ia64/xen-unstable

view linux-2.6-xen-sparse/arch/xen/x86_64/mm/fault.c @ 6552:a9873d384da4

Merge.
author adsharma@los-vmm.sc.intel.com
date Thu Aug 25 12:24:48 2005 -0700 (2005-08-25)
parents 112d44270733 fa0754a9f64f
children dfaf788ab18c
line source
1 /*
2 * linux/arch/x86-64/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
8 #include <linux/config.h>
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
17 #include <linux/mm.h>
18 #include <linux/smp.h>
19 #include <linux/smp_lock.h>
20 #include <linux/interrupt.h>
21 #include <linux/init.h>
22 #include <linux/tty.h>
23 #include <linux/vt_kern.h> /* For unblank_screen() */
24 #include <linux/compiler.h>
25 #include <linux/module.h>
26 #include <linux/percpu.h>
27 #include <linux/kprobes.h>
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <asm/pgalloc.h>
32 #include <asm/smp.h>
33 #include <asm/tlbflush.h>
34 #include <asm/proto.h>
35 #include <asm/kdebug.h>
36 #include <asm-generic/sections.h>
37 #include <asm/kdebug.h>
39 DEFINE_PER_CPU(pgd_t *, cur_pgd);
41 void bust_spinlocks(int yes)
42 {
43 int loglevel_save = console_loglevel;
44 if (yes) {
45 oops_in_progress = 1;
46 } else {
47 #ifdef CONFIG_VT
48 unblank_screen();
49 #endif
50 oops_in_progress = 0;
51 /*
52 * OK, the message is on the console. Now we call printk()
53 * without oops_in_progress set so that printk will give klogd
54 * a poke. Hold onto your hats...
55 */
56 console_loglevel = 15; /* NMI oopser may have shut the console up */
57 printk(" ");
58 console_loglevel = loglevel_save;
59 }
60 }
62 /* Sometimes the CPU reports invalid exceptions on prefetch.
63 Check that here and ignore.
64 Opcode checker based on code by Richard Brunner */
65 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
66 unsigned long error_code)
67 {
68 unsigned char *instr;
69 int scan_more = 1;
70 int prefetch = 0;
71 unsigned char *max_instr;
73 /* If it was a exec fault ignore */
74 if (error_code & (1<<4))
75 return 0;
77 instr = (unsigned char *)convert_rip_to_linear(current, regs);
78 max_instr = instr + 15;
80 if ((regs->cs & 3) != 0 && instr >= (unsigned char *)TASK_SIZE)
81 return 0;
83 while (scan_more && instr < max_instr) {
84 unsigned char opcode;
85 unsigned char instr_hi;
86 unsigned char instr_lo;
88 if (__get_user(opcode, instr))
89 break;
91 instr_hi = opcode & 0xf0;
92 instr_lo = opcode & 0x0f;
93 instr++;
95 switch (instr_hi) {
96 case 0x20:
97 case 0x30:
98 /* Values 0x26,0x2E,0x36,0x3E are valid x86
99 prefixes. In long mode, the CPU will signal
100 invalid opcode if some of these prefixes are
101 present so we will never get here anyway */
102 scan_more = ((instr_lo & 7) == 0x6);
103 break;
105 case 0x40:
106 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
107 Need to figure out under what instruction mode the
108 instruction was issued ... */
109 /* Could check the LDT for lm, but for now it's good
110 enough to assume that long mode only uses well known
111 segments or kernel. */
112 scan_more = ((regs->cs & 3) == 0) || (regs->cs == __USER_CS);
113 break;
115 case 0x60:
116 /* 0x64 thru 0x67 are valid prefixes in all modes. */
117 scan_more = (instr_lo & 0xC) == 0x4;
118 break;
119 case 0xF0:
120 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
121 scan_more = !instr_lo || (instr_lo>>1) == 1;
122 break;
123 case 0x00:
124 /* Prefetch instruction is 0x0F0D or 0x0F18 */
125 scan_more = 0;
126 if (__get_user(opcode, instr))
127 break;
128 prefetch = (instr_lo == 0xF) &&
129 (opcode == 0x0D || opcode == 0x18);
130 break;
131 default:
132 scan_more = 0;
133 break;
134 }
135 }
136 return prefetch;
137 }
139 static int bad_address(void *p)
140 {
141 unsigned long dummy;
142 return __get_user(dummy, (unsigned long *)p);
143 }
145 void dump_pagetable(unsigned long address)
146 {
147 pgd_t *pgd;
148 pud_t *pud;
149 pmd_t *pmd;
150 pte_t *pte;
152 pgd = (pgd_t *)per_cpu(cur_pgd, smp_processor_id());
153 pgd += pgd_index(address);
155 printk("PGD %lx ", pgd_val(*pgd));
156 if (bad_address(pgd)) goto bad;
157 if (!pgd_present(*pgd)) goto ret;
159 pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
160 if (bad_address(pud)) goto bad;
161 printk("PUD %lx ", pud_val(*pud));
162 if (!pud_present(*pud)) goto ret;
164 pmd = pmd_offset(pud, address);
165 if (bad_address(pmd)) goto bad;
166 printk("PMD %lx ", pmd_val(*pmd));
167 if (!pmd_present(*pmd)) goto ret;
169 pte = pte_offset_kernel(pmd, address);
170 if (bad_address(pte)) goto bad;
171 printk("PTE %lx", pte_val(*pte));
172 ret:
173 printk("\n");
174 return;
175 bad:
176 printk("BAD\n");
177 }
179 static const char errata93_warning[] =
180 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
181 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
182 KERN_ERR "******* Please consider a BIOS update.\n"
183 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
185 /* Workaround for K8 erratum #93 & buggy BIOS.
186 BIOS SMM functions are required to use a specific workaround
187 to avoid corruption of the 64bit RIP register on C stepping K8.
188 A lot of BIOS that didn't get tested properly miss this.
189 The OS sees this as a page fault with the upper 32bits of RIP cleared.
190 Try to work around it here.
191 Note we only handle faults in kernel here. */
193 static int is_errata93(struct pt_regs *regs, unsigned long address)
194 {
195 static int warned;
196 if (address != regs->rip)
197 return 0;
198 if ((address >> 32) != 0)
199 return 0;
200 address |= 0xffffffffUL << 32;
201 if ((address >= (u64)_stext && address <= (u64)_etext) ||
202 (address >= MODULES_VADDR && address <= MODULES_END)) {
203 if (!warned) {
204 printk(errata93_warning);
205 warned = 1;
206 }
207 regs->rip = address;
208 return 1;
209 }
210 return 0;
211 }
213 int unhandled_signal(struct task_struct *tsk, int sig)
214 {
215 if (tsk->pid == 1)
216 return 1;
217 /* Warn for strace, but not for gdb */
218 if (!test_ti_thread_flag(tsk->thread_info, TIF_SYSCALL_TRACE) &&
219 (tsk->ptrace & PT_PTRACED))
220 return 0;
221 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
222 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
223 }
225 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
226 unsigned long error_code)
227 {
228 oops_begin();
229 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
230 current->comm, address);
231 dump_pagetable(address);
232 __die("Bad pagetable", regs, error_code);
233 oops_end();
234 do_exit(SIGKILL);
235 }
237 /*
238 * Handle a fault on the vmalloc or module mapping area
239 *
240 * This assumes no large pages in there.
241 */
242 static int vmalloc_fault(unsigned long address)
243 {
244 pgd_t *pgd, *pgd_ref;
245 pud_t *pud, *pud_ref;
246 pmd_t *pmd, *pmd_ref;
247 pte_t *pte, *pte_ref;
249 /* Copy kernel mappings over when needed. This can also
250 happen within a race in page table update. In the later
251 case just flush. */
253 /* On Xen the line below does not always work. Needs investigating! */
254 /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
255 pgd = (pgd_t *)per_cpu(cur_pgd, smp_processor_id());
256 pgd += pgd_index(address);
258 pgd_ref = pgd_offset_k(address);
259 if (pgd_none(*pgd_ref))
260 return -1;
261 if (pgd_none(*pgd))
262 set_pgd(pgd, *pgd_ref);
264 /* Below here mismatches are bugs because these lower tables
265 are shared */
267 pud = pud_offset(pgd, address);
268 pud_ref = pud_offset(pgd_ref, address);
269 if (pud_none(*pud_ref))
270 return -1;
271 if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
272 BUG();
273 pmd = pmd_offset(pud, address);
274 pmd_ref = pmd_offset(pud_ref, address);
275 if (pmd_none(*pmd_ref))
276 return -1;
277 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
278 BUG();
279 pte_ref = pte_offset_kernel(pmd_ref, address);
280 if (!pte_present(*pte_ref))
281 return -1;
282 pte = pte_offset_kernel(pmd, address);
283 /* Don't use pte_page here, because the mappings can point
284 outside mem_map, and the NUMA hash lookup cannot handle
285 that. */
286 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
287 BUG();
288 __flush_tlb_all();
289 return 0;
290 }
292 int page_fault_trace = 0;
293 int exception_trace = 1;
296 #define MEM_VERBOSE 1
298 #ifdef MEM_VERBOSE
299 #define MEM_LOG(_f, _a...) \
300 printk("fault.c:[%d]-> " _f "\n", \
301 __LINE__ , ## _a )
302 #else
303 #define MEM_LOG(_f, _a...) ((void)0)
304 #endif
306 /*
307 * This routine handles page faults. It determines the address,
308 * and the problem, and then passes it off to one of the appropriate
309 * routines.
310 *
311 * error_code:
312 * bit 0 == 0 means no page found, 1 means protection fault
313 * bit 1 == 0 means read, 1 means write
314 * bit 2 == 0 means kernel, 1 means user-mode
315 * bit 3 == 1 means fault was an instruction fetch
316 */
317 asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
318 unsigned long address)
319 {
320 struct task_struct *tsk;
321 struct mm_struct *mm;
322 struct vm_area_struct * vma;
323 const struct exception_table_entry *fixup;
324 int write;
325 siginfo_t info;
327 if (!user_mode(regs))
328 error_code &= ~4; /* means kernel */
330 #ifdef CONFIG_CHECKING
331 {
332 unsigned long gs;
333 struct x8664_pda *pda = cpu_pda + stack_smp_processor_id();
334 rdmsrl(MSR_GS_BASE, gs);
335 if (gs != (unsigned long)pda) {
336 wrmsrl(MSR_GS_BASE, pda);
337 printk("page_fault: wrong gs %lx expected %p\n", gs, pda);
338 }
339 }
340 #endif
341 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
342 SIGSEGV) == NOTIFY_STOP)
343 return;
345 if (likely(regs->eflags & X86_EFLAGS_IF))
346 local_irq_enable();
348 if (unlikely(page_fault_trace))
349 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
350 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
352 tsk = current;
353 mm = tsk->mm;
354 info.si_code = SEGV_MAPERR;
357 /*
358 * We fault-in kernel-space virtual memory on-demand. The
359 * 'reference' page table is init_mm.pgd.
360 *
361 * NOTE! We MUST NOT take any locks for this case. We may
362 * be in an interrupt or a critical region, and should
363 * only copy the information from the master page table,
364 * nothing more.
365 *
366 * This verifies that the fault happens in kernel space
367 * (error_code & 4) == 0, and that the fault was not a
368 * protection error (error_code & 1) == 0.
369 */
370 if (unlikely(address >= TASK_SIZE)) {
371 if (!(error_code & 5) &&
372 ((address >= VMALLOC_START && address < VMALLOC_END) ||
373 (address >= MODULES_VADDR && address < MODULES_END))) {
374 if (vmalloc_fault(address) < 0)
375 goto bad_area_nosemaphore;
376 return;
377 }
378 /*
379 * Don't take the mm semaphore here. If we fixup a prefetch
380 * fault we could otherwise deadlock.
381 */
382 goto bad_area_nosemaphore;
383 }
385 if (unlikely(error_code & (1 << 3)))
386 pgtable_bad(address, regs, error_code);
388 /*
389 * If we're in an interrupt or have no user
390 * context, we must not take the fault..
391 */
392 if (unlikely(in_atomic() || !mm))
393 goto bad_area_nosemaphore;
395 again:
396 /* When running in the kernel we expect faults to occur only to
397 * addresses in user space. All other faults represent errors in the
398 * kernel and should generate an OOPS. Unfortunatly, in the case of an
399 * erroneous fault occuring in a code path which already holds mmap_sem
400 * we will deadlock attempting to validate the fault against the
401 * address space. Luckily the kernel only validly references user
402 * space from well defined areas of code, which are listed in the
403 * exceptions table.
404 *
405 * As the vast majority of faults will be valid we will only perform
406 * the source reference check when there is a possibilty of a deadlock.
407 * Attempt to lock the address space, if we cannot we then validate the
408 * source. If this is invalid we can skip the address space check,
409 * thus avoiding the deadlock.
410 */
411 if (!down_read_trylock(&mm->mmap_sem)) {
412 if ((error_code & 4) == 0 &&
413 !search_exception_tables(regs->rip))
414 goto bad_area_nosemaphore;
415 down_read(&mm->mmap_sem);
416 }
418 vma = find_vma(mm, address);
419 if (!vma)
420 goto bad_area;
421 if (likely(vma->vm_start <= address))
422 goto good_area;
423 if (!(vma->vm_flags & VM_GROWSDOWN))
424 goto bad_area;
425 if (error_code & 4) {
426 // XXX: align red zone size with ABI
427 if (address + 128 < regs->rsp)
428 goto bad_area;
429 }
430 if (expand_stack(vma, address))
431 goto bad_area;
432 /*
433 * Ok, we have a good vm_area for this memory access, so
434 * we can handle it..
435 */
436 good_area:
437 info.si_code = SEGV_ACCERR;
438 write = 0;
439 switch (error_code & 3) {
440 default: /* 3: write, present */
441 /* fall through */
442 case 2: /* write, not present */
443 if (!(vma->vm_flags & VM_WRITE))
444 goto bad_area;
445 write++;
446 break;
447 case 1: /* read, present */
448 goto bad_area;
449 case 0: /* read, not present */
450 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
451 goto bad_area;
452 }
454 /*
455 * If for any reason at all we couldn't handle the fault,
456 * make sure we exit gracefully rather than endlessly redo
457 * the fault.
458 */
459 switch (handle_mm_fault(mm, vma, address, write)) {
460 case 1:
461 tsk->min_flt++;
462 break;
463 case 2:
464 tsk->maj_flt++;
465 break;
466 case 0:
467 goto do_sigbus;
468 default:
469 goto out_of_memory;
470 }
472 up_read(&mm->mmap_sem);
473 return;
475 /*
476 * Something tried to access memory that isn't in our memory map..
477 * Fix it, but check if it's kernel or user first..
478 */
479 bad_area:
480 up_read(&mm->mmap_sem);
482 bad_area_nosemaphore:
483 /* User mode accesses just cause a SIGSEGV */
484 if (error_code & 4) {
485 if (is_prefetch(regs, address, error_code))
486 return;
488 /* Work around K8 erratum #100 K8 in compat mode
489 occasionally jumps to illegal addresses >4GB. We
490 catch this here in the page fault handler because
491 these addresses are not reachable. Just detect this
492 case and return. Any code segment in LDT is
493 compatibility mode. */
494 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
495 (address >> 32))
496 return;
498 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
499 printk(
500 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
501 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
502 tsk->comm, tsk->pid, address, regs->rip,
503 regs->rsp, error_code);
504 }
506 tsk->thread.cr2 = address;
507 /* Kernel addresses are always protection faults */
508 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
509 tsk->thread.trap_no = 14;
510 info.si_signo = SIGSEGV;
511 info.si_errno = 0;
512 /* info.si_code has been set above */
513 info.si_addr = (void __user *)address;
514 force_sig_info(SIGSEGV, &info, tsk);
515 return;
516 }
518 no_context:
520 /* Are we prepared to handle this kernel fault? */
521 fixup = search_exception_tables(regs->rip);
522 if (fixup) {
523 regs->rip = fixup->fixup;
524 return;
525 }
527 /*
528 * Hall of shame of CPU/BIOS bugs.
529 */
531 if (is_prefetch(regs, address, error_code))
532 return;
534 if (is_errata93(regs, address))
535 return;
537 /*
538 * Oops. The kernel tried to access some bad page. We'll have to
539 * terminate things with extreme prejudice.
540 */
542 oops_begin();
544 if (address < PAGE_SIZE)
545 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
546 else
547 printk(KERN_ALERT "Unable to handle kernel paging request");
548 printk(" at %016lx RIP: \n" KERN_ALERT,address);
549 printk_address(regs->rip);
550 printk("\n");
551 dump_pagetable(address);
552 __die("Oops", regs, error_code);
553 /* Executive summary in case the body of the oops scrolled away */
554 printk(KERN_EMERG "CR2: %016lx\n", address);
555 oops_end();
556 do_exit(SIGKILL);
558 /*
559 * We ran out of memory, or some other thing happened to us that made
560 * us unable to handle the page fault gracefully.
561 */
562 out_of_memory:
563 up_read(&mm->mmap_sem);
564 if (current->pid == 1) {
565 yield();
566 goto again;
567 }
568 printk("VM: killing process %s\n", tsk->comm);
569 if (error_code & 4)
570 do_exit(SIGKILL);
571 goto no_context;
573 do_sigbus:
574 up_read(&mm->mmap_sem);
576 /* Kernel mode? Handle exceptions or die */
577 if (!(error_code & 4))
578 goto no_context;
580 tsk->thread.cr2 = address;
581 tsk->thread.error_code = error_code;
582 tsk->thread.trap_no = 14;
583 info.si_signo = SIGBUS;
584 info.si_errno = 0;
585 info.si_code = BUS_ADRERR;
586 info.si_addr = (void __user *)address;
587 force_sig_info(SIGBUS, &info, tsk);
588 return;
589 }