ia64/xen-unstable

view linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c @ 8862:d2f42c284459

Check for the entire kernel range for temporarily cleared PMD entries.

From: Jan Beulich <JBeulich@novell.com>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
author cl349@firebug.cl.cam.ac.uk
date Thu Feb 16 11:30:44 2006 +0000 (2006-02-16)
parents 1ca3d63e7008
children 0e9a0a469c6b
line source
1 /*
2 * linux/arch/x86-64/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
8 #include <linux/config.h>
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
17 #include <linux/mm.h>
18 #include <linux/smp.h>
19 #include <linux/smp_lock.h>
20 #include <linux/interrupt.h>
21 #include <linux/init.h>
22 #include <linux/tty.h>
23 #include <linux/vt_kern.h> /* For unblank_screen() */
24 #include <linux/compiler.h>
25 #include <linux/module.h>
26 #include <linux/kprobes.h>
28 #include <asm/system.h>
29 #include <asm/uaccess.h>
30 #include <asm/pgalloc.h>
31 #include <asm/smp.h>
32 #include <asm/tlbflush.h>
33 #include <asm/proto.h>
34 #include <asm/kdebug.h>
35 #include <asm-generic/sections.h>
37 /* Page fault error code bits */
38 #define PF_PROT (1<<0) /* or no page found */
39 #define PF_WRITE (1<<1)
40 #define PF_USER (1<<2)
41 #define PF_RSVD (1<<3)
42 #define PF_INSTR (1<<4)
44 void bust_spinlocks(int yes)
45 {
46 int loglevel_save = console_loglevel;
47 if (yes) {
48 oops_in_progress = 1;
49 } else {
50 #ifdef CONFIG_VT
51 unblank_screen();
52 #endif
53 oops_in_progress = 0;
54 /*
55 * OK, the message is on the console. Now we call printk()
56 * without oops_in_progress set so that printk will give klogd
57 * a poke. Hold onto your hats...
58 */
59 console_loglevel = 15; /* NMI oopser may have shut the console up */
60 printk(" ");
61 console_loglevel = loglevel_save;
62 }
63 }
65 /* Sometimes the CPU reports invalid exceptions on prefetch.
66 Check that here and ignore.
67 Opcode checker based on code by Richard Brunner */
68 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
69 unsigned long error_code)
70 {
71 unsigned char *instr;
72 int scan_more = 1;
73 int prefetch = 0;
74 unsigned char *max_instr;
76 /* If it was a exec fault ignore */
77 if (error_code & PF_INSTR)
78 return 0;
80 instr = (unsigned char *)convert_rip_to_linear(current, regs);
81 max_instr = instr + 15;
83 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
84 return 0;
86 while (scan_more && instr < max_instr) {
87 unsigned char opcode;
88 unsigned char instr_hi;
89 unsigned char instr_lo;
91 if (__get_user(opcode, instr))
92 break;
94 instr_hi = opcode & 0xf0;
95 instr_lo = opcode & 0x0f;
96 instr++;
98 switch (instr_hi) {
99 case 0x20:
100 case 0x30:
101 /* Values 0x26,0x2E,0x36,0x3E are valid x86
102 prefixes. In long mode, the CPU will signal
103 invalid opcode if some of these prefixes are
104 present so we will never get here anyway */
105 scan_more = ((instr_lo & 7) == 0x6);
106 break;
108 case 0x40:
109 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
110 Need to figure out under what instruction mode the
111 instruction was issued ... */
112 /* Could check the LDT for lm, but for now it's good
113 enough to assume that long mode only uses well known
114 segments or kernel. */
115 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
116 break;
118 case 0x60:
119 /* 0x64 thru 0x67 are valid prefixes in all modes. */
120 scan_more = (instr_lo & 0xC) == 0x4;
121 break;
122 case 0xF0:
123 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
124 scan_more = !instr_lo || (instr_lo>>1) == 1;
125 break;
126 case 0x00:
127 /* Prefetch instruction is 0x0F0D or 0x0F18 */
128 scan_more = 0;
129 if (__get_user(opcode, instr))
130 break;
131 prefetch = (instr_lo == 0xF) &&
132 (opcode == 0x0D || opcode == 0x18);
133 break;
134 default:
135 scan_more = 0;
136 break;
137 }
138 }
139 return prefetch;
140 }
142 static int bad_address(void *p)
143 {
144 unsigned long dummy;
145 return __get_user(dummy, (unsigned long *)p);
146 }
148 void dump_pagetable(unsigned long address)
149 {
150 pgd_t *pgd;
151 pud_t *pud;
152 pmd_t *pmd;
153 pte_t *pte;
155 asm("movq %%cr3,%0" : "=r" (pgd));
156 pgd = (pgd_t *)machine_to_phys((maddr_t)pgd);
158 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
159 pgd += pgd_index(address);
160 if (bad_address(pgd)) goto bad;
161 printk("PGD %lx ", pgd_val(*pgd));
162 if (!pgd_present(*pgd)) goto ret;
164 pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
165 if (bad_address(pud)) goto bad;
166 printk("PUD %lx ", pud_val(*pud));
167 if (!pud_present(*pud)) goto ret;
169 pmd = pmd_offset(pud, address);
170 if (bad_address(pmd)) goto bad;
171 printk("PMD %lx ", pmd_val(*pmd));
172 if (!pmd_present(*pmd)) goto ret;
174 pte = pte_offset_kernel(pmd, address);
175 if (bad_address(pte)) goto bad;
176 printk("PTE %lx", pte_val(*pte));
177 ret:
178 printk("\n");
179 return;
180 bad:
181 printk("BAD\n");
182 }
184 static const char errata93_warning[] =
185 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
186 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
187 KERN_ERR "******* Please consider a BIOS update.\n"
188 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
190 /* Workaround for K8 erratum #93 & buggy BIOS.
191 BIOS SMM functions are required to use a specific workaround
192 to avoid corruption of the 64bit RIP register on C stepping K8.
193 A lot of BIOS that didn't get tested properly miss this.
194 The OS sees this as a page fault with the upper 32bits of RIP cleared.
195 Try to work around it here.
196 Note we only handle faults in kernel here. */
198 static int is_errata93(struct pt_regs *regs, unsigned long address)
199 {
200 static int warned;
201 if (address != regs->rip)
202 return 0;
203 if ((address >> 32) != 0)
204 return 0;
205 address |= 0xffffffffUL << 32;
206 if ((address >= (u64)_stext && address <= (u64)_etext) ||
207 (address >= MODULES_VADDR && address <= MODULES_END)) {
208 if (!warned) {
209 printk(errata93_warning);
210 warned = 1;
211 }
212 regs->rip = address;
213 return 1;
214 }
215 return 0;
216 }
218 int unhandled_signal(struct task_struct *tsk, int sig)
219 {
220 if (tsk->pid == 1)
221 return 1;
222 if (tsk->ptrace & PT_PTRACED)
223 return 0;
224 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
225 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
226 }
228 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
229 unsigned long error_code)
230 {
231 unsigned long flags = oops_begin();
232 struct task_struct *tsk;
234 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
235 current->comm, address);
236 dump_pagetable(address);
237 tsk = current;
238 tsk->thread.cr2 = address;
239 tsk->thread.trap_no = 14;
240 tsk->thread.error_code = error_code;
241 __die("Bad pagetable", regs, error_code);
242 oops_end(flags);
243 do_exit(SIGKILL);
244 }
246 /*
247 * Handle a fault on the vmalloc area
248 *
249 * This assumes no large pages in there.
250 */
251 static int vmalloc_fault(unsigned long address)
252 {
253 pgd_t *pgd, *pgd_ref;
254 pud_t *pud, *pud_ref;
255 pmd_t *pmd, *pmd_ref;
256 pte_t *pte, *pte_ref;
258 /* Copy kernel mappings over when needed. This can also
259 happen within a race in page table update. In the later
260 case just flush. */
262 /* On Xen the line below does not always work. Needs investigating! */
263 /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
264 asm("movq %%cr3,%0" : "=r" (pgd));
265 pgd = (pgd_t *)machine_to_phys((maddr_t)pgd);
266 pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
267 pgd += pgd_index(address);
268 pgd_ref = pgd_offset_k(address);
269 if (pgd_none(*pgd_ref))
270 return -1;
271 if (pgd_none(*pgd))
272 set_pgd(pgd, *pgd_ref);
274 /* Below here mismatches are bugs because these lower tables
275 are shared */
277 pud = pud_offset(pgd, address);
278 pud_ref = pud_offset(pgd_ref, address);
279 if (pud_none(*pud_ref))
280 return -1;
281 if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
282 BUG();
283 pmd = pmd_offset(pud, address);
284 pmd_ref = pmd_offset(pud_ref, address);
285 if (pmd_none(*pmd_ref))
286 return -1;
287 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
288 BUG();
289 pte_ref = pte_offset_kernel(pmd_ref, address);
290 if (!pte_present(*pte_ref))
291 return -1;
292 pte = pte_offset_kernel(pmd, address);
293 /* Don't use pte_page here, because the mappings can point
294 outside mem_map, and the NUMA hash lookup cannot handle
295 that. */
296 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
297 BUG();
298 return 0;
299 }
301 int page_fault_trace = 0;
302 int exception_trace = 1;
305 #define MEM_VERBOSE 1
307 #ifdef MEM_VERBOSE
308 #define MEM_LOG(_f, _a...) \
309 printk("fault.c:[%d]-> " _f "\n", \
310 __LINE__ , ## _a )
311 #else
312 #define MEM_LOG(_f, _a...) ((void)0)
313 #endif
315 /*
316 * This routine handles page faults. It determines the address,
317 * and the problem, and then passes it off to one of the appropriate
318 * routines.
319 */
320 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
321 unsigned long error_code)
322 {
323 struct task_struct *tsk;
324 struct mm_struct *mm;
325 struct vm_area_struct * vma;
326 unsigned long address;
327 const struct exception_table_entry *fixup;
328 int write;
329 unsigned long flags;
330 siginfo_t info;
332 if (!user_mode(regs))
333 error_code &= ~PF_USER; /* means kernel */
335 /* get the address */
336 address = HYPERVISOR_shared_info->vcpu_info[
337 smp_processor_id()].arch.cr2;
338 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
339 SIGSEGV) == NOTIFY_STOP)
340 return;
342 if (likely(regs->eflags & X86_EFLAGS_IF))
343 local_irq_enable();
345 if (unlikely(page_fault_trace))
346 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
347 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
349 tsk = current;
350 mm = tsk->mm;
351 info.si_code = SEGV_MAPERR;
354 /*
355 * We fault-in kernel-space virtual memory on-demand. The
356 * 'reference' page table is init_mm.pgd.
357 *
358 * NOTE! We MUST NOT take any locks for this case. We may
359 * be in an interrupt or a critical region, and should
360 * only copy the information from the master page table,
361 * nothing more.
362 *
363 * This verifies that the fault happens in kernel space
364 * (error_code & 4) == 0, and that the fault was not a
365 * protection error (error_code & 9) == 0.
366 */
367 if (unlikely(address >= TASK_SIZE64)) {
368 /*
369 * Must check for the entire kernel range here: with writable
370 * page tables the hypervisor may temporarily clear PMD
371 * entries.
372 */
373 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
374 address >= PAGE_OFFSET) {
375 if (vmalloc_fault(address) < 0)
376 goto bad_area_nosemaphore;
377 return;
378 }
379 /*
380 * Don't take the mm semaphore here. If we fixup a prefetch
381 * fault we could otherwise deadlock.
382 */
383 goto bad_area_nosemaphore;
384 }
386 if (unlikely(error_code & PF_RSVD))
387 pgtable_bad(address, regs, error_code);
389 /*
390 * If we're in an interrupt or have no user
391 * context, we must not take the fault..
392 */
393 if (unlikely(in_atomic() || !mm))
394 goto bad_area_nosemaphore;
396 again:
397 /* When running in the kernel we expect faults to occur only to
398 * addresses in user space. All other faults represent errors in the
399 * kernel and should generate an OOPS. Unfortunatly, in the case of an
400 * erroneous fault occuring in a code path which already holds mmap_sem
401 * we will deadlock attempting to validate the fault against the
402 * address space. Luckily the kernel only validly references user
403 * space from well defined areas of code, which are listed in the
404 * exceptions table.
405 *
406 * As the vast majority of faults will be valid we will only perform
407 * the source reference check when there is a possibilty of a deadlock.
408 * Attempt to lock the address space, if we cannot we then validate the
409 * source. If this is invalid we can skip the address space check,
410 * thus avoiding the deadlock.
411 */
412 if (!down_read_trylock(&mm->mmap_sem)) {
413 if ((error_code & PF_USER) == 0 &&
414 !search_exception_tables(regs->rip))
415 goto bad_area_nosemaphore;
416 down_read(&mm->mmap_sem);
417 }
419 vma = find_vma(mm, address);
420 if (!vma)
421 goto bad_area;
422 if (likely(vma->vm_start <= address))
423 goto good_area;
424 if (!(vma->vm_flags & VM_GROWSDOWN))
425 goto bad_area;
426 if (error_code & 4) {
427 // XXX: align red zone size with ABI
428 if (address + 128 < regs->rsp)
429 goto bad_area;
430 }
431 if (expand_stack(vma, address))
432 goto bad_area;
433 /*
434 * Ok, we have a good vm_area for this memory access, so
435 * we can handle it..
436 */
437 good_area:
438 info.si_code = SEGV_ACCERR;
439 write = 0;
440 switch (error_code & (PF_PROT|PF_WRITE)) {
441 default: /* 3: write, present */
442 /* fall through */
443 case PF_WRITE: /* write, not present */
444 if (!(vma->vm_flags & VM_WRITE))
445 goto bad_area;
446 write++;
447 break;
448 case PF_PROT: /* read, present */
449 goto bad_area;
450 case 0: /* read, not present */
451 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
452 goto bad_area;
453 }
455 /*
456 * If for any reason at all we couldn't handle the fault,
457 * make sure we exit gracefully rather than endlessly redo
458 * the fault.
459 */
460 switch (handle_mm_fault(mm, vma, address, write)) {
461 case VM_FAULT_MINOR:
462 tsk->min_flt++;
463 break;
464 case VM_FAULT_MAJOR:
465 tsk->maj_flt++;
466 break;
467 case VM_FAULT_SIGBUS:
468 goto do_sigbus;
469 default:
470 goto out_of_memory;
471 }
473 up_read(&mm->mmap_sem);
474 return;
476 /*
477 * Something tried to access memory that isn't in our memory map..
478 * Fix it, but check if it's kernel or user first..
479 */
480 bad_area:
481 up_read(&mm->mmap_sem);
483 bad_area_nosemaphore:
484 /* User mode accesses just cause a SIGSEGV */
485 if (error_code & PF_USER) {
486 if (is_prefetch(regs, address, error_code))
487 return;
489 /* Work around K8 erratum #100 K8 in compat mode
490 occasionally jumps to illegal addresses >4GB. We
491 catch this here in the page fault handler because
492 these addresses are not reachable. Just detect this
493 case and return. Any code segment in LDT is
494 compatibility mode. */
495 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
496 (address >> 32))
497 return;
499 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
500 printk(
501 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
502 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
503 tsk->comm, tsk->pid, address, regs->rip,
504 regs->rsp, error_code);
505 }
507 tsk->thread.cr2 = address;
508 /* Kernel addresses are always protection faults */
509 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
510 tsk->thread.trap_no = 14;
511 info.si_signo = SIGSEGV;
512 info.si_errno = 0;
513 /* info.si_code has been set above */
514 info.si_addr = (void __user *)address;
515 force_sig_info(SIGSEGV, &info, tsk);
516 return;
517 }
519 no_context:
521 /* Are we prepared to handle this kernel fault? */
522 fixup = search_exception_tables(regs->rip);
523 if (fixup) {
524 regs->rip = fixup->fixup;
525 return;
526 }
528 /*
529 * Hall of shame of CPU/BIOS bugs.
530 */
532 if (is_prefetch(regs, address, error_code))
533 return;
535 if (is_errata93(regs, address))
536 return;
538 /*
539 * Oops. The kernel tried to access some bad page. We'll have to
540 * terminate things with extreme prejudice.
541 */
543 flags = oops_begin();
545 if (address < PAGE_SIZE)
546 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
547 else
548 printk(KERN_ALERT "Unable to handle kernel paging request");
549 printk(" at %016lx RIP: \n" KERN_ALERT,address);
550 printk_address(regs->rip);
551 printk("\n");
552 dump_pagetable(address);
553 tsk->thread.cr2 = address;
554 tsk->thread.trap_no = 14;
555 tsk->thread.error_code = error_code;
556 __die("Oops", regs, error_code);
557 /* Executive summary in case the body of the oops scrolled away */
558 printk(KERN_EMERG "CR2: %016lx\n", address);
559 oops_end(flags);
560 do_exit(SIGKILL);
562 /*
563 * We ran out of memory, or some other thing happened to us that made
564 * us unable to handle the page fault gracefully.
565 */
566 out_of_memory:
567 up_read(&mm->mmap_sem);
568 if (current->pid == 1) {
569 yield();
570 goto again;
571 }
572 printk("VM: killing process %s\n", tsk->comm);
573 if (error_code & 4)
574 do_exit(SIGKILL);
575 goto no_context;
577 do_sigbus:
578 up_read(&mm->mmap_sem);
580 /* Kernel mode? Handle exceptions or die */
581 if (!(error_code & PF_USER))
582 goto no_context;
584 tsk->thread.cr2 = address;
585 tsk->thread.error_code = error_code;
586 tsk->thread.trap_no = 14;
587 info.si_signo = SIGBUS;
588 info.si_errno = 0;
589 info.si_code = BUS_ADRERR;
590 info.si_addr = (void __user *)address;
591 force_sig_info(SIGBUS, &info, tsk);
592 return;
593 }
595 static int __init enable_pagefaulttrace(char *str)
596 {
597 page_fault_trace = 1;
598 return 0;
599 }
600 __setup("pagefaulttrace", enable_pagefaulttrace);