direct-io.hg

view linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c @ 11443:af50fb41612c

[IA64] Expand memory_op for PV-on-HVM on IPF

Signed-off-by: Tsunehisa Doi <Doi.Tsunehisa@jp.fujitsu.com>
Signed-off-by: Tomonari Horikoshi <t.horikoshi@jp.fujitsu.com>
author awilliam@xenbuild.aw
date Fri Sep 01 08:46:02 2006 -0600 (2006-09-01)
parents 5033ffe8f533
children 4fad820a2233
line source
1 /*
2 * linux/arch/x86-64/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
6 */
8 #include <linux/config.h>
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
17 #include <linux/mm.h>
18 #include <linux/smp.h>
19 #include <linux/smp_lock.h>
20 #include <linux/interrupt.h>
21 #include <linux/init.h>
22 #include <linux/tty.h>
23 #include <linux/vt_kern.h> /* For unblank_screen() */
24 #include <linux/compiler.h>
25 #include <linux/module.h>
26 #include <linux/kprobes.h>
28 #include <asm/system.h>
29 #include <asm/uaccess.h>
30 #include <asm/pgalloc.h>
31 #include <asm/smp.h>
32 #include <asm/tlbflush.h>
33 #include <asm/proto.h>
34 #include <asm/kdebug.h>
35 #include <asm-generic/sections.h>
37 /* Page fault error code bits */
38 #define PF_PROT (1<<0) /* or no page found */
39 #define PF_WRITE (1<<1)
40 #define PF_USER (1<<2)
41 #define PF_RSVD (1<<3)
42 #define PF_INSTR (1<<4)
44 void bust_spinlocks(int yes)
45 {
46 int loglevel_save = console_loglevel;
47 if (yes) {
48 oops_in_progress = 1;
49 } else {
50 #ifdef CONFIG_VT
51 unblank_screen();
52 #endif
53 oops_in_progress = 0;
54 /*
55 * OK, the message is on the console. Now we call printk()
56 * without oops_in_progress set so that printk will give klogd
57 * a poke. Hold onto your hats...
58 */
59 console_loglevel = 15; /* NMI oopser may have shut the console up */
60 printk(" ");
61 console_loglevel = loglevel_save;
62 }
63 }
65 /* Sometimes the CPU reports invalid exceptions on prefetch.
66 Check that here and ignore.
67 Opcode checker based on code by Richard Brunner */
68 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
69 unsigned long error_code)
70 {
71 unsigned char *instr;
72 int scan_more = 1;
73 int prefetch = 0;
74 unsigned char *max_instr;
76 /* If it was a exec fault ignore */
77 if (error_code & PF_INSTR)
78 return 0;
80 instr = (unsigned char *)convert_rip_to_linear(current, regs);
81 max_instr = instr + 15;
83 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
84 return 0;
86 while (scan_more && instr < max_instr) {
87 unsigned char opcode;
88 unsigned char instr_hi;
89 unsigned char instr_lo;
91 if (__get_user(opcode, instr))
92 break;
94 instr_hi = opcode & 0xf0;
95 instr_lo = opcode & 0x0f;
96 instr++;
98 switch (instr_hi) {
99 case 0x20:
100 case 0x30:
101 /* Values 0x26,0x2E,0x36,0x3E are valid x86
102 prefixes. In long mode, the CPU will signal
103 invalid opcode if some of these prefixes are
104 present so we will never get here anyway */
105 scan_more = ((instr_lo & 7) == 0x6);
106 break;
108 case 0x40:
109 /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes
110 Need to figure out under what instruction mode the
111 instruction was issued ... */
112 /* Could check the LDT for lm, but for now it's good
113 enough to assume that long mode only uses well known
114 segments or kernel. */
115 scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS);
116 break;
118 case 0x60:
119 /* 0x64 thru 0x67 are valid prefixes in all modes. */
120 scan_more = (instr_lo & 0xC) == 0x4;
121 break;
122 case 0xF0:
123 /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
124 scan_more = !instr_lo || (instr_lo>>1) == 1;
125 break;
126 case 0x00:
127 /* Prefetch instruction is 0x0F0D or 0x0F18 */
128 scan_more = 0;
129 if (__get_user(opcode, instr))
130 break;
131 prefetch = (instr_lo == 0xF) &&
132 (opcode == 0x0D || opcode == 0x18);
133 break;
134 default:
135 scan_more = 0;
136 break;
137 }
138 }
139 return prefetch;
140 }
142 static int bad_address(void *p)
143 {
144 unsigned long dummy;
145 return __get_user(dummy, (unsigned long *)p);
146 }
148 void dump_pagetable(unsigned long address)
149 {
150 pgd_t *pgd;
151 pud_t *pud;
152 pmd_t *pmd;
153 pte_t *pte;
155 pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
156 pgd += pgd_index(address);
157 if (bad_address(pgd)) goto bad;
158 printk("PGD %lx ", pgd_val(*pgd));
159 if (!pgd_present(*pgd)) goto ret;
161 pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
162 if (bad_address(pud)) goto bad;
163 printk("PUD %lx ", pud_val(*pud));
164 if (!pud_present(*pud)) goto ret;
166 pmd = pmd_offset(pud, address);
167 if (bad_address(pmd)) goto bad;
168 printk("PMD %lx ", pmd_val(*pmd));
169 if (!pmd_present(*pmd)) goto ret;
171 pte = pte_offset_kernel(pmd, address);
172 if (bad_address(pte)) goto bad;
173 printk("PTE %lx", pte_val(*pte));
174 ret:
175 printk("\n");
176 return;
177 bad:
178 printk("BAD\n");
179 }
181 static const char errata93_warning[] =
182 KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
183 KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
184 KERN_ERR "******* Please consider a BIOS update.\n"
185 KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
187 /* Workaround for K8 erratum #93 & buggy BIOS.
188 BIOS SMM functions are required to use a specific workaround
189 to avoid corruption of the 64bit RIP register on C stepping K8.
190 A lot of BIOS that didn't get tested properly miss this.
191 The OS sees this as a page fault with the upper 32bits of RIP cleared.
192 Try to work around it here.
193 Note we only handle faults in kernel here. */
195 static int is_errata93(struct pt_regs *regs, unsigned long address)
196 {
197 static int warned;
198 if (address != regs->rip)
199 return 0;
200 if ((address >> 32) != 0)
201 return 0;
202 address |= 0xffffffffUL << 32;
203 if ((address >= (u64)_stext && address <= (u64)_etext) ||
204 (address >= MODULES_VADDR && address <= MODULES_END)) {
205 if (!warned) {
206 printk(errata93_warning);
207 warned = 1;
208 }
209 regs->rip = address;
210 return 1;
211 }
212 return 0;
213 }
215 int unhandled_signal(struct task_struct *tsk, int sig)
216 {
217 if (tsk->pid == 1)
218 return 1;
219 if (tsk->ptrace & PT_PTRACED)
220 return 0;
221 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
222 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
223 }
225 static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
226 unsigned long error_code)
227 {
228 unsigned long flags = oops_begin();
229 struct task_struct *tsk;
231 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
232 current->comm, address);
233 dump_pagetable(address);
234 tsk = current;
235 tsk->thread.cr2 = address;
236 tsk->thread.trap_no = 14;
237 tsk->thread.error_code = error_code;
238 __die("Bad pagetable", regs, error_code);
239 oops_end(flags);
240 do_exit(SIGKILL);
241 }
243 /*
244 * Handle a fault on the vmalloc area
245 *
246 * This assumes no large pages in there.
247 */
248 static int vmalloc_fault(unsigned long address)
249 {
250 pgd_t *pgd, *pgd_ref;
251 pud_t *pud, *pud_ref;
252 pmd_t *pmd, *pmd_ref;
253 pte_t *pte, *pte_ref;
255 /* Copy kernel mappings over when needed. This can also
256 happen within a race in page table update. In the later
257 case just flush. */
259 /* On Xen the line below does not always work. Needs investigating! */
260 /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/
261 pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
262 pgd += pgd_index(address);
263 pgd_ref = pgd_offset_k(address);
264 if (pgd_none(*pgd_ref))
265 return -1;
266 if (pgd_none(*pgd))
267 set_pgd(pgd, *pgd_ref);
269 /* Below here mismatches are bugs because these lower tables
270 are shared */
272 pud = pud_offset(pgd, address);
273 pud_ref = pud_offset(pgd_ref, address);
274 if (pud_none(*pud_ref))
275 return -1;
276 if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
277 BUG();
278 pmd = pmd_offset(pud, address);
279 pmd_ref = pmd_offset(pud_ref, address);
280 if (pmd_none(*pmd_ref))
281 return -1;
282 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
283 BUG();
284 pte_ref = pte_offset_kernel(pmd_ref, address);
285 if (!pte_present(*pte_ref))
286 return -1;
287 pte = pte_offset_kernel(pmd, address);
288 /* Don't use pte_page here, because the mappings can point
289 outside mem_map, and the NUMA hash lookup cannot handle
290 that. */
291 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
292 BUG();
293 return 0;
294 }
296 int page_fault_trace = 0;
297 int exception_trace = 1;
300 #define MEM_VERBOSE 1
302 #ifdef MEM_VERBOSE
303 #define MEM_LOG(_f, _a...) \
304 printk("fault.c:[%d]-> " _f "\n", \
305 __LINE__ , ## _a )
306 #else
307 #define MEM_LOG(_f, _a...) ((void)0)
308 #endif
310 static int spurious_fault(struct pt_regs *regs,
311 unsigned long address,
312 unsigned long error_code)
313 {
314 pgd_t *pgd;
315 pud_t *pud;
316 pmd_t *pmd;
317 pte_t *pte;
319 #ifdef CONFIG_XEN
320 /* Faults in hypervisor area are never spurious. */
321 if ((address >= HYPERVISOR_VIRT_START) &&
322 (address < HYPERVISOR_VIRT_END))
323 return 0;
324 #endif
326 /* Reserved-bit violation or user access to kernel space? */
327 if (error_code & (PF_RSVD|PF_USER))
328 return 0;
330 pgd = init_mm.pgd + pgd_index(address);
331 if (!pgd_present(*pgd))
332 return 0;
334 pud = pud_offset(pgd, address);
335 if (!pud_present(*pud))
336 return 0;
338 pmd = pmd_offset(pud, address);
339 if (!pmd_present(*pmd))
340 return 0;
342 pte = pte_offset_kernel(pmd, address);
343 if (!pte_present(*pte))
344 return 0;
345 if ((error_code & PF_WRITE) && !pte_write(*pte))
346 return 0;
347 if ((error_code & PF_INSTR) && (pte_val(*pte) & _PAGE_NX))
348 return 0;
350 return 1;
351 }
353 /*
354 * This routine handles page faults. It determines the address,
355 * and the problem, and then passes it off to one of the appropriate
356 * routines.
357 */
358 asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
359 unsigned long error_code)
360 {
361 struct task_struct *tsk;
362 struct mm_struct *mm;
363 struct vm_area_struct * vma;
364 unsigned long address;
365 const struct exception_table_entry *fixup;
366 int write;
367 unsigned long flags;
368 siginfo_t info;
370 if (!user_mode(regs))
371 error_code &= ~PF_USER; /* means kernel */
373 /* get the address */
374 address = HYPERVISOR_shared_info->vcpu_info[
375 smp_processor_id()].arch.cr2;
376 if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
377 SIGSEGV) == NOTIFY_STOP)
378 return;
380 if (likely(regs->eflags & X86_EFLAGS_IF))
381 local_irq_enable();
383 if (unlikely(page_fault_trace))
384 printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
385 regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
387 tsk = current;
388 mm = tsk->mm;
389 info.si_code = SEGV_MAPERR;
392 /*
393 * We fault-in kernel-space virtual memory on-demand. The
394 * 'reference' page table is init_mm.pgd.
395 *
396 * NOTE! We MUST NOT take any locks for this case. We may
397 * be in an interrupt or a critical region, and should
398 * only copy the information from the master page table,
399 * nothing more.
400 *
401 * This verifies that the fault happens in kernel space
402 * (error_code & 4) == 0, and that the fault was not a
403 * protection error (error_code & 9) == 0.
404 */
405 if (unlikely(address >= TASK_SIZE64)) {
406 /*
407 * Don't check for the module range here: its PML4
408 * is always initialized because it's shared with the main
409 * kernel text. Only vmalloc may need PML4 syncups.
410 */
411 if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
412 ((address >= VMALLOC_START && address < VMALLOC_END))) {
413 if (vmalloc_fault(address) < 0)
414 goto bad_area_nosemaphore;
415 return;
416 }
417 /* Can take a spurious fault if mapping changes R/O -> R/W. */
418 if (spurious_fault(regs, address, error_code))
419 return;
420 /*
421 * Don't take the mm semaphore here. If we fixup a prefetch
422 * fault we could otherwise deadlock.
423 */
424 goto bad_area_nosemaphore;
425 }
427 if (unlikely(error_code & PF_RSVD))
428 pgtable_bad(address, regs, error_code);
430 /*
431 * If we're in an interrupt or have no user
432 * context, we must not take the fault..
433 */
434 if (unlikely(in_atomic() || !mm))
435 goto bad_area_nosemaphore;
437 again:
438 /* When running in the kernel we expect faults to occur only to
439 * addresses in user space. All other faults represent errors in the
440 * kernel and should generate an OOPS. Unfortunatly, in the case of an
441 * erroneous fault occuring in a code path which already holds mmap_sem
442 * we will deadlock attempting to validate the fault against the
443 * address space. Luckily the kernel only validly references user
444 * space from well defined areas of code, which are listed in the
445 * exceptions table.
446 *
447 * As the vast majority of faults will be valid we will only perform
448 * the source reference check when there is a possibilty of a deadlock.
449 * Attempt to lock the address space, if we cannot we then validate the
450 * source. If this is invalid we can skip the address space check,
451 * thus avoiding the deadlock.
452 */
453 if (!down_read_trylock(&mm->mmap_sem)) {
454 if ((error_code & PF_USER) == 0 &&
455 !search_exception_tables(regs->rip))
456 goto bad_area_nosemaphore;
457 down_read(&mm->mmap_sem);
458 }
460 vma = find_vma(mm, address);
461 if (!vma)
462 goto bad_area;
463 if (likely(vma->vm_start <= address))
464 goto good_area;
465 if (!(vma->vm_flags & VM_GROWSDOWN))
466 goto bad_area;
467 if (error_code & 4) {
468 // XXX: align red zone size with ABI
469 if (address + 128 < regs->rsp)
470 goto bad_area;
471 }
472 if (expand_stack(vma, address))
473 goto bad_area;
474 /*
475 * Ok, we have a good vm_area for this memory access, so
476 * we can handle it..
477 */
478 good_area:
479 info.si_code = SEGV_ACCERR;
480 write = 0;
481 switch (error_code & (PF_PROT|PF_WRITE)) {
482 default: /* 3: write, present */
483 /* fall through */
484 case PF_WRITE: /* write, not present */
485 if (!(vma->vm_flags & VM_WRITE))
486 goto bad_area;
487 write++;
488 break;
489 case PF_PROT: /* read, present */
490 goto bad_area;
491 case 0: /* read, not present */
492 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
493 goto bad_area;
494 }
496 /*
497 * If for any reason at all we couldn't handle the fault,
498 * make sure we exit gracefully rather than endlessly redo
499 * the fault.
500 */
501 switch (handle_mm_fault(mm, vma, address, write)) {
502 case VM_FAULT_MINOR:
503 tsk->min_flt++;
504 break;
505 case VM_FAULT_MAJOR:
506 tsk->maj_flt++;
507 break;
508 case VM_FAULT_SIGBUS:
509 goto do_sigbus;
510 default:
511 goto out_of_memory;
512 }
514 up_read(&mm->mmap_sem);
515 return;
517 /*
518 * Something tried to access memory that isn't in our memory map..
519 * Fix it, but check if it's kernel or user first..
520 */
521 bad_area:
522 up_read(&mm->mmap_sem);
524 bad_area_nosemaphore:
525 /* User mode accesses just cause a SIGSEGV */
526 if (error_code & PF_USER) {
527 if (is_prefetch(regs, address, error_code))
528 return;
530 /* Work around K8 erratum #100 K8 in compat mode
531 occasionally jumps to illegal addresses >4GB. We
532 catch this here in the page fault handler because
533 these addresses are not reachable. Just detect this
534 case and return. Any code segment in LDT is
535 compatibility mode. */
536 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) &&
537 (address >> 32))
538 return;
540 if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
541 printk(
542 "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
543 tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
544 tsk->comm, tsk->pid, address, regs->rip,
545 regs->rsp, error_code);
546 }
548 tsk->thread.cr2 = address;
549 /* Kernel addresses are always protection faults */
550 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
551 tsk->thread.trap_no = 14;
552 info.si_signo = SIGSEGV;
553 info.si_errno = 0;
554 /* info.si_code has been set above */
555 info.si_addr = (void __user *)address;
556 force_sig_info(SIGSEGV, &info, tsk);
557 return;
558 }
560 no_context:
562 /* Are we prepared to handle this kernel fault? */
563 fixup = search_exception_tables(regs->rip);
564 if (fixup) {
565 regs->rip = fixup->fixup;
566 return;
567 }
569 /*
570 * Hall of shame of CPU/BIOS bugs.
571 */
573 if (is_prefetch(regs, address, error_code))
574 return;
576 if (is_errata93(regs, address))
577 return;
579 /*
580 * Oops. The kernel tried to access some bad page. We'll have to
581 * terminate things with extreme prejudice.
582 */
584 flags = oops_begin();
586 if (address < PAGE_SIZE)
587 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
588 else
589 printk(KERN_ALERT "Unable to handle kernel paging request");
590 printk(" at %016lx RIP: \n" KERN_ALERT,address);
591 printk_address(regs->rip);
592 printk("\n");
593 dump_pagetable(address);
594 tsk->thread.cr2 = address;
595 tsk->thread.trap_no = 14;
596 tsk->thread.error_code = error_code;
597 __die("Oops", regs, error_code);
598 /* Executive summary in case the body of the oops scrolled away */
599 printk(KERN_EMERG "CR2: %016lx\n", address);
600 oops_end(flags);
601 do_exit(SIGKILL);
603 /*
604 * We ran out of memory, or some other thing happened to us that made
605 * us unable to handle the page fault gracefully.
606 */
607 out_of_memory:
608 up_read(&mm->mmap_sem);
609 if (current->pid == 1) {
610 yield();
611 goto again;
612 }
613 printk("VM: killing process %s\n", tsk->comm);
614 if (error_code & 4)
615 do_exit(SIGKILL);
616 goto no_context;
618 do_sigbus:
619 up_read(&mm->mmap_sem);
621 /* Kernel mode? Handle exceptions or die */
622 if (!(error_code & PF_USER))
623 goto no_context;
625 tsk->thread.cr2 = address;
626 tsk->thread.error_code = error_code;
627 tsk->thread.trap_no = 14;
628 info.si_signo = SIGBUS;
629 info.si_errno = 0;
630 info.si_code = BUS_ADRERR;
631 info.si_addr = (void __user *)address;
632 force_sig_info(SIGBUS, &info, tsk);
633 return;
634 }
636 static int __init enable_pagefaulttrace(char *str)
637 {
638 page_fault_trace = 1;
639 return 0;
640 }
641 __setup("pagefaulttrace", enable_pagefaulttrace);