ia64/xen-unstable
changeset 12875:d51e5a7317bb
[LINUX] Kexec: add kexec files to sparse tree.
Signed-off-by: Ian Campbell <ian.campbell@xensource.com>
Signed-off-by: Ian Campbell <ian.campbell@xensource.com>
author | Ian Campbell <ian.campbell@xensource.com> |
---|---|
date | Fri Dec 08 11:47:09 2006 +0000 (2006-12-08) |
parents | 1db125262365 |
children | 562eee7568a8 |
files | linux-2.6-xen-sparse/arch/i386/kernel/crash.c linux-2.6-xen-sparse/arch/i386/kernel/machine_kexec.c linux-2.6-xen-sparse/arch/x86_64/kernel/crash.c linux-2.6-xen-sparse/arch/x86_64/kernel/machine_kexec.c linux-2.6-xen-sparse/include/asm-i386/kexec.h linux-2.6-xen-sparse/include/asm-x86_64/kexec.h linux-2.6-xen-sparse/include/linux/kexec.h linux-2.6-xen-sparse/kernel/kexec.c |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/crash.c Fri Dec 08 11:47:09 2006 +0000 1.3 @@ -0,0 +1,183 @@ 1.4 +/* 1.5 + * Architecture specific (i386) functions for kexec based crash dumps. 1.6 + * 1.7 + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) 1.8 + * 1.9 + * Copyright (C) IBM Corporation, 2004. All rights reserved. 1.10 + * 1.11 + */ 1.12 + 1.13 +#include <linux/init.h> 1.14 +#include <linux/types.h> 1.15 +#include <linux/kernel.h> 1.16 +#include <linux/smp.h> 1.17 +#include <linux/reboot.h> 1.18 +#include <linux/kexec.h> 1.19 +#include <linux/delay.h> 1.20 +#include <linux/elf.h> 1.21 +#include <linux/elfcore.h> 1.22 + 1.23 +#include <asm/processor.h> 1.24 +#include <asm/hardirq.h> 1.25 +#include <asm/nmi.h> 1.26 +#include <asm/hw_irq.h> 1.27 +#include <asm/apic.h> 1.28 +#include <mach_ipi.h> 1.29 + 1.30 + 1.31 +/* This keeps a track of which one is crashing cpu. */ 1.32 +static int crashing_cpu; 1.33 + 1.34 +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, 1.35 + size_t data_len) 1.36 +{ 1.37 + struct elf_note note; 1.38 + 1.39 + note.n_namesz = strlen(name) + 1; 1.40 + note.n_descsz = data_len; 1.41 + note.n_type = type; 1.42 + memcpy(buf, ¬e, sizeof(note)); 1.43 + buf += (sizeof(note) +3)/4; 1.44 + memcpy(buf, name, note.n_namesz); 1.45 + buf += (note.n_namesz + 3)/4; 1.46 + memcpy(buf, data, note.n_descsz); 1.47 + buf += (note.n_descsz + 3)/4; 1.48 + 1.49 + return buf; 1.50 +} 1.51 + 1.52 +static void final_note(u32 *buf) 1.53 +{ 1.54 + struct elf_note note; 1.55 + 1.56 + note.n_namesz = 0; 1.57 + note.n_descsz = 0; 1.58 + note.n_type = 0; 1.59 + memcpy(buf, ¬e, sizeof(note)); 1.60 +} 1.61 + 1.62 +static void crash_save_this_cpu(struct pt_regs *regs, int cpu) 1.63 +{ 1.64 + struct elf_prstatus prstatus; 1.65 + u32 *buf; 1.66 + 1.67 + if ((cpu < 0) || (cpu >= NR_CPUS)) 1.68 + return; 1.69 + 1.70 + /* Using ELF notes here is opportunistic. 1.71 + * I need a well defined structure format 1.72 + * for the data I pass, and I need tags 1.73 + * on the data to indicate what information I have 1.74 + * squirrelled away. ELF notes happen to provide 1.75 + * all of that that no need to invent something new. 1.76 + */ 1.77 + buf = (u32*)per_cpu_ptr(crash_notes, cpu); 1.78 + if (!buf) 1.79 + return; 1.80 + memset(&prstatus, 0, sizeof(prstatus)); 1.81 + prstatus.pr_pid = current->pid; 1.82 + elf_core_copy_regs(&prstatus.pr_reg, regs); 1.83 + buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, 1.84 + sizeof(prstatus)); 1.85 + final_note(buf); 1.86 +} 1.87 + 1.88 +static void crash_save_self(struct pt_regs *regs) 1.89 +{ 1.90 + int cpu; 1.91 + 1.92 + cpu = smp_processor_id(); 1.93 + crash_save_this_cpu(regs, cpu); 1.94 +} 1.95 + 1.96 +#ifdef CONFIG_SMP 1.97 +static atomic_t waiting_for_crash_ipi; 1.98 + 1.99 +static int crash_nmi_callback(struct pt_regs *regs, int cpu) 1.100 +{ 1.101 + struct pt_regs fixed_regs; 1.102 + 1.103 + /* Don't do anything if this handler is invoked on crashing cpu. 1.104 + * Otherwise, system will completely hang. Crashing cpu can get 1.105 + * an NMI if system was initially booted with nmi_watchdog parameter. 1.106 + */ 1.107 + if (cpu == crashing_cpu) 1.108 + return 1; 1.109 + local_irq_disable(); 1.110 + 1.111 + if (!user_mode(regs)) { 1.112 + crash_fixup_ss_esp(&fixed_regs, regs); 1.113 + regs = &fixed_regs; 1.114 + } 1.115 + crash_save_this_cpu(regs, cpu); 1.116 + disable_local_APIC(); 1.117 + atomic_dec(&waiting_for_crash_ipi); 1.118 + /* Assume hlt works */ 1.119 + halt(); 1.120 + for(;;); 1.121 + 1.122 + return 1; 1.123 +} 1.124 + 1.125 +/* 1.126 + * By using the NMI code instead of a vector we just sneak thru the 1.127 + * word generator coming out with just what we want. AND it does 1.128 + * not matter if clustered_apic_mode is set or not. 1.129 + */ 1.130 +static void smp_send_nmi_allbutself(void) 1.131 +{ 1.132 + send_IPI_allbutself(APIC_DM_NMI); 1.133 +} 1.134 + 1.135 +static void nmi_shootdown_cpus(void) 1.136 +{ 1.137 + unsigned long msecs; 1.138 + 1.139 + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); 1.140 + /* Would it be better to replace the trap vector here? */ 1.141 + set_nmi_callback(crash_nmi_callback); 1.142 + /* Ensure the new callback function is set before sending 1.143 + * out the NMI 1.144 + */ 1.145 + wmb(); 1.146 + 1.147 + smp_send_nmi_allbutself(); 1.148 + 1.149 + msecs = 1000; /* Wait at most a second for the other cpus to stop */ 1.150 + while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { 1.151 + mdelay(1); 1.152 + msecs--; 1.153 + } 1.154 + 1.155 + /* Leave the nmi callback set */ 1.156 + disable_local_APIC(); 1.157 +} 1.158 +#else 1.159 +static void nmi_shootdown_cpus(void) 1.160 +{ 1.161 + /* There are no cpus to shootdown */ 1.162 +} 1.163 +#endif 1.164 + 1.165 +void machine_crash_shutdown(struct pt_regs *regs) 1.166 +{ 1.167 + /* This function is only called after the system 1.168 + * has paniced or is otherwise in a critical state. 1.169 + * The minimum amount of code to allow a kexec'd kernel 1.170 + * to run successfully needs to happen here. 1.171 + * 1.172 + * In practice this means shooting down the other cpus in 1.173 + * an SMP system. 1.174 + */ 1.175 + /* The kernel is broken so disable interrupts */ 1.176 + local_irq_disable(); 1.177 + 1.178 + /* Make a note of crashing cpu. Will be used in NMI callback.*/ 1.179 + crashing_cpu = smp_processor_id(); 1.180 + nmi_shootdown_cpus(); 1.181 + lapic_shutdown(); 1.182 +#if defined(CONFIG_X86_IO_APIC) 1.183 + disable_IO_APIC(); 1.184 +#endif 1.185 + crash_save_self(regs); 1.186 +}
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/machine_kexec.c Fri Dec 08 11:47:09 2006 +0000 2.3 @@ -0,0 +1,89 @@ 2.4 +/* 2.5 + * machine_kexec.c - handle transition of Linux booting another kernel 2.6 + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> 2.7 + * 2.8 + * This source code is licensed under the GNU General Public License, 2.9 + * Version 2. See the file COPYING for more details. 2.10 + */ 2.11 + 2.12 +#include <linux/mm.h> 2.13 +#include <linux/kexec.h> 2.14 +#include <linux/delay.h> 2.15 +#include <asm/pgtable.h> 2.16 +#include <asm/pgalloc.h> 2.17 +#include <asm/tlbflush.h> 2.18 +#include <asm/mmu_context.h> 2.19 +#include <asm/io.h> 2.20 +#include <asm/apic.h> 2.21 +#include <asm/cpufeature.h> 2.22 +#include <asm/desc.h> 2.23 +#include <asm/system.h> 2.24 + 2.25 +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) 2.26 +static u32 kexec_pgd[1024] PAGE_ALIGNED; 2.27 +#ifdef CONFIG_X86_PAE 2.28 +static u32 kexec_pmd0[1024] PAGE_ALIGNED; 2.29 +static u32 kexec_pmd1[1024] PAGE_ALIGNED; 2.30 +#endif 2.31 +static u32 kexec_pte0[1024] PAGE_ALIGNED; 2.32 +static u32 kexec_pte1[1024] PAGE_ALIGNED; 2.33 + 2.34 +/* 2.35 + * A architecture hook called to validate the 2.36 + * proposed image and prepare the control pages 2.37 + * as needed. The pages for KEXEC_CONTROL_CODE_SIZE 2.38 + * have been allocated, but the segments have yet 2.39 + * been copied into the kernel. 2.40 + * 2.41 + * Do what every setup is needed on image and the 2.42 + * reboot code buffer to allow us to avoid allocations 2.43 + * later. 2.44 + * 2.45 + * Currently nothing. 2.46 + */ 2.47 +int machine_kexec_prepare(struct kimage *image) 2.48 +{ 2.49 + return 0; 2.50 +} 2.51 + 2.52 +/* 2.53 + * Undo anything leftover by machine_kexec_prepare 2.54 + * when an image is freed. 2.55 + */ 2.56 +void machine_kexec_cleanup(struct kimage *image) 2.57 +{ 2.58 +} 2.59 + 2.60 +/* 2.61 + * Do not allocate memory (or fail in any way) in machine_kexec(). 2.62 + * We are past the point of no return, committed to rebooting now. 2.63 + */ 2.64 +NORET_TYPE void machine_kexec(struct kimage *image) 2.65 +{ 2.66 + unsigned long page_list[PAGES_NR]; 2.67 + void *control_page; 2.68 + 2.69 + /* Interrupts aren't acceptable while we reboot */ 2.70 + local_irq_disable(); 2.71 + 2.72 + control_page = page_address(image->control_code_page); 2.73 + memcpy(control_page, relocate_kernel, PAGE_SIZE); 2.74 + 2.75 + page_list[PA_CONTROL_PAGE] = __pa(control_page); 2.76 + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; 2.77 + page_list[PA_PGD] = __pa(kexec_pgd); 2.78 + page_list[VA_PGD] = (unsigned long)kexec_pgd; 2.79 +#ifdef CONFIG_X86_PAE 2.80 + page_list[PA_PMD_0] = __pa(kexec_pmd0); 2.81 + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; 2.82 + page_list[PA_PMD_1] = __pa(kexec_pmd1); 2.83 + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; 2.84 +#endif 2.85 + page_list[PA_PTE_0] = __pa(kexec_pte0); 2.86 + page_list[VA_PTE_0] = (unsigned long)kexec_pte0; 2.87 + page_list[PA_PTE_1] = __pa(kexec_pte1); 2.88 + page_list[VA_PTE_1] = (unsigned long)kexec_pte1; 2.89 + 2.90 + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, 2.91 + image->start, cpu_has_pae); 2.92 +}
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/crash.c Fri Dec 08 11:47:09 2006 +0000 3.3 @@ -0,0 +1,186 @@ 3.4 +/* 3.5 + * Architecture specific (x86_64) functions for kexec based crash dumps. 3.6 + * 3.7 + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) 3.8 + * 3.9 + * Copyright (C) IBM Corporation, 2004. All rights reserved. 3.10 + * 3.11 + */ 3.12 + 3.13 +#include <linux/init.h> 3.14 +#include <linux/types.h> 3.15 +#include <linux/kernel.h> 3.16 +#include <linux/smp.h> 3.17 +#include <linux/irq.h> 3.18 +#include <linux/reboot.h> 3.19 +#include <linux/kexec.h> 3.20 +#include <linux/delay.h> 3.21 +#include <linux/elf.h> 3.22 +#include <linux/elfcore.h> 3.23 + 3.24 +#include <asm/processor.h> 3.25 +#include <asm/hardirq.h> 3.26 +#include <asm/nmi.h> 3.27 +#include <asm/hw_irq.h> 3.28 +#include <asm/mach_apic.h> 3.29 + 3.30 +/* This keeps a track of which one is crashing cpu. */ 3.31 +static int crashing_cpu; 3.32 + 3.33 +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, 3.34 + void *data, size_t data_len) 3.35 +{ 3.36 + struct elf_note note; 3.37 + 3.38 + note.n_namesz = strlen(name) + 1; 3.39 + note.n_descsz = data_len; 3.40 + note.n_type = type; 3.41 + memcpy(buf, ¬e, sizeof(note)); 3.42 + buf += (sizeof(note) +3)/4; 3.43 + memcpy(buf, name, note.n_namesz); 3.44 + buf += (note.n_namesz + 3)/4; 3.45 + memcpy(buf, data, note.n_descsz); 3.46 + buf += (note.n_descsz + 3)/4; 3.47 + 3.48 + return buf; 3.49 +} 3.50 + 3.51 +static void final_note(u32 *buf) 3.52 +{ 3.53 + struct elf_note note; 3.54 + 3.55 + note.n_namesz = 0; 3.56 + note.n_descsz = 0; 3.57 + note.n_type = 0; 3.58 + memcpy(buf, ¬e, sizeof(note)); 3.59 +} 3.60 + 3.61 +static void crash_save_this_cpu(struct pt_regs *regs, int cpu) 3.62 +{ 3.63 + struct elf_prstatus prstatus; 3.64 + u32 *buf; 3.65 + 3.66 + if ((cpu < 0) || (cpu >= NR_CPUS)) 3.67 + return; 3.68 + 3.69 + /* Using ELF notes here is opportunistic. 3.70 + * I need a well defined structure format 3.71 + * for the data I pass, and I need tags 3.72 + * on the data to indicate what information I have 3.73 + * squirrelled away. ELF notes happen to provide 3.74 + * all of that that no need to invent something new. 3.75 + */ 3.76 + 3.77 + buf = (u32*)per_cpu_ptr(crash_notes, cpu); 3.78 + 3.79 + if (!buf) 3.80 + return; 3.81 + 3.82 + memset(&prstatus, 0, sizeof(prstatus)); 3.83 + prstatus.pr_pid = current->pid; 3.84 + elf_core_copy_regs(&prstatus.pr_reg, regs); 3.85 + buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, 3.86 + sizeof(prstatus)); 3.87 + final_note(buf); 3.88 +} 3.89 + 3.90 +static void crash_save_self(struct pt_regs *regs) 3.91 +{ 3.92 + int cpu; 3.93 + 3.94 + cpu = smp_processor_id(); 3.95 + crash_save_this_cpu(regs, cpu); 3.96 +} 3.97 + 3.98 +#ifdef CONFIG_SMP 3.99 +static atomic_t waiting_for_crash_ipi; 3.100 + 3.101 +static int crash_nmi_callback(struct pt_regs *regs, int cpu) 3.102 +{ 3.103 + /* 3.104 + * Don't do anything if this handler is invoked on crashing cpu. 3.105 + * Otherwise, system will completely hang. Crashing cpu can get 3.106 + * an NMI if system was initially booted with nmi_watchdog parameter. 3.107 + */ 3.108 + if (cpu == crashing_cpu) 3.109 + return 1; 3.110 + local_irq_disable(); 3.111 + 3.112 + crash_save_this_cpu(regs, cpu); 3.113 + disable_local_APIC(); 3.114 + atomic_dec(&waiting_for_crash_ipi); 3.115 + /* Assume hlt works */ 3.116 + for(;;) 3.117 + asm("hlt"); 3.118 + 3.119 + return 1; 3.120 +} 3.121 + 3.122 +static void smp_send_nmi_allbutself(void) 3.123 +{ 3.124 + send_IPI_allbutself(APIC_DM_NMI); 3.125 +} 3.126 + 3.127 +/* 3.128 + * This code is a best effort heuristic to get the 3.129 + * other cpus to stop executing. So races with 3.130 + * cpu hotplug shouldn't matter. 3.131 + */ 3.132 + 3.133 +static void nmi_shootdown_cpus(void) 3.134 +{ 3.135 + unsigned long msecs; 3.136 + 3.137 + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); 3.138 + set_nmi_callback(crash_nmi_callback); 3.139 + 3.140 + /* 3.141 + * Ensure the new callback function is set before sending 3.142 + * out the NMI 3.143 + */ 3.144 + wmb(); 3.145 + 3.146 + smp_send_nmi_allbutself(); 3.147 + 3.148 + msecs = 1000; /* Wait at most a second for the other cpus to stop */ 3.149 + while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { 3.150 + mdelay(1); 3.151 + msecs--; 3.152 + } 3.153 + /* Leave the nmi callback set */ 3.154 + disable_local_APIC(); 3.155 +} 3.156 +#else 3.157 +static void nmi_shootdown_cpus(void) 3.158 +{ 3.159 + /* There are no cpus to shootdown */ 3.160 +} 3.161 +#endif 3.162 + 3.163 +void machine_crash_shutdown(struct pt_regs *regs) 3.164 +{ 3.165 + /* 3.166 + * This function is only called after the system 3.167 + * has paniced or is otherwise in a critical state. 3.168 + * The minimum amount of code to allow a kexec'd kernel 3.169 + * to run successfully needs to happen here. 3.170 + * 3.171 + * In practice this means shooting down the other cpus in 3.172 + * an SMP system. 3.173 + */ 3.174 + /* The kernel is broken so disable interrupts */ 3.175 + local_irq_disable(); 3.176 + 3.177 + /* Make a note of crashing cpu. Will be used in NMI callback.*/ 3.178 + crashing_cpu = smp_processor_id(); 3.179 + nmi_shootdown_cpus(); 3.180 + 3.181 + if(cpu_has_apic) 3.182 + disable_local_APIC(); 3.183 + 3.184 +#if defined(CONFIG_X86_IO_APIC) 3.185 + disable_IO_APIC(); 3.186 +#endif 3.187 + 3.188 + crash_save_self(regs); 3.189 +}
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/machine_kexec.c Fri Dec 08 11:47:09 2006 +0000 4.3 @@ -0,0 +1,173 @@ 4.4 +/* 4.5 + * machine_kexec.c - handle transition of Linux booting another kernel 4.6 + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> 4.7 + * 4.8 + * This source code is licensed under the GNU General Public License, 4.9 + * Version 2. See the file COPYING for more details. 4.10 + */ 4.11 + 4.12 +#include <linux/mm.h> 4.13 +#include <linux/kexec.h> 4.14 +#include <linux/string.h> 4.15 +#include <linux/reboot.h> 4.16 +#include <asm/pgtable.h> 4.17 +#include <asm/tlbflush.h> 4.18 +#include <asm/mmu_context.h> 4.19 +#include <asm/io.h> 4.20 + 4.21 +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) 4.22 +static u64 kexec_pgd[512] PAGE_ALIGNED; 4.23 +static u64 kexec_pud0[512] PAGE_ALIGNED; 4.24 +static u64 kexec_pmd0[512] PAGE_ALIGNED; 4.25 +static u64 kexec_pte0[512] PAGE_ALIGNED; 4.26 +static u64 kexec_pud1[512] PAGE_ALIGNED; 4.27 +static u64 kexec_pmd1[512] PAGE_ALIGNED; 4.28 +static u64 kexec_pte1[512] PAGE_ALIGNED; 4.29 + 4.30 +static void init_level2_page(pmd_t *level2p, unsigned long addr) 4.31 +{ 4.32 + unsigned long end_addr; 4.33 + 4.34 + addr &= PAGE_MASK; 4.35 + end_addr = addr + PUD_SIZE; 4.36 + while (addr < end_addr) { 4.37 + set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); 4.38 + addr += PMD_SIZE; 4.39 + } 4.40 +} 4.41 + 4.42 +static int init_level3_page(struct kimage *image, pud_t *level3p, 4.43 + unsigned long addr, unsigned long last_addr) 4.44 +{ 4.45 + unsigned long end_addr; 4.46 + int result; 4.47 + 4.48 + result = 0; 4.49 + addr &= PAGE_MASK; 4.50 + end_addr = addr + PGDIR_SIZE; 4.51 + while ((addr < last_addr) && (addr < end_addr)) { 4.52 + struct page *page; 4.53 + pmd_t *level2p; 4.54 + 4.55 + page = kimage_alloc_control_pages(image, 0); 4.56 + if (!page) { 4.57 + result = -ENOMEM; 4.58 + goto out; 4.59 + } 4.60 + level2p = (pmd_t *)page_address(page); 4.61 + init_level2_page(level2p, addr); 4.62 + set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); 4.63 + addr += PUD_SIZE; 4.64 + } 4.65 + /* clear the unused entries */ 4.66 + while (addr < end_addr) { 4.67 + pud_clear(level3p++); 4.68 + addr += PUD_SIZE; 4.69 + } 4.70 +out: 4.71 + return result; 4.72 +} 4.73 + 4.74 + 4.75 +static int init_level4_page(struct kimage *image, pgd_t *level4p, 4.76 + unsigned long addr, unsigned long last_addr) 4.77 +{ 4.78 + unsigned long end_addr; 4.79 + int result; 4.80 + 4.81 + result = 0; 4.82 + addr &= PAGE_MASK; 4.83 + end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE); 4.84 + while ((addr < last_addr) && (addr < end_addr)) { 4.85 + struct page *page; 4.86 + pud_t *level3p; 4.87 + 4.88 + page = kimage_alloc_control_pages(image, 0); 4.89 + if (!page) { 4.90 + result = -ENOMEM; 4.91 + goto out; 4.92 + } 4.93 + level3p = (pud_t *)page_address(page); 4.94 + result = init_level3_page(image, level3p, addr, last_addr); 4.95 + if (result) { 4.96 + goto out; 4.97 + } 4.98 + set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); 4.99 + addr += PGDIR_SIZE; 4.100 + } 4.101 + /* clear the unused entries */ 4.102 + while (addr < end_addr) { 4.103 + pgd_clear(level4p++); 4.104 + addr += PGDIR_SIZE; 4.105 + } 4.106 +out: 4.107 + return result; 4.108 +} 4.109 + 4.110 + 4.111 +static int init_pgtable(struct kimage *image, unsigned long start_pgtable) 4.112 +{ 4.113 + pgd_t *level4p; 4.114 + level4p = (pgd_t *)__va(start_pgtable); 4.115 + return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT); 4.116 +} 4.117 + 4.118 +int machine_kexec_prepare(struct kimage *image) 4.119 +{ 4.120 + unsigned long start_pgtable; 4.121 + int result; 4.122 + 4.123 + /* Calculate the offsets */ 4.124 + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; 4.125 + 4.126 + /* Setup the identity mapped 64bit page table */ 4.127 + result = init_pgtable(image, start_pgtable); 4.128 + if (result) 4.129 + return result; 4.130 + 4.131 + return 0; 4.132 +} 4.133 + 4.134 +void machine_kexec_cleanup(struct kimage *image) 4.135 +{ 4.136 + return; 4.137 +} 4.138 + 4.139 +/* 4.140 + * Do not allocate memory (or fail in any way) in machine_kexec(). 4.141 + * We are past the point of no return, committed to rebooting now. 4.142 + */ 4.143 +NORET_TYPE void machine_kexec(struct kimage *image) 4.144 +{ 4.145 + unsigned long page_list[PAGES_NR]; 4.146 + void *control_page; 4.147 + 4.148 + /* Interrupts aren't acceptable while we reboot */ 4.149 + local_irq_disable(); 4.150 + 4.151 + control_page = page_address(image->control_code_page) + PAGE_SIZE; 4.152 + memcpy(control_page, relocate_kernel, PAGE_SIZE); 4.153 + 4.154 + page_list[PA_CONTROL_PAGE] = __pa(control_page); 4.155 + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; 4.156 + page_list[PA_PGD] = __pa(kexec_pgd); 4.157 + page_list[VA_PGD] = (unsigned long)kexec_pgd; 4.158 + page_list[PA_PUD_0] = __pa(kexec_pud0); 4.159 + page_list[VA_PUD_0] = (unsigned long)kexec_pud0; 4.160 + page_list[PA_PMD_0] = __pa(kexec_pmd0); 4.161 + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; 4.162 + page_list[PA_PTE_0] = __pa(kexec_pte0); 4.163 + page_list[VA_PTE_0] = (unsigned long)kexec_pte0; 4.164 + page_list[PA_PUD_1] = __pa(kexec_pud1); 4.165 + page_list[VA_PUD_1] = (unsigned long)kexec_pud1; 4.166 + page_list[PA_PMD_1] = __pa(kexec_pmd1); 4.167 + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; 4.168 + page_list[PA_PTE_1] = __pa(kexec_pte1); 4.169 + page_list[VA_PTE_1] = (unsigned long)kexec_pte1; 4.170 + 4.171 + page_list[PA_TABLE_PAGE] = 4.172 + (unsigned long)__pa(page_address(image->control_code_page)); 4.173 + 4.174 + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, 4.175 + image->start); 4.176 +}
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/linux-2.6-xen-sparse/include/asm-i386/kexec.h Fri Dec 08 11:47:09 2006 +0000 5.3 @@ -0,0 +1,103 @@ 5.4 +#ifndef _I386_KEXEC_H 5.5 +#define _I386_KEXEC_H 5.6 + 5.7 +#define PA_CONTROL_PAGE 0 5.8 +#define VA_CONTROL_PAGE 1 5.9 +#define PA_PGD 2 5.10 +#define VA_PGD 3 5.11 +#define PA_PTE_0 4 5.12 +#define VA_PTE_0 5 5.13 +#define PA_PTE_1 6 5.14 +#define VA_PTE_1 7 5.15 +#ifdef CONFIG_X86_PAE 5.16 +#define PA_PMD_0 8 5.17 +#define VA_PMD_0 9 5.18 +#define PA_PMD_1 10 5.19 +#define VA_PMD_1 11 5.20 +#define PAGES_NR 12 5.21 +#else 5.22 +#define PAGES_NR 8 5.23 +#endif 5.24 + 5.25 +#ifndef __ASSEMBLY__ 5.26 + 5.27 +#include <asm/fixmap.h> 5.28 +#include <asm/ptrace.h> 5.29 +#include <asm/string.h> 5.30 + 5.31 +/* 5.32 + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. 5.33 + * I.e. Maximum page that is mapped directly into kernel memory, 5.34 + * and kmap is not required. 5.35 + * 5.36 + * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct 5.37 + * calculation for the amount of memory directly mappable into the 5.38 + * kernel memory space. 5.39 + */ 5.40 + 5.41 +/* Maximum physical address we can use pages from */ 5.42 +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) 5.43 +/* Maximum address we can reach in physical address mode */ 5.44 +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL) 5.45 +/* Maximum address we can use for the control code buffer */ 5.46 +#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE 5.47 + 5.48 +#define KEXEC_CONTROL_CODE_SIZE 4096 5.49 + 5.50 +/* The native architecture */ 5.51 +#define KEXEC_ARCH KEXEC_ARCH_386 5.52 + 5.53 +#define MAX_NOTE_BYTES 1024 5.54 + 5.55 +/* CPU does not save ss and esp on stack if execution is already 5.56 + * running in kernel mode at the time of NMI occurrence. This code 5.57 + * fixes it. 5.58 + */ 5.59 +static inline void crash_fixup_ss_esp(struct pt_regs *newregs, 5.60 + struct pt_regs *oldregs) 5.61 +{ 5.62 + memcpy(newregs, oldregs, sizeof(*newregs)); 5.63 + newregs->esp = (unsigned long)&(oldregs->esp); 5.64 + __asm__ __volatile__( 5.65 + "xorl %%eax, %%eax\n\t" 5.66 + "movw %%ss, %%ax\n\t" 5.67 + :"=a"(newregs->xss)); 5.68 +} 5.69 + 5.70 +/* 5.71 + * This function is responsible for capturing register states if coming 5.72 + * via panic otherwise just fix up the ss and esp if coming via kernel 5.73 + * mode exception. 5.74 + */ 5.75 +static inline void crash_setup_regs(struct pt_regs *newregs, 5.76 + struct pt_regs *oldregs) 5.77 +{ 5.78 + if (oldregs) 5.79 + crash_fixup_ss_esp(newregs, oldregs); 5.80 + else { 5.81 + __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx)); 5.82 + __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx)); 5.83 + __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx)); 5.84 + __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi)); 5.85 + __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi)); 5.86 + __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp)); 5.87 + __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax)); 5.88 + __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp)); 5.89 + __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->xss)); 5.90 + __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->xcs)); 5.91 + __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->xds)); 5.92 + __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->xes)); 5.93 + __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags)); 5.94 + 5.95 + newregs->eip = (unsigned long)current_text_addr(); 5.96 + } 5.97 +} 5.98 +asmlinkage NORET_TYPE void 5.99 +relocate_kernel(unsigned long indirection_page, 5.100 + unsigned long control_page, 5.101 + unsigned long start_address, 5.102 + unsigned int has_pae) ATTRIB_NORET; 5.103 + 5.104 +#endif /* __ASSEMBLY__ */ 5.105 + 5.106 +#endif /* _I386_KEXEC_H */
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 6.2 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/kexec.h Fri Dec 08 11:47:09 2006 +0000 6.3 @@ -0,0 +1,96 @@ 6.4 +#ifndef _X86_64_KEXEC_H 6.5 +#define _X86_64_KEXEC_H 6.6 + 6.7 +#define PA_CONTROL_PAGE 0 6.8 +#define VA_CONTROL_PAGE 1 6.9 +#define PA_PGD 2 6.10 +#define VA_PGD 3 6.11 +#define PA_PUD_0 4 6.12 +#define VA_PUD_0 5 6.13 +#define PA_PMD_0 6 6.14 +#define VA_PMD_0 7 6.15 +#define PA_PTE_0 8 6.16 +#define VA_PTE_0 9 6.17 +#define PA_PUD_1 10 6.18 +#define VA_PUD_1 11 6.19 +#define PA_PMD_1 12 6.20 +#define VA_PMD_1 13 6.21 +#define PA_PTE_1 14 6.22 +#define VA_PTE_1 15 6.23 +#define PA_TABLE_PAGE 16 6.24 +#define PAGES_NR 17 6.25 + 6.26 +#ifndef __ASSEMBLY__ 6.27 + 6.28 +#include <linux/string.h> 6.29 + 6.30 +#include <asm/page.h> 6.31 +#include <asm/ptrace.h> 6.32 + 6.33 +/* 6.34 + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. 6.35 + * I.e. Maximum page that is mapped directly into kernel memory, 6.36 + * and kmap is not required. 6.37 + * 6.38 + * So far x86_64 is limited to 40 physical address bits. 6.39 + */ 6.40 + 6.41 +/* Maximum physical address we can use pages from */ 6.42 +#define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL) 6.43 +/* Maximum address we can reach in physical address mode */ 6.44 +#define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL) 6.45 +/* Maximum address we can use for the control pages */ 6.46 +#define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL) 6.47 + 6.48 +/* Allocate one page for the pdp and the second for the code */ 6.49 +#define KEXEC_CONTROL_CODE_SIZE (4096UL + 4096UL) 6.50 + 6.51 +/* The native architecture */ 6.52 +#define KEXEC_ARCH KEXEC_ARCH_X86_64 6.53 + 6.54 +#define MAX_NOTE_BYTES 1024 6.55 + 6.56 +/* 6.57 + * Saving the registers of the cpu on which panic occured in 6.58 + * crash_kexec to save a valid sp. The registers of other cpus 6.59 + * will be saved in machine_crash_shutdown while shooting down them. 6.60 + */ 6.61 + 6.62 +static inline void crash_setup_regs(struct pt_regs *newregs, 6.63 + struct pt_regs *oldregs) 6.64 +{ 6.65 + if (oldregs) 6.66 + memcpy(newregs, oldregs, sizeof(*newregs)); 6.67 + else { 6.68 + __asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->rbx)); 6.69 + __asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->rcx)); 6.70 + __asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->rdx)); 6.71 + __asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->rsi)); 6.72 + __asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->rdi)); 6.73 + __asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->rbp)); 6.74 + __asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->rax)); 6.75 + __asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->rsp)); 6.76 + __asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8)); 6.77 + __asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9)); 6.78 + __asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10)); 6.79 + __asm__ __volatile__("movq %%r11,%0" : "=m"(newregs->r11)); 6.80 + __asm__ __volatile__("movq %%r12,%0" : "=m"(newregs->r12)); 6.81 + __asm__ __volatile__("movq %%r13,%0" : "=m"(newregs->r13)); 6.82 + __asm__ __volatile__("movq %%r14,%0" : "=m"(newregs->r14)); 6.83 + __asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15)); 6.84 + __asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss)); 6.85 + __asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs)); 6.86 + __asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->eflags)); 6.87 + 6.88 + newregs->rip = (unsigned long)current_text_addr(); 6.89 + } 6.90 +} 6.91 + 6.92 +NORET_TYPE void 6.93 +relocate_kernel(unsigned long indirection_page, 6.94 + unsigned long page_list, 6.95 + unsigned long start_address) ATTRIB_NORET; 6.96 + 6.97 +#endif /* __ASSEMBLY__ */ 6.98 + 6.99 +#endif /* _X86_64_KEXEC_H */
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/linux-2.6-xen-sparse/include/linux/kexec.h Fri Dec 08 11:47:09 2006 +0000 7.3 @@ -0,0 +1,139 @@ 7.4 +#ifndef LINUX_KEXEC_H 7.5 +#define LINUX_KEXEC_H 7.6 + 7.7 +#ifdef CONFIG_KEXEC 7.8 +#include <linux/types.h> 7.9 +#include <linux/list.h> 7.10 +#include <linux/linkage.h> 7.11 +#include <linux/compat.h> 7.12 +#include <linux/ioport.h> 7.13 +#include <asm/kexec.h> 7.14 + 7.15 +/* Verify architecture specific macros are defined */ 7.16 + 7.17 +#ifndef KEXEC_SOURCE_MEMORY_LIMIT 7.18 +#error KEXEC_SOURCE_MEMORY_LIMIT not defined 7.19 +#endif 7.20 + 7.21 +#ifndef KEXEC_DESTINATION_MEMORY_LIMIT 7.22 +#error KEXEC_DESTINATION_MEMORY_LIMIT not defined 7.23 +#endif 7.24 + 7.25 +#ifndef KEXEC_CONTROL_MEMORY_LIMIT 7.26 +#error KEXEC_CONTROL_MEMORY_LIMIT not defined 7.27 +#endif 7.28 + 7.29 +#ifndef KEXEC_CONTROL_CODE_SIZE 7.30 +#error KEXEC_CONTROL_CODE_SIZE not defined 7.31 +#endif 7.32 + 7.33 +#ifndef KEXEC_ARCH 7.34 +#error KEXEC_ARCH not defined 7.35 +#endif 7.36 + 7.37 +/* 7.38 + * This structure is used to hold the arguments that are used when loading 7.39 + * kernel binaries. 7.40 + */ 7.41 + 7.42 +typedef unsigned long kimage_entry_t; 7.43 +#define IND_DESTINATION 0x1 7.44 +#define IND_INDIRECTION 0x2 7.45 +#define IND_DONE 0x4 7.46 +#define IND_SOURCE 0x8 7.47 + 7.48 +#define KEXEC_SEGMENT_MAX 16 7.49 +struct kexec_segment { 7.50 + void __user *buf; 7.51 + size_t bufsz; 7.52 + unsigned long mem; /* User space sees this as a (void *) ... */ 7.53 + size_t memsz; 7.54 +}; 7.55 + 7.56 +#ifdef CONFIG_COMPAT 7.57 +struct compat_kexec_segment { 7.58 + compat_uptr_t buf; 7.59 + compat_size_t bufsz; 7.60 + compat_ulong_t mem; /* User space sees this as a (void *) ... */ 7.61 + compat_size_t memsz; 7.62 +}; 7.63 +#endif 7.64 + 7.65 +struct kimage { 7.66 + kimage_entry_t head; 7.67 + kimage_entry_t *entry; 7.68 + kimage_entry_t *last_entry; 7.69 + 7.70 + unsigned long destination; 7.71 + 7.72 + unsigned long start; 7.73 + struct page *control_code_page; 7.74 + 7.75 + unsigned long nr_segments; 7.76 + struct kexec_segment segment[KEXEC_SEGMENT_MAX]; 7.77 + 7.78 + struct list_head control_pages; 7.79 + struct list_head dest_pages; 7.80 + struct list_head unuseable_pages; 7.81 + 7.82 + /* Address of next control page to allocate for crash kernels. */ 7.83 + unsigned long control_page; 7.84 + 7.85 + /* Flags to indicate special processing */ 7.86 + unsigned int type : 1; 7.87 +#define KEXEC_TYPE_DEFAULT 0 7.88 +#define KEXEC_TYPE_CRASH 1 7.89 +}; 7.90 + 7.91 + 7.92 + 7.93 +/* kexec interface functions */ 7.94 +extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET; 7.95 +extern int machine_kexec_prepare(struct kimage *image); 7.96 +extern void machine_kexec_cleanup(struct kimage *image); 7.97 +extern asmlinkage long sys_kexec_load(unsigned long entry, 7.98 + unsigned long nr_segments, 7.99 + struct kexec_segment __user *segments, 7.100 + unsigned long flags); 7.101 +#ifdef CONFIG_COMPAT 7.102 +extern asmlinkage long compat_sys_kexec_load(unsigned long entry, 7.103 + unsigned long nr_segments, 7.104 + struct compat_kexec_segment __user *segments, 7.105 + unsigned long flags); 7.106 +#endif 7.107 +extern struct page *kimage_alloc_control_pages(struct kimage *image, 7.108 + unsigned int order); 7.109 +extern void crash_kexec(struct pt_regs *); 7.110 +int kexec_should_crash(struct task_struct *); 7.111 +extern struct kimage *kexec_image; 7.112 + 7.113 +#define KEXEC_ON_CRASH 0x00000001 7.114 +#define KEXEC_ARCH_MASK 0xffff0000 7.115 + 7.116 +/* These values match the ELF architecture values. 7.117 + * Unless there is a good reason that should continue to be the case. 7.118 + */ 7.119 +#define KEXEC_ARCH_DEFAULT ( 0 << 16) 7.120 +#define KEXEC_ARCH_386 ( 3 << 16) 7.121 +#define KEXEC_ARCH_X86_64 (62 << 16) 7.122 +#define KEXEC_ARCH_PPC (20 << 16) 7.123 +#define KEXEC_ARCH_PPC64 (21 << 16) 7.124 +#define KEXEC_ARCH_IA_64 (50 << 16) 7.125 +#define KEXEC_ARCH_S390 (22 << 16) 7.126 +#define KEXEC_ARCH_SH (42 << 16) 7.127 + 7.128 +#define KEXEC_FLAGS (KEXEC_ON_CRASH) /* List of defined/legal kexec flags */ 7.129 + 7.130 +/* Location of a reserved region to hold the crash kernel. 7.131 + */ 7.132 +extern struct resource crashk_res; 7.133 +typedef u32 note_buf_t[MAX_NOTE_BYTES/4]; 7.134 +extern note_buf_t *crash_notes; 7.135 + 7.136 +#else /* !CONFIG_KEXEC */ 7.137 +struct pt_regs; 7.138 +struct task_struct; 7.139 +static inline void crash_kexec(struct pt_regs *regs) { } 7.140 +static inline int kexec_should_crash(struct task_struct *p) { return 0; } 7.141 +#endif /* CONFIG_KEXEC */ 7.142 +#endif /* LINUX_KEXEC_H */
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 8.2 +++ b/linux-2.6-xen-sparse/kernel/kexec.c Fri Dec 08 11:47:09 2006 +0000 8.3 @@ -0,0 +1,1081 @@ 8.4 +/* 8.5 + * kexec.c - kexec system call 8.6 + * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 8.7 + * 8.8 + * This source code is licensed under the GNU General Public License, 8.9 + * Version 2. See the file COPYING for more details. 8.10 + */ 8.11 + 8.12 +#include <linux/capability.h> 8.13 +#include <linux/mm.h> 8.14 +#include <linux/file.h> 8.15 +#include <linux/slab.h> 8.16 +#include <linux/fs.h> 8.17 +#include <linux/kexec.h> 8.18 +#include <linux/spinlock.h> 8.19 +#include <linux/list.h> 8.20 +#include <linux/highmem.h> 8.21 +#include <linux/syscalls.h> 8.22 +#include <linux/reboot.h> 8.23 +#include <linux/syscalls.h> 8.24 +#include <linux/ioport.h> 8.25 +#include <linux/hardirq.h> 8.26 + 8.27 +#include <asm/page.h> 8.28 +#include <asm/uaccess.h> 8.29 +#include <asm/io.h> 8.30 +#include <asm/system.h> 8.31 +#include <asm/semaphore.h> 8.32 + 8.33 +/* Per cpu memory for storing cpu states in case of system crash. */ 8.34 +note_buf_t* crash_notes; 8.35 + 8.36 +/* Location of the reserved area for the crash kernel */ 8.37 +struct resource crashk_res = { 8.38 + .name = "Crash kernel", 8.39 + .start = 0, 8.40 + .end = 0, 8.41 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM 8.42 +}; 8.43 + 8.44 +int kexec_should_crash(struct task_struct *p) 8.45 +{ 8.46 + if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops) 8.47 + return 1; 8.48 + return 0; 8.49 +} 8.50 + 8.51 +/* 8.52 + * When kexec transitions to the new kernel there is a one-to-one 8.53 + * mapping between physical and virtual addresses. On processors 8.54 + * where you can disable the MMU this is trivial, and easy. For 8.55 + * others it is still a simple predictable page table to setup. 8.56 + * 8.57 + * In that environment kexec copies the new kernel to its final 8.58 + * resting place. This means I can only support memory whose 8.59 + * physical address can fit in an unsigned long. In particular 8.60 + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. 8.61 + * If the assembly stub has more restrictive requirements 8.62 + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be 8.63 + * defined more restrictively in <asm/kexec.h>. 8.64 + * 8.65 + * The code for the transition from the current kernel to the 8.66 + * the new kernel is placed in the control_code_buffer, whose size 8.67 + * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single 8.68 + * page of memory is necessary, but some architectures require more. 8.69 + * Because this memory must be identity mapped in the transition from 8.70 + * virtual to physical addresses it must live in the range 8.71 + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily 8.72 + * modifiable. 8.73 + * 8.74 + * The assembly stub in the control code buffer is passed a linked list 8.75 + * of descriptor pages detailing the source pages of the new kernel, 8.76 + * and the destination addresses of those source pages. As this data 8.77 + * structure is not used in the context of the current OS, it must 8.78 + * be self-contained. 8.79 + * 8.80 + * The code has been made to work with highmem pages and will use a 8.81 + * destination page in its final resting place (if it happens 8.82 + * to allocate it). The end product of this is that most of the 8.83 + * physical address space, and most of RAM can be used. 8.84 + * 8.85 + * Future directions include: 8.86 + * - allocating a page table with the control code buffer identity 8.87 + * mapped, to simplify machine_kexec and make kexec_on_panic more 8.88 + * reliable. 8.89 + */ 8.90 + 8.91 +/* 8.92 + * KIMAGE_NO_DEST is an impossible destination address..., for 8.93 + * allocating pages whose destination address we do not care about. 8.94 + */ 8.95 +#define KIMAGE_NO_DEST (-1UL) 8.96 + 8.97 +static int kimage_is_destination_range(struct kimage *image, 8.98 + unsigned long start, unsigned long end); 8.99 +static struct page *kimage_alloc_page(struct kimage *image, 8.100 + gfp_t gfp_mask, 8.101 + unsigned long dest); 8.102 + 8.103 +static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 8.104 + unsigned long nr_segments, 8.105 + struct kexec_segment __user *segments) 8.106 +{ 8.107 + size_t segment_bytes; 8.108 + struct kimage *image; 8.109 + unsigned long i; 8.110 + int result; 8.111 + 8.112 + /* Allocate a controlling structure */ 8.113 + result = -ENOMEM; 8.114 + image = kmalloc(sizeof(*image), GFP_KERNEL); 8.115 + if (!image) 8.116 + goto out; 8.117 + 8.118 + memset(image, 0, sizeof(*image)); 8.119 + image->head = 0; 8.120 + image->entry = &image->head; 8.121 + image->last_entry = &image->head; 8.122 + image->control_page = ~0; /* By default this does not apply */ 8.123 + image->start = entry; 8.124 + image->type = KEXEC_TYPE_DEFAULT; 8.125 + 8.126 + /* Initialize the list of control pages */ 8.127 + INIT_LIST_HEAD(&image->control_pages); 8.128 + 8.129 + /* Initialize the list of destination pages */ 8.130 + INIT_LIST_HEAD(&image->dest_pages); 8.131 + 8.132 + /* Initialize the list of unuseable pages */ 8.133 + INIT_LIST_HEAD(&image->unuseable_pages); 8.134 + 8.135 + /* Read in the segments */ 8.136 + image->nr_segments = nr_segments; 8.137 + segment_bytes = nr_segments * sizeof(*segments); 8.138 + result = copy_from_user(image->segment, segments, segment_bytes); 8.139 + if (result) 8.140 + goto out; 8.141 + 8.142 + /* 8.143 + * Verify we have good destination addresses. The caller is 8.144 + * responsible for making certain we don't attempt to load 8.145 + * the new image into invalid or reserved areas of RAM. This 8.146 + * just verifies it is an address we can use. 8.147 + * 8.148 + * Since the kernel does everything in page size chunks ensure 8.149 + * the destination addreses are page aligned. Too many 8.150 + * special cases crop of when we don't do this. The most 8.151 + * insidious is getting overlapping destination addresses 8.152 + * simply because addresses are changed to page size 8.153 + * granularity. 8.154 + */ 8.155 + result = -EADDRNOTAVAIL; 8.156 + for (i = 0; i < nr_segments; i++) { 8.157 + unsigned long mstart, mend; 8.158 + 8.159 + mstart = image->segment[i].mem; 8.160 + mend = mstart + image->segment[i].memsz; 8.161 + if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) 8.162 + goto out; 8.163 + if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) 8.164 + goto out; 8.165 + } 8.166 + 8.167 + /* Verify our destination addresses do not overlap. 8.168 + * If we alloed overlapping destination addresses 8.169 + * through very weird things can happen with no 8.170 + * easy explanation as one segment stops on another. 8.171 + */ 8.172 + result = -EINVAL; 8.173 + for (i = 0; i < nr_segments; i++) { 8.174 + unsigned long mstart, mend; 8.175 + unsigned long j; 8.176 + 8.177 + mstart = image->segment[i].mem; 8.178 + mend = mstart + image->segment[i].memsz; 8.179 + for (j = 0; j < i; j++) { 8.180 + unsigned long pstart, pend; 8.181 + pstart = image->segment[j].mem; 8.182 + pend = pstart + image->segment[j].memsz; 8.183 + /* Do the segments overlap ? */ 8.184 + if ((mend > pstart) && (mstart < pend)) 8.185 + goto out; 8.186 + } 8.187 + } 8.188 + 8.189 + /* Ensure our buffer sizes are strictly less than 8.190 + * our memory sizes. This should always be the case, 8.191 + * and it is easier to check up front than to be surprised 8.192 + * later on. 8.193 + */ 8.194 + result = -EINVAL; 8.195 + for (i = 0; i < nr_segments; i++) { 8.196 + if (image->segment[i].bufsz > image->segment[i].memsz) 8.197 + goto out; 8.198 + } 8.199 + 8.200 + result = 0; 8.201 +out: 8.202 + if (result == 0) 8.203 + *rimage = image; 8.204 + else 8.205 + kfree(image); 8.206 + 8.207 + return result; 8.208 + 8.209 +} 8.210 + 8.211 +static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 8.212 + unsigned long nr_segments, 8.213 + struct kexec_segment __user *segments) 8.214 +{ 8.215 + int result; 8.216 + struct kimage *image; 8.217 + 8.218 + /* Allocate and initialize a controlling structure */ 8.219 + image = NULL; 8.220 + result = do_kimage_alloc(&image, entry, nr_segments, segments); 8.221 + if (result) 8.222 + goto out; 8.223 + 8.224 + *rimage = image; 8.225 + 8.226 + /* 8.227 + * Find a location for the control code buffer, and add it 8.228 + * the vector of segments so that it's pages will also be 8.229 + * counted as destination pages. 8.230 + */ 8.231 + result = -ENOMEM; 8.232 + image->control_code_page = kimage_alloc_control_pages(image, 8.233 + get_order(KEXEC_CONTROL_CODE_SIZE)); 8.234 + if (!image->control_code_page) { 8.235 + printk(KERN_ERR "Could not allocate control_code_buffer\n"); 8.236 + goto out; 8.237 + } 8.238 + 8.239 + result = 0; 8.240 + out: 8.241 + if (result == 0) 8.242 + *rimage = image; 8.243 + else 8.244 + kfree(image); 8.245 + 8.246 + return result; 8.247 +} 8.248 + 8.249 +static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, 8.250 + unsigned long nr_segments, 8.251 + struct kexec_segment __user *segments) 8.252 +{ 8.253 + int result; 8.254 + struct kimage *image; 8.255 + unsigned long i; 8.256 + 8.257 + image = NULL; 8.258 + /* Verify we have a valid entry point */ 8.259 + if ((entry < crashk_res.start) || (entry > crashk_res.end)) { 8.260 + result = -EADDRNOTAVAIL; 8.261 + goto out; 8.262 + } 8.263 + 8.264 + /* Allocate and initialize a controlling structure */ 8.265 + result = do_kimage_alloc(&image, entry, nr_segments, segments); 8.266 + if (result) 8.267 + goto out; 8.268 + 8.269 + /* Enable the special crash kernel control page 8.270 + * allocation policy. 8.271 + */ 8.272 + image->control_page = crashk_res.start; 8.273 + image->type = KEXEC_TYPE_CRASH; 8.274 + 8.275 + /* 8.276 + * Verify we have good destination addresses. Normally 8.277 + * the caller is responsible for making certain we don't 8.278 + * attempt to load the new image into invalid or reserved 8.279 + * areas of RAM. But crash kernels are preloaded into a 8.280 + * reserved area of ram. We must ensure the addresses 8.281 + * are in the reserved area otherwise preloading the 8.282 + * kernel could corrupt things. 8.283 + */ 8.284 + result = -EADDRNOTAVAIL; 8.285 + for (i = 0; i < nr_segments; i++) { 8.286 + unsigned long mstart, mend; 8.287 + 8.288 + mstart = image->segment[i].mem; 8.289 + mend = mstart + image->segment[i].memsz - 1; 8.290 + /* Ensure we are within the crash kernel limits */ 8.291 + if ((mstart < crashk_res.start) || (mend > crashk_res.end)) 8.292 + goto out; 8.293 + } 8.294 + 8.295 + /* 8.296 + * Find a location for the control code buffer, and add 8.297 + * the vector of segments so that it's pages will also be 8.298 + * counted as destination pages. 8.299 + */ 8.300 + result = -ENOMEM; 8.301 + image->control_code_page = kimage_alloc_control_pages(image, 8.302 + get_order(KEXEC_CONTROL_CODE_SIZE)); 8.303 + if (!image->control_code_page) { 8.304 + printk(KERN_ERR "Could not allocate control_code_buffer\n"); 8.305 + goto out; 8.306 + } 8.307 + 8.308 + result = 0; 8.309 +out: 8.310 + if (result == 0) 8.311 + *rimage = image; 8.312 + else 8.313 + kfree(image); 8.314 + 8.315 + return result; 8.316 +} 8.317 + 8.318 +static int kimage_is_destination_range(struct kimage *image, 8.319 + unsigned long start, 8.320 + unsigned long end) 8.321 +{ 8.322 + unsigned long i; 8.323 + 8.324 + for (i = 0; i < image->nr_segments; i++) { 8.325 + unsigned long mstart, mend; 8.326 + 8.327 + mstart = image->segment[i].mem; 8.328 + mend = mstart + image->segment[i].memsz; 8.329 + if ((end > mstart) && (start < mend)) 8.330 + return 1; 8.331 + } 8.332 + 8.333 + return 0; 8.334 +} 8.335 + 8.336 +static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) 8.337 +{ 8.338 + struct page *pages; 8.339 + 8.340 + pages = alloc_pages(gfp_mask, order); 8.341 + if (pages) { 8.342 + unsigned int count, i; 8.343 + pages->mapping = NULL; 8.344 + set_page_private(pages, order); 8.345 + count = 1 << order; 8.346 + for (i = 0; i < count; i++) 8.347 + SetPageReserved(pages + i); 8.348 + } 8.349 + 8.350 + return pages; 8.351 +} 8.352 + 8.353 +static void kimage_free_pages(struct page *page) 8.354 +{ 8.355 + unsigned int order, count, i; 8.356 + 8.357 + order = page_private(page); 8.358 + count = 1 << order; 8.359 + for (i = 0; i < count; i++) 8.360 + ClearPageReserved(page + i); 8.361 + __free_pages(page, order); 8.362 +} 8.363 + 8.364 +static void kimage_free_page_list(struct list_head *list) 8.365 +{ 8.366 + struct list_head *pos, *next; 8.367 + 8.368 + list_for_each_safe(pos, next, list) { 8.369 + struct page *page; 8.370 + 8.371 + page = list_entry(pos, struct page, lru); 8.372 + list_del(&page->lru); 8.373 + kimage_free_pages(page); 8.374 + } 8.375 +} 8.376 + 8.377 +static struct page *kimage_alloc_normal_control_pages(struct kimage *image, 8.378 + unsigned int order) 8.379 +{ 8.380 + /* Control pages are special, they are the intermediaries 8.381 + * that are needed while we copy the rest of the pages 8.382 + * to their final resting place. As such they must 8.383 + * not conflict with either the destination addresses 8.384 + * or memory the kernel is already using. 8.385 + * 8.386 + * The only case where we really need more than one of 8.387 + * these are for architectures where we cannot disable 8.388 + * the MMU and must instead generate an identity mapped 8.389 + * page table for all of the memory. 8.390 + * 8.391 + * At worst this runs in O(N) of the image size. 8.392 + */ 8.393 + struct list_head extra_pages; 8.394 + struct page *pages; 8.395 + unsigned int count; 8.396 + 8.397 + count = 1 << order; 8.398 + INIT_LIST_HEAD(&extra_pages); 8.399 + 8.400 + /* Loop while I can allocate a page and the page allocated 8.401 + * is a destination page. 8.402 + */ 8.403 + do { 8.404 + unsigned long pfn, epfn, addr, eaddr; 8.405 + 8.406 + pages = kimage_alloc_pages(GFP_KERNEL, order); 8.407 + if (!pages) 8.408 + break; 8.409 + pfn = page_to_pfn(pages); 8.410 + epfn = pfn + count; 8.411 + addr = pfn << PAGE_SHIFT; 8.412 + eaddr = epfn << PAGE_SHIFT; 8.413 + if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || 8.414 + kimage_is_destination_range(image, addr, eaddr)) { 8.415 + list_add(&pages->lru, &extra_pages); 8.416 + pages = NULL; 8.417 + } 8.418 + } while (!pages); 8.419 + 8.420 + if (pages) { 8.421 + /* Remember the allocated page... */ 8.422 + list_add(&pages->lru, &image->control_pages); 8.423 + 8.424 + /* Because the page is already in it's destination 8.425 + * location we will never allocate another page at 8.426 + * that address. Therefore kimage_alloc_pages 8.427 + * will not return it (again) and we don't need 8.428 + * to give it an entry in image->segment[]. 8.429 + */ 8.430 + } 8.431 + /* Deal with the destination pages I have inadvertently allocated. 8.432 + * 8.433 + * Ideally I would convert multi-page allocations into single 8.434 + * page allocations, and add everyting to image->dest_pages. 8.435 + * 8.436 + * For now it is simpler to just free the pages. 8.437 + */ 8.438 + kimage_free_page_list(&extra_pages); 8.439 + 8.440 + return pages; 8.441 +} 8.442 + 8.443 +static struct page *kimage_alloc_crash_control_pages(struct kimage *image, 8.444 + unsigned int order) 8.445 +{ 8.446 + /* Control pages are special, they are the intermediaries 8.447 + * that are needed while we copy the rest of the pages 8.448 + * to their final resting place. As such they must 8.449 + * not conflict with either the destination addresses 8.450 + * or memory the kernel is already using. 8.451 + * 8.452 + * Control pages are also the only pags we must allocate 8.453 + * when loading a crash kernel. All of the other pages 8.454 + * are specified by the segments and we just memcpy 8.455 + * into them directly. 8.456 + * 8.457 + * The only case where we really need more than one of 8.458 + * these are for architectures where we cannot disable 8.459 + * the MMU and must instead generate an identity mapped 8.460 + * page table for all of the memory. 8.461 + * 8.462 + * Given the low demand this implements a very simple 8.463 + * allocator that finds the first hole of the appropriate 8.464 + * size in the reserved memory region, and allocates all 8.465 + * of the memory up to and including the hole. 8.466 + */ 8.467 + unsigned long hole_start, hole_end, size; 8.468 + struct page *pages; 8.469 + 8.470 + pages = NULL; 8.471 + size = (1 << order) << PAGE_SHIFT; 8.472 + hole_start = (image->control_page + (size - 1)) & ~(size - 1); 8.473 + hole_end = hole_start + size - 1; 8.474 + while (hole_end <= crashk_res.end) { 8.475 + unsigned long i; 8.476 + 8.477 + if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) 8.478 + break; 8.479 + if (hole_end > crashk_res.end) 8.480 + break; 8.481 + /* See if I overlap any of the segments */ 8.482 + for (i = 0; i < image->nr_segments; i++) { 8.483 + unsigned long mstart, mend; 8.484 + 8.485 + mstart = image->segment[i].mem; 8.486 + mend = mstart + image->segment[i].memsz - 1; 8.487 + if ((hole_end >= mstart) && (hole_start <= mend)) { 8.488 + /* Advance the hole to the end of the segment */ 8.489 + hole_start = (mend + (size - 1)) & ~(size - 1); 8.490 + hole_end = hole_start + size - 1; 8.491 + break; 8.492 + } 8.493 + } 8.494 + /* If I don't overlap any segments I have found my hole! */ 8.495 + if (i == image->nr_segments) { 8.496 + pages = pfn_to_page(hole_start >> PAGE_SHIFT); 8.497 + break; 8.498 + } 8.499 + } 8.500 + if (pages) 8.501 + image->control_page = hole_end; 8.502 + 8.503 + return pages; 8.504 +} 8.505 + 8.506 + 8.507 +struct page *kimage_alloc_control_pages(struct kimage *image, 8.508 + unsigned int order) 8.509 +{ 8.510 + struct page *pages = NULL; 8.511 + 8.512 + switch (image->type) { 8.513 + case KEXEC_TYPE_DEFAULT: 8.514 + pages = kimage_alloc_normal_control_pages(image, order); 8.515 + break; 8.516 + case KEXEC_TYPE_CRASH: 8.517 + pages = kimage_alloc_crash_control_pages(image, order); 8.518 + break; 8.519 + } 8.520 + 8.521 + return pages; 8.522 +} 8.523 + 8.524 +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) 8.525 +{ 8.526 + if (*image->entry != 0) 8.527 + image->entry++; 8.528 + 8.529 + if (image->entry == image->last_entry) { 8.530 + kimage_entry_t *ind_page; 8.531 + struct page *page; 8.532 + 8.533 + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); 8.534 + if (!page) 8.535 + return -ENOMEM; 8.536 + 8.537 + ind_page = page_address(page); 8.538 + *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; 8.539 + image->entry = ind_page; 8.540 + image->last_entry = ind_page + 8.541 + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); 8.542 + } 8.543 + *image->entry = entry; 8.544 + image->entry++; 8.545 + *image->entry = 0; 8.546 + 8.547 + return 0; 8.548 +} 8.549 + 8.550 +static int kimage_set_destination(struct kimage *image, 8.551 + unsigned long destination) 8.552 +{ 8.553 + int result; 8.554 + 8.555 + destination &= PAGE_MASK; 8.556 + result = kimage_add_entry(image, destination | IND_DESTINATION); 8.557 + if (result == 0) 8.558 + image->destination = destination; 8.559 + 8.560 + return result; 8.561 +} 8.562 + 8.563 + 8.564 +static int kimage_add_page(struct kimage *image, unsigned long page) 8.565 +{ 8.566 + int result; 8.567 + 8.568 + page &= PAGE_MASK; 8.569 + result = kimage_add_entry(image, page | IND_SOURCE); 8.570 + if (result == 0) 8.571 + image->destination += PAGE_SIZE; 8.572 + 8.573 + return result; 8.574 +} 8.575 + 8.576 + 8.577 +static void kimage_free_extra_pages(struct kimage *image) 8.578 +{ 8.579 + /* Walk through and free any extra destination pages I may have */ 8.580 + kimage_free_page_list(&image->dest_pages); 8.581 + 8.582 + /* Walk through and free any unuseable pages I have cached */ 8.583 + kimage_free_page_list(&image->unuseable_pages); 8.584 + 8.585 +} 8.586 +static int kimage_terminate(struct kimage *image) 8.587 +{ 8.588 + if (*image->entry != 0) 8.589 + image->entry++; 8.590 + 8.591 + *image->entry = IND_DONE; 8.592 + 8.593 + return 0; 8.594 +} 8.595 + 8.596 +#define for_each_kimage_entry(image, ptr, entry) \ 8.597 + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ 8.598 + ptr = (entry & IND_INDIRECTION)? \ 8.599 + phys_to_virt((entry & PAGE_MASK)): ptr +1) 8.600 + 8.601 +static void kimage_free_entry(kimage_entry_t entry) 8.602 +{ 8.603 + struct page *page; 8.604 + 8.605 + page = pfn_to_page(entry >> PAGE_SHIFT); 8.606 + kimage_free_pages(page); 8.607 +} 8.608 + 8.609 +static void kimage_free(struct kimage *image) 8.610 +{ 8.611 + kimage_entry_t *ptr, entry; 8.612 + kimage_entry_t ind = 0; 8.613 + 8.614 + if (!image) 8.615 + return; 8.616 + 8.617 + kimage_free_extra_pages(image); 8.618 + for_each_kimage_entry(image, ptr, entry) { 8.619 + if (entry & IND_INDIRECTION) { 8.620 + /* Free the previous indirection page */ 8.621 + if (ind & IND_INDIRECTION) 8.622 + kimage_free_entry(ind); 8.623 + /* Save this indirection page until we are 8.624 + * done with it. 8.625 + */ 8.626 + ind = entry; 8.627 + } 8.628 + else if (entry & IND_SOURCE) 8.629 + kimage_free_entry(entry); 8.630 + } 8.631 + /* Free the final indirection page */ 8.632 + if (ind & IND_INDIRECTION) 8.633 + kimage_free_entry(ind); 8.634 + 8.635 + /* Handle any machine specific cleanup */ 8.636 + machine_kexec_cleanup(image); 8.637 + 8.638 + /* Free the kexec control pages... */ 8.639 + kimage_free_page_list(&image->control_pages); 8.640 + kfree(image); 8.641 +} 8.642 + 8.643 +static kimage_entry_t *kimage_dst_used(struct kimage *image, 8.644 + unsigned long page) 8.645 +{ 8.646 + kimage_entry_t *ptr, entry; 8.647 + unsigned long destination = 0; 8.648 + 8.649 + for_each_kimage_entry(image, ptr, entry) { 8.650 + if (entry & IND_DESTINATION) 8.651 + destination = entry & PAGE_MASK; 8.652 + else if (entry & IND_SOURCE) { 8.653 + if (page == destination) 8.654 + return ptr; 8.655 + destination += PAGE_SIZE; 8.656 + } 8.657 + } 8.658 + 8.659 + return NULL; 8.660 +} 8.661 + 8.662 +static struct page *kimage_alloc_page(struct kimage *image, 8.663 + gfp_t gfp_mask, 8.664 + unsigned long destination) 8.665 +{ 8.666 + /* 8.667 + * Here we implement safeguards to ensure that a source page 8.668 + * is not copied to its destination page before the data on 8.669 + * the destination page is no longer useful. 8.670 + * 8.671 + * To do this we maintain the invariant that a source page is 8.672 + * either its own destination page, or it is not a 8.673 + * destination page at all. 8.674 + * 8.675 + * That is slightly stronger than required, but the proof 8.676 + * that no problems will not occur is trivial, and the 8.677 + * implementation is simply to verify. 8.678 + * 8.679 + * When allocating all pages normally this algorithm will run 8.680 + * in O(N) time, but in the worst case it will run in O(N^2) 8.681 + * time. If the runtime is a problem the data structures can 8.682 + * be fixed. 8.683 + */ 8.684 + struct page *page; 8.685 + unsigned long addr; 8.686 + 8.687 + /* 8.688 + * Walk through the list of destination pages, and see if I 8.689 + * have a match. 8.690 + */ 8.691 + list_for_each_entry(page, &image->dest_pages, lru) { 8.692 + addr = page_to_pfn(page) << PAGE_SHIFT; 8.693 + if (addr == destination) { 8.694 + list_del(&page->lru); 8.695 + return page; 8.696 + } 8.697 + } 8.698 + page = NULL; 8.699 + while (1) { 8.700 + kimage_entry_t *old; 8.701 + 8.702 + /* Allocate a page, if we run out of memory give up */ 8.703 + page = kimage_alloc_pages(gfp_mask, 0); 8.704 + if (!page) 8.705 + return NULL; 8.706 + /* If the page cannot be used file it away */ 8.707 + if (page_to_pfn(page) > 8.708 + (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { 8.709 + list_add(&page->lru, &image->unuseable_pages); 8.710 + continue; 8.711 + } 8.712 + addr = page_to_pfn(page) << PAGE_SHIFT; 8.713 + 8.714 + /* If it is the destination page we want use it */ 8.715 + if (addr == destination) 8.716 + break; 8.717 + 8.718 + /* If the page is not a destination page use it */ 8.719 + if (!kimage_is_destination_range(image, addr, 8.720 + addr + PAGE_SIZE)) 8.721 + break; 8.722 + 8.723 + /* 8.724 + * I know that the page is someones destination page. 8.725 + * See if there is already a source page for this 8.726 + * destination page. And if so swap the source pages. 8.727 + */ 8.728 + old = kimage_dst_used(image, addr); 8.729 + if (old) { 8.730 + /* If so move it */ 8.731 + unsigned long old_addr; 8.732 + struct page *old_page; 8.733 + 8.734 + old_addr = *old & PAGE_MASK; 8.735 + old_page = pfn_to_page(old_addr >> PAGE_SHIFT); 8.736 + copy_highpage(page, old_page); 8.737 + *old = addr | (*old & ~PAGE_MASK); 8.738 + 8.739 + /* The old page I have found cannot be a 8.740 + * destination page, so return it. 8.741 + */ 8.742 + addr = old_addr; 8.743 + page = old_page; 8.744 + break; 8.745 + } 8.746 + else { 8.747 + /* Place the page on the destination list I 8.748 + * will use it later. 8.749 + */ 8.750 + list_add(&page->lru, &image->dest_pages); 8.751 + } 8.752 + } 8.753 + 8.754 + return page; 8.755 +} 8.756 + 8.757 +static int kimage_load_normal_segment(struct kimage *image, 8.758 + struct kexec_segment *segment) 8.759 +{ 8.760 + unsigned long maddr; 8.761 + unsigned long ubytes, mbytes; 8.762 + int result; 8.763 + unsigned char __user *buf; 8.764 + 8.765 + result = 0; 8.766 + buf = segment->buf; 8.767 + ubytes = segment->bufsz; 8.768 + mbytes = segment->memsz; 8.769 + maddr = segment->mem; 8.770 + 8.771 + result = kimage_set_destination(image, maddr); 8.772 + if (result < 0) 8.773 + goto out; 8.774 + 8.775 + while (mbytes) { 8.776 + struct page *page; 8.777 + char *ptr; 8.778 + size_t uchunk, mchunk; 8.779 + 8.780 + page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); 8.781 + if (page == 0) { 8.782 + result = -ENOMEM; 8.783 + goto out; 8.784 + } 8.785 + result = kimage_add_page(image, page_to_pfn(page) 8.786 + << PAGE_SHIFT); 8.787 + if (result < 0) 8.788 + goto out; 8.789 + 8.790 + ptr = kmap(page); 8.791 + /* Start with a clear page */ 8.792 + memset(ptr, 0, PAGE_SIZE); 8.793 + ptr += maddr & ~PAGE_MASK; 8.794 + mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 8.795 + if (mchunk > mbytes) 8.796 + mchunk = mbytes; 8.797 + 8.798 + uchunk = mchunk; 8.799 + if (uchunk > ubytes) 8.800 + uchunk = ubytes; 8.801 + 8.802 + result = copy_from_user(ptr, buf, uchunk); 8.803 + kunmap(page); 8.804 + if (result) { 8.805 + result = (result < 0) ? result : -EIO; 8.806 + goto out; 8.807 + } 8.808 + ubytes -= uchunk; 8.809 + maddr += mchunk; 8.810 + buf += mchunk; 8.811 + mbytes -= mchunk; 8.812 + } 8.813 +out: 8.814 + return result; 8.815 +} 8.816 + 8.817 +static int kimage_load_crash_segment(struct kimage *image, 8.818 + struct kexec_segment *segment) 8.819 +{ 8.820 + /* For crash dumps kernels we simply copy the data from 8.821 + * user space to it's destination. 8.822 + * We do things a page at a time for the sake of kmap. 8.823 + */ 8.824 + unsigned long maddr; 8.825 + unsigned long ubytes, mbytes; 8.826 + int result; 8.827 + unsigned char __user *buf; 8.828 + 8.829 + result = 0; 8.830 + buf = segment->buf; 8.831 + ubytes = segment->bufsz; 8.832 + mbytes = segment->memsz; 8.833 + maddr = segment->mem; 8.834 + while (mbytes) { 8.835 + struct page *page; 8.836 + char *ptr; 8.837 + size_t uchunk, mchunk; 8.838 + 8.839 + page = pfn_to_page(maddr >> PAGE_SHIFT); 8.840 + if (page == 0) { 8.841 + result = -ENOMEM; 8.842 + goto out; 8.843 + } 8.844 + ptr = kmap(page); 8.845 + ptr += maddr & ~PAGE_MASK; 8.846 + mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 8.847 + if (mchunk > mbytes) 8.848 + mchunk = mbytes; 8.849 + 8.850 + uchunk = mchunk; 8.851 + if (uchunk > ubytes) { 8.852 + uchunk = ubytes; 8.853 + /* Zero the trailing part of the page */ 8.854 + memset(ptr + uchunk, 0, mchunk - uchunk); 8.855 + } 8.856 + result = copy_from_user(ptr, buf, uchunk); 8.857 + kunmap(page); 8.858 + if (result) { 8.859 + result = (result < 0) ? result : -EIO; 8.860 + goto out; 8.861 + } 8.862 + ubytes -= uchunk; 8.863 + maddr += mchunk; 8.864 + buf += mchunk; 8.865 + mbytes -= mchunk; 8.866 + } 8.867 +out: 8.868 + return result; 8.869 +} 8.870 + 8.871 +static int kimage_load_segment(struct kimage *image, 8.872 + struct kexec_segment *segment) 8.873 +{ 8.874 + int result = -ENOMEM; 8.875 + 8.876 + switch (image->type) { 8.877 + case KEXEC_TYPE_DEFAULT: 8.878 + result = kimage_load_normal_segment(image, segment); 8.879 + break; 8.880 + case KEXEC_TYPE_CRASH: 8.881 + result = kimage_load_crash_segment(image, segment); 8.882 + break; 8.883 + } 8.884 + 8.885 + return result; 8.886 +} 8.887 + 8.888 +/* 8.889 + * Exec Kernel system call: for obvious reasons only root may call it. 8.890 + * 8.891 + * This call breaks up into three pieces. 8.892 + * - A generic part which loads the new kernel from the current 8.893 + * address space, and very carefully places the data in the 8.894 + * allocated pages. 8.895 + * 8.896 + * - A generic part that interacts with the kernel and tells all of 8.897 + * the devices to shut down. Preventing on-going dmas, and placing 8.898 + * the devices in a consistent state so a later kernel can 8.899 + * reinitialize them. 8.900 + * 8.901 + * - A machine specific part that includes the syscall number 8.902 + * and the copies the image to it's final destination. And 8.903 + * jumps into the image at entry. 8.904 + * 8.905 + * kexec does not sync, or unmount filesystems so if you need 8.906 + * that to happen you need to do that yourself. 8.907 + */ 8.908 +struct kimage *kexec_image = NULL; 8.909 +static struct kimage *kexec_crash_image = NULL; 8.910 +/* 8.911 + * A home grown binary mutex. 8.912 + * Nothing can wait so this mutex is safe to use 8.913 + * in interrupt context :) 8.914 + */ 8.915 +static int kexec_lock = 0; 8.916 + 8.917 +asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 8.918 + struct kexec_segment __user *segments, 8.919 + unsigned long flags) 8.920 +{ 8.921 + struct kimage **dest_image, *image; 8.922 + int locked; 8.923 + int result; 8.924 + 8.925 + /* We only trust the superuser with rebooting the system. */ 8.926 + if (!capable(CAP_SYS_BOOT)) 8.927 + return -EPERM; 8.928 + 8.929 + /* 8.930 + * Verify we have a legal set of flags 8.931 + * This leaves us room for future extensions. 8.932 + */ 8.933 + if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) 8.934 + return -EINVAL; 8.935 + 8.936 + /* Verify we are on the appropriate architecture */ 8.937 + if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && 8.938 + ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) 8.939 + return -EINVAL; 8.940 + 8.941 + /* Put an artificial cap on the number 8.942 + * of segments passed to kexec_load. 8.943 + */ 8.944 + if (nr_segments > KEXEC_SEGMENT_MAX) 8.945 + return -EINVAL; 8.946 + 8.947 + image = NULL; 8.948 + result = 0; 8.949 + 8.950 + /* Because we write directly to the reserved memory 8.951 + * region when loading crash kernels we need a mutex here to 8.952 + * prevent multiple crash kernels from attempting to load 8.953 + * simultaneously, and to prevent a crash kernel from loading 8.954 + * over the top of a in use crash kernel. 8.955 + * 8.956 + * KISS: always take the mutex. 8.957 + */ 8.958 + locked = xchg(&kexec_lock, 1); 8.959 + if (locked) 8.960 + return -EBUSY; 8.961 + 8.962 + dest_image = &kexec_image; 8.963 + if (flags & KEXEC_ON_CRASH) 8.964 + dest_image = &kexec_crash_image; 8.965 + if (nr_segments > 0) { 8.966 + unsigned long i; 8.967 + 8.968 + /* Loading another kernel to reboot into */ 8.969 + if ((flags & KEXEC_ON_CRASH) == 0) 8.970 + result = kimage_normal_alloc(&image, entry, 8.971 + nr_segments, segments); 8.972 + /* Loading another kernel to switch to if this one crashes */ 8.973 + else if (flags & KEXEC_ON_CRASH) { 8.974 + /* Free any current crash dump kernel before 8.975 + * we corrupt it. 8.976 + */ 8.977 + kimage_free(xchg(&kexec_crash_image, NULL)); 8.978 + result = kimage_crash_alloc(&image, entry, 8.979 + nr_segments, segments); 8.980 + } 8.981 + if (result) 8.982 + goto out; 8.983 + 8.984 + result = machine_kexec_prepare(image); 8.985 + if (result) 8.986 + goto out; 8.987 + 8.988 + for (i = 0; i < nr_segments; i++) { 8.989 + result = kimage_load_segment(image, &image->segment[i]); 8.990 + if (result) 8.991 + goto out; 8.992 + } 8.993 + result = kimage_terminate(image); 8.994 + if (result) 8.995 + goto out; 8.996 + } 8.997 + /* Install the new kernel, and Uninstall the old */ 8.998 + image = xchg(dest_image, image); 8.999 + 8.1000 +out: 8.1001 + xchg(&kexec_lock, 0); /* Release the mutex */ 8.1002 + kimage_free(image); 8.1003 + 8.1004 + return result; 8.1005 +} 8.1006 + 8.1007 +#ifdef CONFIG_COMPAT 8.1008 +asmlinkage long compat_sys_kexec_load(unsigned long entry, 8.1009 + unsigned long nr_segments, 8.1010 + struct compat_kexec_segment __user *segments, 8.1011 + unsigned long flags) 8.1012 +{ 8.1013 + struct compat_kexec_segment in; 8.1014 + struct kexec_segment out, __user *ksegments; 8.1015 + unsigned long i, result; 8.1016 + 8.1017 + /* Don't allow clients that don't understand the native 8.1018 + * architecture to do anything. 8.1019 + */ 8.1020 + if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) 8.1021 + return -EINVAL; 8.1022 + 8.1023 + if (nr_segments > KEXEC_SEGMENT_MAX) 8.1024 + return -EINVAL; 8.1025 + 8.1026 + ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); 8.1027 + for (i=0; i < nr_segments; i++) { 8.1028 + result = copy_from_user(&in, &segments[i], sizeof(in)); 8.1029 + if (result) 8.1030 + return -EFAULT; 8.1031 + 8.1032 + out.buf = compat_ptr(in.buf); 8.1033 + out.bufsz = in.bufsz; 8.1034 + out.mem = in.mem; 8.1035 + out.memsz = in.memsz; 8.1036 + 8.1037 + result = copy_to_user(&ksegments[i], &out, sizeof(out)); 8.1038 + if (result) 8.1039 + return -EFAULT; 8.1040 + } 8.1041 + 8.1042 + return sys_kexec_load(entry, nr_segments, ksegments, flags); 8.1043 +} 8.1044 +#endif 8.1045 + 8.1046 +void crash_kexec(struct pt_regs *regs) 8.1047 +{ 8.1048 + struct kimage *image; 8.1049 + int locked; 8.1050 + 8.1051 + 8.1052 + /* Take the kexec_lock here to prevent sys_kexec_load 8.1053 + * running on one cpu from replacing the crash kernel 8.1054 + * we are using after a panic on a different cpu. 8.1055 + * 8.1056 + * If the crash kernel was not located in a fixed area 8.1057 + * of memory the xchg(&kexec_crash_image) would be 8.1058 + * sufficient. But since I reuse the memory... 8.1059 + */ 8.1060 + locked = xchg(&kexec_lock, 1); 8.1061 + if (!locked) { 8.1062 + image = xchg(&kexec_crash_image, NULL); 8.1063 + if (image) { 8.1064 + struct pt_regs fixed_regs; 8.1065 + crash_setup_regs(&fixed_regs, regs); 8.1066 + machine_crash_shutdown(&fixed_regs); 8.1067 + machine_kexec(image); 8.1068 + } 8.1069 + xchg(&kexec_lock, 0); 8.1070 + } 8.1071 +} 8.1072 + 8.1073 +static int __init crash_notes_memory_init(void) 8.1074 +{ 8.1075 + /* Allocate memory for saving cpu registers. */ 8.1076 + crash_notes = alloc_percpu(note_buf_t); 8.1077 + if (!crash_notes) { 8.1078 + printk("Kexec: Memory allocation for saving cpu register" 8.1079 + " states failed\n"); 8.1080 + return -ENOMEM; 8.1081 + } 8.1082 + return 0; 8.1083 +} 8.1084 +module_init(crash_notes_memory_init)