direct-io.hg

changeset 12806:d51e5a7317bb

[LINUX] Kexec: add kexec files to sparse tree.

Signed-off-by: Ian Campbell <ian.campbell@xensource.com>
author Ian Campbell <ian.campbell@xensource.com>
date Fri Dec 08 11:47:09 2006 +0000 (2006-12-08)
parents 1db125262365
children 562eee7568a8
files linux-2.6-xen-sparse/arch/i386/kernel/crash.c linux-2.6-xen-sparse/arch/i386/kernel/machine_kexec.c linux-2.6-xen-sparse/arch/x86_64/kernel/crash.c linux-2.6-xen-sparse/arch/x86_64/kernel/machine_kexec.c linux-2.6-xen-sparse/include/asm-i386/kexec.h linux-2.6-xen-sparse/include/asm-x86_64/kexec.h linux-2.6-xen-sparse/include/linux/kexec.h linux-2.6-xen-sparse/kernel/kexec.c
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/crash.c	Fri Dec 08 11:47:09 2006 +0000
     1.3 @@ -0,0 +1,183 @@
     1.4 +/*
     1.5 + * Architecture specific (i386) functions for kexec based crash dumps.
     1.6 + *
     1.7 + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
     1.8 + *
     1.9 + * Copyright (C) IBM Corporation, 2004. All rights reserved.
    1.10 + *
    1.11 + */
    1.12 +
    1.13 +#include <linux/init.h>
    1.14 +#include <linux/types.h>
    1.15 +#include <linux/kernel.h>
    1.16 +#include <linux/smp.h>
    1.17 +#include <linux/reboot.h>
    1.18 +#include <linux/kexec.h>
    1.19 +#include <linux/delay.h>
    1.20 +#include <linux/elf.h>
    1.21 +#include <linux/elfcore.h>
    1.22 +
    1.23 +#include <asm/processor.h>
    1.24 +#include <asm/hardirq.h>
    1.25 +#include <asm/nmi.h>
    1.26 +#include <asm/hw_irq.h>
    1.27 +#include <asm/apic.h>
    1.28 +#include <mach_ipi.h>
    1.29 +
    1.30 +
    1.31 +/* This keeps a track of which one is crashing cpu. */
    1.32 +static int crashing_cpu;
    1.33 +
    1.34 +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
    1.35 +							       size_t data_len)
    1.36 +{
    1.37 +	struct elf_note note;
    1.38 +
    1.39 +	note.n_namesz = strlen(name) + 1;
    1.40 +	note.n_descsz = data_len;
    1.41 +	note.n_type   = type;
    1.42 +	memcpy(buf, &note, sizeof(note));
    1.43 +	buf += (sizeof(note) +3)/4;
    1.44 +	memcpy(buf, name, note.n_namesz);
    1.45 +	buf += (note.n_namesz + 3)/4;
    1.46 +	memcpy(buf, data, note.n_descsz);
    1.47 +	buf += (note.n_descsz + 3)/4;
    1.48 +
    1.49 +	return buf;
    1.50 +}
    1.51 +
    1.52 +static void final_note(u32 *buf)
    1.53 +{
    1.54 +	struct elf_note note;
    1.55 +
    1.56 +	note.n_namesz = 0;
    1.57 +	note.n_descsz = 0;
    1.58 +	note.n_type   = 0;
    1.59 +	memcpy(buf, &note, sizeof(note));
    1.60 +}
    1.61 +
    1.62 +static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
    1.63 +{
    1.64 +	struct elf_prstatus prstatus;
    1.65 +	u32 *buf;
    1.66 +
    1.67 +	if ((cpu < 0) || (cpu >= NR_CPUS))
    1.68 +		return;
    1.69 +
    1.70 +	/* Using ELF notes here is opportunistic.
    1.71 +	 * I need a well defined structure format
    1.72 +	 * for the data I pass, and I need tags
    1.73 +	 * on the data to indicate what information I have
    1.74 +	 * squirrelled away.  ELF notes happen to provide
    1.75 +	 * all of that that no need to invent something new.
    1.76 +	 */
    1.77 +	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
    1.78 +	if (!buf)
    1.79 +		return;
    1.80 +	memset(&prstatus, 0, sizeof(prstatus));
    1.81 +	prstatus.pr_pid = current->pid;
    1.82 +	elf_core_copy_regs(&prstatus.pr_reg, regs);
    1.83 +	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
    1.84 +				sizeof(prstatus));
    1.85 +	final_note(buf);
    1.86 +}
    1.87 +
    1.88 +static void crash_save_self(struct pt_regs *regs)
    1.89 +{
    1.90 +	int cpu;
    1.91 +
    1.92 +	cpu = smp_processor_id();
    1.93 +	crash_save_this_cpu(regs, cpu);
    1.94 +}
    1.95 +
    1.96 +#ifdef CONFIG_SMP
    1.97 +static atomic_t waiting_for_crash_ipi;
    1.98 +
    1.99 +static int crash_nmi_callback(struct pt_regs *regs, int cpu)
   1.100 +{
   1.101 +	struct pt_regs fixed_regs;
   1.102 +
   1.103 +	/* Don't do anything if this handler is invoked on crashing cpu.
   1.104 +	 * Otherwise, system will completely hang. Crashing cpu can get
   1.105 +	 * an NMI if system was initially booted with nmi_watchdog parameter.
   1.106 +	 */
   1.107 +	if (cpu == crashing_cpu)
   1.108 +		return 1;
   1.109 +	local_irq_disable();
   1.110 +
   1.111 +	if (!user_mode(regs)) {
   1.112 +		crash_fixup_ss_esp(&fixed_regs, regs);
   1.113 +		regs = &fixed_regs;
   1.114 +	}
   1.115 +	crash_save_this_cpu(regs, cpu);
   1.116 +	disable_local_APIC();
   1.117 +	atomic_dec(&waiting_for_crash_ipi);
   1.118 +	/* Assume hlt works */
   1.119 +	halt();
   1.120 +	for(;;);
   1.121 +
   1.122 +	return 1;
   1.123 +}
   1.124 +
   1.125 +/*
   1.126 + * By using the NMI code instead of a vector we just sneak thru the
   1.127 + * word generator coming out with just what we want.  AND it does
   1.128 + * not matter if clustered_apic_mode is set or not.
   1.129 + */
   1.130 +static void smp_send_nmi_allbutself(void)
   1.131 +{
   1.132 +	send_IPI_allbutself(APIC_DM_NMI);
   1.133 +}
   1.134 +
   1.135 +static void nmi_shootdown_cpus(void)
   1.136 +{
   1.137 +	unsigned long msecs;
   1.138 +
   1.139 +	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
   1.140 +	/* Would it be better to replace the trap vector here? */
   1.141 +	set_nmi_callback(crash_nmi_callback);
   1.142 +	/* Ensure the new callback function is set before sending
   1.143 +	 * out the NMI
   1.144 +	 */
   1.145 +	wmb();
   1.146 +
   1.147 +	smp_send_nmi_allbutself();
   1.148 +
   1.149 +	msecs = 1000; /* Wait at most a second for the other cpus to stop */
   1.150 +	while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
   1.151 +		mdelay(1);
   1.152 +		msecs--;
   1.153 +	}
   1.154 +
   1.155 +	/* Leave the nmi callback set */
   1.156 +	disable_local_APIC();
   1.157 +}
   1.158 +#else
   1.159 +static void nmi_shootdown_cpus(void)
   1.160 +{
   1.161 +	/* There are no cpus to shootdown */
   1.162 +}
   1.163 +#endif
   1.164 +
   1.165 +void machine_crash_shutdown(struct pt_regs *regs)
   1.166 +{
   1.167 +	/* This function is only called after the system
   1.168 +	 * has paniced or is otherwise in a critical state.
   1.169 +	 * The minimum amount of code to allow a kexec'd kernel
   1.170 +	 * to run successfully needs to happen here.
   1.171 +	 *
   1.172 +	 * In practice this means shooting down the other cpus in
   1.173 +	 * an SMP system.
   1.174 +	 */
   1.175 +	/* The kernel is broken so disable interrupts */
   1.176 +	local_irq_disable();
   1.177 +
   1.178 +	/* Make a note of crashing cpu. Will be used in NMI callback.*/
   1.179 +	crashing_cpu = smp_processor_id();
   1.180 +	nmi_shootdown_cpus();
   1.181 +	lapic_shutdown();
   1.182 +#if defined(CONFIG_X86_IO_APIC)
   1.183 +	disable_IO_APIC();
   1.184 +#endif
   1.185 +	crash_save_self(regs);
   1.186 +}
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/machine_kexec.c	Fri Dec 08 11:47:09 2006 +0000
     2.3 @@ -0,0 +1,89 @@
     2.4 +/*
     2.5 + * machine_kexec.c - handle transition of Linux booting another kernel
     2.6 + * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
     2.7 + *
     2.8 + * This source code is licensed under the GNU General Public License,
     2.9 + * Version 2.  See the file COPYING for more details.
    2.10 + */
    2.11 +
    2.12 +#include <linux/mm.h>
    2.13 +#include <linux/kexec.h>
    2.14 +#include <linux/delay.h>
    2.15 +#include <asm/pgtable.h>
    2.16 +#include <asm/pgalloc.h>
    2.17 +#include <asm/tlbflush.h>
    2.18 +#include <asm/mmu_context.h>
    2.19 +#include <asm/io.h>
    2.20 +#include <asm/apic.h>
    2.21 +#include <asm/cpufeature.h>
    2.22 +#include <asm/desc.h>
    2.23 +#include <asm/system.h>
    2.24 +
    2.25 +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
    2.26 +static u32 kexec_pgd[1024] PAGE_ALIGNED;
    2.27 +#ifdef CONFIG_X86_PAE
    2.28 +static u32 kexec_pmd0[1024] PAGE_ALIGNED;
    2.29 +static u32 kexec_pmd1[1024] PAGE_ALIGNED;
    2.30 +#endif
    2.31 +static u32 kexec_pte0[1024] PAGE_ALIGNED;
    2.32 +static u32 kexec_pte1[1024] PAGE_ALIGNED;
    2.33 +
    2.34 +/*
    2.35 + * A architecture hook called to validate the
    2.36 + * proposed image and prepare the control pages
    2.37 + * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
    2.38 + * have been allocated, but the segments have yet
    2.39 + * been copied into the kernel.
    2.40 + *
    2.41 + * Do what every setup is needed on image and the
    2.42 + * reboot code buffer to allow us to avoid allocations
    2.43 + * later.
    2.44 + *
    2.45 + * Currently nothing.
    2.46 + */
    2.47 +int machine_kexec_prepare(struct kimage *image)
    2.48 +{
    2.49 +	return 0;
    2.50 +}
    2.51 +
    2.52 +/*
    2.53 + * Undo anything leftover by machine_kexec_prepare
    2.54 + * when an image is freed.
    2.55 + */
    2.56 +void machine_kexec_cleanup(struct kimage *image)
    2.57 +{
    2.58 +}
    2.59 +
    2.60 +/*
    2.61 + * Do not allocate memory (or fail in any way) in machine_kexec().
    2.62 + * We are past the point of no return, committed to rebooting now.
    2.63 + */
    2.64 +NORET_TYPE void machine_kexec(struct kimage *image)
    2.65 +{
    2.66 +	unsigned long page_list[PAGES_NR];
    2.67 +	void *control_page;
    2.68 +
    2.69 +	/* Interrupts aren't acceptable while we reboot */
    2.70 +	local_irq_disable();
    2.71 +
    2.72 +	control_page = page_address(image->control_code_page);
    2.73 +	memcpy(control_page, relocate_kernel, PAGE_SIZE);
    2.74 +
    2.75 +	page_list[PA_CONTROL_PAGE] = __pa(control_page);
    2.76 +	page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
    2.77 +	page_list[PA_PGD] = __pa(kexec_pgd);
    2.78 +	page_list[VA_PGD] = (unsigned long)kexec_pgd;
    2.79 +#ifdef CONFIG_X86_PAE
    2.80 +	page_list[PA_PMD_0] = __pa(kexec_pmd0);
    2.81 +	page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
    2.82 +	page_list[PA_PMD_1] = __pa(kexec_pmd1);
    2.83 +	page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
    2.84 +#endif
    2.85 +	page_list[PA_PTE_0] = __pa(kexec_pte0);
    2.86 +	page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
    2.87 +	page_list[PA_PTE_1] = __pa(kexec_pte1);
    2.88 +	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
    2.89 +
    2.90 +	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
    2.91 +			image->start, cpu_has_pae);
    2.92 +}
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/crash.c	Fri Dec 08 11:47:09 2006 +0000
     3.3 @@ -0,0 +1,186 @@
     3.4 +/*
     3.5 + * Architecture specific (x86_64) functions for kexec based crash dumps.
     3.6 + *
     3.7 + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
     3.8 + *
     3.9 + * Copyright (C) IBM Corporation, 2004. All rights reserved.
    3.10 + *
    3.11 + */
    3.12 +
    3.13 +#include <linux/init.h>
    3.14 +#include <linux/types.h>
    3.15 +#include <linux/kernel.h>
    3.16 +#include <linux/smp.h>
    3.17 +#include <linux/irq.h>
    3.18 +#include <linux/reboot.h>
    3.19 +#include <linux/kexec.h>
    3.20 +#include <linux/delay.h>
    3.21 +#include <linux/elf.h>
    3.22 +#include <linux/elfcore.h>
    3.23 +
    3.24 +#include <asm/processor.h>
    3.25 +#include <asm/hardirq.h>
    3.26 +#include <asm/nmi.h>
    3.27 +#include <asm/hw_irq.h>
    3.28 +#include <asm/mach_apic.h>
    3.29 +
    3.30 +/* This keeps a track of which one is crashing cpu. */
    3.31 +static int crashing_cpu;
    3.32 +
    3.33 +static u32 *append_elf_note(u32 *buf, char *name, unsigned type,
    3.34 +						void *data, size_t data_len)
    3.35 +{
    3.36 +	struct elf_note note;
    3.37 +
    3.38 +	note.n_namesz = strlen(name) + 1;
    3.39 +	note.n_descsz = data_len;
    3.40 +	note.n_type   = type;
    3.41 +	memcpy(buf, &note, sizeof(note));
    3.42 +	buf += (sizeof(note) +3)/4;
    3.43 +	memcpy(buf, name, note.n_namesz);
    3.44 +	buf += (note.n_namesz + 3)/4;
    3.45 +	memcpy(buf, data, note.n_descsz);
    3.46 +	buf += (note.n_descsz + 3)/4;
    3.47 +
    3.48 +	return buf;
    3.49 +}
    3.50 +
    3.51 +static void final_note(u32 *buf)
    3.52 +{
    3.53 +	struct elf_note note;
    3.54 +
    3.55 +	note.n_namesz = 0;
    3.56 +	note.n_descsz = 0;
    3.57 +	note.n_type   = 0;
    3.58 +	memcpy(buf, &note, sizeof(note));
    3.59 +}
    3.60 +
    3.61 +static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
    3.62 +{
    3.63 +	struct elf_prstatus prstatus;
    3.64 +	u32 *buf;
    3.65 +
    3.66 +	if ((cpu < 0) || (cpu >= NR_CPUS))
    3.67 +		return;
    3.68 +
    3.69 +	/* Using ELF notes here is opportunistic.
    3.70 +	 * I need a well defined structure format
    3.71 +	 * for the data I pass, and I need tags
    3.72 +	 * on the data to indicate what information I have
    3.73 +	 * squirrelled away.  ELF notes happen to provide
    3.74 +	 * all of that that no need to invent something new.
    3.75 +	 */
    3.76 +
    3.77 +	buf = (u32*)per_cpu_ptr(crash_notes, cpu);
    3.78 +
    3.79 +	if (!buf)
    3.80 +		return;
    3.81 +
    3.82 +	memset(&prstatus, 0, sizeof(prstatus));
    3.83 +	prstatus.pr_pid = current->pid;
    3.84 +	elf_core_copy_regs(&prstatus.pr_reg, regs);
    3.85 +	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
    3.86 +					sizeof(prstatus));
    3.87 +	final_note(buf);
    3.88 +}
    3.89 +
    3.90 +static void crash_save_self(struct pt_regs *regs)
    3.91 +{
    3.92 +	int cpu;
    3.93 +
    3.94 +	cpu = smp_processor_id();
    3.95 +	crash_save_this_cpu(regs, cpu);
    3.96 +}
    3.97 +
    3.98 +#ifdef CONFIG_SMP
    3.99 +static atomic_t waiting_for_crash_ipi;
   3.100 +
   3.101 +static int crash_nmi_callback(struct pt_regs *regs, int cpu)
   3.102 +{
   3.103 +	/*
   3.104 +	 * Don't do anything if this handler is invoked on crashing cpu.
   3.105 +	 * Otherwise, system will completely hang. Crashing cpu can get
   3.106 +	 * an NMI if system was initially booted with nmi_watchdog parameter.
   3.107 +	 */
   3.108 +	if (cpu == crashing_cpu)
   3.109 +		return 1;
   3.110 +	local_irq_disable();
   3.111 +
   3.112 +	crash_save_this_cpu(regs, cpu);
   3.113 +	disable_local_APIC();
   3.114 +	atomic_dec(&waiting_for_crash_ipi);
   3.115 +	/* Assume hlt works */
   3.116 +	for(;;)
   3.117 +		asm("hlt");
   3.118 +
   3.119 +	return 1;
   3.120 +}
   3.121 +
   3.122 +static void smp_send_nmi_allbutself(void)
   3.123 +{
   3.124 +	send_IPI_allbutself(APIC_DM_NMI);
   3.125 +}
   3.126 +
   3.127 +/*
   3.128 + * This code is a best effort heuristic to get the
   3.129 + * other cpus to stop executing. So races with
   3.130 + * cpu hotplug shouldn't matter.
   3.131 + */
   3.132 +
   3.133 +static void nmi_shootdown_cpus(void)
   3.134 +{
   3.135 +	unsigned long msecs;
   3.136 +
   3.137 +	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
   3.138 +	set_nmi_callback(crash_nmi_callback);
   3.139 +
   3.140 +	/*
   3.141 +	 * Ensure the new callback function is set before sending
   3.142 +	 * out the NMI
   3.143 +	 */
   3.144 +	wmb();
   3.145 +
   3.146 +	smp_send_nmi_allbutself();
   3.147 +
   3.148 +	msecs = 1000; /* Wait at most a second for the other cpus to stop */
   3.149 +	while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
   3.150 +		mdelay(1);
   3.151 +		msecs--;
   3.152 +	}
   3.153 +	/* Leave the nmi callback set */
   3.154 +	disable_local_APIC();
   3.155 +}
   3.156 +#else
   3.157 +static void nmi_shootdown_cpus(void)
   3.158 +{
   3.159 +	/* There are no cpus to shootdown */
   3.160 +}
   3.161 +#endif
   3.162 +
   3.163 +void machine_crash_shutdown(struct pt_regs *regs)
   3.164 +{
   3.165 +	/*
   3.166 +	 * This function is only called after the system
   3.167 +	 * has paniced or is otherwise in a critical state.
   3.168 +	 * The minimum amount of code to allow a kexec'd kernel
   3.169 +	 * to run successfully needs to happen here.
   3.170 +	 *
   3.171 +	 * In practice this means shooting down the other cpus in
   3.172 +	 * an SMP system.
   3.173 +	 */
   3.174 +	/* The kernel is broken so disable interrupts */
   3.175 +	local_irq_disable();
   3.176 +
   3.177 +	/* Make a note of crashing cpu. Will be used in NMI callback.*/
   3.178 +	crashing_cpu = smp_processor_id();
   3.179 +	nmi_shootdown_cpus();
   3.180 +
   3.181 +	if(cpu_has_apic)
   3.182 +		 disable_local_APIC();
   3.183 +
   3.184 +#if defined(CONFIG_X86_IO_APIC)
   3.185 +	disable_IO_APIC();
   3.186 +#endif
   3.187 +
   3.188 +	crash_save_self(regs);
   3.189 +}
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/machine_kexec.c	Fri Dec 08 11:47:09 2006 +0000
     4.3 @@ -0,0 +1,173 @@
     4.4 +/*
     4.5 + * machine_kexec.c - handle transition of Linux booting another kernel
     4.6 + * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
     4.7 + *
     4.8 + * This source code is licensed under the GNU General Public License,
     4.9 + * Version 2.  See the file COPYING for more details.
    4.10 + */
    4.11 +
    4.12 +#include <linux/mm.h>
    4.13 +#include <linux/kexec.h>
    4.14 +#include <linux/string.h>
    4.15 +#include <linux/reboot.h>
    4.16 +#include <asm/pgtable.h>
    4.17 +#include <asm/tlbflush.h>
    4.18 +#include <asm/mmu_context.h>
    4.19 +#include <asm/io.h>
    4.20 +
    4.21 +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
    4.22 +static u64 kexec_pgd[512] PAGE_ALIGNED;
    4.23 +static u64 kexec_pud0[512] PAGE_ALIGNED;
    4.24 +static u64 kexec_pmd0[512] PAGE_ALIGNED;
    4.25 +static u64 kexec_pte0[512] PAGE_ALIGNED;
    4.26 +static u64 kexec_pud1[512] PAGE_ALIGNED;
    4.27 +static u64 kexec_pmd1[512] PAGE_ALIGNED;
    4.28 +static u64 kexec_pte1[512] PAGE_ALIGNED;
    4.29 +
    4.30 +static void init_level2_page(pmd_t *level2p, unsigned long addr)
    4.31 +{
    4.32 +	unsigned long end_addr;
    4.33 +
    4.34 +	addr &= PAGE_MASK;
    4.35 +	end_addr = addr + PUD_SIZE;
    4.36 +	while (addr < end_addr) {
    4.37 +		set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
    4.38 +		addr += PMD_SIZE;
    4.39 +	}
    4.40 +}
    4.41 +
    4.42 +static int init_level3_page(struct kimage *image, pud_t *level3p,
    4.43 +				unsigned long addr, unsigned long last_addr)
    4.44 +{
    4.45 +	unsigned long end_addr;
    4.46 +	int result;
    4.47 +
    4.48 +	result = 0;
    4.49 +	addr &= PAGE_MASK;
    4.50 +	end_addr = addr + PGDIR_SIZE;
    4.51 +	while ((addr < last_addr) && (addr < end_addr)) {
    4.52 +		struct page *page;
    4.53 +		pmd_t *level2p;
    4.54 +
    4.55 +		page = kimage_alloc_control_pages(image, 0);
    4.56 +		if (!page) {
    4.57 +			result = -ENOMEM;
    4.58 +			goto out;
    4.59 +		}
    4.60 +		level2p = (pmd_t *)page_address(page);
    4.61 +		init_level2_page(level2p, addr);
    4.62 +		set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
    4.63 +		addr += PUD_SIZE;
    4.64 +	}
    4.65 +	/* clear the unused entries */
    4.66 +	while (addr < end_addr) {
    4.67 +		pud_clear(level3p++);
    4.68 +		addr += PUD_SIZE;
    4.69 +	}
    4.70 +out:
    4.71 +	return result;
    4.72 +}
    4.73 +
    4.74 +
    4.75 +static int init_level4_page(struct kimage *image, pgd_t *level4p,
    4.76 +				unsigned long addr, unsigned long last_addr)
    4.77 +{
    4.78 +	unsigned long end_addr;
    4.79 +	int result;
    4.80 +
    4.81 +	result = 0;
    4.82 +	addr &= PAGE_MASK;
    4.83 +	end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
    4.84 +	while ((addr < last_addr) && (addr < end_addr)) {
    4.85 +		struct page *page;
    4.86 +		pud_t *level3p;
    4.87 +
    4.88 +		page = kimage_alloc_control_pages(image, 0);
    4.89 +		if (!page) {
    4.90 +			result = -ENOMEM;
    4.91 +			goto out;
    4.92 +		}
    4.93 +		level3p = (pud_t *)page_address(page);
    4.94 +		result = init_level3_page(image, level3p, addr, last_addr);
    4.95 +		if (result) {
    4.96 +			goto out;
    4.97 +		}
    4.98 +		set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
    4.99 +		addr += PGDIR_SIZE;
   4.100 +	}
   4.101 +	/* clear the unused entries */
   4.102 +	while (addr < end_addr) {
   4.103 +		pgd_clear(level4p++);
   4.104 +		addr += PGDIR_SIZE;
   4.105 +	}
   4.106 +out:
   4.107 +	return result;
   4.108 +}
   4.109 +
   4.110 +
   4.111 +static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
   4.112 +{
   4.113 +	pgd_t *level4p;
   4.114 +	level4p = (pgd_t *)__va(start_pgtable);
   4.115 + 	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
   4.116 +}
   4.117 +
   4.118 +int machine_kexec_prepare(struct kimage *image)
   4.119 +{
   4.120 +	unsigned long start_pgtable;
   4.121 +	int result;
   4.122 +
   4.123 +	/* Calculate the offsets */
   4.124 +	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
   4.125 +
   4.126 +	/* Setup the identity mapped 64bit page table */
   4.127 +	result = init_pgtable(image, start_pgtable);
   4.128 +	if (result)
   4.129 +		return result;
   4.130 +
   4.131 +	return 0;
   4.132 +}
   4.133 +
   4.134 +void machine_kexec_cleanup(struct kimage *image)
   4.135 +{
   4.136 +	return;
   4.137 +}
   4.138 +
   4.139 +/*
   4.140 + * Do not allocate memory (or fail in any way) in machine_kexec().
   4.141 + * We are past the point of no return, committed to rebooting now.
   4.142 + */
   4.143 +NORET_TYPE void machine_kexec(struct kimage *image)
   4.144 +{
   4.145 +	unsigned long page_list[PAGES_NR];
   4.146 +	void *control_page;
   4.147 +
   4.148 +	/* Interrupts aren't acceptable while we reboot */
   4.149 +	local_irq_disable();
   4.150 +
   4.151 +	control_page = page_address(image->control_code_page) + PAGE_SIZE;
   4.152 +	memcpy(control_page, relocate_kernel, PAGE_SIZE);
   4.153 +
   4.154 +	page_list[PA_CONTROL_PAGE] = __pa(control_page);
   4.155 +	page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
   4.156 +	page_list[PA_PGD] = __pa(kexec_pgd);
   4.157 +	page_list[VA_PGD] = (unsigned long)kexec_pgd;
   4.158 +	page_list[PA_PUD_0] = __pa(kexec_pud0);
   4.159 +	page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
   4.160 +	page_list[PA_PMD_0] = __pa(kexec_pmd0);
   4.161 +	page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
   4.162 +	page_list[PA_PTE_0] = __pa(kexec_pte0);
   4.163 +	page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
   4.164 +	page_list[PA_PUD_1] = __pa(kexec_pud1);
   4.165 +	page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
   4.166 +	page_list[PA_PMD_1] = __pa(kexec_pmd1);
   4.167 +	page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
   4.168 +	page_list[PA_PTE_1] = __pa(kexec_pte1);
   4.169 +	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
   4.170 +
   4.171 +	page_list[PA_TABLE_PAGE] =
   4.172 +	  (unsigned long)__pa(page_address(image->control_code_page));
   4.173 +
   4.174 +	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
   4.175 +			image->start);
   4.176 +}
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/linux-2.6-xen-sparse/include/asm-i386/kexec.h	Fri Dec 08 11:47:09 2006 +0000
     5.3 @@ -0,0 +1,103 @@
     5.4 +#ifndef _I386_KEXEC_H
     5.5 +#define _I386_KEXEC_H
     5.6 +
     5.7 +#define PA_CONTROL_PAGE  0
     5.8 +#define VA_CONTROL_PAGE  1
     5.9 +#define PA_PGD           2
    5.10 +#define VA_PGD           3
    5.11 +#define PA_PTE_0         4
    5.12 +#define VA_PTE_0         5
    5.13 +#define PA_PTE_1         6
    5.14 +#define VA_PTE_1         7
    5.15 +#ifdef CONFIG_X86_PAE
    5.16 +#define PA_PMD_0         8
    5.17 +#define VA_PMD_0         9
    5.18 +#define PA_PMD_1         10
    5.19 +#define VA_PMD_1         11
    5.20 +#define PAGES_NR         12
    5.21 +#else
    5.22 +#define PAGES_NR         8
    5.23 +#endif
    5.24 +
    5.25 +#ifndef __ASSEMBLY__
    5.26 +
    5.27 +#include <asm/fixmap.h>
    5.28 +#include <asm/ptrace.h>
    5.29 +#include <asm/string.h>
    5.30 +
    5.31 +/*
    5.32 + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
    5.33 + * I.e. Maximum page that is mapped directly into kernel memory,
    5.34 + * and kmap is not required.
    5.35 + *
    5.36 + * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct
    5.37 + * calculation for the amount of memory directly mappable into the
    5.38 + * kernel memory space.
    5.39 + */
    5.40 +
    5.41 +/* Maximum physical address we can use pages from */
    5.42 +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
    5.43 +/* Maximum address we can reach in physical address mode */
    5.44 +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
    5.45 +/* Maximum address we can use for the control code buffer */
    5.46 +#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
    5.47 +
    5.48 +#define KEXEC_CONTROL_CODE_SIZE	4096
    5.49 +
    5.50 +/* The native architecture */
    5.51 +#define KEXEC_ARCH KEXEC_ARCH_386
    5.52 +
    5.53 +#define MAX_NOTE_BYTES 1024
    5.54 +
    5.55 +/* CPU does not save ss and esp on stack if execution is already
    5.56 + * running in kernel mode at the time of NMI occurrence. This code
    5.57 + * fixes it.
    5.58 + */
    5.59 +static inline void crash_fixup_ss_esp(struct pt_regs *newregs,
    5.60 +					struct pt_regs *oldregs)
    5.61 +{
    5.62 +	memcpy(newregs, oldregs, sizeof(*newregs));
    5.63 +	newregs->esp = (unsigned long)&(oldregs->esp);
    5.64 +	__asm__ __volatile__(
    5.65 +			"xorl %%eax, %%eax\n\t"
    5.66 +			"movw %%ss, %%ax\n\t"
    5.67 +			:"=a"(newregs->xss));
    5.68 +}
    5.69 +
    5.70 +/*
    5.71 + * This function is responsible for capturing register states if coming
    5.72 + * via panic otherwise just fix up the ss and esp if coming via kernel
    5.73 + * mode exception.
    5.74 + */
    5.75 +static inline void crash_setup_regs(struct pt_regs *newregs,
    5.76 +                                       struct pt_regs *oldregs)
    5.77 +{
    5.78 +       if (oldregs)
    5.79 +               crash_fixup_ss_esp(newregs, oldregs);
    5.80 +       else {
    5.81 +               __asm__ __volatile__("movl %%ebx,%0" : "=m"(newregs->ebx));
    5.82 +               __asm__ __volatile__("movl %%ecx,%0" : "=m"(newregs->ecx));
    5.83 +               __asm__ __volatile__("movl %%edx,%0" : "=m"(newregs->edx));
    5.84 +               __asm__ __volatile__("movl %%esi,%0" : "=m"(newregs->esi));
    5.85 +               __asm__ __volatile__("movl %%edi,%0" : "=m"(newregs->edi));
    5.86 +               __asm__ __volatile__("movl %%ebp,%0" : "=m"(newregs->ebp));
    5.87 +               __asm__ __volatile__("movl %%eax,%0" : "=m"(newregs->eax));
    5.88 +               __asm__ __volatile__("movl %%esp,%0" : "=m"(newregs->esp));
    5.89 +               __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(newregs->xss));
    5.90 +               __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(newregs->xcs));
    5.91 +               __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(newregs->xds));
    5.92 +               __asm__ __volatile__("movw %%es, %%ax;" :"=a"(newregs->xes));
    5.93 +               __asm__ __volatile__("pushfl; popl %0" :"=m"(newregs->eflags));
    5.94 +
    5.95 +               newregs->eip = (unsigned long)current_text_addr();
    5.96 +       }
    5.97 +}
    5.98 +asmlinkage NORET_TYPE void
    5.99 +relocate_kernel(unsigned long indirection_page,
   5.100 +		unsigned long control_page,
   5.101 +		unsigned long start_address,
   5.102 +		unsigned int has_pae) ATTRIB_NORET;
   5.103 +
   5.104 +#endif /* __ASSEMBLY__ */
   5.105 +
   5.106 +#endif /* _I386_KEXEC_H */
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/kexec.h	Fri Dec 08 11:47:09 2006 +0000
     6.3 @@ -0,0 +1,96 @@
     6.4 +#ifndef _X86_64_KEXEC_H
     6.5 +#define _X86_64_KEXEC_H
     6.6 +
     6.7 +#define PA_CONTROL_PAGE  0
     6.8 +#define VA_CONTROL_PAGE  1
     6.9 +#define PA_PGD           2
    6.10 +#define VA_PGD           3
    6.11 +#define PA_PUD_0         4
    6.12 +#define VA_PUD_0         5
    6.13 +#define PA_PMD_0         6
    6.14 +#define VA_PMD_0         7
    6.15 +#define PA_PTE_0         8
    6.16 +#define VA_PTE_0         9
    6.17 +#define PA_PUD_1         10
    6.18 +#define VA_PUD_1         11
    6.19 +#define PA_PMD_1         12
    6.20 +#define VA_PMD_1         13
    6.21 +#define PA_PTE_1         14
    6.22 +#define VA_PTE_1         15
    6.23 +#define PA_TABLE_PAGE    16
    6.24 +#define PAGES_NR         17
    6.25 +
    6.26 +#ifndef __ASSEMBLY__
    6.27 +
    6.28 +#include <linux/string.h>
    6.29 +
    6.30 +#include <asm/page.h>
    6.31 +#include <asm/ptrace.h>
    6.32 +
    6.33 +/*
    6.34 + * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
    6.35 + * I.e. Maximum page that is mapped directly into kernel memory,
    6.36 + * and kmap is not required.
    6.37 + *
    6.38 + * So far x86_64 is limited to 40 physical address bits.
    6.39 + */
    6.40 +
    6.41 +/* Maximum physical address we can use pages from */
    6.42 +#define KEXEC_SOURCE_MEMORY_LIMIT      (0xFFFFFFFFFFUL)
    6.43 +/* Maximum address we can reach in physical address mode */
    6.44 +#define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
    6.45 +/* Maximum address we can use for the control pages */
    6.46 +#define KEXEC_CONTROL_MEMORY_LIMIT     (0xFFFFFFFFFFUL)
    6.47 +
    6.48 +/* Allocate one page for the pdp and the second for the code */
    6.49 +#define KEXEC_CONTROL_CODE_SIZE  (4096UL + 4096UL)
    6.50 +
    6.51 +/* The native architecture */
    6.52 +#define KEXEC_ARCH KEXEC_ARCH_X86_64
    6.53 +
    6.54 +#define MAX_NOTE_BYTES 1024
    6.55 +
    6.56 +/*
    6.57 + * Saving the registers of the cpu on which panic occured in
    6.58 + * crash_kexec to save a valid sp. The registers of other cpus
    6.59 + * will be saved in machine_crash_shutdown while shooting down them.
    6.60 + */
    6.61 +
    6.62 +static inline void crash_setup_regs(struct pt_regs *newregs,
    6.63 +						struct pt_regs *oldregs)
    6.64 +{
    6.65 +	if (oldregs)
    6.66 +		memcpy(newregs, oldregs, sizeof(*newregs));
    6.67 +	else {
    6.68 +		__asm__ __volatile__("movq %%rbx,%0" : "=m"(newregs->rbx));
    6.69 +		__asm__ __volatile__("movq %%rcx,%0" : "=m"(newregs->rcx));
    6.70 +		__asm__ __volatile__("movq %%rdx,%0" : "=m"(newregs->rdx));
    6.71 +		__asm__ __volatile__("movq %%rsi,%0" : "=m"(newregs->rsi));
    6.72 +		__asm__ __volatile__("movq %%rdi,%0" : "=m"(newregs->rdi));
    6.73 +		__asm__ __volatile__("movq %%rbp,%0" : "=m"(newregs->rbp));
    6.74 +		__asm__ __volatile__("movq %%rax,%0" : "=m"(newregs->rax));
    6.75 +		__asm__ __volatile__("movq %%rsp,%0" : "=m"(newregs->rsp));
    6.76 +		__asm__ __volatile__("movq %%r8,%0" : "=m"(newregs->r8));
    6.77 +		__asm__ __volatile__("movq %%r9,%0" : "=m"(newregs->r9));
    6.78 +		__asm__ __volatile__("movq %%r10,%0" : "=m"(newregs->r10));
    6.79 +		__asm__ __volatile__("movq %%r11,%0" : "=m"(newregs->r11));
    6.80 +		__asm__ __volatile__("movq %%r12,%0" : "=m"(newregs->r12));
    6.81 +		__asm__ __volatile__("movq %%r13,%0" : "=m"(newregs->r13));
    6.82 +		__asm__ __volatile__("movq %%r14,%0" : "=m"(newregs->r14));
    6.83 +		__asm__ __volatile__("movq %%r15,%0" : "=m"(newregs->r15));
    6.84 +		__asm__ __volatile__("movl %%ss, %%eax;" :"=a"(newregs->ss));
    6.85 +		__asm__ __volatile__("movl %%cs, %%eax;" :"=a"(newregs->cs));
    6.86 +		__asm__ __volatile__("pushfq; popq %0" :"=m"(newregs->eflags));
    6.87 +
    6.88 +		newregs->rip = (unsigned long)current_text_addr();
    6.89 +	}
    6.90 +}
    6.91 +
    6.92 +NORET_TYPE void
    6.93 +relocate_kernel(unsigned long indirection_page,
    6.94 +		unsigned long page_list,
    6.95 +		unsigned long start_address) ATTRIB_NORET;
    6.96 +
    6.97 +#endif /* __ASSEMBLY__ */
    6.98 +
    6.99 +#endif /* _X86_64_KEXEC_H */
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/linux-2.6-xen-sparse/include/linux/kexec.h	Fri Dec 08 11:47:09 2006 +0000
     7.3 @@ -0,0 +1,139 @@
     7.4 +#ifndef LINUX_KEXEC_H
     7.5 +#define LINUX_KEXEC_H
     7.6 +
     7.7 +#ifdef CONFIG_KEXEC
     7.8 +#include <linux/types.h>
     7.9 +#include <linux/list.h>
    7.10 +#include <linux/linkage.h>
    7.11 +#include <linux/compat.h>
    7.12 +#include <linux/ioport.h>
    7.13 +#include <asm/kexec.h>
    7.14 +
    7.15 +/* Verify architecture specific macros are defined */
    7.16 +
    7.17 +#ifndef KEXEC_SOURCE_MEMORY_LIMIT
    7.18 +#error KEXEC_SOURCE_MEMORY_LIMIT not defined
    7.19 +#endif
    7.20 +
    7.21 +#ifndef KEXEC_DESTINATION_MEMORY_LIMIT
    7.22 +#error KEXEC_DESTINATION_MEMORY_LIMIT not defined
    7.23 +#endif
    7.24 +
    7.25 +#ifndef KEXEC_CONTROL_MEMORY_LIMIT
    7.26 +#error KEXEC_CONTROL_MEMORY_LIMIT not defined
    7.27 +#endif
    7.28 +
    7.29 +#ifndef KEXEC_CONTROL_CODE_SIZE
    7.30 +#error KEXEC_CONTROL_CODE_SIZE not defined
    7.31 +#endif
    7.32 +
    7.33 +#ifndef KEXEC_ARCH
    7.34 +#error KEXEC_ARCH not defined
    7.35 +#endif
    7.36 +
    7.37 +/*
    7.38 + * This structure is used to hold the arguments that are used when loading
    7.39 + * kernel binaries.
    7.40 + */
    7.41 +
    7.42 +typedef unsigned long kimage_entry_t;
    7.43 +#define IND_DESTINATION  0x1
    7.44 +#define IND_INDIRECTION  0x2
    7.45 +#define IND_DONE         0x4
    7.46 +#define IND_SOURCE       0x8
    7.47 +
    7.48 +#define KEXEC_SEGMENT_MAX 16
    7.49 +struct kexec_segment {
    7.50 +	void __user *buf;
    7.51 +	size_t bufsz;
    7.52 +	unsigned long mem;	/* User space sees this as a (void *) ... */
    7.53 +	size_t memsz;
    7.54 +};
    7.55 +
    7.56 +#ifdef CONFIG_COMPAT
    7.57 +struct compat_kexec_segment {
    7.58 +	compat_uptr_t buf;
    7.59 +	compat_size_t bufsz;
    7.60 +	compat_ulong_t mem;	/* User space sees this as a (void *) ... */
    7.61 +	compat_size_t memsz;
    7.62 +};
    7.63 +#endif
    7.64 +
    7.65 +struct kimage {
    7.66 +	kimage_entry_t head;
    7.67 +	kimage_entry_t *entry;
    7.68 +	kimage_entry_t *last_entry;
    7.69 +
    7.70 +	unsigned long destination;
    7.71 +
    7.72 +	unsigned long start;
    7.73 +	struct page *control_code_page;
    7.74 +
    7.75 +	unsigned long nr_segments;
    7.76 +	struct kexec_segment segment[KEXEC_SEGMENT_MAX];
    7.77 +
    7.78 +	struct list_head control_pages;
    7.79 +	struct list_head dest_pages;
    7.80 +	struct list_head unuseable_pages;
    7.81 +
    7.82 +	/* Address of next control page to allocate for crash kernels. */
    7.83 +	unsigned long control_page;
    7.84 +
    7.85 +	/* Flags to indicate special processing */
    7.86 +	unsigned int type : 1;
    7.87 +#define KEXEC_TYPE_DEFAULT 0
    7.88 +#define KEXEC_TYPE_CRASH   1
    7.89 +};
    7.90 +
    7.91 +
    7.92 +
    7.93 +/* kexec interface functions */
    7.94 +extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
    7.95 +extern int machine_kexec_prepare(struct kimage *image);
    7.96 +extern void machine_kexec_cleanup(struct kimage *image);
    7.97 +extern asmlinkage long sys_kexec_load(unsigned long entry,
    7.98 +					unsigned long nr_segments,
    7.99 +					struct kexec_segment __user *segments,
   7.100 +					unsigned long flags);
   7.101 +#ifdef CONFIG_COMPAT
   7.102 +extern asmlinkage long compat_sys_kexec_load(unsigned long entry,
   7.103 +				unsigned long nr_segments,
   7.104 +				struct compat_kexec_segment __user *segments,
   7.105 +				unsigned long flags);
   7.106 +#endif
   7.107 +extern struct page *kimage_alloc_control_pages(struct kimage *image,
   7.108 +						unsigned int order);
   7.109 +extern void crash_kexec(struct pt_regs *);
   7.110 +int kexec_should_crash(struct task_struct *);
   7.111 +extern struct kimage *kexec_image;
   7.112 +
   7.113 +#define KEXEC_ON_CRASH  0x00000001
   7.114 +#define KEXEC_ARCH_MASK 0xffff0000
   7.115 +
   7.116 +/* These values match the ELF architecture values.
   7.117 + * Unless there is a good reason that should continue to be the case.
   7.118 + */
   7.119 +#define KEXEC_ARCH_DEFAULT ( 0 << 16)
   7.120 +#define KEXEC_ARCH_386     ( 3 << 16)
   7.121 +#define KEXEC_ARCH_X86_64  (62 << 16)
   7.122 +#define KEXEC_ARCH_PPC     (20 << 16)
   7.123 +#define KEXEC_ARCH_PPC64   (21 << 16)
   7.124 +#define KEXEC_ARCH_IA_64   (50 << 16)
   7.125 +#define KEXEC_ARCH_S390    (22 << 16)
   7.126 +#define KEXEC_ARCH_SH      (42 << 16)
   7.127 +
   7.128 +#define KEXEC_FLAGS    (KEXEC_ON_CRASH)  /* List of defined/legal kexec flags */
   7.129 +
   7.130 +/* Location of a reserved region to hold the crash kernel.
   7.131 + */
   7.132 +extern struct resource crashk_res;
   7.133 +typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
   7.134 +extern note_buf_t *crash_notes;
   7.135 +
   7.136 +#else /* !CONFIG_KEXEC */
   7.137 +struct pt_regs;
   7.138 +struct task_struct;
   7.139 +static inline void crash_kexec(struct pt_regs *regs) { }
   7.140 +static inline int kexec_should_crash(struct task_struct *p) { return 0; }
   7.141 +#endif /* CONFIG_KEXEC */
   7.142 +#endif /* LINUX_KEXEC_H */
     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/linux-2.6-xen-sparse/kernel/kexec.c	Fri Dec 08 11:47:09 2006 +0000
     8.3 @@ -0,0 +1,1081 @@
     8.4 +/*
     8.5 + * kexec.c - kexec system call
     8.6 + * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
     8.7 + *
     8.8 + * This source code is licensed under the GNU General Public License,
     8.9 + * Version 2.  See the file COPYING for more details.
    8.10 + */
    8.11 +
    8.12 +#include <linux/capability.h>
    8.13 +#include <linux/mm.h>
    8.14 +#include <linux/file.h>
    8.15 +#include <linux/slab.h>
    8.16 +#include <linux/fs.h>
    8.17 +#include <linux/kexec.h>
    8.18 +#include <linux/spinlock.h>
    8.19 +#include <linux/list.h>
    8.20 +#include <linux/highmem.h>
    8.21 +#include <linux/syscalls.h>
    8.22 +#include <linux/reboot.h>
    8.23 +#include <linux/syscalls.h>
    8.24 +#include <linux/ioport.h>
    8.25 +#include <linux/hardirq.h>
    8.26 +
    8.27 +#include <asm/page.h>
    8.28 +#include <asm/uaccess.h>
    8.29 +#include <asm/io.h>
    8.30 +#include <asm/system.h>
    8.31 +#include <asm/semaphore.h>
    8.32 +
    8.33 +/* Per cpu memory for storing cpu states in case of system crash. */
    8.34 +note_buf_t* crash_notes;
    8.35 +
    8.36 +/* Location of the reserved area for the crash kernel */
    8.37 +struct resource crashk_res = {
    8.38 +	.name  = "Crash kernel",
    8.39 +	.start = 0,
    8.40 +	.end   = 0,
    8.41 +	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
    8.42 +};
    8.43 +
    8.44 +int kexec_should_crash(struct task_struct *p)
    8.45 +{
    8.46 +	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
    8.47 +		return 1;
    8.48 +	return 0;
    8.49 +}
    8.50 +
    8.51 +/*
    8.52 + * When kexec transitions to the new kernel there is a one-to-one
    8.53 + * mapping between physical and virtual addresses.  On processors
    8.54 + * where you can disable the MMU this is trivial, and easy.  For
    8.55 + * others it is still a simple predictable page table to setup.
    8.56 + *
    8.57 + * In that environment kexec copies the new kernel to its final
    8.58 + * resting place.  This means I can only support memory whose
    8.59 + * physical address can fit in an unsigned long.  In particular
    8.60 + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
    8.61 + * If the assembly stub has more restrictive requirements
    8.62 + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
    8.63 + * defined more restrictively in <asm/kexec.h>.
    8.64 + *
    8.65 + * The code for the transition from the current kernel to the
    8.66 + * the new kernel is placed in the control_code_buffer, whose size
    8.67 + * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
    8.68 + * page of memory is necessary, but some architectures require more.
    8.69 + * Because this memory must be identity mapped in the transition from
    8.70 + * virtual to physical addresses it must live in the range
    8.71 + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
    8.72 + * modifiable.
    8.73 + *
    8.74 + * The assembly stub in the control code buffer is passed a linked list
    8.75 + * of descriptor pages detailing the source pages of the new kernel,
    8.76 + * and the destination addresses of those source pages.  As this data
    8.77 + * structure is not used in the context of the current OS, it must
    8.78 + * be self-contained.
    8.79 + *
    8.80 + * The code has been made to work with highmem pages and will use a
    8.81 + * destination page in its final resting place (if it happens
    8.82 + * to allocate it).  The end product of this is that most of the
    8.83 + * physical address space, and most of RAM can be used.
    8.84 + *
    8.85 + * Future directions include:
    8.86 + *  - allocating a page table with the control code buffer identity
    8.87 + *    mapped, to simplify machine_kexec and make kexec_on_panic more
    8.88 + *    reliable.
    8.89 + */
    8.90 +
    8.91 +/*
    8.92 + * KIMAGE_NO_DEST is an impossible destination address..., for
    8.93 + * allocating pages whose destination address we do not care about.
    8.94 + */
    8.95 +#define KIMAGE_NO_DEST (-1UL)
    8.96 +
    8.97 +static int kimage_is_destination_range(struct kimage *image,
    8.98 +				       unsigned long start, unsigned long end);
    8.99 +static struct page *kimage_alloc_page(struct kimage *image,
   8.100 +				       gfp_t gfp_mask,
   8.101 +				       unsigned long dest);
   8.102 +
   8.103 +static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
   8.104 +	                    unsigned long nr_segments,
   8.105 +                            struct kexec_segment __user *segments)
   8.106 +{
   8.107 +	size_t segment_bytes;
   8.108 +	struct kimage *image;
   8.109 +	unsigned long i;
   8.110 +	int result;
   8.111 +
   8.112 +	/* Allocate a controlling structure */
   8.113 +	result = -ENOMEM;
   8.114 +	image = kmalloc(sizeof(*image), GFP_KERNEL);
   8.115 +	if (!image)
   8.116 +		goto out;
   8.117 +
   8.118 +	memset(image, 0, sizeof(*image));
   8.119 +	image->head = 0;
   8.120 +	image->entry = &image->head;
   8.121 +	image->last_entry = &image->head;
   8.122 +	image->control_page = ~0; /* By default this does not apply */
   8.123 +	image->start = entry;
   8.124 +	image->type = KEXEC_TYPE_DEFAULT;
   8.125 +
   8.126 +	/* Initialize the list of control pages */
   8.127 +	INIT_LIST_HEAD(&image->control_pages);
   8.128 +
   8.129 +	/* Initialize the list of destination pages */
   8.130 +	INIT_LIST_HEAD(&image->dest_pages);
   8.131 +
   8.132 +	/* Initialize the list of unuseable pages */
   8.133 +	INIT_LIST_HEAD(&image->unuseable_pages);
   8.134 +
   8.135 +	/* Read in the segments */
   8.136 +	image->nr_segments = nr_segments;
   8.137 +	segment_bytes = nr_segments * sizeof(*segments);
   8.138 +	result = copy_from_user(image->segment, segments, segment_bytes);
   8.139 +	if (result)
   8.140 +		goto out;
   8.141 +
   8.142 +	/*
   8.143 +	 * Verify we have good destination addresses.  The caller is
   8.144 +	 * responsible for making certain we don't attempt to load
   8.145 +	 * the new image into invalid or reserved areas of RAM.  This
   8.146 +	 * just verifies it is an address we can use.
   8.147 +	 *
   8.148 +	 * Since the kernel does everything in page size chunks ensure
   8.149 +	 * the destination addreses are page aligned.  Too many
   8.150 +	 * special cases crop of when we don't do this.  The most
   8.151 +	 * insidious is getting overlapping destination addresses
   8.152 +	 * simply because addresses are changed to page size
   8.153 +	 * granularity.
   8.154 +	 */
   8.155 +	result = -EADDRNOTAVAIL;
   8.156 +	for (i = 0; i < nr_segments; i++) {
   8.157 +		unsigned long mstart, mend;
   8.158 +
   8.159 +		mstart = image->segment[i].mem;
   8.160 +		mend   = mstart + image->segment[i].memsz;
   8.161 +		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
   8.162 +			goto out;
   8.163 +		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
   8.164 +			goto out;
   8.165 +	}
   8.166 +
   8.167 +	/* Verify our destination addresses do not overlap.
   8.168 +	 * If we alloed overlapping destination addresses
   8.169 +	 * through very weird things can happen with no
   8.170 +	 * easy explanation as one segment stops on another.
   8.171 +	 */
   8.172 +	result = -EINVAL;
   8.173 +	for (i = 0; i < nr_segments; i++) {
   8.174 +		unsigned long mstart, mend;
   8.175 +		unsigned long j;
   8.176 +
   8.177 +		mstart = image->segment[i].mem;
   8.178 +		mend   = mstart + image->segment[i].memsz;
   8.179 +		for (j = 0; j < i; j++) {
   8.180 +			unsigned long pstart, pend;
   8.181 +			pstart = image->segment[j].mem;
   8.182 +			pend   = pstart + image->segment[j].memsz;
   8.183 +			/* Do the segments overlap ? */
   8.184 +			if ((mend > pstart) && (mstart < pend))
   8.185 +				goto out;
   8.186 +		}
   8.187 +	}
   8.188 +
   8.189 +	/* Ensure our buffer sizes are strictly less than
   8.190 +	 * our memory sizes.  This should always be the case,
   8.191 +	 * and it is easier to check up front than to be surprised
   8.192 +	 * later on.
   8.193 +	 */
   8.194 +	result = -EINVAL;
   8.195 +	for (i = 0; i < nr_segments; i++) {
   8.196 +		if (image->segment[i].bufsz > image->segment[i].memsz)
   8.197 +			goto out;
   8.198 +	}
   8.199 +
   8.200 +	result = 0;
   8.201 +out:
   8.202 +	if (result == 0)
   8.203 +		*rimage = image;
   8.204 +	else
   8.205 +		kfree(image);
   8.206 +
   8.207 +	return result;
   8.208 +
   8.209 +}
   8.210 +
   8.211 +static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
   8.212 +				unsigned long nr_segments,
   8.213 +				struct kexec_segment __user *segments)
   8.214 +{
   8.215 +	int result;
   8.216 +	struct kimage *image;
   8.217 +
   8.218 +	/* Allocate and initialize a controlling structure */
   8.219 +	image = NULL;
   8.220 +	result = do_kimage_alloc(&image, entry, nr_segments, segments);
   8.221 +	if (result)
   8.222 +		goto out;
   8.223 +
   8.224 +	*rimage = image;
   8.225 +
   8.226 +	/*
   8.227 +	 * Find a location for the control code buffer, and add it
   8.228 +	 * the vector of segments so that it's pages will also be
   8.229 +	 * counted as destination pages.
   8.230 +	 */
   8.231 +	result = -ENOMEM;
   8.232 +	image->control_code_page = kimage_alloc_control_pages(image,
   8.233 +					   get_order(KEXEC_CONTROL_CODE_SIZE));
   8.234 +	if (!image->control_code_page) {
   8.235 +		printk(KERN_ERR "Could not allocate control_code_buffer\n");
   8.236 +		goto out;
   8.237 +	}
   8.238 +
   8.239 +	result = 0;
   8.240 + out:
   8.241 +	if (result == 0)
   8.242 +		*rimage = image;
   8.243 +	else
   8.244 +		kfree(image);
   8.245 +
   8.246 +	return result;
   8.247 +}
   8.248 +
   8.249 +static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
   8.250 +				unsigned long nr_segments,
   8.251 +				struct kexec_segment __user *segments)
   8.252 +{
   8.253 +	int result;
   8.254 +	struct kimage *image;
   8.255 +	unsigned long i;
   8.256 +
   8.257 +	image = NULL;
   8.258 +	/* Verify we have a valid entry point */
   8.259 +	if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
   8.260 +		result = -EADDRNOTAVAIL;
   8.261 +		goto out;
   8.262 +	}
   8.263 +
   8.264 +	/* Allocate and initialize a controlling structure */
   8.265 +	result = do_kimage_alloc(&image, entry, nr_segments, segments);
   8.266 +	if (result)
   8.267 +		goto out;
   8.268 +
   8.269 +	/* Enable the special crash kernel control page
   8.270 +	 * allocation policy.
   8.271 +	 */
   8.272 +	image->control_page = crashk_res.start;
   8.273 +	image->type = KEXEC_TYPE_CRASH;
   8.274 +
   8.275 +	/*
   8.276 +	 * Verify we have good destination addresses.  Normally
   8.277 +	 * the caller is responsible for making certain we don't
   8.278 +	 * attempt to load the new image into invalid or reserved
   8.279 +	 * areas of RAM.  But crash kernels are preloaded into a
   8.280 +	 * reserved area of ram.  We must ensure the addresses
   8.281 +	 * are in the reserved area otherwise preloading the
   8.282 +	 * kernel could corrupt things.
   8.283 +	 */
   8.284 +	result = -EADDRNOTAVAIL;
   8.285 +	for (i = 0; i < nr_segments; i++) {
   8.286 +		unsigned long mstart, mend;
   8.287 +
   8.288 +		mstart = image->segment[i].mem;
   8.289 +		mend = mstart + image->segment[i].memsz - 1;
   8.290 +		/* Ensure we are within the crash kernel limits */
   8.291 +		if ((mstart < crashk_res.start) || (mend > crashk_res.end))
   8.292 +			goto out;
   8.293 +	}
   8.294 +
   8.295 +	/*
   8.296 +	 * Find a location for the control code buffer, and add
   8.297 +	 * the vector of segments so that it's pages will also be
   8.298 +	 * counted as destination pages.
   8.299 +	 */
   8.300 +	result = -ENOMEM;
   8.301 +	image->control_code_page = kimage_alloc_control_pages(image,
   8.302 +					   get_order(KEXEC_CONTROL_CODE_SIZE));
   8.303 +	if (!image->control_code_page) {
   8.304 +		printk(KERN_ERR "Could not allocate control_code_buffer\n");
   8.305 +		goto out;
   8.306 +	}
   8.307 +
   8.308 +	result = 0;
   8.309 +out:
   8.310 +	if (result == 0)
   8.311 +		*rimage = image;
   8.312 +	else
   8.313 +		kfree(image);
   8.314 +
   8.315 +	return result;
   8.316 +}
   8.317 +
   8.318 +static int kimage_is_destination_range(struct kimage *image,
   8.319 +					unsigned long start,
   8.320 +					unsigned long end)
   8.321 +{
   8.322 +	unsigned long i;
   8.323 +
   8.324 +	for (i = 0; i < image->nr_segments; i++) {
   8.325 +		unsigned long mstart, mend;
   8.326 +
   8.327 +		mstart = image->segment[i].mem;
   8.328 +		mend = mstart + image->segment[i].memsz;
   8.329 +		if ((end > mstart) && (start < mend))
   8.330 +			return 1;
   8.331 +	}
   8.332 +
   8.333 +	return 0;
   8.334 +}
   8.335 +
   8.336 +static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
   8.337 +{
   8.338 +	struct page *pages;
   8.339 +
   8.340 +	pages = alloc_pages(gfp_mask, order);
   8.341 +	if (pages) {
   8.342 +		unsigned int count, i;
   8.343 +		pages->mapping = NULL;
   8.344 +		set_page_private(pages, order);
   8.345 +		count = 1 << order;
   8.346 +		for (i = 0; i < count; i++)
   8.347 +			SetPageReserved(pages + i);
   8.348 +	}
   8.349 +
   8.350 +	return pages;
   8.351 +}
   8.352 +
   8.353 +static void kimage_free_pages(struct page *page)
   8.354 +{
   8.355 +	unsigned int order, count, i;
   8.356 +
   8.357 +	order = page_private(page);
   8.358 +	count = 1 << order;
   8.359 +	for (i = 0; i < count; i++)
   8.360 +		ClearPageReserved(page + i);
   8.361 +	__free_pages(page, order);
   8.362 +}
   8.363 +
   8.364 +static void kimage_free_page_list(struct list_head *list)
   8.365 +{
   8.366 +	struct list_head *pos, *next;
   8.367 +
   8.368 +	list_for_each_safe(pos, next, list) {
   8.369 +		struct page *page;
   8.370 +
   8.371 +		page = list_entry(pos, struct page, lru);
   8.372 +		list_del(&page->lru);
   8.373 +		kimage_free_pages(page);
   8.374 +	}
   8.375 +}
   8.376 +
   8.377 +static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
   8.378 +							unsigned int order)
   8.379 +{
   8.380 +	/* Control pages are special, they are the intermediaries
   8.381 +	 * that are needed while we copy the rest of the pages
   8.382 +	 * to their final resting place.  As such they must
   8.383 +	 * not conflict with either the destination addresses
   8.384 +	 * or memory the kernel is already using.
   8.385 +	 *
   8.386 +	 * The only case where we really need more than one of
   8.387 +	 * these are for architectures where we cannot disable
   8.388 +	 * the MMU and must instead generate an identity mapped
   8.389 +	 * page table for all of the memory.
   8.390 +	 *
   8.391 +	 * At worst this runs in O(N) of the image size.
   8.392 +	 */
   8.393 +	struct list_head extra_pages;
   8.394 +	struct page *pages;
   8.395 +	unsigned int count;
   8.396 +
   8.397 +	count = 1 << order;
   8.398 +	INIT_LIST_HEAD(&extra_pages);
   8.399 +
   8.400 +	/* Loop while I can allocate a page and the page allocated
   8.401 +	 * is a destination page.
   8.402 +	 */
   8.403 +	do {
   8.404 +		unsigned long pfn, epfn, addr, eaddr;
   8.405 +
   8.406 +		pages = kimage_alloc_pages(GFP_KERNEL, order);
   8.407 +		if (!pages)
   8.408 +			break;
   8.409 +		pfn   = page_to_pfn(pages);
   8.410 +		epfn  = pfn + count;
   8.411 +		addr  = pfn << PAGE_SHIFT;
   8.412 +		eaddr = epfn << PAGE_SHIFT;
   8.413 +		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
   8.414 +			      kimage_is_destination_range(image, addr, eaddr)) {
   8.415 +			list_add(&pages->lru, &extra_pages);
   8.416 +			pages = NULL;
   8.417 +		}
   8.418 +	} while (!pages);
   8.419 +
   8.420 +	if (pages) {
   8.421 +		/* Remember the allocated page... */
   8.422 +		list_add(&pages->lru, &image->control_pages);
   8.423 +
   8.424 +		/* Because the page is already in it's destination
   8.425 +		 * location we will never allocate another page at
   8.426 +		 * that address.  Therefore kimage_alloc_pages
   8.427 +		 * will not return it (again) and we don't need
   8.428 +		 * to give it an entry in image->segment[].
   8.429 +		 */
   8.430 +	}
   8.431 +	/* Deal with the destination pages I have inadvertently allocated.
   8.432 +	 *
   8.433 +	 * Ideally I would convert multi-page allocations into single
   8.434 +	 * page allocations, and add everyting to image->dest_pages.
   8.435 +	 *
   8.436 +	 * For now it is simpler to just free the pages.
   8.437 +	 */
   8.438 +	kimage_free_page_list(&extra_pages);
   8.439 +
   8.440 +	return pages;
   8.441 +}
   8.442 +
   8.443 +static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
   8.444 +						      unsigned int order)
   8.445 +{
   8.446 +	/* Control pages are special, they are the intermediaries
   8.447 +	 * that are needed while we copy the rest of the pages
   8.448 +	 * to their final resting place.  As such they must
   8.449 +	 * not conflict with either the destination addresses
   8.450 +	 * or memory the kernel is already using.
   8.451 +	 *
   8.452 +	 * Control pages are also the only pags we must allocate
   8.453 +	 * when loading a crash kernel.  All of the other pages
   8.454 +	 * are specified by the segments and we just memcpy
   8.455 +	 * into them directly.
   8.456 +	 *
   8.457 +	 * The only case where we really need more than one of
   8.458 +	 * these are for architectures where we cannot disable
   8.459 +	 * the MMU and must instead generate an identity mapped
   8.460 +	 * page table for all of the memory.
   8.461 +	 *
   8.462 +	 * Given the low demand this implements a very simple
   8.463 +	 * allocator that finds the first hole of the appropriate
   8.464 +	 * size in the reserved memory region, and allocates all
   8.465 +	 * of the memory up to and including the hole.
   8.466 +	 */
   8.467 +	unsigned long hole_start, hole_end, size;
   8.468 +	struct page *pages;
   8.469 +
   8.470 +	pages = NULL;
   8.471 +	size = (1 << order) << PAGE_SHIFT;
   8.472 +	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
   8.473 +	hole_end   = hole_start + size - 1;
   8.474 +	while (hole_end <= crashk_res.end) {
   8.475 +		unsigned long i;
   8.476 +
   8.477 +		if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
   8.478 +			break;
   8.479 +		if (hole_end > crashk_res.end)
   8.480 +			break;
   8.481 +		/* See if I overlap any of the segments */
   8.482 +		for (i = 0; i < image->nr_segments; i++) {
   8.483 +			unsigned long mstart, mend;
   8.484 +
   8.485 +			mstart = image->segment[i].mem;
   8.486 +			mend   = mstart + image->segment[i].memsz - 1;
   8.487 +			if ((hole_end >= mstart) && (hole_start <= mend)) {
   8.488 +				/* Advance the hole to the end of the segment */
   8.489 +				hole_start = (mend + (size - 1)) & ~(size - 1);
   8.490 +				hole_end   = hole_start + size - 1;
   8.491 +				break;
   8.492 +			}
   8.493 +		}
   8.494 +		/* If I don't overlap any segments I have found my hole! */
   8.495 +		if (i == image->nr_segments) {
   8.496 +			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
   8.497 +			break;
   8.498 +		}
   8.499 +	}
   8.500 +	if (pages)
   8.501 +		image->control_page = hole_end;
   8.502 +
   8.503 +	return pages;
   8.504 +}
   8.505 +
   8.506 +
   8.507 +struct page *kimage_alloc_control_pages(struct kimage *image,
   8.508 +					 unsigned int order)
   8.509 +{
   8.510 +	struct page *pages = NULL;
   8.511 +
   8.512 +	switch (image->type) {
   8.513 +	case KEXEC_TYPE_DEFAULT:
   8.514 +		pages = kimage_alloc_normal_control_pages(image, order);
   8.515 +		break;
   8.516 +	case KEXEC_TYPE_CRASH:
   8.517 +		pages = kimage_alloc_crash_control_pages(image, order);
   8.518 +		break;
   8.519 +	}
   8.520 +
   8.521 +	return pages;
   8.522 +}
   8.523 +
   8.524 +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
   8.525 +{
   8.526 +	if (*image->entry != 0)
   8.527 +		image->entry++;
   8.528 +
   8.529 +	if (image->entry == image->last_entry) {
   8.530 +		kimage_entry_t *ind_page;
   8.531 +		struct page *page;
   8.532 +
   8.533 +		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
   8.534 +		if (!page)
   8.535 +			return -ENOMEM;
   8.536 +
   8.537 +		ind_page = page_address(page);
   8.538 +		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
   8.539 +		image->entry = ind_page;
   8.540 +		image->last_entry = ind_page +
   8.541 +				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
   8.542 +	}
   8.543 +	*image->entry = entry;
   8.544 +	image->entry++;
   8.545 +	*image->entry = 0;
   8.546 +
   8.547 +	return 0;
   8.548 +}
   8.549 +
   8.550 +static int kimage_set_destination(struct kimage *image,
   8.551 +				   unsigned long destination)
   8.552 +{
   8.553 +	int result;
   8.554 +
   8.555 +	destination &= PAGE_MASK;
   8.556 +	result = kimage_add_entry(image, destination | IND_DESTINATION);
   8.557 +	if (result == 0)
   8.558 +		image->destination = destination;
   8.559 +
   8.560 +	return result;
   8.561 +}
   8.562 +
   8.563 +
   8.564 +static int kimage_add_page(struct kimage *image, unsigned long page)
   8.565 +{
   8.566 +	int result;
   8.567 +
   8.568 +	page &= PAGE_MASK;
   8.569 +	result = kimage_add_entry(image, page | IND_SOURCE);
   8.570 +	if (result == 0)
   8.571 +		image->destination += PAGE_SIZE;
   8.572 +
   8.573 +	return result;
   8.574 +}
   8.575 +
   8.576 +
   8.577 +static void kimage_free_extra_pages(struct kimage *image)
   8.578 +{
   8.579 +	/* Walk through and free any extra destination pages I may have */
   8.580 +	kimage_free_page_list(&image->dest_pages);
   8.581 +
   8.582 +	/* Walk through and free any unuseable pages I have cached */
   8.583 +	kimage_free_page_list(&image->unuseable_pages);
   8.584 +
   8.585 +}
   8.586 +static int kimage_terminate(struct kimage *image)
   8.587 +{
   8.588 +	if (*image->entry != 0)
   8.589 +		image->entry++;
   8.590 +
   8.591 +	*image->entry = IND_DONE;
   8.592 +
   8.593 +	return 0;
   8.594 +}
   8.595 +
   8.596 +#define for_each_kimage_entry(image, ptr, entry) \
   8.597 +	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
   8.598 +		ptr = (entry & IND_INDIRECTION)? \
   8.599 +			phys_to_virt((entry & PAGE_MASK)): ptr +1)
   8.600 +
   8.601 +static void kimage_free_entry(kimage_entry_t entry)
   8.602 +{
   8.603 +	struct page *page;
   8.604 +
   8.605 +	page = pfn_to_page(entry >> PAGE_SHIFT);
   8.606 +	kimage_free_pages(page);
   8.607 +}
   8.608 +
   8.609 +static void kimage_free(struct kimage *image)
   8.610 +{
   8.611 +	kimage_entry_t *ptr, entry;
   8.612 +	kimage_entry_t ind = 0;
   8.613 +
   8.614 +	if (!image)
   8.615 +		return;
   8.616 +
   8.617 +	kimage_free_extra_pages(image);
   8.618 +	for_each_kimage_entry(image, ptr, entry) {
   8.619 +		if (entry & IND_INDIRECTION) {
   8.620 +			/* Free the previous indirection page */
   8.621 +			if (ind & IND_INDIRECTION)
   8.622 +				kimage_free_entry(ind);
   8.623 +			/* Save this indirection page until we are
   8.624 +			 * done with it.
   8.625 +			 */
   8.626 +			ind = entry;
   8.627 +		}
   8.628 +		else if (entry & IND_SOURCE)
   8.629 +			kimage_free_entry(entry);
   8.630 +	}
   8.631 +	/* Free the final indirection page */
   8.632 +	if (ind & IND_INDIRECTION)
   8.633 +		kimage_free_entry(ind);
   8.634 +
   8.635 +	/* Handle any machine specific cleanup */
   8.636 +	machine_kexec_cleanup(image);
   8.637 +
   8.638 +	/* Free the kexec control pages... */
   8.639 +	kimage_free_page_list(&image->control_pages);
   8.640 +	kfree(image);
   8.641 +}
   8.642 +
   8.643 +static kimage_entry_t *kimage_dst_used(struct kimage *image,
   8.644 +					unsigned long page)
   8.645 +{
   8.646 +	kimage_entry_t *ptr, entry;
   8.647 +	unsigned long destination = 0;
   8.648 +
   8.649 +	for_each_kimage_entry(image, ptr, entry) {
   8.650 +		if (entry & IND_DESTINATION)
   8.651 +			destination = entry & PAGE_MASK;
   8.652 +		else if (entry & IND_SOURCE) {
   8.653 +			if (page == destination)
   8.654 +				return ptr;
   8.655 +			destination += PAGE_SIZE;
   8.656 +		}
   8.657 +	}
   8.658 +
   8.659 +	return NULL;
   8.660 +}
   8.661 +
   8.662 +static struct page *kimage_alloc_page(struct kimage *image,
   8.663 +					gfp_t gfp_mask,
   8.664 +					unsigned long destination)
   8.665 +{
   8.666 +	/*
   8.667 +	 * Here we implement safeguards to ensure that a source page
   8.668 +	 * is not copied to its destination page before the data on
   8.669 +	 * the destination page is no longer useful.
   8.670 +	 *
   8.671 +	 * To do this we maintain the invariant that a source page is
   8.672 +	 * either its own destination page, or it is not a
   8.673 +	 * destination page at all.
   8.674 +	 *
   8.675 +	 * That is slightly stronger than required, but the proof
   8.676 +	 * that no problems will not occur is trivial, and the
   8.677 +	 * implementation is simply to verify.
   8.678 +	 *
   8.679 +	 * When allocating all pages normally this algorithm will run
   8.680 +	 * in O(N) time, but in the worst case it will run in O(N^2)
   8.681 +	 * time.   If the runtime is a problem the data structures can
   8.682 +	 * be fixed.
   8.683 +	 */
   8.684 +	struct page *page;
   8.685 +	unsigned long addr;
   8.686 +
   8.687 +	/*
   8.688 +	 * Walk through the list of destination pages, and see if I
   8.689 +	 * have a match.
   8.690 +	 */
   8.691 +	list_for_each_entry(page, &image->dest_pages, lru) {
   8.692 +		addr = page_to_pfn(page) << PAGE_SHIFT;
   8.693 +		if (addr == destination) {
   8.694 +			list_del(&page->lru);
   8.695 +			return page;
   8.696 +		}
   8.697 +	}
   8.698 +	page = NULL;
   8.699 +	while (1) {
   8.700 +		kimage_entry_t *old;
   8.701 +
   8.702 +		/* Allocate a page, if we run out of memory give up */
   8.703 +		page = kimage_alloc_pages(gfp_mask, 0);
   8.704 +		if (!page)
   8.705 +			return NULL;
   8.706 +		/* If the page cannot be used file it away */
   8.707 +		if (page_to_pfn(page) >
   8.708 +				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
   8.709 +			list_add(&page->lru, &image->unuseable_pages);
   8.710 +			continue;
   8.711 +		}
   8.712 +		addr = page_to_pfn(page) << PAGE_SHIFT;
   8.713 +
   8.714 +		/* If it is the destination page we want use it */
   8.715 +		if (addr == destination)
   8.716 +			break;
   8.717 +
   8.718 +		/* If the page is not a destination page use it */
   8.719 +		if (!kimage_is_destination_range(image, addr,
   8.720 +						  addr + PAGE_SIZE))
   8.721 +			break;
   8.722 +
   8.723 +		/*
   8.724 +		 * I know that the page is someones destination page.
   8.725 +		 * See if there is already a source page for this
   8.726 +		 * destination page.  And if so swap the source pages.
   8.727 +		 */
   8.728 +		old = kimage_dst_used(image, addr);
   8.729 +		if (old) {
   8.730 +			/* If so move it */
   8.731 +			unsigned long old_addr;
   8.732 +			struct page *old_page;
   8.733 +
   8.734 +			old_addr = *old & PAGE_MASK;
   8.735 +			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
   8.736 +			copy_highpage(page, old_page);
   8.737 +			*old = addr | (*old & ~PAGE_MASK);
   8.738 +
   8.739 +			/* The old page I have found cannot be a
   8.740 +			 * destination page, so return it.
   8.741 +			 */
   8.742 +			addr = old_addr;
   8.743 +			page = old_page;
   8.744 +			break;
   8.745 +		}
   8.746 +		else {
   8.747 +			/* Place the page on the destination list I
   8.748 +			 * will use it later.
   8.749 +			 */
   8.750 +			list_add(&page->lru, &image->dest_pages);
   8.751 +		}
   8.752 +	}
   8.753 +
   8.754 +	return page;
   8.755 +}
   8.756 +
   8.757 +static int kimage_load_normal_segment(struct kimage *image,
   8.758 +					 struct kexec_segment *segment)
   8.759 +{
   8.760 +	unsigned long maddr;
   8.761 +	unsigned long ubytes, mbytes;
   8.762 +	int result;
   8.763 +	unsigned char __user *buf;
   8.764 +
   8.765 +	result = 0;
   8.766 +	buf = segment->buf;
   8.767 +	ubytes = segment->bufsz;
   8.768 +	mbytes = segment->memsz;
   8.769 +	maddr = segment->mem;
   8.770 +
   8.771 +	result = kimage_set_destination(image, maddr);
   8.772 +	if (result < 0)
   8.773 +		goto out;
   8.774 +
   8.775 +	while (mbytes) {
   8.776 +		struct page *page;
   8.777 +		char *ptr;
   8.778 +		size_t uchunk, mchunk;
   8.779 +
   8.780 +		page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
   8.781 +		if (page == 0) {
   8.782 +			result  = -ENOMEM;
   8.783 +			goto out;
   8.784 +		}
   8.785 +		result = kimage_add_page(image, page_to_pfn(page)
   8.786 +								<< PAGE_SHIFT);
   8.787 +		if (result < 0)
   8.788 +			goto out;
   8.789 +
   8.790 +		ptr = kmap(page);
   8.791 +		/* Start with a clear page */
   8.792 +		memset(ptr, 0, PAGE_SIZE);
   8.793 +		ptr += maddr & ~PAGE_MASK;
   8.794 +		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
   8.795 +		if (mchunk > mbytes)
   8.796 +			mchunk = mbytes;
   8.797 +
   8.798 +		uchunk = mchunk;
   8.799 +		if (uchunk > ubytes)
   8.800 +			uchunk = ubytes;
   8.801 +
   8.802 +		result = copy_from_user(ptr, buf, uchunk);
   8.803 +		kunmap(page);
   8.804 +		if (result) {
   8.805 +			result = (result < 0) ? result : -EIO;
   8.806 +			goto out;
   8.807 +		}
   8.808 +		ubytes -= uchunk;
   8.809 +		maddr  += mchunk;
   8.810 +		buf    += mchunk;
   8.811 +		mbytes -= mchunk;
   8.812 +	}
   8.813 +out:
   8.814 +	return result;
   8.815 +}
   8.816 +
   8.817 +static int kimage_load_crash_segment(struct kimage *image,
   8.818 +					struct kexec_segment *segment)
   8.819 +{
   8.820 +	/* For crash dumps kernels we simply copy the data from
   8.821 +	 * user space to it's destination.
   8.822 +	 * We do things a page at a time for the sake of kmap.
   8.823 +	 */
   8.824 +	unsigned long maddr;
   8.825 +	unsigned long ubytes, mbytes;
   8.826 +	int result;
   8.827 +	unsigned char __user *buf;
   8.828 +
   8.829 +	result = 0;
   8.830 +	buf = segment->buf;
   8.831 +	ubytes = segment->bufsz;
   8.832 +	mbytes = segment->memsz;
   8.833 +	maddr = segment->mem;
   8.834 +	while (mbytes) {
   8.835 +		struct page *page;
   8.836 +		char *ptr;
   8.837 +		size_t uchunk, mchunk;
   8.838 +
   8.839 +		page = pfn_to_page(maddr >> PAGE_SHIFT);
   8.840 +		if (page == 0) {
   8.841 +			result  = -ENOMEM;
   8.842 +			goto out;
   8.843 +		}
   8.844 +		ptr = kmap(page);
   8.845 +		ptr += maddr & ~PAGE_MASK;
   8.846 +		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
   8.847 +		if (mchunk > mbytes)
   8.848 +			mchunk = mbytes;
   8.849 +
   8.850 +		uchunk = mchunk;
   8.851 +		if (uchunk > ubytes) {
   8.852 +			uchunk = ubytes;
   8.853 +			/* Zero the trailing part of the page */
   8.854 +			memset(ptr + uchunk, 0, mchunk - uchunk);
   8.855 +		}
   8.856 +		result = copy_from_user(ptr, buf, uchunk);
   8.857 +		kunmap(page);
   8.858 +		if (result) {
   8.859 +			result = (result < 0) ? result : -EIO;
   8.860 +			goto out;
   8.861 +		}
   8.862 +		ubytes -= uchunk;
   8.863 +		maddr  += mchunk;
   8.864 +		buf    += mchunk;
   8.865 +		mbytes -= mchunk;
   8.866 +	}
   8.867 +out:
   8.868 +	return result;
   8.869 +}
   8.870 +
   8.871 +static int kimage_load_segment(struct kimage *image,
   8.872 +				struct kexec_segment *segment)
   8.873 +{
   8.874 +	int result = -ENOMEM;
   8.875 +
   8.876 +	switch (image->type) {
   8.877 +	case KEXEC_TYPE_DEFAULT:
   8.878 +		result = kimage_load_normal_segment(image, segment);
   8.879 +		break;
   8.880 +	case KEXEC_TYPE_CRASH:
   8.881 +		result = kimage_load_crash_segment(image, segment);
   8.882 +		break;
   8.883 +	}
   8.884 +
   8.885 +	return result;
   8.886 +}
   8.887 +
   8.888 +/*
   8.889 + * Exec Kernel system call: for obvious reasons only root may call it.
   8.890 + *
   8.891 + * This call breaks up into three pieces.
   8.892 + * - A generic part which loads the new kernel from the current
   8.893 + *   address space, and very carefully places the data in the
   8.894 + *   allocated pages.
   8.895 + *
   8.896 + * - A generic part that interacts with the kernel and tells all of
   8.897 + *   the devices to shut down.  Preventing on-going dmas, and placing
   8.898 + *   the devices in a consistent state so a later kernel can
   8.899 + *   reinitialize them.
   8.900 + *
   8.901 + * - A machine specific part that includes the syscall number
   8.902 + *   and the copies the image to it's final destination.  And
   8.903 + *   jumps into the image at entry.
   8.904 + *
   8.905 + * kexec does not sync, or unmount filesystems so if you need
   8.906 + * that to happen you need to do that yourself.
   8.907 + */
   8.908 +struct kimage *kexec_image = NULL;
   8.909 +static struct kimage *kexec_crash_image = NULL;
   8.910 +/*
   8.911 + * A home grown binary mutex.
   8.912 + * Nothing can wait so this mutex is safe to use
   8.913 + * in interrupt context :)
   8.914 + */
   8.915 +static int kexec_lock = 0;
   8.916 +
   8.917 +asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
   8.918 +				struct kexec_segment __user *segments,
   8.919 +				unsigned long flags)
   8.920 +{
   8.921 +	struct kimage **dest_image, *image;
   8.922 +	int locked;
   8.923 +	int result;
   8.924 +
   8.925 +	/* We only trust the superuser with rebooting the system. */
   8.926 +	if (!capable(CAP_SYS_BOOT))
   8.927 +		return -EPERM;
   8.928 +
   8.929 +	/*
   8.930 +	 * Verify we have a legal set of flags
   8.931 +	 * This leaves us room for future extensions.
   8.932 +	 */
   8.933 +	if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
   8.934 +		return -EINVAL;
   8.935 +
   8.936 +	/* Verify we are on the appropriate architecture */
   8.937 +	if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
   8.938 +		((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
   8.939 +		return -EINVAL;
   8.940 +
   8.941 +	/* Put an artificial cap on the number
   8.942 +	 * of segments passed to kexec_load.
   8.943 +	 */
   8.944 +	if (nr_segments > KEXEC_SEGMENT_MAX)
   8.945 +		return -EINVAL;
   8.946 +
   8.947 +	image = NULL;
   8.948 +	result = 0;
   8.949 +
   8.950 +	/* Because we write directly to the reserved memory
   8.951 +	 * region when loading crash kernels we need a mutex here to
   8.952 +	 * prevent multiple crash  kernels from attempting to load
   8.953 +	 * simultaneously, and to prevent a crash kernel from loading
   8.954 +	 * over the top of a in use crash kernel.
   8.955 +	 *
   8.956 +	 * KISS: always take the mutex.
   8.957 +	 */
   8.958 +	locked = xchg(&kexec_lock, 1);
   8.959 +	if (locked)
   8.960 +		return -EBUSY;
   8.961 +
   8.962 +	dest_image = &kexec_image;
   8.963 +	if (flags & KEXEC_ON_CRASH)
   8.964 +		dest_image = &kexec_crash_image;
   8.965 +	if (nr_segments > 0) {
   8.966 +		unsigned long i;
   8.967 +
   8.968 +		/* Loading another kernel to reboot into */
   8.969 +		if ((flags & KEXEC_ON_CRASH) == 0)
   8.970 +			result = kimage_normal_alloc(&image, entry,
   8.971 +							nr_segments, segments);
   8.972 +		/* Loading another kernel to switch to if this one crashes */
   8.973 +		else if (flags & KEXEC_ON_CRASH) {
   8.974 +			/* Free any current crash dump kernel before
   8.975 +			 * we corrupt it.
   8.976 +			 */
   8.977 +			kimage_free(xchg(&kexec_crash_image, NULL));
   8.978 +			result = kimage_crash_alloc(&image, entry,
   8.979 +						     nr_segments, segments);
   8.980 +		}
   8.981 +		if (result)
   8.982 +			goto out;
   8.983 +
   8.984 +		result = machine_kexec_prepare(image);
   8.985 +		if (result)
   8.986 +			goto out;
   8.987 +
   8.988 +		for (i = 0; i < nr_segments; i++) {
   8.989 +			result = kimage_load_segment(image, &image->segment[i]);
   8.990 +			if (result)
   8.991 +				goto out;
   8.992 +		}
   8.993 +		result = kimage_terminate(image);
   8.994 +		if (result)
   8.995 +			goto out;
   8.996 +	}
   8.997 +	/* Install the new kernel, and  Uninstall the old */
   8.998 +	image = xchg(dest_image, image);
   8.999 +
  8.1000 +out:
  8.1001 +	xchg(&kexec_lock, 0); /* Release the mutex */
  8.1002 +	kimage_free(image);
  8.1003 +
  8.1004 +	return result;
  8.1005 +}
  8.1006 +
  8.1007 +#ifdef CONFIG_COMPAT
  8.1008 +asmlinkage long compat_sys_kexec_load(unsigned long entry,
  8.1009 +				unsigned long nr_segments,
  8.1010 +				struct compat_kexec_segment __user *segments,
  8.1011 +				unsigned long flags)
  8.1012 +{
  8.1013 +	struct compat_kexec_segment in;
  8.1014 +	struct kexec_segment out, __user *ksegments;
  8.1015 +	unsigned long i, result;
  8.1016 +
  8.1017 +	/* Don't allow clients that don't understand the native
  8.1018 +	 * architecture to do anything.
  8.1019 +	 */
  8.1020 +	if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
  8.1021 +		return -EINVAL;
  8.1022 +
  8.1023 +	if (nr_segments > KEXEC_SEGMENT_MAX)
  8.1024 +		return -EINVAL;
  8.1025 +
  8.1026 +	ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
  8.1027 +	for (i=0; i < nr_segments; i++) {
  8.1028 +		result = copy_from_user(&in, &segments[i], sizeof(in));
  8.1029 +		if (result)
  8.1030 +			return -EFAULT;
  8.1031 +
  8.1032 +		out.buf   = compat_ptr(in.buf);
  8.1033 +		out.bufsz = in.bufsz;
  8.1034 +		out.mem   = in.mem;
  8.1035 +		out.memsz = in.memsz;
  8.1036 +
  8.1037 +		result = copy_to_user(&ksegments[i], &out, sizeof(out));
  8.1038 +		if (result)
  8.1039 +			return -EFAULT;
  8.1040 +	}
  8.1041 +
  8.1042 +	return sys_kexec_load(entry, nr_segments, ksegments, flags);
  8.1043 +}
  8.1044 +#endif
  8.1045 +
  8.1046 +void crash_kexec(struct pt_regs *regs)
  8.1047 +{
  8.1048 +	struct kimage *image;
  8.1049 +	int locked;
  8.1050 +
  8.1051 +
  8.1052 +	/* Take the kexec_lock here to prevent sys_kexec_load
  8.1053 +	 * running on one cpu from replacing the crash kernel
  8.1054 +	 * we are using after a panic on a different cpu.
  8.1055 +	 *
  8.1056 +	 * If the crash kernel was not located in a fixed area
  8.1057 +	 * of memory the xchg(&kexec_crash_image) would be
  8.1058 +	 * sufficient.  But since I reuse the memory...
  8.1059 +	 */
  8.1060 +	locked = xchg(&kexec_lock, 1);
  8.1061 +	if (!locked) {
  8.1062 +		image = xchg(&kexec_crash_image, NULL);
  8.1063 +		if (image) {
  8.1064 +			struct pt_regs fixed_regs;
  8.1065 +			crash_setup_regs(&fixed_regs, regs);
  8.1066 +			machine_crash_shutdown(&fixed_regs);
  8.1067 +			machine_kexec(image);
  8.1068 +		}
  8.1069 +		xchg(&kexec_lock, 0);
  8.1070 +	}
  8.1071 +}
  8.1072 +
  8.1073 +static int __init crash_notes_memory_init(void)
  8.1074 +{
  8.1075 +	/* Allocate memory for saving cpu registers. */
  8.1076 +	crash_notes = alloc_percpu(note_buf_t);
  8.1077 +	if (!crash_notes) {
  8.1078 +		printk("Kexec: Memory allocation for saving cpu register"
  8.1079 +		" states failed\n");
  8.1080 +		return -ENOMEM;
  8.1081 +	}
  8.1082 +	return 0;
  8.1083 +}
  8.1084 +module_init(crash_notes_memory_init)