ia64/xen-unstable
changeset 6055:c4512592a1dc
Attached is a patch to x86_64 xenlinux. It also includes cleanups. We
are also working on SMP + writable pagetable support now.
Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
are also working on SMP + writable pagetable support now.
Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
author | kaf24@firebug.cl.cam.ac.uk |
---|---|
date | Mon Aug 08 08:18:38 2005 +0000 (2005-08-08) |
parents | 69bf77e1b102 |
children | 41ceeb6828b5 |
files | linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/processor.h |
line diff
1.1 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S Mon Aug 08 08:18:06 2005 +0000 1.2 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S Mon Aug 08 08:18:38 2005 +0000 1.3 @@ -28,8 +28,6 @@ 1.4 #include <asm/page.h> 1.5 #include <asm/msr.h> 1.6 #include <asm/cache.h> 1.7 -/* #include <asm/thread_info.h> */ 1.8 - 1.9 1.10 /* we are not able to switch in one step to the final KERNEL ADRESS SPACE 1.11 * because we need identity-mapped pages on setup so define __START_KERNEL to 1.12 @@ -116,15 +114,81 @@ ENTRY(init_level4_pgt) 1.13 ENTRY(init_level4_user_pgt) 1.14 .fill 512,8,0 1.15 1.16 + /* 1.17 + * In Xen the following pre-initialized pgt entries are re-initialized. 1.18 + */ 1.19 +.org 0x3000 1.20 +ENTRY(level3_kernel_pgt) 1.21 + .fill 510,8,0 1.22 + /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ 1.23 + .quad 0x0000000000105007 /* -> level2_kernel_pgt */ 1.24 + .fill 1,8,0 1.25 + 1.26 +.org 0x4000 1.27 +ENTRY(level2_ident_pgt) 1.28 + /* 40MB for bootup. */ 1.29 + .quad 0x0000000000000283 1.30 + .quad 0x0000000000200183 1.31 + .quad 0x0000000000400183 1.32 + .quad 0x0000000000600183 1.33 + .quad 0x0000000000800183 1.34 + .quad 0x0000000000A00183 1.35 + .quad 0x0000000000C00183 1.36 + .quad 0x0000000000E00183 1.37 + .quad 0x0000000001000183 1.38 + .quad 0x0000000001200183 1.39 + .quad 0x0000000001400183 1.40 + .quad 0x0000000001600183 1.41 + .quad 0x0000000001800183 1.42 + .quad 0x0000000001A00183 1.43 + .quad 0x0000000001C00183 1.44 + .quad 0x0000000001E00183 1.45 + .quad 0x0000000002000183 1.46 + .quad 0x0000000002200183 1.47 + .quad 0x0000000002400183 1.48 + .quad 0x0000000002600183 1.49 + /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */ 1.50 + .globl temp_boot_pmds 1.51 +temp_boot_pmds: 1.52 + .fill 492,8,0 1.53 + 1.54 +.org 0x5000 1.55 +ENTRY(level2_kernel_pgt) 1.56 + /* 40MB kernel mapping. The kernel code cannot be bigger than that. 1.57 + When you change this change KERNEL_TEXT_SIZE in page.h too. */ 1.58 + /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ 1.59 + .quad 0x0000000000000183 1.60 + .quad 0x0000000000200183 1.61 + .quad 0x0000000000400183 1.62 + .quad 0x0000000000600183 1.63 + .quad 0x0000000000800183 1.64 + .quad 0x0000000000A00183 1.65 + .quad 0x0000000000C00183 1.66 + .quad 0x0000000000E00183 1.67 + .quad 0x0000000001000183 1.68 + .quad 0x0000000001200183 1.69 + .quad 0x0000000001400183 1.70 + .quad 0x0000000001600183 1.71 + .quad 0x0000000001800183 1.72 + .quad 0x0000000001A00183 1.73 + .quad 0x0000000001C00183 1.74 + .quad 0x0000000001E00183 1.75 + .quad 0x0000000002000183 1.76 + .quad 0x0000000002200183 1.77 + .quad 0x0000000002400183 1.78 + .quad 0x0000000002600183 1.79 + /* Module mapping starts here */ 1.80 + .fill 492,8,0 1.81 + 1.82 /* 1.83 * This is used for vsyscall area mapping as we have a different 1.84 * level4 page table for user. 1.85 */ 1.86 -.org 0x3000 1.87 +.org 0x6000 1.88 ENTRY(level3_user_pgt) 1.89 .fill 512,8,0 1.90 1.91 -.org 0x4000 1.92 +.org 0x7000 1.93 ENTRY(cpu_gdt_table) 1.94 /* The TLS descriptors are currently at a different place compared to i386. 1.95 Hopefully nobody expects them at a fixed place (Wine?) */ 1.96 @@ -147,19 +211,24 @@ gdt_end: 1.97 /* GDTs of other CPUs: */ 1.98 .fill (GDT_SIZE * NR_CPUS) - (gdt_end - cpu_gdt_table) 1.99 1.100 -.org 0x5000 1.101 +.org 0x8000 1.102 ENTRY(empty_zero_page) 1.103 1.104 -.org 0x6000 1.105 +.org 0x9000 1.106 ENTRY(empty_bad_page) 1.107 1.108 -.org 0x7000 1.109 +.org 0xa000 1.110 ENTRY(empty_bad_pte_table) 1.111 1.112 -.org 0x8000 1.113 +.org 0xb000 1.114 ENTRY(empty_bad_pmd_table) 1.115 1.116 - .org 0x9000 1.117 +.org 0xc000 1.118 +ENTRY(level3_physmem_pgt) 1.119 + .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */ 1.120 + 1.121 + 1.122 + .org 0xd000 1.123 #ifdef CONFIG_ACPI_SLEEP 1.124 ENTRY(wakeup_level4_pgt) 1.125 .quad 0x0000000000102007 /* -> level3_ident_pgt */
2.1 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c Mon Aug 08 08:18:06 2005 +0000 2.2 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c Mon Aug 08 08:18:38 2005 +0000 2.3 @@ -623,7 +623,9 @@ void __init setup_arch(char **cmdline_p) 2.4 rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); 2.5 rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); 2.6 #endif 2.7 -/* register_console(&xen_console); */ 2.8 + 2.9 + HYPERVISOR_vm_assist(VMASST_CMD_enable, 2.10 + VMASST_TYPE_writable_pagetables); 2.11 2.12 #ifdef CONFIG_XEN_PHYSDEV_ACCESS 2.13 /* This is drawn from a dump from vgacon:startup in standard Linux. */
3.1 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c Mon Aug 08 08:18:06 2005 +0000 3.2 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c Mon Aug 08 08:18:38 2005 +0000 3.3 @@ -30,9 +30,9 @@ 3.4 #include <asm/proto.h> 3.5 #include <asm/mman.h> 3.6 #include <asm/numa.h> 3.7 - 3.8 +#ifdef CONFIG_XEN 3.9 #include <asm-xen/hypervisor.h> 3.10 - 3.11 +#endif 3.12 char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; 3.13 3.14 cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; 3.15 @@ -123,82 +123,11 @@ void __init setup_per_cpu_areas(void) 3.16 } 3.17 } 3.18 3.19 -void pda_init(int cpu) 3.20 -{ 3.21 - pgd_t *old_level4 = (pgd_t *)xen_start_info.pt_base; 3.22 - struct x8664_pda *pda = &cpu_pda[cpu]; 3.23 - 3.24 - /* Setup up data that may be needed in __get_free_pages early */ 3.25 - asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 3.26 - HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, 3.27 - (unsigned long)(cpu_pda + cpu)); 3.28 - 3.29 - pda->me = pda; 3.30 - pda->cpunumber = cpu; 3.31 - pda->irqcount = -1; 3.32 - pda->kernelstack = 3.33 - (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; 3.34 - pda->active_mm = &init_mm; 3.35 - pda->mmu_state = 0; 3.36 - pda->kernel_mode = 1; 3.37 - 3.38 - if (cpu == 0) { 3.39 - memcpy((void *)init_level4_pgt, 3.40 - (void *) xen_start_info.pt_base, PAGE_SIZE); 3.41 - /* others are initialized in smpboot.c */ 3.42 - pda->pcurrent = &init_task; 3.43 - pda->irqstackptr = boot_cpu_stack; 3.44 - make_page_readonly(init_level4_pgt); 3.45 - make_page_readonly(init_level4_user_pgt); 3.46 - make_page_readonly(level3_user_pgt); /* for vsyscall stuff */ 3.47 - xen_pgd_pin(__pa_symbol(init_level4_user_pgt)); 3.48 - xen_pud_pin(__pa_symbol(level3_user_pgt)); 3.49 - set_pgd((pgd_t *)(init_level4_user_pgt + 511), 3.50 - mk_kernel_pgd(__pa_symbol(level3_user_pgt))); 3.51 - } else { 3.52 - pda->irqstackptr = (char *) 3.53 - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); 3.54 - if (!pda->irqstackptr) 3.55 - panic("cannot allocate irqstack for cpu %d", cpu); 3.56 - } 3.57 - 3.58 +#ifdef CONFIG_XEN 3.59 +static void switch_pt(void) 3.60 +{ 3.61 xen_pt_switch(__pa(init_level4_pgt)); 3.62 xen_new_user_pt(__pa(init_level4_user_pgt)); 3.63 - 3.64 - if (cpu == 0) { 3.65 - xen_pgd_unpin(__pa(old_level4)); 3.66 -#if 0 3.67 - early_printk("__pa: %x, <machine_phys> old_level 4 %x\n", 3.68 - __pa(xen_start_info.pt_base), 3.69 - pfn_to_mfn(__pa(old_level4) >> PAGE_SHIFT)); 3.70 -#endif 3.71 -// make_page_writable(old_level4); 3.72 -// free_bootmem(__pa(old_level4), PAGE_SIZE); 3.73 - } 3.74 - 3.75 - pda->irqstackptr += IRQSTACKSIZE-64; 3.76 -} 3.77 - 3.78 -char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ] 3.79 -__attribute__((section(".bss.page_aligned"))); 3.80 - 3.81 -/* May not be marked __init: used by software suspend */ 3.82 -void syscall_init(void) 3.83 -{ 3.84 -#ifdef CONFIG_IA32_EMULATION 3.85 - syscall32_cpu_init (); 3.86 -#endif 3.87 -} 3.88 - 3.89 -void __init check_efer(void) 3.90 -{ 3.91 - unsigned long efer; 3.92 - 3.93 - rdmsrl(MSR_EFER, efer); 3.94 - if (!(efer & EFER_NX) || do_not_nx) { 3.95 - __supported_pte_mask &= ~_PAGE_NX; 3.96 - 3.97 - } 3.98 } 3.99 3.100 void __init cpu_gdt_init(struct desc_ptr *gdt_descr) 3.101 @@ -217,7 +146,96 @@ void __init cpu_gdt_init(struct desc_ptr 3.102 sizeof (struct desc_struct))) 3.103 BUG(); 3.104 } 3.105 +#else 3.106 +static void switch_pt(void) 3.107 +{ 3.108 + asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); 3.109 +} 3.110 3.111 +void __init cpu_gdt_init(struct desc_ptr *gdt_descr) 3.112 +{ 3.113 +#ifdef CONFIG_SMP 3.114 + int cpu = stack_smp_processor_id(); 3.115 +#else 3.116 + int cpu = smp_processor_id(); 3.117 +#endif 3.118 + 3.119 + asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); 3.120 + asm volatile("lidt %0" :: "m" (idt_descr)); 3.121 +} 3.122 +#endif 3.123 + 3.124 + 3.125 +void pda_init(int cpu) 3.126 +{ 3.127 + struct x8664_pda *pda = &cpu_pda[cpu]; 3.128 + 3.129 + /* Setup up data that may be needed in __get_free_pages early */ 3.130 + asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 3.131 +#ifndef CONFIG_XEN 3.132 + wrmsrl(MSR_GS_BASE, cpu_pda + cpu); 3.133 +#else 3.134 + HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, 3.135 + (unsigned long)(cpu_pda + cpu)); 3.136 +#endif 3.137 + pda->me = pda; 3.138 + pda->cpunumber = cpu; 3.139 + pda->irqcount = -1; 3.140 + pda->kernelstack = 3.141 + (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; 3.142 + pda->active_mm = &init_mm; 3.143 + pda->mmu_state = 0; 3.144 + 3.145 + if (cpu == 0) { 3.146 +#ifdef CONFIG_XEN 3.147 + xen_init_pt(); 3.148 +#endif 3.149 + /* others are initialized in smpboot.c */ 3.150 + pda->pcurrent = &init_task; 3.151 + pda->irqstackptr = boot_cpu_stack; 3.152 + } else { 3.153 + pda->irqstackptr = (char *) 3.154 + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); 3.155 + if (!pda->irqstackptr) 3.156 + panic("cannot allocate irqstack for cpu %d", cpu); 3.157 + } 3.158 + 3.159 + switch_pt(); 3.160 + pda->irqstackptr += IRQSTACKSIZE-64; 3.161 +} 3.162 + 3.163 +char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ] 3.164 +__attribute__((section(".bss.page_aligned"))); 3.165 + 3.166 +/* May not be marked __init: used by software suspend */ 3.167 +void syscall_init(void) 3.168 +{ 3.169 +#ifndef CONFIG_XEN 3.170 + /* 3.171 + * LSTAR and STAR live in a bit strange symbiosis. 3.172 + * They both write to the same internal register. STAR allows to set CS/DS 3.173 + * but only a 32bit target. LSTAR sets the 64bit rip. 3.174 + */ 3.175 + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); 3.176 + wrmsrl(MSR_LSTAR, system_call); 3.177 + 3.178 + /* Flags to clear on syscall */ 3.179 + wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 3.180 +#endif 3.181 +#ifdef CONFIG_IA32_EMULATION 3.182 + syscall32_cpu_init (); 3.183 +#endif 3.184 +} 3.185 + 3.186 +void __init check_efer(void) 3.187 +{ 3.188 + unsigned long efer; 3.189 + 3.190 + rdmsrl(MSR_EFER, efer); 3.191 + if (!(efer & EFER_NX) || do_not_nx) { 3.192 + __supported_pte_mask &= ~_PAGE_NX; 3.193 + } 3.194 +} 3.195 3.196 /* 3.197 * cpu_init() initializes state that is per-CPU. Some data is already 3.198 @@ -247,14 +265,13 @@ void __init cpu_init (void) 3.199 3.200 me = current; 3.201 3.202 - if (test_and_set_bit(cpu, &cpu_initialized)) 3.203 + if (cpu_test_and_set(cpu, cpu_initialized)) 3.204 panic("CPU#%d already initialized!\n", cpu); 3.205 3.206 printk("Initializing CPU#%d\n", cpu); 3.207 3.208 -#if 0 3.209 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 3.210 -#endif 3.211 + 3.212 /* 3.213 * Initialize the per-CPU GDT with the boot GDT, 3.214 * and set up the GDT descriptor: 3.215 @@ -265,18 +282,16 @@ void __init cpu_init (void) 3.216 3.217 cpu_gdt_descr[cpu].size = GDT_SIZE; 3.218 cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu]; 3.219 -#if 0 3.220 - asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); 3.221 - asm volatile("lidt %0" :: "m" (idt_descr)); 3.222 -#endif 3.223 + 3.224 cpu_gdt_init(&cpu_gdt_descr[cpu]); 3.225 3.226 -#if 0 3.227 +#ifndef CONFIG_XEN 3.228 memcpy(me->thread.tls_array, cpu_gdt_table[cpu], GDT_ENTRY_TLS_ENTRIES * 8); 3.229 3.230 -#endif 3.231 +#else 3.232 memcpy(me->thread.tls_array, &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN], 3.233 GDT_ENTRY_TLS_ENTRIES * 8); 3.234 +#endif 3.235 3.236 /* 3.237 * Delete NT 3.238 @@ -284,12 +299,12 @@ void __init cpu_init (void) 3.239 3.240 asm volatile("pushfq ; popq %%rax ; btr $14,%%rax ; pushq %%rax ; popfq" ::: "eax"); 3.241 3.242 - if (cpu == 0) 3.243 - early_identify_cpu(&boot_cpu_data); 3.244 - 3.245 syscall_init(); 3.246 3.247 + wrmsrl(MSR_FS_BASE, 0); 3.248 + wrmsrl(MSR_KERNEL_GS_BASE, 0); 3.249 barrier(); 3.250 + 3.251 check_efer(); 3.252 3.253 /* 3.254 @@ -321,19 +336,22 @@ void __init cpu_init (void) 3.255 BUG(); 3.256 enter_lazy_tlb(&init_mm, me); 3.257 3.258 +#ifndef CONFIG_XEN 3.259 + set_tss_desc(cpu, t); 3.260 + load_TR_desc(); 3.261 +#endif 3.262 load_LDT(&init_mm.context); 3.263 3.264 /* 3.265 * Clear all 6 debug registers: 3.266 */ 3.267 -#define CD(register) HYPERVISOR_set_debugreg(register, 0) 3.268 3.269 - CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); 3.270 + set_debug(0UL, 0); 3.271 + set_debug(0UL, 1); 3.272 + set_debug(0UL, 2); 3.273 + set_debug(0UL, 3); 3.274 + set_debug(0UL, 6); 3.275 + set_debug(0UL, 7); 3.276 3.277 -#undef CD 3.278 fpu_init(); 3.279 - 3.280 -#ifdef CONFIG_NUMA 3.281 - numa_add_cpu(cpu); 3.282 -#endif 3.283 }
4.1 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c Mon Aug 08 08:18:06 2005 +0000 4.2 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c Mon Aug 08 08:18:38 2005 +0000 4.3 @@ -280,7 +280,7 @@ static void set_pte_phys(unsigned long v 4.4 if (!pte_none(*pte) && 4.5 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) 4.6 pte_ERROR(*pte); 4.7 - xen_l1_entry_update(pte, new_pte); 4.8 + set_pte(pte, new_pte); 4.9 4.10 /* 4.11 * It's enough to flush this one mapping. 4.12 @@ -511,6 +511,78 @@ static void __init find_early_table_spac 4.13 round_up(ptes * 8, PAGE_SIZE); 4.14 } 4.15 4.16 +static void xen_copy_pt(void) 4.17 +{ 4.18 + unsigned long va = __START_KERNEL_map; 4.19 + unsigned long addr, *pte_page; 4.20 + int i; 4.21 + pud_t *pud; pmd_t *pmd; pte_t *pte; 4.22 + unsigned long *page = (unsigned long *) init_level4_pgt; 4.23 + 4.24 + addr = (unsigned long) page[pgd_index(va)]; 4.25 + addr_to_page(addr, page); 4.26 + 4.27 + pud = (pud_t *) &page[pud_index(va)]; 4.28 + addr = page[pud_index(va)]; 4.29 + addr_to_page(addr, page); 4.30 + 4.31 + level3_kernel_pgt[pud_index(va)] = 4.32 + __pud(__pa_symbol(level2_kernel_pgt) | _KERNPG_TABLE | _PAGE_USER); 4.33 + 4.34 + for (;;) { 4.35 + pmd = (pmd_t *) &page[pmd_index(va)]; 4.36 + if (pmd_present(*pmd)) { 4.37 + level2_kernel_pgt[pmd_index(va)] = *pmd; 4.38 + /* 4.39 + * if pmd is valid, check pte. 4.40 + */ 4.41 + addr = page[pmd_index(va)]; 4.42 + addr_to_page(addr, pte_page); 4.43 + 4.44 + for (i = 0; i < PTRS_PER_PTE; i++) { 4.45 + pte = (pte_t *) &pte_page[pte_index(va)]; 4.46 + if (pte_present(*pte)) 4.47 + va += PAGE_SIZE; 4.48 + else 4.49 + break; 4.50 + } 4.51 + 4.52 + } else 4.53 + break; 4.54 + } 4.55 + 4.56 + init_level4_pgt[pgd_index(__START_KERNEL_map)] = 4.57 + mk_kernel_pgd(__pa_symbol(level3_kernel_pgt)); 4.58 +} 4.59 + 4.60 +void __init xen_init_pt(void) 4.61 +{ 4.62 + pgd_t *old_level4 = (pgd_t *)xen_start_info.pt_base; 4.63 + 4.64 + memcpy((void *)init_level4_pgt, 4.65 + (void *)xen_start_info.pt_base, PAGE_SIZE); 4.66 + 4.67 + memset((void *)level3_kernel_pgt, 0, PAGE_SIZE); 4.68 + memset((void *)level2_kernel_pgt, 0, PAGE_SIZE); 4.69 + 4.70 + xen_copy_pt(); 4.71 + 4.72 + make_page_readonly(init_level4_pgt); 4.73 + make_page_readonly(level3_kernel_pgt); 4.74 + make_page_readonly(level2_kernel_pgt); 4.75 + make_page_readonly(init_level4_user_pgt); 4.76 + make_page_readonly(level3_user_pgt); /* for vsyscall stuff */ 4.77 + 4.78 + xen_pgd_pin(__pa_symbol(init_level4_pgt)); 4.79 + xen_pgd_pin(__pa_symbol(init_level4_user_pgt)); 4.80 + xen_pud_pin(__pa_symbol(level3_kernel_pgt)); 4.81 + xen_pud_pin(__pa_symbol(level3_user_pgt)); 4.82 + xen_pmd_pin(__pa_symbol(level2_kernel_pgt)); 4.83 + 4.84 + set_pgd((pgd_t *)(init_level4_user_pgt + 511), 4.85 + mk_kernel_pgd(__pa_symbol(level3_user_pgt))); 4.86 + 4.87 +} 4.88 4.89 /* 4.90 * Extend kernel mapping to access pages for page tables. The initial
5.1 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h Mon Aug 08 08:18:06 2005 +0000 5.2 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h Mon Aug 08 08:18:38 2005 +0000 5.3 @@ -4,31 +4,19 @@ 5.4 /* 5.5 * This file contains the functions and defines necessary to modify and use 5.6 * the x86-64 page table tree. 5.7 - * 5.8 - * x86-64 has a 4 level table setup. Generic linux MM only supports 5.9 - * three levels. The fourth level is currently a single static page that 5.10 - * is shared by everybody and just contains a pointer to the current 5.11 - * three level page setup on the beginning and some kernel mappings at 5.12 - * the end. For more details see Documentation/x86_64/mm.txt 5.13 */ 5.14 #include <asm/processor.h> 5.15 #include <asm/fixmap.h> 5.16 #include <asm/bitops.h> 5.17 #include <linux/threads.h> 5.18 #include <asm/pda.h> 5.19 +#ifdef CONFIG_XEN 5.20 #include <asm-xen/hypervisor.h> 5.21 -extern pud_t level3_user_pgt[512]; 5.22 -extern pud_t init_level4_pgt[]; 5.23 -extern pud_t init_level4_user_pgt[]; 5.24 -extern unsigned long __supported_pte_mask; 5.25 5.26 -#define swapper_pg_dir NULL 5.27 +extern pud_t level3_user_pgt[512]; 5.28 +extern pud_t init_level4_user_pgt[]; 5.29 5.30 -extern int nonx_setup(char *str); 5.31 -extern void paging_init(void); 5.32 -extern void clear_kernel_mapping(unsigned long addr, unsigned long size); 5.33 - 5.34 -extern unsigned long pgkern_mask; 5.35 +extern void xen_init_pt(void); 5.36 5.37 #define virt_to_ptep(__va) \ 5.38 ({ \ 5.39 @@ -44,6 +32,22 @@ extern unsigned long pgkern_mask; 5.40 unsigned long __pa = (*(unsigned long *)__pte) & PAGE_MASK; \ 5.41 __pa | ((unsigned long)(__va) & (PAGE_SIZE-1)); \ 5.42 }) 5.43 +#endif 5.44 + 5.45 +extern pud_t level3_kernel_pgt[512]; 5.46 +extern pud_t level3_physmem_pgt[512]; 5.47 +extern pud_t level3_ident_pgt[512]; 5.48 +extern pmd_t level2_kernel_pgt[512]; 5.49 +extern pgd_t init_level4_pgt[]; 5.50 +extern unsigned long __supported_pte_mask; 5.51 + 5.52 +#define swapper_pg_dir init_level4_pgt 5.53 + 5.54 +extern int nonx_setup(char *str); 5.55 +extern void paging_init(void); 5.56 +extern void clear_kernel_mapping(unsigned long addr, unsigned long size); 5.57 + 5.58 +extern unsigned long pgkern_mask; 5.59 5.60 /* 5.61 * ZERO_PAGE is a global shared page that is always zero: used 5.62 @@ -52,11 +56,14 @@ extern unsigned long pgkern_mask; 5.63 extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; 5.64 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) 5.65 5.66 +/* 5.67 + * PGDIR_SHIFT determines what a top-level page table entry can map 5.68 + */ 5.69 #define PGDIR_SHIFT 39 5.70 #define PTRS_PER_PGD 512 5.71 5.72 /* 5.73 - * PUDIR_SHIFT determines what a top-level page table entry can map 5.74 + * 3rd level page 5.75 */ 5.76 #define PUD_SHIFT 30 5.77 #define PTRS_PER_PUD 512 5.78 @@ -80,7 +87,7 @@ extern unsigned long empty_zero_page[PAG 5.79 #define pud_ERROR(e) \ 5.80 printk("%s:%d: bad pud %p(%016lx).\n", __FILE__, __LINE__, &(e), pud_val(e)) 5.81 #define pgd_ERROR(e) \ 5.82 - printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), pgd_val(e)) 5.83 + printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), pgd_val(e)) 5.84 5.85 #define pgd_none(x) (!pgd_val(x)) 5.86 #define pud_none(x) (!pud_val(x)) 5.87 @@ -90,18 +97,10 @@ extern unsigned long empty_zero_page[PAG 5.88 5.89 extern inline int pud_present(pud_t pud) { return !pud_none(pud); } 5.90 5.91 -#ifdef CONFIG_SMP 5.92 -#define set_pte(pteptr, pteval) xen_l1_entry_update(pteptr, (pteval)) 5.93 - 5.94 -#else 5.95 -#define set_pte(pteptr, pteval) xen_l1_entry_update(pteptr, (pteval)) 5.96 -#if 0 5.97 static inline void set_pte(pte_t *dst, pte_t val) 5.98 { 5.99 *dst = val; 5.100 } 5.101 -#endif 5.102 -#endif 5.103 5.104 #define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval)) 5.105 #define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval)) 5.106 @@ -132,6 +131,9 @@ extern inline void pgd_clear (pgd_t * pg 5.107 * each domain will have separate page tables, with their own versions of 5.108 * accessed & dirty state. 5.109 */ 5.110 +#define ptep_get_and_clear(mm,addr,xp) __pte_ma(xchg(&(xp)->pte, 0)) 5.111 + 5.112 +#if 0 5.113 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp) 5.114 { 5.115 pte_t pte = *xp; 5.116 @@ -139,21 +141,22 @@ static inline pte_t ptep_get_and_clear(s 5.117 set_pte(xp, __pte_ma(0)); 5.118 return pte; 5.119 } 5.120 +#endif 5.121 5.122 #define pte_same(a, b) ((a).pte == (b).pte) 5.123 5.124 -#define PMD_SIZE (1UL << PMD_SHIFT) 5.125 -#define PMD_MASK (~(PMD_SIZE-1)) 5.126 -#define PUD_SIZE (1UL << PUD_SHIFT) 5.127 -#define PUD_MASK (~(PUD_SIZE-1)) 5.128 -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) 5.129 -#define PGDIR_MASK (~(PGDIR_SIZE-1)) 5.130 +#define PMD_SIZE (1UL << PMD_SHIFT) 5.131 +#define PMD_MASK (~(PMD_SIZE-1)) 5.132 +#define PUD_SIZE (1UL << PUD_SHIFT) 5.133 +#define PUD_MASK (~(PUD_SIZE-1)) 5.134 +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) 5.135 +#define PGDIR_MASK (~(PGDIR_SIZE-1)) 5.136 5.137 -#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) 5.138 +#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) 5.139 #define FIRST_USER_ADDRESS 0 5.140 5.141 #ifndef __ASSEMBLY__ 5.142 -#define MAXMEM 0x3fffffffffffUL 5.143 +#define MAXMEM 0x3fffffffffffUL 5.144 #define VMALLOC_START 0xffffc20000000000UL 5.145 #define VMALLOC_END 0xffffe1ffffffffffUL 5.146 #define MODULES_VADDR 0xffffffff88000000UL 5.147 @@ -347,7 +350,7 @@ static inline int ptep_test_and_clear_di 5.148 pte_t pte = *ptep; 5.149 int ret = pte_dirty(pte); 5.150 if (ret) 5.151 - xen_l1_entry_update(ptep, pte_mkclean(pte)); 5.152 + set_pte(ptep, pte_mkclean(pte)); 5.153 return ret; 5.154 } 5.155 5.156 @@ -356,7 +359,7 @@ static inline int ptep_test_and_clear_yo 5.157 pte_t pte = *ptep; 5.158 int ret = pte_young(pte); 5.159 if (ret) 5.160 - xen_l1_entry_update(ptep, pte_mkold(pte)); 5.161 + set_pte(ptep, pte_mkold(pte)); 5.162 return ret; 5.163 } 5.164 5.165 @@ -398,7 +401,7 @@ static inline int pmd_large(pmd_t pte) { 5.166 5.167 /* PUD - Level3 access */ 5.168 /* to find an entry in a page-table-directory. */ 5.169 -#define pud_index(address) ((address >> PUD_SHIFT) & (PTRS_PER_PUD-1)) 5.170 +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) 5.171 #define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address)) 5.172 static inline pud_t *__pud_offset_k(pud_t *pud, unsigned long address) 5.173 { 5.174 @@ -413,7 +416,7 @@ static inline pud_t *pud_offset_k(unsign 5.175 { 5.176 unsigned long addr; 5.177 5.178 - addr = pud_val(init_level4_pgt[pud_index(address)]); 5.179 + addr = pgd_val(init_level4_pgt[pud_index(address)]); 5.180 addr &= PHYSICAL_PAGE_MASK; /* machine physical */ 5.181 addr = machine_to_phys(addr); 5.182 return __pud_offset_k((pud_t *)__va(addr), address); 5.183 @@ -427,9 +430,11 @@ static inline pud_t *pud_offset_k(unsign 5.184 #define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \ 5.185 pmd_index(address)) 5.186 #define pmd_none(x) (!pmd_val(x)) 5.187 -#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) 5.188 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. 5.189 + can temporarily clear it. */ 5.190 +#define pmd_present(x) (pmd_val(x)) 5.191 #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) 5.192 -#define pmd_bad(x) ((pmd_val(x) & ~PTE_MASK) != _KERNPG_TABLE ) 5.193 +#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT)) 5.194 #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot))) 5.195 #define pmd_pfn(x) ((pmd_val(x) >> PAGE_SHIFT) & __PHYSICAL_MASK) 5.196 5.197 @@ -479,6 +484,7 @@ extern inline pte_t pte_modify(pte_t pte 5.198 * race with other CPU's that might be updating the dirty 5.199 * bit at the same time. */ 5.200 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS 5.201 +#if 0 5.202 #define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ 5.203 do { \ 5.204 if (__dirty) { \ 5.205 @@ -486,6 +492,18 @@ extern inline pte_t pte_modify(pte_t pte 5.206 flush_tlb_page(__vma, __address); \ 5.207 } \ 5.208 } while (0) 5.209 +#endif 5.210 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ 5.211 + do { \ 5.212 + if (__dirty) { \ 5.213 + if ( likely((__vma)->vm_mm == current->mm) ) { \ 5.214 + HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits)); \ 5.215 + } else { \ 5.216 + xen_l1_entry_update((__ptep), (__entry)); \ 5.217 + flush_tlb_page((__vma), (__address)); \ 5.218 + } \ 5.219 + } \ 5.220 + } while (0) 5.221 5.222 /* Encode and de-code a swap entry */ 5.223 #define __swp_type(x) (((x).val >> 1) & 0x3f)
6.1 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/processor.h Mon Aug 08 08:18:06 2005 +0000 6.2 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/processor.h Mon Aug 08 08:18:38 2005 +0000 6.3 @@ -154,6 +154,20 @@ static inline void set_in_cr4 (unsigned 6.4 } 6.5 } 6.6 6.7 + 6.8 +static inline void clear_in_cr4 (unsigned long mask) 6.9 +{ 6.10 +#ifndef CONFIG_XEN 6.11 + mmu_cr4_features &= ~mask; 6.12 + __asm__("movq %%cr4,%%rax\n\t" 6.13 + "andq %0,%%rax\n\t" 6.14 + "movq %%rax,%%cr4\n" 6.15 + : : "irg" (~mask) 6.16 + :"ax"); 6.17 +#endif 6.18 +} 6.19 + 6.20 + 6.21 #define load_cr3(pgdir) do { \ 6.22 xen_pt_switch(__pa(pgdir)); \ 6.23 per_cpu(cur_pgd, smp_processor_id()) = pgdir; \ 6.24 @@ -283,9 +297,9 @@ struct thread_struct { 6.25 load_gs_index(0); \ 6.26 (regs)->rip = (new_rip); \ 6.27 (regs)->rsp = (new_rsp); \ 6.28 - write_pda(oldrsp, (new_rsp)); \ 6.29 - (regs)->cs = __USER_CS; \ 6.30 - (regs)->ss = __USER_DS; \ 6.31 + write_pda(oldrsp, (new_rsp)); \ 6.32 + (regs)->cs = __USER_CS; \ 6.33 + (regs)->ss = __USER_DS; \ 6.34 (regs)->eflags = 0x200; \ 6.35 set_fs(USER_DS); \ 6.36 } while(0)