ia64/xen-unstable

changeset 6055:c4512592a1dc

Attached is a patch to x86_64 xenlinux. It also includes cleanups. We
are also working on SMP + writable pagetable support now.
Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>
author kaf24@firebug.cl.cam.ac.uk
date Mon Aug 08 08:18:38 2005 +0000 (2005-08-08)
parents 69bf77e1b102
children 41ceeb6828b5
files linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/processor.h
line diff
     1.1 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S	Mon Aug 08 08:18:06 2005 +0000
     1.2 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S	Mon Aug 08 08:18:38 2005 +0000
     1.3 @@ -28,8 +28,6 @@
     1.4  #include <asm/page.h>
     1.5  #include <asm/msr.h>
     1.6  #include <asm/cache.h>
     1.7 -/* #include <asm/thread_info.h> */
     1.8 -        
     1.9  	
    1.10  /* we are not able to switch in one step to the final KERNEL ADRESS SPACE
    1.11   * because we need identity-mapped pages on setup so define __START_KERNEL to
    1.12 @@ -116,15 +114,81 @@ ENTRY(init_level4_pgt)
    1.13  ENTRY(init_level4_user_pgt)
    1.14  	.fill	512,8,0
    1.15  
    1.16 +	/*
    1.17 +	 * In Xen the following pre-initialized pgt entries are re-initialized.
    1.18 +	 */
    1.19 +.org 0x3000
    1.20 +ENTRY(level3_kernel_pgt)
    1.21 +	.fill	510,8,0
    1.22 +	/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
    1.23 +	.quad	0x0000000000105007		/* -> level2_kernel_pgt */
    1.24 +	.fill	1,8,0
    1.25 +
    1.26 +.org 0x4000
    1.27 +ENTRY(level2_ident_pgt)
    1.28 +	/* 40MB for bootup. 	*/
    1.29 +	.quad	0x0000000000000283
    1.30 +	.quad	0x0000000000200183
    1.31 +	.quad	0x0000000000400183
    1.32 +	.quad	0x0000000000600183
    1.33 +	.quad	0x0000000000800183
    1.34 +	.quad	0x0000000000A00183
    1.35 +	.quad	0x0000000000C00183
    1.36 +	.quad	0x0000000000E00183
    1.37 +	.quad	0x0000000001000183
    1.38 +	.quad	0x0000000001200183
    1.39 +	.quad	0x0000000001400183
    1.40 +	.quad	0x0000000001600183
    1.41 +	.quad	0x0000000001800183
    1.42 +	.quad	0x0000000001A00183
    1.43 +	.quad	0x0000000001C00183
    1.44 +	.quad	0x0000000001E00183
    1.45 +	.quad	0x0000000002000183
    1.46 +	.quad	0x0000000002200183
    1.47 +	.quad	0x0000000002400183
    1.48 +	.quad	0x0000000002600183
    1.49 +	/* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */
    1.50 +	.globl temp_boot_pmds
    1.51 +temp_boot_pmds:
    1.52 +	.fill	492,8,0
    1.53 +
    1.54 +.org 0x5000
    1.55 +ENTRY(level2_kernel_pgt)
    1.56 +	/* 40MB kernel mapping. The kernel code cannot be bigger than that.
    1.57 +	   When you change this change KERNEL_TEXT_SIZE in page.h too. */
    1.58 +	/* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
    1.59 +	.quad	0x0000000000000183
    1.60 +	.quad	0x0000000000200183
    1.61 +	.quad	0x0000000000400183
    1.62 +	.quad	0x0000000000600183
    1.63 +	.quad	0x0000000000800183
    1.64 +	.quad	0x0000000000A00183
    1.65 +	.quad	0x0000000000C00183
    1.66 +	.quad	0x0000000000E00183
    1.67 +	.quad	0x0000000001000183
    1.68 +	.quad	0x0000000001200183
    1.69 +	.quad	0x0000000001400183
    1.70 +	.quad	0x0000000001600183
    1.71 +	.quad	0x0000000001800183
    1.72 +	.quad	0x0000000001A00183
    1.73 +	.quad	0x0000000001C00183
    1.74 +	.quad	0x0000000001E00183
    1.75 +	.quad	0x0000000002000183
    1.76 +	.quad	0x0000000002200183
    1.77 +	.quad	0x0000000002400183
    1.78 +	.quad	0x0000000002600183
    1.79 +	/* Module mapping starts here */
    1.80 +	.fill	492,8,0
    1.81 +	
    1.82          /*
    1.83           * This is used for vsyscall area mapping as we have a different
    1.84           * level4 page table for user.
    1.85           */
    1.86 -.org 0x3000
    1.87 +.org 0x6000
    1.88  ENTRY(level3_user_pgt)
    1.89          .fill	512,8,0
    1.90  
    1.91 -.org 0x4000
    1.92 +.org 0x7000
    1.93  ENTRY(cpu_gdt_table)
    1.94  /* The TLS descriptors are currently at a different place compared to i386.
    1.95     Hopefully nobody expects them at a fixed place (Wine?) */
    1.96 @@ -147,19 +211,24 @@ gdt_end:
    1.97  	/* GDTs of other CPUs: */	
    1.98  	.fill (GDT_SIZE * NR_CPUS) - (gdt_end - cpu_gdt_table)
    1.99  
   1.100 -.org 0x5000
   1.101 +.org 0x8000
   1.102  ENTRY(empty_zero_page)
   1.103  
   1.104 -.org 0x6000
   1.105 +.org 0x9000
   1.106  ENTRY(empty_bad_page)
   1.107  
   1.108 -.org 0x7000
   1.109 +.org 0xa000
   1.110  ENTRY(empty_bad_pte_table)
   1.111  
   1.112 -.org 0x8000
   1.113 +.org 0xb000
   1.114  ENTRY(empty_bad_pmd_table)
   1.115  
   1.116 -	.org 0x9000
   1.117 +.org 0xc000
   1.118 +ENTRY(level3_physmem_pgt)
   1.119 +	.quad	0x0000000000105007		/* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
   1.120 +
   1.121 +	
   1.122 +	.org 0xd000
   1.123  #ifdef CONFIG_ACPI_SLEEP
   1.124  ENTRY(wakeup_level4_pgt)
   1.125  	.quad	0x0000000000102007		/* -> level3_ident_pgt */
     2.1 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c	Mon Aug 08 08:18:06 2005 +0000
     2.2 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c	Mon Aug 08 08:18:38 2005 +0000
     2.3 @@ -623,7 +623,9 @@ void __init setup_arch(char **cmdline_p)
     2.4  	rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
     2.5  	rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
     2.6  #endif
     2.7 -/*        register_console(&xen_console); */
     2.8 +
     2.9 +	HYPERVISOR_vm_assist(VMASST_CMD_enable,
    2.10 +			     VMASST_TYPE_writable_pagetables);
    2.11  
    2.12  #ifdef CONFIG_XEN_PHYSDEV_ACCESS
    2.13  	/* This is drawn from a dump from vgacon:startup in standard Linux. */
     3.1 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c	Mon Aug 08 08:18:06 2005 +0000
     3.2 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c	Mon Aug 08 08:18:38 2005 +0000
     3.3 @@ -30,9 +30,9 @@
     3.4  #include <asm/proto.h>
     3.5  #include <asm/mman.h>
     3.6  #include <asm/numa.h>
     3.7 -
     3.8 +#ifdef CONFIG_XEN
     3.9  #include <asm-xen/hypervisor.h>
    3.10 -
    3.11 +#endif
    3.12  char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
    3.13  
    3.14  cpumask_t cpu_initialized __initdata = CPU_MASK_NONE;
    3.15 @@ -123,82 +123,11 @@ void __init setup_per_cpu_areas(void)
    3.16  	}
    3.17  } 
    3.18  
    3.19 -void pda_init(int cpu)
    3.20 -{ 
    3.21 -        pgd_t *old_level4 = (pgd_t *)xen_start_info.pt_base;
    3.22 -	struct x8664_pda *pda = &cpu_pda[cpu];
    3.23 -
    3.24 -	/* Setup up data that may be needed in __get_free_pages early */
    3.25 -	asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
    3.26 -        HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, 
    3.27 -                                    (unsigned long)(cpu_pda + cpu));
    3.28 -
    3.29 -	pda->me = pda;
    3.30 -	pda->cpunumber = cpu; 
    3.31 -	pda->irqcount = -1;
    3.32 -	pda->kernelstack = 
    3.33 -		(unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; 
    3.34 -	pda->active_mm = &init_mm;
    3.35 -	pda->mmu_state = 0;
    3.36 -        pda->kernel_mode = 1;
    3.37 -
    3.38 -	if (cpu == 0) {
    3.39 -                memcpy((void *)init_level4_pgt, 
    3.40 -                       (void *) xen_start_info.pt_base, PAGE_SIZE);
    3.41 -		/* others are initialized in smpboot.c */
    3.42 -		pda->pcurrent = &init_task;
    3.43 -		pda->irqstackptr = boot_cpu_stack; 
    3.44 -                make_page_readonly(init_level4_pgt);
    3.45 -                make_page_readonly(init_level4_user_pgt);
    3.46 -                make_page_readonly(level3_user_pgt); /* for vsyscall stuff */
    3.47 -                xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
    3.48 -                xen_pud_pin(__pa_symbol(level3_user_pgt));
    3.49 -                set_pgd((pgd_t *)(init_level4_user_pgt + 511), 
    3.50 -                        mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
    3.51 -	} else {
    3.52 -		pda->irqstackptr = (char *)
    3.53 -			__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
    3.54 -		if (!pda->irqstackptr)
    3.55 -			panic("cannot allocate irqstack for cpu %d", cpu); 
    3.56 -	}
    3.57 -
    3.58 +#ifdef CONFIG_XEN
    3.59 +static void switch_pt(void)
    3.60 +{
    3.61  	xen_pt_switch(__pa(init_level4_pgt));
    3.62          xen_new_user_pt(__pa(init_level4_user_pgt));
    3.63 -
    3.64 -	if (cpu == 0) {
    3.65 -                xen_pgd_unpin(__pa(old_level4));
    3.66 -#if 0
    3.67 -                early_printk("__pa: %x, <machine_phys> old_level 4 %x\n", 
    3.68 -                             __pa(xen_start_info.pt_base),
    3.69 -                             pfn_to_mfn(__pa(old_level4) >> PAGE_SHIFT));
    3.70 -#endif
    3.71 -//                make_page_writable(old_level4);
    3.72 -//                free_bootmem(__pa(old_level4), PAGE_SIZE);
    3.73 -        }
    3.74 -
    3.75 -	pda->irqstackptr += IRQSTACKSIZE-64;
    3.76 -} 
    3.77 -
    3.78 -char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ] 
    3.79 -__attribute__((section(".bss.page_aligned")));
    3.80 -
    3.81 -/* May not be marked __init: used by software suspend */
    3.82 -void syscall_init(void)
    3.83 -{
    3.84 -#ifdef CONFIG_IA32_EMULATION   		
    3.85 -	syscall32_cpu_init ();
    3.86 -#endif
    3.87 -}
    3.88 -
    3.89 -void __init check_efer(void)
    3.90 -{
    3.91 -	unsigned long efer;
    3.92 -
    3.93 -	rdmsrl(MSR_EFER, efer); 
    3.94 -        if (!(efer & EFER_NX) || do_not_nx) { 
    3.95 -                __supported_pte_mask &= ~_PAGE_NX; 
    3.96 -
    3.97 -        }       
    3.98  }
    3.99  
   3.100  void __init cpu_gdt_init(struct desc_ptr *gdt_descr)
   3.101 @@ -217,7 +146,96 @@ void __init cpu_gdt_init(struct desc_ptr
   3.102                                 sizeof (struct desc_struct)))
   3.103  		BUG();
   3.104  }
   3.105 +#else
   3.106 +static void switch_pt(void)
   3.107 +{
   3.108 +	asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
   3.109 +}
   3.110  
   3.111 +void __init cpu_gdt_init(struct desc_ptr *gdt_descr)
   3.112 +{
   3.113 +#ifdef CONFIG_SMP
   3.114 +	int cpu = stack_smp_processor_id();
   3.115 +#else
   3.116 +	int cpu = smp_processor_id();
   3.117 +#endif
   3.118 +
   3.119 +	asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));
   3.120 +	asm volatile("lidt %0" :: "m" (idt_descr));
   3.121 +}
   3.122 +#endif
   3.123 +
   3.124 +
   3.125 +void pda_init(int cpu)
   3.126 +{ 
   3.127 +	struct x8664_pda *pda = &cpu_pda[cpu];
   3.128 +
   3.129 +	/* Setup up data that may be needed in __get_free_pages early */
   3.130 +	asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
   3.131 +#ifndef CONFIG_XEN
   3.132 +	wrmsrl(MSR_GS_BASE, cpu_pda + cpu);
   3.133 +#else
   3.134 +        HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, 
   3.135 +                                    (unsigned long)(cpu_pda + cpu));
   3.136 +#endif
   3.137 +	pda->me = pda;
   3.138 +	pda->cpunumber = cpu; 
   3.139 +	pda->irqcount = -1;
   3.140 +	pda->kernelstack = 
   3.141 +		(unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; 
   3.142 +	pda->active_mm = &init_mm;
   3.143 +	pda->mmu_state = 0;
   3.144 +
   3.145 +	if (cpu == 0) {
   3.146 +#ifdef CONFIG_XEN
   3.147 +		xen_init_pt();
   3.148 +#endif
   3.149 +		/* others are initialized in smpboot.c */
   3.150 +		pda->pcurrent = &init_task;
   3.151 +		pda->irqstackptr = boot_cpu_stack; 
   3.152 +	} else {
   3.153 +		pda->irqstackptr = (char *)
   3.154 +			__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
   3.155 +		if (!pda->irqstackptr)
   3.156 +			panic("cannot allocate irqstack for cpu %d", cpu); 
   3.157 +	}
   3.158 +
   3.159 +	switch_pt();
   3.160 +	pda->irqstackptr += IRQSTACKSIZE-64;
   3.161 +} 
   3.162 +
   3.163 +char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ] 
   3.164 +__attribute__((section(".bss.page_aligned")));
   3.165 +
   3.166 +/* May not be marked __init: used by software suspend */
   3.167 +void syscall_init(void)
   3.168 +{
   3.169 +#ifndef CONFIG_XEN
   3.170 +	/* 
   3.171 +	 * LSTAR and STAR live in a bit strange symbiosis.
   3.172 +	 * They both write to the same internal register. STAR allows to set CS/DS
   3.173 +	 * but only a 32bit target. LSTAR sets the 64bit rip. 	 
   3.174 +	 */ 
   3.175 +	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32); 
   3.176 +	wrmsrl(MSR_LSTAR, system_call); 
   3.177 +
   3.178 +	/* Flags to clear on syscall */
   3.179 +	wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 
   3.180 +#endif
   3.181 +#ifdef CONFIG_IA32_EMULATION   		
   3.182 +	syscall32_cpu_init ();
   3.183 +#endif
   3.184 +}
   3.185 +
   3.186 +void __init check_efer(void)
   3.187 +{
   3.188 +	unsigned long efer;
   3.189 +
   3.190 +	rdmsrl(MSR_EFER, efer); 
   3.191 +        if (!(efer & EFER_NX) || do_not_nx) { 
   3.192 +                __supported_pte_mask &= ~_PAGE_NX; 
   3.193 +        }       
   3.194 +}
   3.195  
   3.196  /*
   3.197   * cpu_init() initializes state that is per-CPU. Some data is already
   3.198 @@ -247,14 +265,13 @@ void __init cpu_init (void)
   3.199  
   3.200  	me = current;
   3.201  
   3.202 -	if (test_and_set_bit(cpu, &cpu_initialized))
   3.203 +	if (cpu_test_and_set(cpu, cpu_initialized))
   3.204  		panic("CPU#%d already initialized!\n", cpu);
   3.205  
   3.206  	printk("Initializing CPU#%d\n", cpu);
   3.207  
   3.208 -#if 0
   3.209  		clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
   3.210 -#endif
   3.211 +
   3.212  	/*
   3.213  	 * Initialize the per-CPU GDT with the boot GDT,
   3.214  	 * and set up the GDT descriptor:
   3.215 @@ -265,18 +282,16 @@ void __init cpu_init (void)
   3.216  
   3.217  	cpu_gdt_descr[cpu].size = GDT_SIZE;
   3.218  	cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu];
   3.219 -#if 0
   3.220 -	asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));
   3.221 -	asm volatile("lidt %0" :: "m" (idt_descr));
   3.222 -#endif
   3.223 +
   3.224          cpu_gdt_init(&cpu_gdt_descr[cpu]);
   3.225  
   3.226 -#if 0
   3.227 +#ifndef CONFIG_XEN 
   3.228  	memcpy(me->thread.tls_array, cpu_gdt_table[cpu], GDT_ENTRY_TLS_ENTRIES * 8);
   3.229  
   3.230 -#endif
   3.231 +#else
   3.232   	memcpy(me->thread.tls_array, &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN],
   3.233  	    GDT_ENTRY_TLS_ENTRIES * 8);
   3.234 +#endif
   3.235         
   3.236  	/*
   3.237  	 * Delete NT
   3.238 @@ -284,12 +299,12 @@ void __init cpu_init (void)
   3.239  
   3.240  	asm volatile("pushfq ; popq %%rax ; btr $14,%%rax ; pushq %%rax ; popfq" ::: "eax");
   3.241  
   3.242 -	if (cpu == 0) 
   3.243 -		early_identify_cpu(&boot_cpu_data);
   3.244 -
   3.245  	syscall_init();
   3.246  
   3.247 +	wrmsrl(MSR_FS_BASE, 0);
   3.248 +	wrmsrl(MSR_KERNEL_GS_BASE, 0);
   3.249  	barrier(); 
   3.250 +
   3.251  	check_efer();
   3.252  
   3.253  	/*
   3.254 @@ -321,19 +336,22 @@ void __init cpu_init (void)
   3.255  		BUG();
   3.256  	enter_lazy_tlb(&init_mm, me);
   3.257  
   3.258 +#ifndef CONFIG_XEN
   3.259 +	set_tss_desc(cpu, t);
   3.260 +	load_TR_desc();
   3.261 +#endif
   3.262  	load_LDT(&init_mm.context);
   3.263  
   3.264  	/*
   3.265  	 * Clear all 6 debug registers:
   3.266  	 */
   3.267 -#define CD(register) HYPERVISOR_set_debugreg(register, 0)
   3.268  
   3.269 -	CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
   3.270 +	set_debug(0UL, 0);
   3.271 +	set_debug(0UL, 1);
   3.272 +	set_debug(0UL, 2);
   3.273 +	set_debug(0UL, 3);
   3.274 +	set_debug(0UL, 6);
   3.275 +	set_debug(0UL, 7);
   3.276  
   3.277 -#undef CD
   3.278  	fpu_init(); 
   3.279 -
   3.280 -#ifdef CONFIG_NUMA
   3.281 -	numa_add_cpu(cpu);
   3.282 -#endif
   3.283  }
     4.1 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c	Mon Aug 08 08:18:06 2005 +0000
     4.2 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c	Mon Aug 08 08:18:38 2005 +0000
     4.3 @@ -280,7 +280,7 @@ static void set_pte_phys(unsigned long v
     4.4  	if (!pte_none(*pte) &&
     4.5  	    pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
     4.6  		pte_ERROR(*pte);
     4.7 -        xen_l1_entry_update(pte, new_pte);
     4.8 +        set_pte(pte, new_pte);
     4.9  
    4.10  	/*
    4.11  	 * It's enough to flush this one mapping.
    4.12 @@ -511,6 +511,78 @@ static void __init find_early_table_spac
    4.13  	    		  round_up(ptes * 8, PAGE_SIZE); 
    4.14  }
    4.15  
    4.16 +static void xen_copy_pt(void)
    4.17 +{
    4.18 +	unsigned long va = __START_KERNEL_map;
    4.19 +	unsigned long addr, *pte_page;
    4.20 +	int i;
    4.21 +	pud_t *pud; pmd_t *pmd; pte_t *pte;
    4.22 +	unsigned long *page = (unsigned long *) init_level4_pgt;
    4.23 +
    4.24 +	addr = (unsigned long) page[pgd_index(va)];
    4.25 +	addr_to_page(addr, page);
    4.26 +
    4.27 +	pud = (pud_t *) &page[pud_index(va)];
    4.28 +	addr = page[pud_index(va)];
    4.29 +	addr_to_page(addr, page);
    4.30 +
    4.31 +	level3_kernel_pgt[pud_index(va)] = 
    4.32 +		__pud(__pa_symbol(level2_kernel_pgt) | _KERNPG_TABLE | _PAGE_USER);
    4.33 +
    4.34 +	for (;;) {
    4.35 +		pmd = (pmd_t *) &page[pmd_index(va)];
    4.36 +		if (pmd_present(*pmd)) {
    4.37 +			level2_kernel_pgt[pmd_index(va)] = *pmd;
    4.38 +			/*
    4.39 +			 * if pmd is valid, check pte.
    4.40 +			 */
    4.41 +			addr = page[pmd_index(va)];
    4.42 +			addr_to_page(addr, pte_page);
    4.43 +			
    4.44 +			for (i = 0; i < PTRS_PER_PTE; i++) {
    4.45 +				pte = (pte_t *) &pte_page[pte_index(va)];
    4.46 +				if (pte_present(*pte))
    4.47 +					va += PAGE_SIZE;
    4.48 +				else
    4.49 +				    break;
    4.50 +			}
    4.51 +
    4.52 +		} else
    4.53 +		    break;
    4.54 +	}
    4.55 +
    4.56 +	init_level4_pgt[pgd_index(__START_KERNEL_map)] = 
    4.57 +		mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
    4.58 +}
    4.59 +
    4.60 +void __init xen_init_pt(void)
    4.61 +{
    4.62 +        pgd_t *old_level4 = (pgd_t *)xen_start_info.pt_base;
    4.63 +
    4.64 +	memcpy((void *)init_level4_pgt, 
    4.65 +	       (void *)xen_start_info.pt_base, PAGE_SIZE);
    4.66 +
    4.67 +	memset((void *)level3_kernel_pgt, 0, PAGE_SIZE);
    4.68 +	memset((void *)level2_kernel_pgt, 0, PAGE_SIZE);
    4.69 +
    4.70 +	xen_copy_pt();
    4.71 +
    4.72 +	make_page_readonly(init_level4_pgt);
    4.73 +	make_page_readonly(level3_kernel_pgt);
    4.74 +	make_page_readonly(level2_kernel_pgt);
    4.75 +	make_page_readonly(init_level4_user_pgt);
    4.76 +	make_page_readonly(level3_user_pgt); /* for vsyscall stuff */
    4.77 +
    4.78 +	xen_pgd_pin(__pa_symbol(init_level4_pgt));
    4.79 +	xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
    4.80 +	xen_pud_pin(__pa_symbol(level3_kernel_pgt));
    4.81 +	xen_pud_pin(__pa_symbol(level3_user_pgt));
    4.82 +	xen_pmd_pin(__pa_symbol(level2_kernel_pgt));
    4.83 +
    4.84 +	set_pgd((pgd_t *)(init_level4_user_pgt + 511), 
    4.85 +		mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
    4.86 +
    4.87 +}
    4.88  
    4.89  /*
    4.90   * Extend kernel mapping to access pages for page tables.  The initial
     5.1 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h	Mon Aug 08 08:18:06 2005 +0000
     5.2 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h	Mon Aug 08 08:18:38 2005 +0000
     5.3 @@ -4,31 +4,19 @@
     5.4  /*
     5.5   * This file contains the functions and defines necessary to modify and use
     5.6   * the x86-64 page table tree.
     5.7 - * 
     5.8 - * x86-64 has a 4 level table setup. Generic linux MM only supports
     5.9 - * three levels. The fourth level is currently a single static page that
    5.10 - * is shared by everybody and just contains a pointer to the current
    5.11 - * three level page setup on the beginning and some kernel mappings at 
    5.12 - * the end. For more details see Documentation/x86_64/mm.txt
    5.13   */
    5.14  #include <asm/processor.h>
    5.15  #include <asm/fixmap.h>
    5.16  #include <asm/bitops.h>
    5.17  #include <linux/threads.h>
    5.18  #include <asm/pda.h>
    5.19 +#ifdef CONFIG_XEN
    5.20  #include <asm-xen/hypervisor.h>
    5.21 -extern pud_t level3_user_pgt[512];
    5.22 -extern pud_t init_level4_pgt[];
    5.23 -extern pud_t init_level4_user_pgt[];
    5.24 -extern unsigned long __supported_pte_mask;
    5.25  
    5.26 -#define swapper_pg_dir NULL
    5.27 +extern pud_t level3_user_pgt[512];
    5.28 +extern pud_t init_level4_user_pgt[];
    5.29  
    5.30 -extern int nonx_setup(char *str);
    5.31 -extern void paging_init(void);
    5.32 -extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
    5.33 -
    5.34 -extern unsigned long pgkern_mask;
    5.35 +extern void xen_init_pt(void);
    5.36  
    5.37  #define virt_to_ptep(__va)						\
    5.38  ({									\
    5.39 @@ -44,6 +32,22 @@ extern unsigned long pgkern_mask;
    5.40  	unsigned long __pa = (*(unsigned long *)__pte) & PAGE_MASK;	\
    5.41  	__pa | ((unsigned long)(__va) & (PAGE_SIZE-1));			\
    5.42  })
    5.43 +#endif
    5.44 +
    5.45 +extern pud_t level3_kernel_pgt[512];
    5.46 +extern pud_t level3_physmem_pgt[512];
    5.47 +extern pud_t level3_ident_pgt[512];
    5.48 +extern pmd_t level2_kernel_pgt[512];
    5.49 +extern pgd_t init_level4_pgt[];
    5.50 +extern unsigned long __supported_pte_mask;
    5.51 +
    5.52 +#define swapper_pg_dir init_level4_pgt
    5.53 +
    5.54 +extern int nonx_setup(char *str);
    5.55 +extern void paging_init(void);
    5.56 +extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
    5.57 +
    5.58 +extern unsigned long pgkern_mask;
    5.59  
    5.60  /*
    5.61   * ZERO_PAGE is a global shared page that is always zero: used
    5.62 @@ -52,11 +56,14 @@ extern unsigned long pgkern_mask;
    5.63  extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
    5.64  #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
    5.65  
    5.66 +/*
    5.67 + * PGDIR_SHIFT determines what a top-level page table entry can map
    5.68 + */
    5.69  #define PGDIR_SHIFT	39
    5.70  #define PTRS_PER_PGD	512
    5.71  
    5.72  /*
    5.73 - * PUDIR_SHIFT determines what a top-level page table entry can map
    5.74 + * 3rd level page
    5.75   */
    5.76  #define PUD_SHIFT	30
    5.77  #define PTRS_PER_PUD	512
    5.78 @@ -80,7 +87,7 @@ extern unsigned long empty_zero_page[PAG
    5.79  #define pud_ERROR(e) \
    5.80  	printk("%s:%d: bad pud %p(%016lx).\n", __FILE__, __LINE__, &(e), pud_val(e))
    5.81  #define pgd_ERROR(e) \
    5.82 -        printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
    5.83 +	printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
    5.84  
    5.85  #define pgd_none(x)	(!pgd_val(x))
    5.86  #define pud_none(x)	(!pud_val(x))
    5.87 @@ -90,18 +97,10 @@ extern unsigned long empty_zero_page[PAG
    5.88  
    5.89  extern inline int pud_present(pud_t pud)	{ return !pud_none(pud); }
    5.90  
    5.91 -#ifdef CONFIG_SMP
    5.92 -#define set_pte(pteptr, pteval) xen_l1_entry_update(pteptr, (pteval))
    5.93 -
    5.94 -#else
    5.95 -#define set_pte(pteptr, pteval) xen_l1_entry_update(pteptr, (pteval))
    5.96 -#if 0
    5.97  static inline void set_pte(pte_t *dst, pte_t val)
    5.98  {
    5.99  	*dst = val;
   5.100  }
   5.101 -#endif
   5.102 -#endif
   5.103  
   5.104  #define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval))
   5.105  #define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval))
   5.106 @@ -132,6 +131,9 @@ extern inline void pgd_clear (pgd_t * pg
   5.107   * each domain will have separate page tables, with their own versions of
   5.108   * accessed & dirty state.
   5.109   */
   5.110 +#define ptep_get_and_clear(mm,addr,xp)	__pte_ma(xchg(&(xp)->pte, 0))
   5.111 +
   5.112 +#if 0
   5.113  static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp)
   5.114  {
   5.115          pte_t pte = *xp;
   5.116 @@ -139,21 +141,22 @@ static inline pte_t ptep_get_and_clear(s
   5.117                  set_pte(xp, __pte_ma(0));
   5.118          return pte;
   5.119  }
   5.120 +#endif
   5.121  
   5.122  #define pte_same(a, b)		((a).pte == (b).pte)
   5.123  
   5.124 -#define PMD_SIZE        (1UL << PMD_SHIFT)
   5.125 -#define PMD_MASK        (~(PMD_SIZE-1))
   5.126 -#define PUD_SIZE        (1UL << PUD_SHIFT)
   5.127 -#define PUD_MASK        (~(PUD_SIZE-1))
   5.128 -#define PGDIR_SIZE      (1UL << PGDIR_SHIFT)
   5.129 -#define PGDIR_MASK      (~(PGDIR_SIZE-1))
   5.130 +#define PMD_SIZE	(1UL << PMD_SHIFT)
   5.131 +#define PMD_MASK	(~(PMD_SIZE-1))
   5.132 +#define PUD_SIZE	(1UL << PUD_SHIFT)
   5.133 +#define PUD_MASK	(~(PUD_SIZE-1))
   5.134 +#define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
   5.135 +#define PGDIR_MASK	(~(PGDIR_SIZE-1))
   5.136  
   5.137 -#define USER_PTRS_PER_PGD       (TASK_SIZE/PGDIR_SIZE)
   5.138 +#define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
   5.139  #define FIRST_USER_ADDRESS	0
   5.140  
   5.141  #ifndef __ASSEMBLY__
   5.142 -#define MAXMEM           0x3fffffffffffUL
   5.143 +#define MAXMEM		 0x3fffffffffffUL
   5.144  #define VMALLOC_START    0xffffc20000000000UL
   5.145  #define VMALLOC_END      0xffffe1ffffffffffUL
   5.146  #define MODULES_VADDR    0xffffffff88000000UL
   5.147 @@ -347,7 +350,7 @@ static inline int ptep_test_and_clear_di
   5.148  	pte_t pte = *ptep;
   5.149  	int ret = pte_dirty(pte);
   5.150  	if (ret)
   5.151 -		xen_l1_entry_update(ptep, pte_mkclean(pte));
   5.152 +		set_pte(ptep, pte_mkclean(pte));
   5.153  	return ret;
   5.154  }
   5.155  
   5.156 @@ -356,7 +359,7 @@ static inline int ptep_test_and_clear_yo
   5.157  	pte_t pte = *ptep;
   5.158  	int ret = pte_young(pte);
   5.159  	if (ret)
   5.160 -		xen_l1_entry_update(ptep, pte_mkold(pte));
   5.161 +		set_pte(ptep, pte_mkold(pte));
   5.162  	return ret;
   5.163  }
   5.164  
   5.165 @@ -398,7 +401,7 @@ static inline int pmd_large(pmd_t pte) {
   5.166  
   5.167  /* PUD - Level3 access */
   5.168  /* to find an entry in a page-table-directory. */
   5.169 -#define pud_index(address) ((address >> PUD_SHIFT) & (PTRS_PER_PUD-1))
   5.170 +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
   5.171  #define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
   5.172  static inline pud_t *__pud_offset_k(pud_t *pud, unsigned long address)
   5.173  { 
   5.174 @@ -413,7 +416,7 @@ static inline pud_t *pud_offset_k(unsign
   5.175  {
   5.176  	unsigned long addr;
   5.177  
   5.178 -	addr = pud_val(init_level4_pgt[pud_index(address)]);
   5.179 +	addr = pgd_val(init_level4_pgt[pud_index(address)]);
   5.180  	addr &= PHYSICAL_PAGE_MASK; /* machine physical */
   5.181          addr = machine_to_phys(addr);
   5.182  	return __pud_offset_k((pud_t *)__va(addr), address);
   5.183 @@ -427,9 +430,11 @@ static inline pud_t *pud_offset_k(unsign
   5.184  #define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
   5.185                                    pmd_index(address))
   5.186  #define pmd_none(x)	(!pmd_val(x))
   5.187 -#define pmd_present(x)	(pmd_val(x) & _PAGE_PRESENT)
   5.188 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t.
   5.189 +   can temporarily clear it. */
   5.190 +#define pmd_present(x)	(pmd_val(x))
   5.191  #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
   5.192 -#define	pmd_bad(x)	((pmd_val(x) & ~PTE_MASK) != _KERNPG_TABLE )
   5.193 +#define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
   5.194  #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
   5.195  #define pmd_pfn(x)  ((pmd_val(x) >> PAGE_SHIFT) & __PHYSICAL_MASK)
   5.196  
   5.197 @@ -479,6 +484,7 @@ extern inline pte_t pte_modify(pte_t pte
   5.198   * race with other CPU's that might be updating the dirty
   5.199   * bit at the same time. */
   5.200  #define  __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
   5.201 +#if 0
   5.202  #define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
   5.203  	do {								  \
   5.204  		if (__dirty) {						  \
   5.205 @@ -486,6 +492,18 @@ extern inline pte_t pte_modify(pte_t pte
   5.206  			flush_tlb_page(__vma, __address);		  \
   5.207  		}							  \
   5.208  	} while (0)
   5.209 +#endif
   5.210 +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \
   5.211 +	do {								  \
   5.212 +		if (__dirty) {						  \
   5.213 +		        if ( likely((__vma)->vm_mm == current->mm) ) {    \
   5.214 +			    HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits)); \
   5.215 +			} else {                                          \
   5.216 +                            xen_l1_entry_update((__ptep), (__entry)); \
   5.217 +			    flush_tlb_page((__vma), (__address));         \
   5.218 +			}                                                 \
   5.219 +		}							  \
   5.220 +	} while (0)
   5.221  
   5.222  /* Encode and de-code a swap entry */
   5.223  #define __swp_type(x)			(((x).val >> 1) & 0x3f)
     6.1 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/processor.h	Mon Aug 08 08:18:06 2005 +0000
     6.2 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/processor.h	Mon Aug 08 08:18:38 2005 +0000
     6.3 @@ -154,6 +154,20 @@ static inline void set_in_cr4 (unsigned 
     6.4  	}
     6.5  }
     6.6  
     6.7 +
     6.8 +static inline void clear_in_cr4 (unsigned long mask)
     6.9 +{
    6.10 +#ifndef CONFIG_XEN
    6.11 +	mmu_cr4_features &= ~mask;
    6.12 +	__asm__("movq %%cr4,%%rax\n\t"
    6.13 +		"andq %0,%%rax\n\t"
    6.14 +		"movq %%rax,%%cr4\n"
    6.15 +		: : "irg" (~mask)
    6.16 +		:"ax");
    6.17 +#endif
    6.18 +}
    6.19 +
    6.20 +
    6.21  #define load_cr3(pgdir) do {				\
    6.22  	xen_pt_switch(__pa(pgdir));			\
    6.23  	per_cpu(cur_pgd, smp_processor_id()) = pgdir;	\
    6.24 @@ -283,9 +297,9 @@ struct thread_struct {
    6.25  	load_gs_index(0);							\
    6.26  	(regs)->rip = (new_rip);						 \
    6.27  	(regs)->rsp = (new_rsp);						 \
    6.28 -	write_pda(oldrsp, (new_rsp)); 						 \
    6.29 -	(regs)->cs = __USER_CS;                                                  \
    6.30 -	(regs)->ss = __USER_DS;                                                  \
    6.31 +	write_pda(oldrsp, (new_rsp));						 \
    6.32 +	(regs)->cs = __USER_CS;							 \
    6.33 +	(regs)->ss = __USER_DS;							 \
    6.34  	(regs)->eflags = 0x200;							 \
    6.35  	set_fs(USER_DS);							 \
    6.36  } while(0)