direct-io.hg

changeset 6476:f0bf239844a6

Fix x86/64 pagetable initialisation to not waste several
megabytes of memory.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Tue Aug 30 16:19:07 2005 +0000 (2005-08-30)
parents b043928b0873
children 872e94f8eb69
files linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c
line diff
     1.1 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c	Tue Aug 30 16:15:27 2005 +0000
     1.2 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c	Tue Aug 30 16:19:07 2005 +0000
     1.3 @@ -62,14 +62,16 @@ static int init_mapping_done;
     1.4   * avaialble in init_memory_mapping().
     1.5   */
     1.6  
     1.7 -#define addr_to_page(addr, page)                                             \
     1.8 -        (addr) &= PHYSICAL_PAGE_MASK;                                   \
     1.9 -        (page) = ((unsigned long *) ((unsigned long)(((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + __START_KERNEL_map)))
    1.10 +#define addr_to_page(addr, page)				\
    1.11 +	(addr) &= PHYSICAL_PAGE_MASK;				\
    1.12 +	(page) = ((unsigned long *) ((unsigned long)		\
    1.13 +	(((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) +	\
    1.14 +	__START_KERNEL_map)))
    1.15  
    1.16  static void __make_page_readonly(unsigned long va)
    1.17  {
    1.18 -        unsigned long addr;
    1.19 -        pte_t pte, *ptep;
    1.20 +	unsigned long addr;
    1.21 +	pte_t pte, *ptep;
    1.22  	unsigned long *page = (unsigned long *) init_level4_pgt;
    1.23  
    1.24  	addr = (unsigned long) page[pgd_index(va)];
    1.25 @@ -89,22 +91,22 @@ static void __make_page_readonly(unsigne
    1.26  
    1.27  static void __make_page_writable(unsigned long va)
    1.28  {
    1.29 -        unsigned long addr;
    1.30 -        pte_t pte, *ptep;
    1.31 -        unsigned long *page = (unsigned long *) init_level4_pgt;
    1.32 +	unsigned long addr;
    1.33 +	pte_t pte, *ptep;
    1.34 +	unsigned long *page = (unsigned long *) init_level4_pgt;
    1.35  
    1.36 -        addr = (unsigned long) page[pgd_index(va)];
    1.37 -        addr_to_page(addr, page);
    1.38 +	addr = (unsigned long) page[pgd_index(va)];
    1.39 +	addr_to_page(addr, page);
    1.40  
    1.41 -        addr = page[pud_index(va)];
    1.42 -        addr_to_page(addr, page);
    1.43 -        
    1.44 -        addr = page[pmd_index(va)];
    1.45 -        addr_to_page(addr, page);
    1.46 +	addr = page[pud_index(va)];
    1.47 +	addr_to_page(addr, page);
    1.48 + 
    1.49 +	addr = page[pmd_index(va)];
    1.50 +	addr_to_page(addr, page);
    1.51  
    1.52 -        ptep = (pte_t *) &page[pte_index(va)];
    1.53 +	ptep = (pte_t *) &page[pte_index(va)];
    1.54  	pte.pte = (ptep->pte | _PAGE_RW);
    1.55 -        xen_l1_entry_update(ptep, pte);
    1.56 +	xen_l1_entry_update(ptep, pte);
    1.57  	__flush_tlb_one(addr);
    1.58  }
    1.59  
    1.60 @@ -115,55 +117,55 @@ static void __make_page_writable(unsigne
    1.61  void make_page_readonly(void *va)
    1.62  {
    1.63  	pgd_t* pgd; pud_t *pud; pmd_t* pmd; pte_t pte, *ptep;
    1.64 -        unsigned long addr = (unsigned long) va;
    1.65 +	unsigned long addr = (unsigned long) va;
    1.66  
    1.67 -        if (!init_mapping_done) {
    1.68 -                __make_page_readonly(addr);
    1.69 -                return;
    1.70 -        }
    1.71 -                
    1.72 -        pgd = pgd_offset_k(addr);
    1.73 -        pud = pud_offset(pgd, addr);
    1.74 -        pmd = pmd_offset(pud, addr);
    1.75 -        ptep = pte_offset_kernel(pmd, addr);
    1.76 +	if (!init_mapping_done) {
    1.77 +		__make_page_readonly(addr);
    1.78 +		return;
    1.79 +	}
    1.80 +  
    1.81 +	pgd = pgd_offset_k(addr);
    1.82 +	pud = pud_offset(pgd, addr);
    1.83 +	pmd = pmd_offset(pud, addr);
    1.84 +	ptep = pte_offset_kernel(pmd, addr);
    1.85  	pte.pte = (ptep->pte & ~_PAGE_RW);
    1.86 -        xen_l1_entry_update(ptep, pte);
    1.87 +	xen_l1_entry_update(ptep, pte);
    1.88  	__flush_tlb_one(addr);
    1.89  }
    1.90  
    1.91  void make_page_writable(void *va)
    1.92  {
    1.93 -        pgd_t* pgd; pud_t *pud; pmd_t* pmd; pte_t pte, *ptep;
    1.94 -        unsigned long addr = (unsigned long) va;
    1.95 +	pgd_t* pgd; pud_t *pud; pmd_t* pmd; pte_t pte, *ptep;
    1.96 +	unsigned long addr = (unsigned long) va;
    1.97  
    1.98 -        if (!init_mapping_done) {
    1.99 -                __make_page_writable(addr);
   1.100 -                return;
   1.101 -        }
   1.102 +	if (!init_mapping_done) {
   1.103 +		__make_page_writable(addr);
   1.104 +		return;
   1.105 +	}
   1.106  
   1.107 -        pgd = pgd_offset_k(addr);
   1.108 -        pud = pud_offset(pgd, addr);
   1.109 -        pmd = pmd_offset(pud, addr);
   1.110 -        ptep = pte_offset_kernel(pmd, addr);
   1.111 +	pgd = pgd_offset_k(addr);
   1.112 +	pud = pud_offset(pgd, addr);
   1.113 +	pmd = pmd_offset(pud, addr);
   1.114 +	ptep = pte_offset_kernel(pmd, addr);
   1.115  	pte.pte = (ptep->pte | _PAGE_RW);
   1.116 -        xen_l1_entry_update(ptep, pte);
   1.117 +	xen_l1_entry_update(ptep, pte);
   1.118  	__flush_tlb_one(addr);
   1.119  }
   1.120  
   1.121  void make_pages_readonly(void* va, unsigned nr)
   1.122  {
   1.123 -        while ( nr-- != 0 ) {
   1.124 -                make_page_readonly(va);
   1.125 -                va = (void*)((unsigned long)va + PAGE_SIZE);
   1.126 -        }
   1.127 +	while (nr-- != 0) {
   1.128 +		make_page_readonly(va);
   1.129 +		va = (void*)((unsigned long)va + PAGE_SIZE);
   1.130 +	}
   1.131  }
   1.132  
   1.133  void make_pages_writable(void* va, unsigned nr)
   1.134  {
   1.135 -        while ( nr-- != 0 ) {
   1.136 -                make_page_writable(va);
   1.137 -                va = (void*)((unsigned long)va + PAGE_SIZE);
   1.138 -        }
   1.139 +	while (nr-- != 0) {
   1.140 +		make_page_writable(va);
   1.141 +		va = (void*)((unsigned long)va + PAGE_SIZE);
   1.142 +	}
   1.143  }
   1.144  
   1.145  /*
   1.146 @@ -389,7 +391,7 @@ void __set_fixmap_user (enum fixed_addre
   1.147          set_pte_phys(address, phys, prot, SET_FIXMAP_USER); 
   1.148  }
   1.149  
   1.150 -unsigned long __initdata table_start, table_end, tables_space; 
   1.151 +unsigned long __initdata table_start, tables_space; 
   1.152  
   1.153  unsigned long get_machine_pfn(unsigned long addr)
   1.154  {
   1.155 @@ -400,38 +402,13 @@ unsigned long get_machine_pfn(unsigned l
   1.156          return pte_mfn(*pte);
   1.157  } 
   1.158  
   1.159 -#define ALIGN_TO_4K __attribute__((section(".data.page_aligned")))
   1.160 -#define MAX_LOW_PAGES	0x20
   1.161 -static unsigned long __init_pgt[MAX_LOW_PAGES][512]  ALIGN_TO_4K;
   1.162 -static int __init_pgt_index;
   1.163 -
   1.164 -/*
   1.165 - * We start using from start_pfn
   1.166 - */
   1.167  static __init void *alloc_static_page(unsigned long *phys)
   1.168  {
   1.169 -	int i = __init_pgt_index++;
   1.170 -
   1.171 -	if (__init_pgt_index >= MAX_LOW_PAGES) {
   1.172 -		printk("Need to increase MAX_LOW_PAGES");
   1.173 -		BUG();
   1.174 -	}
   1.175 -		
   1.176 -	*phys = __pa(__init_pgt[i]);
   1.177 -
   1.178 -	return (void *) __init_pgt[i];
   1.179 -} 
   1.180 -
   1.181 -/*
   1.182 - * Get RO page
   1.183 - */
   1.184 -static void __init *alloc_low_page(unsigned long *phys)
   1.185 -{ 
   1.186 -        unsigned long pfn = table_end++;
   1.187 -    
   1.188 -        *phys = (pfn << PAGE_SHIFT);
   1.189 -        memset((void *) ((pfn << PAGE_SHIFT) + __START_KERNEL_map), 0, PAGE_SIZE);
   1.190 -        return (void *)((pfn << PAGE_SHIFT) + __START_KERNEL_map);
   1.191 +	unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
   1.192 +	*phys = start_pfn << PAGE_SHIFT;
   1.193 +	start_pfn++;
   1.194 +	memset((void *)va, 0, PAGE_SIZE);
   1.195 +	return (void *)va;
   1.196  } 
   1.197  
   1.198  #define PTE_SIZE PAGE_SIZE
   1.199 @@ -443,27 +420,21 @@ static inline void __set_pte(pte_t *dst,
   1.200  
   1.201  static inline int make_readonly(unsigned long paddr)
   1.202  {
   1.203 -    int readonly = 0;
   1.204 -
   1.205 -    /* Make new page tables read-only. */
   1.206 -    if ((paddr < ((table_start << PAGE_SHIFT) + tables_space)) &&
   1.207 -        (paddr >= (table_start << PAGE_SHIFT)))
   1.208 -        readonly = 1;
   1.209 +	int readonly = 0;
   1.210  
   1.211 -    /* Make old page tables read-only. */
   1.212 -    if ((paddr < ((xen_start_info.pt_base - __START_KERNEL_map) +
   1.213 -                  (xen_start_info.nr_pt_frames << PAGE_SHIFT))) &&
   1.214 -        (paddr >= (xen_start_info.pt_base - __START_KERNEL_map)))
   1.215 -        readonly = 1;
   1.216 +	/* Make old and new page tables read-only. */
   1.217 +	if ((paddr >= (xen_start_info.pt_base - __START_KERNEL_map))
   1.218 +	    && (paddr < ((table_start << PAGE_SHIFT) + tables_space)))
   1.219 +		readonly = 1;
   1.220 +	/*
   1.221 +	 * No need for writable mapping of kernel image. This also ensures that
   1.222 +	 * page and descriptor tables embedded inside don't have writable
   1.223 +	 * mappings. 
   1.224 +	 */
   1.225 +	if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)))
   1.226 +		readonly = 1;
   1.227  
   1.228 -    /*
   1.229 -     * No need for writable mapping of kernel image. This also ensures that
   1.230 -     * page and descriptor tables embedded inside don't have writable mappings.
   1.231 -     */
   1.232 -    if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)))
   1.233 -        readonly = 1;
   1.234 -
   1.235 -    return readonly;
   1.236 +	return readonly;
   1.237  }
   1.238  
   1.239  static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
   1.240 @@ -485,7 +456,7 @@ static void __init phys_pud_init(pud_t *
   1.241  			break;
   1.242  		} 
   1.243  
   1.244 -		pmd = alloc_low_page(&pmd_phys);
   1.245 +		pmd = alloc_static_page(&pmd_phys);
   1.246                  make_page_readonly(pmd);
   1.247                  xen_pmd_pin(pmd_phys);
   1.248  		set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
   1.249 @@ -499,7 +470,7 @@ static void __init phys_pud_init(pud_t *
   1.250  					set_pmd(pmd,  __pmd(0)); 
   1.251  				break;
   1.252  			}
   1.253 -                        pte = alloc_low_page(&pte_phys);
   1.254 +                        pte = alloc_static_page(&pte_phys);
   1.255                          pte_save = pte;
   1.256                          for (k = 0; k < PTRS_PER_PTE; pte++, k++, paddr += PTE_SIZE) {
   1.257                                  if ((paddr >= end) ||
   1.258 @@ -526,15 +497,16 @@ static void __init phys_pud_init(pud_t *
   1.259  
   1.260  static void __init find_early_table_space(unsigned long end)
   1.261  {
   1.262 -        unsigned long puds, pmds, ptes; 
   1.263 +	unsigned long puds, pmds, ptes; 
   1.264  
   1.265  	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
   1.266  	pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
   1.267 -        ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
   1.268 +	ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
   1.269  
   1.270 -        tables_space = round_up(puds * 8, PAGE_SIZE) + 
   1.271 -	    		  round_up(pmds * 8, PAGE_SIZE) + 
   1.272 -	    		  round_up(ptes * 8, PAGE_SIZE); 
   1.273 +	tables_space =
   1.274 +		round_up(puds * 8, PAGE_SIZE) + 
   1.275 +		round_up(pmds * 8, PAGE_SIZE) + 
   1.276 +		round_up(ptes * 8, PAGE_SIZE); 
   1.277  }
   1.278  
   1.279  void __init xen_init_pt(void)
   1.280 @@ -580,66 +552,59 @@ void __init xen_init_pt(void)
   1.281  		mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
   1.282  }
   1.283  
   1.284 -/*
   1.285 - * Extend kernel mapping to access pages for page tables.  The initial
   1.286 - * mapping done by Xen is minimal (e.g. 8MB) and we need to extend the
   1.287 - * mapping for early initialization.
   1.288 - */
   1.289 -static unsigned long current_size, extended_size;
   1.290 -
   1.291  void __init extend_init_mapping(void) 
   1.292  {
   1.293  	unsigned long va = __START_KERNEL_map;
   1.294  	unsigned long phys, addr, *pte_page;
   1.295 -        pmd_t *pmd;
   1.296 +	pmd_t *pmd;
   1.297  	pte_t *pte, new_pte;
   1.298 -	unsigned long *page = (unsigned long *) init_level4_pgt;
   1.299 -	int i;
   1.300 +	unsigned long *page = (unsigned long *)init_level4_pgt;
   1.301  
   1.302  	addr = page[pgd_index(va)];
   1.303  	addr_to_page(addr, page);
   1.304  	addr = page[pud_index(va)];
   1.305  	addr_to_page(addr, page);
   1.306  
   1.307 -	for (;;) {
   1.308 -		pmd = (pmd_t *)&page[pmd_index(va)];
   1.309 -		if (!pmd_present(*pmd))
   1.310 -			break;
   1.311 -		addr = page[pmd_index(va)];
   1.312 -		addr_to_page(addr, pte_page);
   1.313 -		for (i = 0; i < PTRS_PER_PTE; i++) {
   1.314 -			pte = (pte_t *) &pte_page[pte_index(va)];
   1.315 -			if (!pte_present(*pte))
   1.316 -				break;
   1.317 -			va += PAGE_SIZE;
   1.318 -			current_size += PAGE_SIZE;
   1.319 -		}
   1.320 +	/* Kill mapping of low 1MB. */
   1.321 +	while (va < (unsigned long)&_text) {
   1.322 +		HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
   1.323 +		va += PAGE_SIZE;
   1.324  	}
   1.325  
   1.326 -	while (va < __START_KERNEL_map + current_size + tables_space) {
   1.327 -		pmd = (pmd_t *) &page[pmd_index(va)];
   1.328 -		if (!pmd_none(*pmd))
   1.329 -			continue;
   1.330 -		pte_page = (unsigned long *) alloc_static_page(&phys);
   1.331 -		make_page_readonly(pte_page);
   1.332 -		xen_pte_pin(phys);
   1.333 -		set_pmd(pmd, __pmd(phys | _KERNPG_TABLE | _PAGE_USER));
   1.334 -		for (i = 0; i < PTRS_PER_PTE; i++, va += PAGE_SIZE) {
   1.335 +	/* Ensure init mappings cover kernel text/data and initial tables. */
   1.336 +	while (va < (__START_KERNEL_map
   1.337 +		     + (start_pfn << PAGE_SHIFT)
   1.338 +		     + tables_space)) {
   1.339 +		pmd = (pmd_t *)&page[pmd_index(va)];
   1.340 +		if (pmd_none(*pmd)) {
   1.341 +			pte_page = alloc_static_page(&phys);
   1.342 +			make_page_readonly(pte_page);
   1.343 +			xen_pte_pin(phys);
   1.344 +			set_pmd(pmd, __pmd(phys | _KERNPG_TABLE | _PAGE_USER));
   1.345 +		} else {
   1.346 +			addr = page[pmd_index(va)];
   1.347 +			addr_to_page(addr, pte_page);
   1.348 +		}
   1.349 +		pte = (pte_t *)&pte_page[pte_index(va)];
   1.350 +		if (pte_none(*pte)) {
   1.351  			new_pte = pfn_pte(
   1.352  				(va - __START_KERNEL_map) >> PAGE_SHIFT, 
   1.353  				__pgprot(_KERNPG_TABLE | _PAGE_USER));
   1.354 -			pte = (pte_t *)&pte_page[pte_index(va)];
   1.355  			xen_l1_entry_update(pte, new_pte);
   1.356 -			extended_size += PAGE_SIZE;
   1.357  		}
   1.358 +		va += PAGE_SIZE;
   1.359  	}
   1.360  
   1.361 -	/* Kill mapping of low 1MB. */
   1.362 -	for (va = __START_KERNEL_map; va < (unsigned long)&_text; va += PAGE_SIZE)
   1.363 +	/* Finally, blow away any spurious initial mappings. */
   1.364 +	while (1) {
   1.365 +		pmd = (pmd_t *)&page[pmd_index(va)];
   1.366 +		if (pmd_none(*pmd))
   1.367 +			break;
   1.368  		HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
   1.369 +		va += PAGE_SIZE;
   1.370 +	}
   1.371  }
   1.372  
   1.373 -
   1.374  /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
   1.375     This runs before bootmem is initialized and gets pages directly from the 
   1.376     physical memory. To access them they are temporarily mapped. */
   1.377 @@ -651,34 +616,31 @@ void __init init_memory_mapping(unsigned
   1.378  
   1.379  	find_early_table_space(end);
   1.380  	extend_init_mapping();
   1.381 -	start_pfn = current_size >> PAGE_SHIFT;
   1.382  
   1.383  	table_start = start_pfn;
   1.384 -	table_end = table_start;
   1.385  
   1.386  	start = (unsigned long)__va(start);
   1.387  	end = (unsigned long)__va(end);
   1.388  
   1.389  	for (; start < end; start = next) {
   1.390  		unsigned long pud_phys; 
   1.391 -                pud_t *pud = alloc_low_page(&pud_phys);
   1.392 -                make_page_readonly(pud);
   1.393 -                xen_pud_pin(pud_phys);
   1.394 +		pud_t *pud = alloc_static_page(&pud_phys);
   1.395 +		make_page_readonly(pud);
   1.396 +		xen_pud_pin(pud_phys);
   1.397  		next = start + PGDIR_SIZE;
   1.398  		if (next > end) 
   1.399  			next = end; 
   1.400  		phys_pud_init(pud, __pa(start), __pa(next));
   1.401  		set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
   1.402 -	} 
   1.403 +	}
   1.404  
   1.405 -	printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, 
   1.406 -	       table_start<<PAGE_SHIFT, 
   1.407 -	       table_end<<PAGE_SHIFT);
   1.408 +	printk("kernel direct mapping tables upto %lx @ %lx-%lx\n",
   1.409 +	       __pa(end), table_start<<PAGE_SHIFT, start_pfn<<PAGE_SHIFT);
   1.410  
   1.411 -        start_pfn = ((current_size + extended_size) >> PAGE_SHIFT);
   1.412 +	BUG_ON(start_pfn != (table_start + (tables_space >> PAGE_SHIFT)));
   1.413  
   1.414  	__flush_tlb_all();
   1.415 -        init_mapping_done = 1;
   1.416 +	init_mapping_done = 1;
   1.417  }
   1.418  
   1.419  extern struct x8664_pda cpu_pda[NR_CPUS];
   1.420 @@ -1003,3 +965,13 @@ int in_gate_area_no_task(unsigned long a
   1.421  {
   1.422  	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
   1.423  }
   1.424 +
   1.425 +/*
   1.426 + * Local variables:
   1.427 + *  c-file-style: "linux"
   1.428 + *  indent-tabs-mode: t
   1.429 + *  c-indent-level: 8
   1.430 + *  c-basic-offset: 8
   1.431 + *  tab-width: 8
   1.432 + * End:
   1.433 + */