ia64/linux-2.6.18-xen.hg

view arch/x86_64/mm/init-xen.c @ 708:e410857fd83c

Remove contiguous_bitmap[] as it's no longer needed.

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Oct 22 14:55:29 2008 +0100 (2008-10-22)
parents 70fb735a8e28
children 9090872bac23
line source
1 /*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 *
8 * Jun Nakajima <jun.nakajima@intel.com>
9 * Modified for Xen.
10 */
12 #include <linux/signal.h>
13 #include <linux/sched.h>
14 #include <linux/kernel.h>
15 #include <linux/errno.h>
16 #include <linux/string.h>
17 #include <linux/types.h>
18 #include <linux/ptrace.h>
19 #include <linux/mman.h>
20 #include <linux/mm.h>
21 #include <linux/swap.h>
22 #include <linux/smp.h>
23 #include <linux/init.h>
24 #include <linux/pagemap.h>
25 #include <linux/bootmem.h>
26 #include <linux/proc_fs.h>
27 #include <linux/pci.h>
28 #include <linux/poison.h>
29 #include <linux/dma-mapping.h>
30 #include <linux/module.h>
31 #include <linux/memory_hotplug.h>
33 #include <asm/processor.h>
34 #include <asm/system.h>
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37 #include <asm/pgalloc.h>
38 #include <asm/dma.h>
39 #include <asm/fixmap.h>
40 #include <asm/e820.h>
41 #include <asm/apic.h>
42 #include <asm/tlb.h>
43 #include <asm/mmu_context.h>
44 #include <asm/proto.h>
45 #include <asm/smp.h>
46 #include <asm/sections.h>
48 #include <xen/features.h>
50 #ifndef Dprintk
51 #define Dprintk(x...)
52 #endif
54 struct dma_mapping_ops* dma_ops;
55 EXPORT_SYMBOL(dma_ops);
57 #if CONFIG_XEN_COMPAT <= 0x030002
58 unsigned int __kernel_page_user;
59 EXPORT_SYMBOL(__kernel_page_user);
60 #endif
62 int after_bootmem;
64 static unsigned long dma_reserve __initdata;
66 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
67 extern unsigned long start_pfn;
69 /*
70 * Use this until direct mapping is established, i.e. before __va() is
71 * available in init_memory_mapping().
72 */
74 #define addr_to_page(addr, page) \
75 (addr) &= PHYSICAL_PAGE_MASK; \
76 (page) = ((unsigned long *) ((unsigned long) \
77 (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
78 __START_KERNEL_map)))
80 static void __meminit early_make_page_readonly(void *va, unsigned int feature)
81 {
82 unsigned long addr, _va = (unsigned long)va;
83 pte_t pte, *ptep;
84 unsigned long *page = (unsigned long *) init_level4_pgt;
86 BUG_ON(after_bootmem);
88 if (xen_feature(feature))
89 return;
91 addr = (unsigned long) page[pgd_index(_va)];
92 addr_to_page(addr, page);
94 addr = page[pud_index(_va)];
95 addr_to_page(addr, page);
97 addr = page[pmd_index(_va)];
98 addr_to_page(addr, page);
100 ptep = (pte_t *) &page[pte_index(_va)];
102 pte.pte = ptep->pte & ~_PAGE_RW;
103 if (HYPERVISOR_update_va_mapping(_va, pte, 0))
104 BUG();
105 }
107 static void __make_page_readonly(void *va)
108 {
109 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
110 unsigned long addr = (unsigned long) va;
112 pgd = pgd_offset_k(addr);
113 pud = pud_offset(pgd, addr);
114 pmd = pmd_offset(pud, addr);
115 ptep = pte_offset_kernel(pmd, addr);
117 pte.pte = ptep->pte & ~_PAGE_RW;
118 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
119 xen_l1_entry_update(ptep, pte); /* fallback */
121 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
122 __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
123 }
125 static void __make_page_writable(void *va)
126 {
127 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
128 unsigned long addr = (unsigned long) va;
130 pgd = pgd_offset_k(addr);
131 pud = pud_offset(pgd, addr);
132 pmd = pmd_offset(pud, addr);
133 ptep = pte_offset_kernel(pmd, addr);
135 pte.pte = ptep->pte | _PAGE_RW;
136 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
137 xen_l1_entry_update(ptep, pte); /* fallback */
139 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
140 __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
141 }
143 void make_page_readonly(void *va, unsigned int feature)
144 {
145 if (!xen_feature(feature))
146 __make_page_readonly(va);
147 }
149 void make_page_writable(void *va, unsigned int feature)
150 {
151 if (!xen_feature(feature))
152 __make_page_writable(va);
153 }
155 void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
156 {
157 if (xen_feature(feature))
158 return;
160 while (nr-- != 0) {
161 __make_page_readonly(va);
162 va = (void*)((unsigned long)va + PAGE_SIZE);
163 }
164 }
166 void make_pages_writable(void *va, unsigned nr, unsigned int feature)
167 {
168 if (xen_feature(feature))
169 return;
171 while (nr-- != 0) {
172 __make_page_writable(va);
173 va = (void*)((unsigned long)va + PAGE_SIZE);
174 }
175 }
177 /*
178 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
179 * physical space so we can cache the place of the first one and move
180 * around without checking the pgd every time.
181 */
183 void show_mem(void)
184 {
185 long i, total = 0, reserved = 0;
186 long shared = 0, cached = 0;
187 pg_data_t *pgdat;
188 struct page *page;
190 printk(KERN_INFO "Mem-info:\n");
191 show_free_areas();
192 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
194 for_each_online_pgdat(pgdat) {
195 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
196 page = pfn_to_page(pgdat->node_start_pfn + i);
197 total++;
198 if (PageReserved(page))
199 reserved++;
200 else if (PageSwapCache(page))
201 cached++;
202 else if (page_count(page))
203 shared += page_count(page) - 1;
204 }
205 }
206 printk(KERN_INFO "%lu pages of RAM\n", total);
207 printk(KERN_INFO "%lu reserved pages\n",reserved);
208 printk(KERN_INFO "%lu pages shared\n",shared);
209 printk(KERN_INFO "%lu pages swap cached\n",cached);
210 }
213 static __init void *spp_getpage(void)
214 {
215 void *ptr;
216 if (after_bootmem)
217 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
218 else if (start_pfn < table_end) {
219 ptr = __va(start_pfn << PAGE_SHIFT);
220 start_pfn++;
221 memset(ptr, 0, PAGE_SIZE);
222 } else
223 ptr = alloc_bootmem_pages(PAGE_SIZE);
224 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
225 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
227 Dprintk("spp_getpage %p\n", ptr);
228 return ptr;
229 }
231 #define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
232 #define pud_offset_u(address) (level3_user_pgt + pud_index(address))
234 static __init void set_pte_phys(unsigned long vaddr,
235 unsigned long phys, pgprot_t prot, int user_mode)
236 {
237 pgd_t *pgd;
238 pud_t *pud;
239 pmd_t *pmd;
240 pte_t *pte, new_pte;
242 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
244 pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
245 if (pgd_none(*pgd)) {
246 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
247 return;
248 }
249 pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
250 if (pud_none(*pud)) {
251 pmd = (pmd_t *) spp_getpage();
252 make_page_readonly(pmd, XENFEAT_writable_page_tables);
253 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
254 if (pmd != pmd_offset(pud, 0)) {
255 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
256 return;
257 }
258 }
259 pmd = pmd_offset(pud, vaddr);
260 if (pmd_none(*pmd)) {
261 pte = (pte_t *) spp_getpage();
262 make_page_readonly(pte, XENFEAT_writable_page_tables);
263 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
264 if (pte != pte_offset_kernel(pmd, 0)) {
265 printk("PAGETABLE BUG #02!\n");
266 return;
267 }
268 }
269 if (pgprot_val(prot))
270 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
271 else
272 new_pte = __pte(0);
274 pte = pte_offset_kernel(pmd, vaddr);
275 if (!pte_none(*pte) && __pte_val(new_pte) &&
276 __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
277 pte_ERROR(*pte);
278 set_pte(pte, new_pte);
280 /*
281 * It's enough to flush this one mapping.
282 * (PGE mappings get flushed as well)
283 */
284 __flush_tlb_one(vaddr);
285 }
287 static __init void set_pte_phys_ma(unsigned long vaddr,
288 unsigned long phys, pgprot_t prot)
289 {
290 pgd_t *pgd;
291 pud_t *pud;
292 pmd_t *pmd;
293 pte_t *pte, new_pte;
295 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
297 pgd = pgd_offset_k(vaddr);
298 if (pgd_none(*pgd)) {
299 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
300 return;
301 }
302 pud = pud_offset(pgd, vaddr);
303 if (pud_none(*pud)) {
305 pmd = (pmd_t *) spp_getpage();
306 make_page_readonly(pmd, XENFEAT_writable_page_tables);
307 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
308 if (pmd != pmd_offset(pud, 0)) {
309 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
310 return;
311 }
312 }
313 pmd = pmd_offset(pud, vaddr);
314 if (pmd_none(*pmd)) {
315 pte = (pte_t *) spp_getpage();
316 make_page_readonly(pte, XENFEAT_writable_page_tables);
317 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
318 if (pte != pte_offset_kernel(pmd, 0)) {
319 printk("PAGETABLE BUG #02!\n");
320 return;
321 }
322 }
323 new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
325 pte = pte_offset_kernel(pmd, vaddr);
326 if (!pte_none(*pte) && __pte_val(new_pte) &&
327 #ifdef CONFIG_ACPI
328 /* __acpi_map_table() fails to properly call clear_fixmap() */
329 (vaddr < __fix_to_virt(FIX_ACPI_END) ||
330 vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
331 #endif
332 __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
333 pte_ERROR(*pte);
334 set_pte(pte, new_pte);
336 /*
337 * It's enough to flush this one mapping.
338 * (PGE mappings get flushed as well)
339 */
340 __flush_tlb_one(vaddr);
341 }
343 /* NOTE: this is meant to be run only at boot */
344 void __init
345 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
346 {
347 unsigned long address = __fix_to_virt(idx);
349 if (idx >= __end_of_fixed_addresses) {
350 printk("Invalid __set_fixmap\n");
351 return;
352 }
353 switch (idx) {
354 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
355 set_pte_phys(address, phys, prot, 0);
356 set_pte_phys(address, phys, prot, 1);
357 break;
358 default:
359 set_pte_phys_ma(address, phys, prot);
360 break;
361 }
362 }
364 unsigned long __initdata table_start, table_end;
366 static __meminit void *alloc_static_page(unsigned long *phys)
367 {
368 unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
370 if (after_bootmem) {
371 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
373 *phys = __pa(adr);
374 return adr;
375 }
377 *phys = start_pfn << PAGE_SHIFT;
378 start_pfn++;
379 memset((void *)va, 0, PAGE_SIZE);
380 return (void *)va;
381 }
383 #define PTE_SIZE PAGE_SIZE
385 static inline int make_readonly(unsigned long paddr)
386 {
387 extern char __vsyscall_0;
388 int readonly = 0;
390 /* Make new page tables read-only. */
391 if (!xen_feature(XENFEAT_writable_page_tables)
392 && (paddr >= (table_start << PAGE_SHIFT))
393 && (paddr < (table_end << PAGE_SHIFT)))
394 readonly = 1;
395 /* Make old page tables read-only. */
396 if (!xen_feature(XENFEAT_writable_page_tables)
397 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
398 && (paddr < (start_pfn << PAGE_SHIFT)))
399 readonly = 1;
401 /*
402 * No need for writable mapping of kernel image. This also ensures that
403 * page and descriptor tables embedded inside don't have writable
404 * mappings. Exclude the vsyscall area here, allowing alternative
405 * instruction patching to work.
406 */
407 if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end))
408 && !(paddr >= __pa_symbol(&__vsyscall_0)
409 && paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE))
410 readonly = 1;
412 return readonly;
413 }
415 #ifndef CONFIG_XEN
416 /* Must run before zap_low_mappings */
417 __init void *early_ioremap(unsigned long addr, unsigned long size)
418 {
419 unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
421 /* actually usually some more */
422 if (size >= LARGE_PAGE_SIZE) {
423 printk("SMBIOS area too long %lu\n", size);
424 return NULL;
425 }
426 set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
427 map += LARGE_PAGE_SIZE;
428 set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
429 __flush_tlb();
430 return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
431 }
433 /* To avoid virtual aliases later */
434 __init void early_iounmap(void *addr, unsigned long size)
435 {
436 if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
437 printk("early_iounmap: bad address %p\n", addr);
438 set_pmd(temp_mappings[0].pmd, __pmd(0));
439 set_pmd(temp_mappings[1].pmd, __pmd(0));
440 __flush_tlb();
441 }
442 #endif
444 static void __meminit
445 phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
446 {
447 int i, k;
449 for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
450 unsigned long pte_phys;
451 pte_t *pte, *pte_save;
453 if (address >= end)
454 break;
455 pte = alloc_static_page(&pte_phys);
456 pte_save = pte;
457 for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
458 unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
460 if (address >= (after_bootmem
461 ? end
462 : xen_start_info->nr_pages << PAGE_SHIFT))
463 pteval = 0;
464 else if (make_readonly(address))
465 pteval &= ~_PAGE_RW;
466 set_pte(pte, __pte(pteval & __supported_pte_mask));
467 }
468 if (!after_bootmem) {
469 early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
470 *pmd = __pmd(pte_phys | _KERNPG_TABLE);
471 } else {
472 make_page_readonly(pte_save, XENFEAT_writable_page_tables);
473 set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
474 }
475 }
476 }
478 static void __meminit
479 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
480 {
481 pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
483 if (pmd_none(*pmd)) {
484 spin_lock(&init_mm.page_table_lock);
485 phys_pmd_init(pmd, address, end);
486 spin_unlock(&init_mm.page_table_lock);
487 __flush_tlb_all();
488 }
489 }
491 static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
492 {
493 long i = pud_index(address);
495 pud = pud + i;
497 if (after_bootmem && pud_val(*pud)) {
498 phys_pmd_update(pud, address, end);
499 return;
500 }
502 for (; i < PTRS_PER_PUD; pud++, i++) {
503 unsigned long paddr, pmd_phys;
504 pmd_t *pmd;
506 paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
507 if (paddr >= end)
508 break;
510 pmd = alloc_static_page(&pmd_phys);
512 spin_lock(&init_mm.page_table_lock);
513 *pud = __pud(pmd_phys | _KERNPG_TABLE);
514 phys_pmd_init(pmd, paddr, end);
515 spin_unlock(&init_mm.page_table_lock);
517 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
518 }
519 __flush_tlb();
520 }
522 void __init xen_init_pt(void)
523 {
524 unsigned long addr, *page;
526 /* Find the initial pte page that was built for us. */
527 page = (unsigned long *)xen_start_info->pt_base;
528 addr = page[pgd_index(__START_KERNEL_map)];
529 addr_to_page(addr, page);
530 addr = page[pud_index(__START_KERNEL_map)];
531 addr_to_page(addr, page);
533 #if CONFIG_XEN_COMPAT <= 0x030002
534 /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
535 in kernel PTEs. We check that here. */
536 if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
537 unsigned long *pg;
538 pte_t pte;
540 /* Mess with the initial mapping of page 0. It's not needed. */
541 BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
542 addr = page[pmd_index(__START_KERNEL_map)];
543 addr_to_page(addr, pg);
544 pte.pte = pg[pte_index(__START_KERNEL_map)];
545 BUG_ON(!(pte.pte & _PAGE_PRESENT));
547 /* If _PAGE_USER isn't set, we obviously do not need it. */
548 if (pte.pte & _PAGE_USER) {
549 /* _PAGE_USER is needed, but is it set implicitly? */
550 pte.pte &= ~_PAGE_USER;
551 if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
552 pte, 0) != 0) ||
553 !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
554 /* We need to explicitly specify _PAGE_USER. */
555 __kernel_page_user = _PAGE_USER;
556 }
557 }
558 #endif
560 /* Construct mapping of initial pte page in our own directories. */
561 init_level4_pgt[pgd_index(__START_KERNEL_map)] =
562 __pgd(__pa_symbol(level3_kernel_pgt) | _PAGE_TABLE);
563 level3_kernel_pgt[pud_index(__START_KERNEL_map)] =
564 __pud(__pa_symbol(level2_kernel_pgt) | _PAGE_TABLE);
565 memcpy(level2_kernel_pgt, page, PAGE_SIZE);
567 __user_pgd(init_level4_pgt)[pgd_index(VSYSCALL_START)] =
568 __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
570 early_make_page_readonly(init_level4_pgt,
571 XENFEAT_writable_page_tables);
572 early_make_page_readonly(__user_pgd(init_level4_pgt),
573 XENFEAT_writable_page_tables);
574 early_make_page_readonly(level3_kernel_pgt,
575 XENFEAT_writable_page_tables);
576 early_make_page_readonly(level3_user_pgt,
577 XENFEAT_writable_page_tables);
578 early_make_page_readonly(level2_kernel_pgt,
579 XENFEAT_writable_page_tables);
581 if (!xen_feature(XENFEAT_writable_page_tables)) {
582 xen_pgd_pin(__pa_symbol(init_level4_pgt));
583 xen_pgd_pin(__pa_symbol(__user_pgd(init_level4_pgt)));
584 }
585 }
587 static void __init extend_init_mapping(unsigned long tables_space)
588 {
589 unsigned long va = __START_KERNEL_map;
590 unsigned long phys, addr, *pte_page;
591 pmd_t *pmd;
592 pte_t *pte, new_pte;
593 unsigned long *page = (unsigned long *)init_level4_pgt;
595 addr = page[pgd_index(va)];
596 addr_to_page(addr, page);
597 addr = page[pud_index(va)];
598 addr_to_page(addr, page);
600 /* Kill mapping of low 1MB. */
601 while (va < (unsigned long)&_text) {
602 if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
603 BUG();
604 va += PAGE_SIZE;
605 }
607 /* Ensure init mappings cover kernel text/data and initial tables. */
608 while (va < (__START_KERNEL_map
609 + (start_pfn << PAGE_SHIFT)
610 + tables_space)) {
611 pmd = (pmd_t *)&page[pmd_index(va)];
612 if (pmd_none(*pmd)) {
613 pte_page = alloc_static_page(&phys);
614 early_make_page_readonly(
615 pte_page, XENFEAT_writable_page_tables);
616 set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
617 } else {
618 addr = page[pmd_index(va)];
619 addr_to_page(addr, pte_page);
620 }
621 pte = (pte_t *)&pte_page[pte_index(va)];
622 if (pte_none(*pte)) {
623 new_pte = pfn_pte(
624 (va - __START_KERNEL_map) >> PAGE_SHIFT,
625 __pgprot(_KERNPG_TABLE));
626 xen_l1_entry_update(pte, new_pte);
627 }
628 va += PAGE_SIZE;
629 }
631 /* Finally, blow away any spurious initial mappings. */
632 while (1) {
633 pmd = (pmd_t *)&page[pmd_index(va)];
634 if (pmd_none(*pmd))
635 break;
636 if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
637 BUG();
638 va += PAGE_SIZE;
639 }
640 }
642 static void __init find_early_table_space(unsigned long end)
643 {
644 unsigned long puds, pmds, ptes, tables;
646 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
647 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
648 ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
650 tables = round_up(puds * 8, PAGE_SIZE) +
651 round_up(pmds * 8, PAGE_SIZE) +
652 round_up(ptes * 8, PAGE_SIZE);
654 extend_init_mapping(tables);
656 table_start = start_pfn;
657 table_end = table_start + (tables>>PAGE_SHIFT);
659 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
660 end, table_start << PAGE_SHIFT,
661 (table_start << PAGE_SHIFT) + tables);
662 }
664 static void xen_finish_init_mapping(void)
665 {
666 unsigned long i, start, end;
668 /* Re-vector virtual addresses pointing into the initial
669 mapping to the just-established permanent ones. */
670 xen_start_info = __va(__pa(xen_start_info));
671 xen_start_info->pt_base = (unsigned long)
672 __va(__pa(xen_start_info->pt_base));
673 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
674 phys_to_machine_mapping =
675 __va(__pa(xen_start_info->mfn_list));
676 xen_start_info->mfn_list = (unsigned long)
677 phys_to_machine_mapping;
678 }
679 if (xen_start_info->mod_start)
680 xen_start_info->mod_start = (unsigned long)
681 __va(__pa(xen_start_info->mod_start));
683 /* Destroy the Xen-created mappings beyond the kernel image as
684 * well as the temporary mappings created above. Prevents
685 * overlap with modules area (if init mapping is very big).
686 */
687 start = PAGE_ALIGN((unsigned long)_end);
688 end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
689 for (; start < end; start += PAGE_SIZE)
690 if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
691 BUG();
693 /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
694 table_end = ~0UL;
696 /*
697 * Prefetch pte's for the bt_ioremap() area. It gets used before the
698 * boot-time allocator is online, so allocate-on-demand would fail.
699 */
700 for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
701 __set_fixmap(i, 0, __pgprot(0));
703 /* Switch to the real shared_info page, and clear the dummy page. */
704 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
705 HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
706 memset(empty_zero_page, 0, sizeof(empty_zero_page));
708 /* Set up mapping of lowest 1MB of physical memory. */
709 for (i = 0; i < NR_FIX_ISAMAPS; i++)
710 if (is_initial_xendomain())
711 set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
712 else
713 __set_fixmap(FIX_ISAMAP_BEGIN - i,
714 virt_to_mfn(empty_zero_page)
715 << PAGE_SHIFT,
716 PAGE_KERNEL_RO);
718 /* Disable the 'start_pfn' allocator. */
719 table_end = start_pfn;
720 }
722 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
723 This runs before bootmem is initialized and gets pages directly from the
724 physical memory. To access them they are temporarily mapped. */
725 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
726 {
727 unsigned long next;
729 Dprintk("init_memory_mapping\n");
731 /*
732 * Find space for the kernel direct mapping tables.
733 * Later we should allocate these tables in the local node of the memory
734 * mapped. Unfortunately this is done currently before the nodes are
735 * discovered.
736 */
737 if (!after_bootmem)
738 find_early_table_space(end);
740 start = (unsigned long)__va(start);
741 end = (unsigned long)__va(end);
743 for (; start < end; start = next) {
744 unsigned long pud_phys;
745 pgd_t *pgd = pgd_offset_k(start);
746 pud_t *pud;
748 if (after_bootmem)
749 pud = pud_offset(pgd, start & PGDIR_MASK);
750 else
751 pud = alloc_static_page(&pud_phys);
752 next = start + PGDIR_SIZE;
753 if (next > end)
754 next = end;
755 phys_pud_init(pud, __pa(start), __pa(next));
756 if (!after_bootmem) {
757 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
758 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
759 }
760 }
762 if (!after_bootmem) {
763 BUG_ON(start_pfn != table_end);
764 xen_finish_init_mapping();
765 }
767 __flush_tlb_all();
768 }
770 void __cpuinit zap_low_mappings(int cpu)
771 {
772 /* this is not required for Xen */
773 #if 0
774 swap_low_mappings();
775 #endif
776 }
778 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
779 __init void
780 size_zones(unsigned long *z, unsigned long *h,
781 unsigned long start_pfn, unsigned long end_pfn)
782 {
783 int i;
784 unsigned long w;
786 for (i = 0; i < MAX_NR_ZONES; i++)
787 z[i] = 0;
789 if (start_pfn < MAX_DMA_PFN)
790 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
791 if (start_pfn < MAX_DMA32_PFN) {
792 unsigned long dma32_pfn = MAX_DMA32_PFN;
793 if (dma32_pfn > end_pfn)
794 dma32_pfn = end_pfn;
795 z[ZONE_DMA32] = dma32_pfn - start_pfn;
796 }
797 z[ZONE_NORMAL] = end_pfn - start_pfn;
799 /* Remove lower zones from higher ones. */
800 w = 0;
801 for (i = 0; i < MAX_NR_ZONES; i++) {
802 if (z[i])
803 z[i] -= w;
804 w += z[i];
805 }
807 /* Compute holes */
808 w = start_pfn;
809 for (i = 0; i < MAX_NR_ZONES; i++) {
810 unsigned long s = w;
811 w += z[i];
812 h[i] = e820_hole_size(s, w);
813 }
815 /* Add the space pace needed for mem_map to the holes too. */
816 for (i = 0; i < MAX_NR_ZONES; i++)
817 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
819 /* The 16MB DMA zone has the kernel and other misc mappings.
820 Account them too */
821 if (h[ZONE_DMA]) {
822 h[ZONE_DMA] += dma_reserve;
823 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
824 printk(KERN_WARNING
825 "Kernel too large and filling up ZONE_DMA?\n");
826 h[ZONE_DMA] = z[ZONE_DMA];
827 }
828 }
829 }
831 #ifndef CONFIG_NUMA
832 void __init paging_init(void)
833 {
834 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
836 memory_present(0, 0, end_pfn);
837 sparse_init();
838 size_zones(zones, holes, 0, end_pfn);
839 free_area_init_node(0, NODE_DATA(0), zones,
840 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
842 init_mm.context.pinned = 1;
843 }
844 #endif
846 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
847 from the CPU leading to inconsistent cache lines. address and size
848 must be aligned to 2MB boundaries.
849 Does nothing when the mapping doesn't exist. */
850 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
851 {
852 unsigned long end = address + size;
854 BUG_ON(address & ~LARGE_PAGE_MASK);
855 BUG_ON(size & ~LARGE_PAGE_MASK);
857 for (; address < end; address += LARGE_PAGE_SIZE) {
858 pgd_t *pgd = pgd_offset_k(address);
859 pud_t *pud;
860 pmd_t *pmd;
861 if (pgd_none(*pgd))
862 continue;
863 pud = pud_offset(pgd, address);
864 if (pud_none(*pud))
865 continue;
866 pmd = pmd_offset(pud, address);
867 if (!pmd || pmd_none(*pmd))
868 continue;
869 if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) {
870 /* Could handle this, but it should not happen currently. */
871 printk(KERN_ERR
872 "clear_kernel_mapping: mapping has been split. will leak memory\n");
873 pmd_ERROR(*pmd);
874 }
875 set_pmd(pmd, __pmd(0));
876 }
877 __flush_tlb_all();
878 }
880 /*
881 * Memory hotplug specific functions
882 */
883 void online_page(struct page *page)
884 {
885 ClearPageReserved(page);
886 init_page_count(page);
887 __free_page(page);
888 totalram_pages++;
889 num_physpages++;
890 }
892 #ifdef CONFIG_MEMORY_HOTPLUG
893 /*
894 * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
895 * via probe interface of sysfs. If acpi notifies hot-add event, then it
896 * can tell node id by searching dsdt. But, probe interface doesn't have
897 * node id. So, return 0 as node id at this time.
898 */
899 #ifdef CONFIG_NUMA
900 int memory_add_physaddr_to_nid(u64 start)
901 {
902 return 0;
903 }
904 #endif
906 /*
907 * Memory is added always to NORMAL zone. This means you will never get
908 * additional DMA/DMA32 memory.
909 */
910 int arch_add_memory(int nid, u64 start, u64 size)
911 {
912 struct pglist_data *pgdat = NODE_DATA(nid);
913 struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
914 unsigned long start_pfn = start >> PAGE_SHIFT;
915 unsigned long nr_pages = size >> PAGE_SHIFT;
916 int ret;
918 ret = __add_pages(zone, start_pfn, nr_pages);
919 if (ret)
920 goto error;
922 init_memory_mapping(start, (start + size -1));
924 return ret;
925 error:
926 printk("%s: Problem encountered in __add_pages!\n", __func__);
927 return ret;
928 }
929 EXPORT_SYMBOL_GPL(arch_add_memory);
931 int remove_memory(u64 start, u64 size)
932 {
933 return -EINVAL;
934 }
935 EXPORT_SYMBOL_GPL(remove_memory);
937 #else /* CONFIG_MEMORY_HOTPLUG */
938 /*
939 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
940 * just online the pages.
941 */
942 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
943 {
944 int err = -EIO;
945 unsigned long pfn;
946 unsigned long total = 0, mem = 0;
947 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
948 if (pfn_valid(pfn)) {
949 online_page(pfn_to_page(pfn));
950 err = 0;
951 mem++;
952 }
953 total++;
954 }
955 if (!err) {
956 z->spanned_pages += total;
957 z->present_pages += mem;
958 z->zone_pgdat->node_spanned_pages += total;
959 z->zone_pgdat->node_present_pages += mem;
960 }
961 return err;
962 }
963 #endif /* CONFIG_MEMORY_HOTPLUG */
965 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
966 kcore_vsyscall;
968 void __init mem_init(void)
969 {
970 long codesize, reservedpages, datasize, initsize;
971 unsigned long pfn;
973 pci_iommu_alloc();
975 /* How many end-of-memory variables you have, grandma! */
976 max_low_pfn = end_pfn;
977 max_pfn = end_pfn;
978 num_physpages = end_pfn;
979 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
981 /* clear the zero-page */
982 memset(empty_zero_page, 0, PAGE_SIZE);
984 reservedpages = 0;
986 /* this will put all low memory onto the freelists */
987 #ifdef CONFIG_NUMA
988 totalram_pages = numa_free_all_bootmem();
989 #else
990 totalram_pages = free_all_bootmem();
991 #endif
992 /* XEN: init and count pages outside initial allocation. */
993 for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
994 ClearPageReserved(pfn_to_page(pfn));
995 init_page_count(pfn_to_page(pfn));
996 totalram_pages++;
997 }
998 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
1000 after_bootmem = 1;
1002 codesize = (unsigned long) &_etext - (unsigned long) &_text;
1003 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
1004 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
1006 /* Register memory areas for /proc/kcore */
1007 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
1008 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
1009 VMALLOC_END-VMALLOC_START);
1010 kclist_add(&kcore_kernel, &_stext, _end - _stext);
1011 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
1012 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
1013 VSYSCALL_END - VSYSCALL_START);
1015 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
1016 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
1017 end_pfn << (PAGE_SHIFT-10),
1018 codesize >> 10,
1019 reservedpages << (PAGE_SHIFT-10),
1020 datasize >> 10,
1021 initsize >> 10);
1023 #ifndef CONFIG_XEN
1024 #ifdef CONFIG_SMP
1025 /*
1026 * Sync boot_level4_pgt mappings with the init_level4_pgt
1027 * except for the low identity mappings which are already zapped
1028 * in init_level4_pgt. This sync-up is essential for AP's bringup
1029 */
1030 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
1031 #endif
1032 #endif
1035 void free_init_pages(char *what, unsigned long begin, unsigned long end)
1037 unsigned long addr;
1039 if (begin >= end)
1040 return;
1042 printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
1043 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1044 ClearPageReserved(virt_to_page(addr));
1045 init_page_count(virt_to_page(addr));
1046 memset((void *)(addr & ~(PAGE_SIZE-1)),
1047 POISON_FREE_INITMEM, PAGE_SIZE);
1048 if (addr >= __START_KERNEL_map) {
1049 /* make_readonly() reports all kernel addresses. */
1050 __make_page_writable(__va(__pa(addr)));
1051 if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
1052 pgd_t *pgd = pgd_offset_k(addr);
1053 pud_t *pud = pud_offset(pgd, addr);
1054 pmd_t *pmd = pmd_offset(pud, addr);
1055 pte_t *pte = pte_offset_kernel(pmd, addr);
1057 xen_l1_entry_update(pte, __pte(0)); /* fallback */
1060 free_page(addr);
1061 totalram_pages++;
1065 void free_initmem(void)
1067 memset(__initdata_begin, POISON_FREE_INITDATA,
1068 __initdata_end - __initdata_begin);
1069 free_init_pages("unused kernel memory",
1070 (unsigned long)(&__init_begin),
1071 (unsigned long)(&__init_end));
1074 #ifdef CONFIG_DEBUG_RODATA
1076 void mark_rodata_ro(void)
1078 unsigned long addr = (unsigned long)__start_rodata;
1080 for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
1081 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
1083 printk ("Write protecting the kernel read-only data: %luk\n",
1084 (__end_rodata - __start_rodata) >> 10);
1086 /*
1087 * change_page_attr_addr() requires a global_flush_tlb() call after it.
1088 * We do this after the printk so that if something went wrong in the
1089 * change, the printk gets out at least to give a better debug hint
1090 * of who is the culprit.
1091 */
1092 global_flush_tlb();
1094 #endif
1096 #ifdef CONFIG_BLK_DEV_INITRD
1097 void free_initrd_mem(unsigned long start, unsigned long end)
1099 free_init_pages("initrd memory", start, end);
1101 #endif
1103 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
1105 /* Should check here against the e820 map to avoid double free */
1106 #ifdef CONFIG_NUMA
1107 int nid = phys_to_nid(phys);
1108 reserve_bootmem_node(NODE_DATA(nid), phys, len);
1109 #else
1110 reserve_bootmem(phys, len);
1111 #endif
1112 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
1113 dma_reserve += len / PAGE_SIZE;
1116 int kern_addr_valid(unsigned long addr)
1118 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1119 pgd_t *pgd;
1120 pud_t *pud;
1121 pmd_t *pmd;
1122 pte_t *pte;
1124 if (above != 0 && above != -1UL)
1125 return 0;
1127 pgd = pgd_offset_k(addr);
1128 if (pgd_none(*pgd))
1129 return 0;
1131 pud = pud_offset(pgd, addr);
1132 if (pud_none(*pud))
1133 return 0;
1135 pmd = pmd_offset(pud, addr);
1136 if (pmd_none(*pmd))
1137 return 0;
1138 if (pmd_large(*pmd))
1139 return pfn_valid(pmd_pfn(*pmd));
1141 pte = pte_offset_kernel(pmd, addr);
1142 if (pte_none(*pte))
1143 return 0;
1144 return pfn_valid(pte_pfn(*pte));
1147 #ifdef CONFIG_SYSCTL
1148 #include <linux/sysctl.h>
1150 extern int exception_trace, page_fault_trace;
1152 static ctl_table debug_table2[] = {
1153 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
1154 proc_dointvec },
1155 { 0, }
1156 };
1158 static ctl_table debug_root_table2[] = {
1159 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
1160 .child = debug_table2 },
1161 { 0 },
1162 };
1164 static __init int x8664_sysctl_init(void)
1166 register_sysctl_table(debug_root_table2, 1);
1167 return 0;
1169 __initcall(x8664_sysctl_init);
1170 #endif
1172 /* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
1173 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
1174 not need special handling anymore. */
1176 static struct vm_area_struct gate_vma = {
1177 .vm_start = VSYSCALL_START,
1178 .vm_end = VSYSCALL_END,
1179 .vm_page_prot = PAGE_READONLY
1180 };
1182 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
1184 #ifdef CONFIG_IA32_EMULATION
1185 if (test_tsk_thread_flag(tsk, TIF_IA32))
1186 return NULL;
1187 #endif
1188 return &gate_vma;
1191 int in_gate_area(struct task_struct *task, unsigned long addr)
1193 struct vm_area_struct *vma = get_gate_vma(task);
1194 if (!vma)
1195 return 0;
1196 return (addr >= vma->vm_start) && (addr < vma->vm_end);
1199 /* Use this when you have no reliable task/vma, typically from interrupt
1200 * context. It is less reliable than using the task's vma and may give
1201 * false positives.
1202 */
1203 int in_gate_area_no_task(unsigned long addr)
1205 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);