ia64/xen-unstable

view linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c @ 13978:477a3bde3b61

linux: miscellaneous changes after upgrade through 2.6.17.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
author kfraser@localhost.localdomain
date Thu Feb 15 14:12:32 2007 +0000 (2007-02-15)
parents 3464bb656a9c
children 7c739272bac8
line source
1 /*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 *
8 * Jun Nakajima <jun.nakajima@intel.com>
9 * Modified for Xen.
10 */
12 #include <linux/signal.h>
13 #include <linux/sched.h>
14 #include <linux/kernel.h>
15 #include <linux/errno.h>
16 #include <linux/string.h>
17 #include <linux/types.h>
18 #include <linux/ptrace.h>
19 #include <linux/mman.h>
20 #include <linux/mm.h>
21 #include <linux/swap.h>
22 #include <linux/smp.h>
23 #include <linux/init.h>
24 #include <linux/pagemap.h>
25 #include <linux/bootmem.h>
26 #include <linux/proc_fs.h>
27 #include <linux/pci.h>
28 #include <linux/poison.h>
29 #include <linux/dma-mapping.h>
30 #include <linux/module.h>
31 #include <linux/memory_hotplug.h>
33 #include <asm/processor.h>
34 #include <asm/system.h>
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37 #include <asm/pgalloc.h>
38 #include <asm/dma.h>
39 #include <asm/fixmap.h>
40 #include <asm/e820.h>
41 #include <asm/apic.h>
42 #include <asm/tlb.h>
43 #include <asm/mmu_context.h>
44 #include <asm/proto.h>
45 #include <asm/smp.h>
46 #include <asm/sections.h>
48 #include <xen/features.h>
50 #ifndef Dprintk
51 #define Dprintk(x...)
52 #endif
54 struct dma_mapping_ops* dma_ops;
55 EXPORT_SYMBOL(dma_ops);
57 #ifdef CONFIG_XEN_COMPAT_030002
58 unsigned int __kernel_page_user;
59 EXPORT_SYMBOL(__kernel_page_user);
60 #endif
62 extern unsigned long *contiguous_bitmap;
64 static unsigned long dma_reserve __initdata;
66 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
67 extern unsigned long start_pfn;
69 /*
70 * Use this until direct mapping is established, i.e. before __va() is
71 * available in init_memory_mapping().
72 */
74 #define addr_to_page(addr, page) \
75 (addr) &= PHYSICAL_PAGE_MASK; \
76 (page) = ((unsigned long *) ((unsigned long) \
77 (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
78 __START_KERNEL_map)))
80 static void early_make_page_readonly(void *va, unsigned int feature)
81 {
82 unsigned long addr, _va = (unsigned long)va;
83 pte_t pte, *ptep;
84 unsigned long *page = (unsigned long *) init_level4_pgt;
86 if (xen_feature(feature))
87 return;
89 addr = (unsigned long) page[pgd_index(_va)];
90 addr_to_page(addr, page);
92 addr = page[pud_index(_va)];
93 addr_to_page(addr, page);
95 addr = page[pmd_index(_va)];
96 addr_to_page(addr, page);
98 ptep = (pte_t *) &page[pte_index(_va)];
100 pte.pte = ptep->pte & ~_PAGE_RW;
101 if (HYPERVISOR_update_va_mapping(_va, pte, 0))
102 BUG();
103 }
105 void make_page_readonly(void *va, unsigned int feature)
106 {
107 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
108 unsigned long addr = (unsigned long) va;
110 if (xen_feature(feature))
111 return;
113 pgd = pgd_offset_k(addr);
114 pud = pud_offset(pgd, addr);
115 pmd = pmd_offset(pud, addr);
116 ptep = pte_offset_kernel(pmd, addr);
118 pte.pte = ptep->pte & ~_PAGE_RW;
119 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
120 xen_l1_entry_update(ptep, pte); /* fallback */
122 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
123 make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
124 }
126 void make_page_writable(void *va, unsigned int feature)
127 {
128 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
129 unsigned long addr = (unsigned long) va;
131 if (xen_feature(feature))
132 return;
134 pgd = pgd_offset_k(addr);
135 pud = pud_offset(pgd, addr);
136 pmd = pmd_offset(pud, addr);
137 ptep = pte_offset_kernel(pmd, addr);
139 pte.pte = ptep->pte | _PAGE_RW;
140 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
141 xen_l1_entry_update(ptep, pte); /* fallback */
143 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
144 make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
145 }
147 void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
148 {
149 if (xen_feature(feature))
150 return;
152 while (nr-- != 0) {
153 make_page_readonly(va, feature);
154 va = (void*)((unsigned long)va + PAGE_SIZE);
155 }
156 }
158 void make_pages_writable(void *va, unsigned nr, unsigned int feature)
159 {
160 if (xen_feature(feature))
161 return;
163 while (nr-- != 0) {
164 make_page_writable(va, feature);
165 va = (void*)((unsigned long)va + PAGE_SIZE);
166 }
167 }
169 /*
170 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
171 * physical space so we can cache the place of the first one and move
172 * around without checking the pgd every time.
173 */
175 void show_mem(void)
176 {
177 long i, total = 0, reserved = 0;
178 long shared = 0, cached = 0;
179 pg_data_t *pgdat;
180 struct page *page;
182 printk(KERN_INFO "Mem-info:\n");
183 show_free_areas();
184 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
186 for_each_online_pgdat(pgdat) {
187 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
188 page = pfn_to_page(pgdat->node_start_pfn + i);
189 total++;
190 if (PageReserved(page))
191 reserved++;
192 else if (PageSwapCache(page))
193 cached++;
194 else if (page_count(page))
195 shared += page_count(page) - 1;
196 }
197 }
198 printk(KERN_INFO "%lu pages of RAM\n", total);
199 printk(KERN_INFO "%lu reserved pages\n",reserved);
200 printk(KERN_INFO "%lu pages shared\n",shared);
201 printk(KERN_INFO "%lu pages swap cached\n",cached);
202 }
204 int after_bootmem;
206 static __init void *spp_getpage(void)
207 {
208 void *ptr;
209 if (after_bootmem)
210 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
211 else
212 ptr = alloc_bootmem_pages(PAGE_SIZE);
213 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
214 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
216 Dprintk("spp_getpage %p\n", ptr);
217 return ptr;
218 }
220 #define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
222 static inline pud_t *pud_offset_u(unsigned long address)
223 {
224 pud_t *pud = level3_user_pgt;
226 return pud + pud_index(address);
227 }
229 static __init void set_pte_phys(unsigned long vaddr,
230 unsigned long phys, pgprot_t prot, int user_mode)
231 {
232 pgd_t *pgd;
233 pud_t *pud;
234 pmd_t *pmd;
235 pte_t *pte, new_pte;
237 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
239 pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
240 if (pgd_none(*pgd)) {
241 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
242 return;
243 }
244 pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
245 if (pud_none(*pud)) {
246 pmd = (pmd_t *) spp_getpage();
247 make_page_readonly(pmd, XENFEAT_writable_page_tables);
248 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
249 if (pmd != pmd_offset(pud, 0)) {
250 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
251 return;
252 }
253 }
254 pmd = pmd_offset(pud, vaddr);
255 if (pmd_none(*pmd)) {
256 pte = (pte_t *) spp_getpage();
257 make_page_readonly(pte, XENFEAT_writable_page_tables);
258 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
259 if (pte != pte_offset_kernel(pmd, 0)) {
260 printk("PAGETABLE BUG #02!\n");
261 return;
262 }
263 }
264 if (pgprot_val(prot))
265 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
266 else
267 new_pte = __pte(0);
269 pte = pte_offset_kernel(pmd, vaddr);
270 if (!pte_none(*pte) &&
271 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
272 pte_ERROR(*pte);
273 set_pte(pte, new_pte);
275 /*
276 * It's enough to flush this one mapping.
277 * (PGE mappings get flushed as well)
278 */
279 __flush_tlb_one(vaddr);
280 }
282 static __init void set_pte_phys_ma(unsigned long vaddr,
283 unsigned long phys, pgprot_t prot)
284 {
285 pgd_t *pgd;
286 pud_t *pud;
287 pmd_t *pmd;
288 pte_t *pte, new_pte;
290 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
292 pgd = pgd_offset_k(vaddr);
293 if (pgd_none(*pgd)) {
294 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
295 return;
296 }
297 pud = pud_offset(pgd, vaddr);
298 if (pud_none(*pud)) {
300 pmd = (pmd_t *) spp_getpage();
301 make_page_readonly(pmd, XENFEAT_writable_page_tables);
303 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
305 if (pmd != pmd_offset(pud, 0)) {
306 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
307 return;
308 }
309 }
310 pmd = pmd_offset(pud, vaddr);
312 if (pmd_none(*pmd)) {
313 pte = (pte_t *) spp_getpage();
314 make_page_readonly(pte, XENFEAT_writable_page_tables);
316 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
317 if (pte != pte_offset_kernel(pmd, 0)) {
318 printk("PAGETABLE BUG #02!\n");
319 return;
320 }
321 }
323 new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
324 pte = pte_offset_kernel(pmd, vaddr);
326 /*
327 * Note that the pte page is already RO, thus we want to use
328 * xen_l1_entry_update(), not set_pte().
329 */
330 xen_l1_entry_update(pte,
331 pfn_pte_ma(phys >> PAGE_SHIFT, prot));
333 /*
334 * It's enough to flush this one mapping.
335 * (PGE mappings get flushed as well)
336 */
337 __flush_tlb_one(vaddr);
338 }
340 #define SET_FIXMAP_KERNEL 0
341 #define SET_FIXMAP_USER 1
343 /* NOTE: this is meant to be run only at boot */
344 void __init
345 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
346 {
347 unsigned long address = __fix_to_virt(idx);
349 if (idx >= __end_of_fixed_addresses) {
350 printk("Invalid __set_fixmap\n");
351 return;
352 }
353 switch (idx) {
354 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
355 set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
356 break;
357 default:
358 set_pte_phys_ma(address, phys, prot);
359 break;
360 }
361 }
363 /*
364 * This only supports vsyscall area.
365 */
366 void __init
367 __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
368 {
369 unsigned long address = __fix_to_virt(idx);
371 if (idx >= __end_of_fixed_addresses) {
372 printk("Invalid __set_fixmap\n");
373 return;
374 }
376 set_pte_phys(address, phys, prot, SET_FIXMAP_USER);
377 }
379 unsigned long __initdata table_start, table_end;
381 unsigned long get_machine_pfn(unsigned long addr)
382 {
383 pud_t* pud = pud_offset_k(NULL, addr);
384 pmd_t* pmd = pmd_offset(pud, addr);
385 pte_t *pte = pte_offset_kernel(pmd, addr);
387 return pte_mfn(*pte);
388 }
390 static __meminit void *alloc_static_page(unsigned long *phys)
391 {
392 unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
394 if (after_bootmem) {
395 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
397 *phys = __pa(adr);
398 return adr;
399 }
401 *phys = start_pfn << PAGE_SHIFT;
402 start_pfn++;
403 memset((void *)va, 0, PAGE_SIZE);
404 return (void *)va;
405 }
407 #define PTE_SIZE PAGE_SIZE
409 static inline void __set_pte(pte_t *dst, pte_t val)
410 {
411 *dst = val;
412 }
414 static inline int make_readonly(unsigned long paddr)
415 {
416 int readonly = 0;
418 /* Make new page tables read-only. */
419 if (!xen_feature(XENFEAT_writable_page_tables)
420 && (paddr >= (table_start << PAGE_SHIFT))
421 && (paddr < (table_end << PAGE_SHIFT)))
422 readonly = 1;
423 /* Make old page tables read-only. */
424 if (!xen_feature(XENFEAT_writable_page_tables)
425 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
426 && (paddr < (start_pfn << PAGE_SHIFT)))
427 readonly = 1;
429 /*
430 * No need for writable mapping of kernel image. This also ensures that
431 * page and descriptor tables embedded inside don't have writable
432 * mappings.
433 */
434 if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)))
435 readonly = 1;
437 return readonly;
438 }
440 /* Must run before zap_low_mappings */
441 __init void *early_ioremap(unsigned long addr, unsigned long size)
442 {
443 return ioremap(addr, size);
444 }
446 /* To avoid virtual aliases later */
447 __init void early_iounmap(void *addr, unsigned long size)
448 {
449 iounmap(addr);
450 }
452 static void __meminit
453 phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
454 {
455 int i, k;
457 for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
458 unsigned long pte_phys;
459 pte_t *pte, *pte_save;
461 if (address >= end) {
462 if (!after_bootmem)
463 for (; i < PTRS_PER_PMD; i++, pmd++)
464 set_pmd(pmd, __pmd(0));
465 break;
466 }
467 pte = alloc_static_page(&pte_phys);
468 pte_save = pte;
469 for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
470 if ((address >= end) ||
471 ((address >> PAGE_SHIFT) >=
472 xen_start_info->nr_pages)) {
473 __set_pte(pte, __pte(0));
474 continue;
475 }
476 if (make_readonly(address)) {
477 __set_pte(pte,
478 __pte(address | (_KERNPG_TABLE & ~_PAGE_RW)));
479 continue;
480 }
481 __set_pte(pte, __pte(address | _KERNPG_TABLE));
482 }
483 pte = pte_save;
484 early_make_page_readonly(pte, XENFEAT_writable_page_tables);
485 set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
486 }
487 }
489 static void __meminit
490 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
491 {
492 pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
494 if (pmd_none(*pmd)) {
495 spin_lock(&init_mm.page_table_lock);
496 phys_pmd_init(pmd, address, end);
497 spin_unlock(&init_mm.page_table_lock);
498 __flush_tlb_all();
499 }
500 }
502 static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
503 {
504 long i = pud_index(address);
506 pud = pud + i;
508 if (after_bootmem && pud_val(*pud)) {
509 phys_pmd_update(pud, address, end);
510 return;
511 }
513 for (; i < PTRS_PER_PUD; pud++, i++) {
514 unsigned long paddr, pmd_phys;
515 pmd_t *pmd;
517 paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
518 if (paddr >= end)
519 break;
521 pmd = alloc_static_page(&pmd_phys);
522 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
523 spin_lock(&init_mm.page_table_lock);
524 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
525 phys_pmd_init(pmd, paddr, end);
526 spin_unlock(&init_mm.page_table_lock);
527 }
528 __flush_tlb();
529 }
531 void __init xen_init_pt(void)
532 {
533 unsigned long addr, *page;
535 memset((void *)init_level4_pgt, 0, PAGE_SIZE);
536 memset((void *)level3_kernel_pgt, 0, PAGE_SIZE);
537 memset((void *)level2_kernel_pgt, 0, PAGE_SIZE);
539 /* Find the initial pte page that was built for us. */
540 page = (unsigned long *)xen_start_info->pt_base;
541 addr = page[pgd_index(__START_KERNEL_map)];
542 addr_to_page(addr, page);
543 addr = page[pud_index(__START_KERNEL_map)];
544 addr_to_page(addr, page);
546 #ifdef CONFIG_XEN_COMPAT_030002
547 /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
548 in kernel PTEs. We check that here. */
549 if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
550 unsigned long *pg;
551 pte_t pte;
553 /* Mess with the initial mapping of page 0. It's not needed. */
554 BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
555 addr = page[pmd_index(__START_KERNEL_map)];
556 addr_to_page(addr, pg);
557 pte.pte = pg[pte_index(__START_KERNEL_map)];
558 BUG_ON(!(pte.pte & _PAGE_PRESENT));
560 /* If _PAGE_USER isn't set, we obviously do not need it. */
561 if (pte.pte & _PAGE_USER) {
562 /* _PAGE_USER is needed, but is it set implicitly? */
563 pte.pte &= ~_PAGE_USER;
564 if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
565 pte, 0) != 0) ||
566 !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
567 /* We need to explicitly specify _PAGE_USER. */
568 __kernel_page_user = _PAGE_USER;
569 }
570 }
571 #endif
573 /* Construct mapping of initial pte page in our own directories. */
574 init_level4_pgt[pgd_index(__START_KERNEL_map)] =
575 mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
576 level3_kernel_pgt[pud_index(__START_KERNEL_map)] =
577 __pud(__pa_symbol(level2_kernel_pgt) |
578 _KERNPG_TABLE);
579 memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
581 early_make_page_readonly(init_level4_pgt,
582 XENFEAT_writable_page_tables);
583 early_make_page_readonly(init_level4_user_pgt,
584 XENFEAT_writable_page_tables);
585 early_make_page_readonly(level3_kernel_pgt,
586 XENFEAT_writable_page_tables);
587 early_make_page_readonly(level3_user_pgt,
588 XENFEAT_writable_page_tables);
589 early_make_page_readonly(level2_kernel_pgt,
590 XENFEAT_writable_page_tables);
592 xen_pgd_pin(__pa_symbol(init_level4_pgt));
593 xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
595 set_pgd((pgd_t *)(init_level4_user_pgt + 511),
596 mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
597 }
599 void __init extend_init_mapping(unsigned long tables_space)
600 {
601 unsigned long va = __START_KERNEL_map;
602 unsigned long phys, addr, *pte_page;
603 pmd_t *pmd;
604 pte_t *pte, new_pte;
605 unsigned long *page = (unsigned long *)init_level4_pgt;
607 addr = page[pgd_index(va)];
608 addr_to_page(addr, page);
609 addr = page[pud_index(va)];
610 addr_to_page(addr, page);
612 /* Kill mapping of low 1MB. */
613 while (va < (unsigned long)&_text) {
614 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
615 va += PAGE_SIZE;
616 }
618 /* Ensure init mappings cover kernel text/data and initial tables. */
619 while (va < (__START_KERNEL_map
620 + (start_pfn << PAGE_SHIFT)
621 + tables_space)) {
622 pmd = (pmd_t *)&page[pmd_index(va)];
623 if (pmd_none(*pmd)) {
624 pte_page = alloc_static_page(&phys);
625 early_make_page_readonly(
626 pte_page, XENFEAT_writable_page_tables);
627 set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
628 } else {
629 addr = page[pmd_index(va)];
630 addr_to_page(addr, pte_page);
631 }
632 pte = (pte_t *)&pte_page[pte_index(va)];
633 if (pte_none(*pte)) {
634 new_pte = pfn_pte(
635 (va - __START_KERNEL_map) >> PAGE_SHIFT,
636 __pgprot(_KERNPG_TABLE));
637 xen_l1_entry_update(pte, new_pte);
638 }
639 va += PAGE_SIZE;
640 }
642 /* Finally, blow away any spurious initial mappings. */
643 while (1) {
644 pmd = (pmd_t *)&page[pmd_index(va)];
645 if (pmd_none(*pmd))
646 break;
647 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
648 va += PAGE_SIZE;
649 }
650 }
652 static void __init find_early_table_space(unsigned long end)
653 {
654 unsigned long puds, pmds, ptes, tables;
656 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
657 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
658 ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
660 tables = round_up(puds * 8, PAGE_SIZE) +
661 round_up(pmds * 8, PAGE_SIZE) +
662 round_up(ptes * 8, PAGE_SIZE);
664 extend_init_mapping(tables);
666 table_start = start_pfn;
667 table_end = table_start + (tables>>PAGE_SHIFT);
669 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
670 end, table_start << PAGE_SHIFT,
671 (table_start << PAGE_SHIFT) + tables);
672 }
674 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
675 This runs before bootmem is initialized and gets pages directly from the
676 physical memory. To access them they are temporarily mapped. */
677 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
678 {
679 unsigned long next;
681 Dprintk("init_memory_mapping\n");
683 /*
684 * Find space for the kernel direct mapping tables.
685 * Later we should allocate these tables in the local node of the memory
686 * mapped. Unfortunately this is done currently before the nodes are
687 * discovered.
688 */
689 if (!after_bootmem)
690 find_early_table_space(end);
692 start = (unsigned long)__va(start);
693 end = (unsigned long)__va(end);
695 for (; start < end; start = next) {
696 unsigned long pud_phys;
697 pgd_t *pgd = pgd_offset_k(start);
698 pud_t *pud;
700 if (after_bootmem) {
701 pud = pud_offset(pgd, start & PGDIR_MASK);
702 make_page_readonly(pud, XENFEAT_writable_page_tables);
703 pud_phys = __pa(pud);
704 } else {
705 pud = alloc_static_page(&pud_phys);
706 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
707 }
708 next = start + PGDIR_SIZE;
709 if (next > end)
710 next = end;
711 phys_pud_init(pud, __pa(start), __pa(next));
712 if (!after_bootmem)
713 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
714 }
716 if (!after_bootmem) {
717 BUG_ON(start_pfn != table_end);
719 /* Re-vector virtual addresses pointing into the initial
720 mapping to the just-established permanent ones. */
721 xen_start_info = __va(__pa(xen_start_info));
722 xen_start_info->pt_base = (unsigned long)
723 __va(__pa(xen_start_info->pt_base));
724 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
725 phys_to_machine_mapping =
726 __va(__pa(xen_start_info->mfn_list));
727 xen_start_info->mfn_list = (unsigned long)
728 phys_to_machine_mapping;
729 }
730 if (xen_start_info->mod_start)
731 xen_start_info->mod_start = (unsigned long)
732 __va(__pa(xen_start_info->mod_start));
734 /* Destroy the Xen-created mappings beyond the kernel image as
735 * well as the temporary mappings created above. Prevents
736 * overlap with modules area (if init mapping is very big).
737 */
738 start = PAGE_ALIGN((unsigned long)_end);
739 end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
740 for (; start < end; start += PAGE_SIZE)
741 WARN_ON(HYPERVISOR_update_va_mapping(
742 start, __pte_ma(0), 0));
743 }
745 __flush_tlb_all();
746 }
748 void __cpuinit zap_low_mappings(int cpu)
749 {
750 /* this is not required for Xen */
751 #if 0
752 swap_low_mappings();
753 #endif
754 }
756 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
757 __init void
758 size_zones(unsigned long *z, unsigned long *h,
759 unsigned long start_pfn, unsigned long end_pfn)
760 {
761 int i;
762 #ifndef CONFIG_XEN
763 unsigned long w;
764 #endif
766 for (i = 0; i < MAX_NR_ZONES; i++)
767 z[i] = 0;
769 #ifndef CONFIG_XEN
770 if (start_pfn < MAX_DMA_PFN)
771 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
772 if (start_pfn < MAX_DMA32_PFN) {
773 unsigned long dma32_pfn = MAX_DMA32_PFN;
774 if (dma32_pfn > end_pfn)
775 dma32_pfn = end_pfn;
776 z[ZONE_DMA32] = dma32_pfn - start_pfn;
777 }
778 z[ZONE_NORMAL] = end_pfn - start_pfn;
780 /* Remove lower zones from higher ones. */
781 w = 0;
782 for (i = 0; i < MAX_NR_ZONES; i++) {
783 if (z[i])
784 z[i] -= w;
785 w += z[i];
786 }
788 /* Compute holes */
789 w = start_pfn;
790 for (i = 0; i < MAX_NR_ZONES; i++) {
791 unsigned long s = w;
792 w += z[i];
793 h[i] = e820_hole_size(s, w);
794 }
796 /* Add the space pace needed for mem_map to the holes too. */
797 for (i = 0; i < MAX_NR_ZONES; i++)
798 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
800 /* The 16MB DMA zone has the kernel and other misc mappings.
801 Account them too */
802 if (h[ZONE_DMA]) {
803 h[ZONE_DMA] += dma_reserve;
804 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
805 printk(KERN_WARNING
806 "Kernel too large and filling up ZONE_DMA?\n");
807 h[ZONE_DMA] = z[ZONE_DMA];
808 }
809 }
810 #else
811 z[ZONE_DMA] = end_pfn;
812 for (i = 0; i < MAX_NR_ZONES; i++)
813 h[i] = 0;
814 #endif
815 }
817 #ifndef CONFIG_NUMA
818 void __init paging_init(void)
819 {
820 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
821 int i;
823 memory_present(0, 0, end_pfn);
824 sparse_init();
825 size_zones(zones, holes, 0, end_pfn);
826 free_area_init_node(0, NODE_DATA(0), zones,
827 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
829 /* Switch to the real shared_info page, and clear the
830 * dummy page. */
831 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
832 HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
833 memset(empty_zero_page, 0, sizeof(empty_zero_page));
835 init_mm.context.pinned = 1;
837 /* Setup mapping of lower 1st MB */
838 for (i = 0; i < NR_FIX_ISAMAPS; i++)
839 if (is_initial_xendomain())
840 set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
841 else
842 __set_fixmap(FIX_ISAMAP_BEGIN - i,
843 virt_to_mfn(empty_zero_page) << PAGE_SHIFT,
844 PAGE_KERNEL_RO);
845 }
846 #endif
848 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
849 from the CPU leading to inconsistent cache lines. address and size
850 must be aligned to 2MB boundaries.
851 Does nothing when the mapping doesn't exist. */
852 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
853 {
854 unsigned long end = address + size;
856 BUG_ON(address & ~LARGE_PAGE_MASK);
857 BUG_ON(size & ~LARGE_PAGE_MASK);
859 for (; address < end; address += LARGE_PAGE_SIZE) {
860 pgd_t *pgd = pgd_offset_k(address);
861 pud_t *pud;
862 pmd_t *pmd;
863 if (pgd_none(*pgd))
864 continue;
865 pud = pud_offset(pgd, address);
866 if (pud_none(*pud))
867 continue;
868 pmd = pmd_offset(pud, address);
869 if (!pmd || pmd_none(*pmd))
870 continue;
871 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
872 /* Could handle this, but it should not happen currently. */
873 printk(KERN_ERR
874 "clear_kernel_mapping: mapping has been split. will leak memory\n");
875 pmd_ERROR(*pmd);
876 }
877 set_pmd(pmd, __pmd(0));
878 }
879 __flush_tlb_all();
880 }
882 /*
883 * Memory hotplug specific functions
884 */
885 void online_page(struct page *page)
886 {
887 ClearPageReserved(page);
888 init_page_count(page);
889 __free_page(page);
890 totalram_pages++;
891 num_physpages++;
892 }
894 #ifdef CONFIG_MEMORY_HOTPLUG
895 /*
896 * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
897 * via probe interface of sysfs. If acpi notifies hot-add event, then it
898 * can tell node id by searching dsdt. But, probe interface doesn't have
899 * node id. So, return 0 as node id at this time.
900 */
901 #ifdef CONFIG_NUMA
902 int memory_add_physaddr_to_nid(u64 start)
903 {
904 return 0;
905 }
906 #endif
908 /*
909 * Memory is added always to NORMAL zone. This means you will never get
910 * additional DMA/DMA32 memory.
911 */
912 int arch_add_memory(int nid, u64 start, u64 size)
913 {
914 struct pglist_data *pgdat = NODE_DATA(nid);
915 struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
916 unsigned long start_pfn = start >> PAGE_SHIFT;
917 unsigned long nr_pages = size >> PAGE_SHIFT;
918 int ret;
920 ret = __add_pages(zone, start_pfn, nr_pages);
921 if (ret)
922 goto error;
924 init_memory_mapping(start, (start + size -1));
926 return ret;
927 error:
928 printk("%s: Problem encountered in __add_pages!\n", __func__);
929 return ret;
930 }
931 EXPORT_SYMBOL_GPL(arch_add_memory);
933 int remove_memory(u64 start, u64 size)
934 {
935 return -EINVAL;
936 }
937 EXPORT_SYMBOL_GPL(remove_memory);
939 #else /* CONFIG_MEMORY_HOTPLUG */
940 /*
941 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
942 * just online the pages.
943 */
944 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
945 {
946 int err = -EIO;
947 unsigned long pfn;
948 unsigned long total = 0, mem = 0;
949 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
950 if (pfn_valid(pfn)) {
951 online_page(pfn_to_page(pfn));
952 err = 0;
953 mem++;
954 }
955 total++;
956 }
957 if (!err) {
958 z->spanned_pages += total;
959 z->present_pages += mem;
960 z->zone_pgdat->node_spanned_pages += total;
961 z->zone_pgdat->node_present_pages += mem;
962 }
963 return err;
964 }
965 #endif /* CONFIG_MEMORY_HOTPLUG */
967 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
968 kcore_vsyscall;
970 void __init mem_init(void)
971 {
972 long codesize, reservedpages, datasize, initsize;
973 unsigned long pfn;
975 contiguous_bitmap = alloc_bootmem_low_pages(
976 (end_pfn + 2*BITS_PER_LONG) >> 3);
977 BUG_ON(!contiguous_bitmap);
978 memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
980 pci_iommu_alloc();
982 /* How many end-of-memory variables you have, grandma! */
983 max_low_pfn = end_pfn;
984 max_pfn = end_pfn;
985 num_physpages = end_pfn;
986 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
988 /* clear the zero-page */
989 memset(empty_zero_page, 0, PAGE_SIZE);
991 reservedpages = 0;
993 /* this will put all low memory onto the freelists */
994 #ifdef CONFIG_NUMA
995 totalram_pages = numa_free_all_bootmem();
996 #else
997 totalram_pages = free_all_bootmem();
998 #endif
999 /* XEN: init and count pages outside initial allocation. */
1000 for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
1001 ClearPageReserved(pfn_to_page(pfn));
1002 init_page_count(pfn_to_page(pfn));
1003 totalram_pages++;
1005 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
1007 after_bootmem = 1;
1009 codesize = (unsigned long) &_etext - (unsigned long) &_text;
1010 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
1011 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
1013 /* Register memory areas for /proc/kcore */
1014 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
1015 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
1016 VMALLOC_END-VMALLOC_START);
1017 kclist_add(&kcore_kernel, &_stext, _end - _stext);
1018 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
1019 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
1020 VSYSCALL_END - VSYSCALL_START);
1022 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
1023 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
1024 end_pfn << (PAGE_SHIFT-10),
1025 codesize >> 10,
1026 reservedpages << (PAGE_SHIFT-10),
1027 datasize >> 10,
1028 initsize >> 10);
1030 #ifndef CONFIG_XEN
1031 #ifdef CONFIG_SMP
1032 /*
1033 * Sync boot_level4_pgt mappings with the init_level4_pgt
1034 * except for the low identity mappings which are already zapped
1035 * in init_level4_pgt. This sync-up is essential for AP's bringup
1036 */
1037 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
1038 #endif
1039 #endif
1042 void free_init_pages(char *what, unsigned long begin, unsigned long end)
1044 #ifdef __DO_LATER__
1045 /*
1046 * Some pages can be pinned, but some are not. Unpinning such pages
1047 * triggers BUG().
1048 */
1049 unsigned long addr;
1051 if (begin >= end)
1052 return;
1054 printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
1055 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1056 ClearPageReserved(virt_to_page(addr));
1057 init_page_count(virt_to_page(addr));
1058 memset((void *)(addr & ~(PAGE_SIZE-1)),
1059 POISON_FREE_INITMEM, PAGE_SIZE);
1060 make_page_writable(
1061 __va(__pa(addr)), XENFEAT_writable_page_tables);
1062 /*
1063 * Make pages from __PAGE_OFFSET address as well
1064 */
1065 make_page_writable(
1066 (void *)addr, XENFEAT_writable_page_tables);
1067 free_page(addr);
1068 totalram_pages++;
1070 #endif
1073 void free_initmem(void)
1075 #ifdef __DO_LATER__
1076 memset(__initdata_begin, POISON_FREE_INITDATA,
1077 __initdata_end - __initdata_begin);
1078 #endif
1079 free_init_pages("unused kernel memory",
1080 (unsigned long)(&__init_begin),
1081 (unsigned long)(&__init_end));
1084 #ifdef CONFIG_DEBUG_RODATA
1086 void mark_rodata_ro(void)
1088 unsigned long addr = (unsigned long)__start_rodata;
1090 for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
1091 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
1093 printk ("Write protecting the kernel read-only data: %luk\n",
1094 (__end_rodata - __start_rodata) >> 10);
1096 /*
1097 * change_page_attr_addr() requires a global_flush_tlb() call after it.
1098 * We do this after the printk so that if something went wrong in the
1099 * change, the printk gets out at least to give a better debug hint
1100 * of who is the culprit.
1101 */
1102 global_flush_tlb();
1104 #endif
1106 #ifdef CONFIG_BLK_DEV_INITRD
1107 void free_initrd_mem(unsigned long start, unsigned long end)
1109 free_init_pages("initrd memory", start, end);
1111 #endif
1113 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
1115 /* Should check here against the e820 map to avoid double free */
1116 #ifdef CONFIG_NUMA
1117 int nid = phys_to_nid(phys);
1118 reserve_bootmem_node(NODE_DATA(nid), phys, len);
1119 #else
1120 reserve_bootmem(phys, len);
1121 #endif
1122 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
1123 dma_reserve += len / PAGE_SIZE;
1126 int kern_addr_valid(unsigned long addr)
1128 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1129 pgd_t *pgd;
1130 pud_t *pud;
1131 pmd_t *pmd;
1132 pte_t *pte;
1134 if (above != 0 && above != -1UL)
1135 return 0;
1137 pgd = pgd_offset_k(addr);
1138 if (pgd_none(*pgd))
1139 return 0;
1141 pud = pud_offset_k(pgd, addr);
1142 if (pud_none(*pud))
1143 return 0;
1145 pmd = pmd_offset(pud, addr);
1146 if (pmd_none(*pmd))
1147 return 0;
1148 if (pmd_large(*pmd))
1149 return pfn_valid(pmd_pfn(*pmd));
1151 pte = pte_offset_kernel(pmd, addr);
1152 if (pte_none(*pte))
1153 return 0;
1154 return pfn_valid(pte_pfn(*pte));
1157 #ifdef CONFIG_SYSCTL
1158 #include <linux/sysctl.h>
1160 extern int exception_trace, page_fault_trace;
1162 static ctl_table debug_table2[] = {
1163 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
1164 proc_dointvec },
1165 { 0, }
1166 };
1168 static ctl_table debug_root_table2[] = {
1169 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
1170 .child = debug_table2 },
1171 { 0 },
1172 };
1174 static __init int x8664_sysctl_init(void)
1176 register_sysctl_table(debug_root_table2, 1);
1177 return 0;
1179 __initcall(x8664_sysctl_init);
1180 #endif
1182 /* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
1183 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
1184 not need special handling anymore. */
1186 static struct vm_area_struct gate_vma = {
1187 .vm_start = VSYSCALL_START,
1188 .vm_end = VSYSCALL_END,
1189 .vm_page_prot = PAGE_READONLY
1190 };
1192 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
1194 #ifdef CONFIG_IA32_EMULATION
1195 if (test_tsk_thread_flag(tsk, TIF_IA32))
1196 return NULL;
1197 #endif
1198 return &gate_vma;
1201 int in_gate_area(struct task_struct *task, unsigned long addr)
1203 struct vm_area_struct *vma = get_gate_vma(task);
1204 if (!vma)
1205 return 0;
1206 return (addr >= vma->vm_start) && (addr < vma->vm_end);
1209 /* Use this when you have no reliable task/vma, typically from interrupt
1210 * context. It is less reliable than using the task's vma and may give
1211 * false positives.
1212 */
1213 int in_gate_area_no_task(unsigned long addr)
1215 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);