ia64/xen-unstable

view linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c @ 14279:200d13363a38

linux x86/64: No need to pre-allocate all fixmap pagetables:
just those for bt_ioremap().
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Tue Mar 06 13:41:05 2007 +0000 (2007-03-06)
parents 53589c343d46
children 26a1378d5ece
line source
1 /*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 *
8 * Jun Nakajima <jun.nakajima@intel.com>
9 * Modified for Xen.
10 */
12 #include <linux/signal.h>
13 #include <linux/sched.h>
14 #include <linux/kernel.h>
15 #include <linux/errno.h>
16 #include <linux/string.h>
17 #include <linux/types.h>
18 #include <linux/ptrace.h>
19 #include <linux/mman.h>
20 #include <linux/mm.h>
21 #include <linux/swap.h>
22 #include <linux/smp.h>
23 #include <linux/init.h>
24 #include <linux/pagemap.h>
25 #include <linux/bootmem.h>
26 #include <linux/proc_fs.h>
27 #include <linux/pci.h>
28 #include <linux/poison.h>
29 #include <linux/dma-mapping.h>
30 #include <linux/module.h>
31 #include <linux/memory_hotplug.h>
33 #include <asm/processor.h>
34 #include <asm/system.h>
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37 #include <asm/pgalloc.h>
38 #include <asm/dma.h>
39 #include <asm/fixmap.h>
40 #include <asm/e820.h>
41 #include <asm/apic.h>
42 #include <asm/tlb.h>
43 #include <asm/mmu_context.h>
44 #include <asm/proto.h>
45 #include <asm/smp.h>
46 #include <asm/sections.h>
48 #include <xen/features.h>
50 #ifndef Dprintk
51 #define Dprintk(x...)
52 #endif
54 struct dma_mapping_ops* dma_ops;
55 EXPORT_SYMBOL(dma_ops);
57 #ifdef CONFIG_XEN_COMPAT_030002
58 unsigned int __kernel_page_user;
59 EXPORT_SYMBOL(__kernel_page_user);
60 #endif
62 extern unsigned long *contiguous_bitmap;
64 static unsigned long dma_reserve __initdata;
66 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
67 extern unsigned long start_pfn;
69 /*
70 * Use this until direct mapping is established, i.e. before __va() is
71 * available in init_memory_mapping().
72 */
74 #define addr_to_page(addr, page) \
75 (addr) &= PHYSICAL_PAGE_MASK; \
76 (page) = ((unsigned long *) ((unsigned long) \
77 (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
78 __START_KERNEL_map)))
80 static void __meminit early_make_page_readonly(void *va, unsigned int feature)
81 {
82 unsigned long addr, _va = (unsigned long)va;
83 pte_t pte, *ptep;
84 unsigned long *page = (unsigned long *) init_level4_pgt;
86 if (xen_feature(feature))
87 return;
89 addr = (unsigned long) page[pgd_index(_va)];
90 addr_to_page(addr, page);
92 addr = page[pud_index(_va)];
93 addr_to_page(addr, page);
95 addr = page[pmd_index(_va)];
96 addr_to_page(addr, page);
98 ptep = (pte_t *) &page[pte_index(_va)];
100 pte.pte = ptep->pte & ~_PAGE_RW;
101 if (HYPERVISOR_update_va_mapping(_va, pte, 0))
102 BUG();
103 }
105 static void __make_page_readonly(void *va)
106 {
107 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
108 unsigned long addr = (unsigned long) va;
110 pgd = pgd_offset_k(addr);
111 pud = pud_offset(pgd, addr);
112 pmd = pmd_offset(pud, addr);
113 ptep = pte_offset_kernel(pmd, addr);
115 pte.pte = ptep->pte & ~_PAGE_RW;
116 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
117 xen_l1_entry_update(ptep, pte); /* fallback */
119 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
120 __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
121 }
123 static void __make_page_writable(void *va)
124 {
125 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
126 unsigned long addr = (unsigned long) va;
128 pgd = pgd_offset_k(addr);
129 pud = pud_offset(pgd, addr);
130 pmd = pmd_offset(pud, addr);
131 ptep = pte_offset_kernel(pmd, addr);
133 pte.pte = ptep->pte | _PAGE_RW;
134 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
135 xen_l1_entry_update(ptep, pte); /* fallback */
137 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
138 __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
139 }
141 void make_page_readonly(void *va, unsigned int feature)
142 {
143 if (!xen_feature(feature))
144 __make_page_readonly(va);
145 }
147 void make_page_writable(void *va, unsigned int feature)
148 {
149 if (!xen_feature(feature))
150 __make_page_writable(va);
151 }
153 void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
154 {
155 if (xen_feature(feature))
156 return;
158 while (nr-- != 0) {
159 __make_page_readonly(va);
160 va = (void*)((unsigned long)va + PAGE_SIZE);
161 }
162 }
164 void make_pages_writable(void *va, unsigned nr, unsigned int feature)
165 {
166 if (xen_feature(feature))
167 return;
169 while (nr-- != 0) {
170 __make_page_writable(va);
171 va = (void*)((unsigned long)va + PAGE_SIZE);
172 }
173 }
175 /*
176 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
177 * physical space so we can cache the place of the first one and move
178 * around without checking the pgd every time.
179 */
181 void show_mem(void)
182 {
183 long i, total = 0, reserved = 0;
184 long shared = 0, cached = 0;
185 pg_data_t *pgdat;
186 struct page *page;
188 printk(KERN_INFO "Mem-info:\n");
189 show_free_areas();
190 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
192 for_each_online_pgdat(pgdat) {
193 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
194 page = pfn_to_page(pgdat->node_start_pfn + i);
195 total++;
196 if (PageReserved(page))
197 reserved++;
198 else if (PageSwapCache(page))
199 cached++;
200 else if (page_count(page))
201 shared += page_count(page) - 1;
202 }
203 }
204 printk(KERN_INFO "%lu pages of RAM\n", total);
205 printk(KERN_INFO "%lu reserved pages\n",reserved);
206 printk(KERN_INFO "%lu pages shared\n",shared);
207 printk(KERN_INFO "%lu pages swap cached\n",cached);
208 }
210 int after_bootmem;
212 static __init void *spp_getpage(void)
213 {
214 void *ptr;
215 if (after_bootmem)
216 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
217 else if (start_pfn < table_end) {
218 ptr = __va(start_pfn << PAGE_SHIFT);
219 start_pfn++;
220 memset(ptr, 0, PAGE_SIZE);
221 } else
222 ptr = alloc_bootmem_pages(PAGE_SIZE);
223 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
224 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
226 Dprintk("spp_getpage %p\n", ptr);
227 return ptr;
228 }
230 #define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
232 static inline pud_t *pud_offset_u(unsigned long address)
233 {
234 pud_t *pud = level3_user_pgt;
236 return pud + pud_index(address);
237 }
239 static __init void set_pte_phys(unsigned long vaddr,
240 unsigned long phys, pgprot_t prot, int user_mode)
241 {
242 pgd_t *pgd;
243 pud_t *pud;
244 pmd_t *pmd;
245 pte_t *pte, new_pte;
247 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
249 pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
250 if (pgd_none(*pgd)) {
251 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
252 return;
253 }
254 pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
255 if (pud_none(*pud)) {
256 pmd = (pmd_t *) spp_getpage();
257 make_page_readonly(pmd, XENFEAT_writable_page_tables);
258 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
259 if (pmd != pmd_offset(pud, 0)) {
260 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
261 return;
262 }
263 }
264 pmd = pmd_offset(pud, vaddr);
265 if (pmd_none(*pmd)) {
266 pte = (pte_t *) spp_getpage();
267 make_page_readonly(pte, XENFEAT_writable_page_tables);
268 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
269 if (pte != pte_offset_kernel(pmd, 0)) {
270 printk("PAGETABLE BUG #02!\n");
271 return;
272 }
273 }
274 if (pgprot_val(prot))
275 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
276 else
277 new_pte = __pte(0);
279 pte = pte_offset_kernel(pmd, vaddr);
280 if (!pte_none(*pte) &&
281 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
282 pte_ERROR(*pte);
283 set_pte(pte, new_pte);
285 /*
286 * It's enough to flush this one mapping.
287 * (PGE mappings get flushed as well)
288 */
289 __flush_tlb_one(vaddr);
290 }
292 static __init void set_pte_phys_ma(unsigned long vaddr,
293 unsigned long phys, pgprot_t prot)
294 {
295 pgd_t *pgd;
296 pud_t *pud;
297 pmd_t *pmd;
298 pte_t *pte, new_pte;
300 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
302 pgd = pgd_offset_k(vaddr);
303 if (pgd_none(*pgd)) {
304 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
305 return;
306 }
307 pud = pud_offset(pgd, vaddr);
308 if (pud_none(*pud)) {
310 pmd = (pmd_t *) spp_getpage();
311 make_page_readonly(pmd, XENFEAT_writable_page_tables);
312 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
313 if (pmd != pmd_offset(pud, 0)) {
314 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
315 return;
316 }
317 }
318 pmd = pmd_offset(pud, vaddr);
319 if (pmd_none(*pmd)) {
320 pte = (pte_t *) spp_getpage();
321 make_page_readonly(pte, XENFEAT_writable_page_tables);
322 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
323 if (pte != pte_offset_kernel(pmd, 0)) {
324 printk("PAGETABLE BUG #02!\n");
325 return;
326 }
327 }
328 new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
330 pte = pte_offset_kernel(pmd, vaddr);
331 set_pte(pte, new_pte);
333 /*
334 * It's enough to flush this one mapping.
335 * (PGE mappings get flushed as well)
336 */
337 __flush_tlb_one(vaddr);
338 }
340 #define SET_FIXMAP_KERNEL 0
341 #define SET_FIXMAP_USER 1
343 /* NOTE: this is meant to be run only at boot */
344 void __init
345 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
346 {
347 unsigned long address = __fix_to_virt(idx);
349 if (idx >= __end_of_fixed_addresses) {
350 printk("Invalid __set_fixmap\n");
351 return;
352 }
353 switch (idx) {
354 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
355 set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
356 break;
357 default:
358 set_pte_phys_ma(address, phys, prot);
359 break;
360 }
361 }
363 /*
364 * This only supports vsyscall area.
365 */
366 void __init
367 __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
368 {
369 unsigned long address = __fix_to_virt(idx);
371 if (idx >= __end_of_fixed_addresses) {
372 printk("Invalid __set_fixmap\n");
373 return;
374 }
376 set_pte_phys(address, phys, prot, SET_FIXMAP_USER);
377 }
379 unsigned long __initdata table_start, table_end;
381 static __meminit void *alloc_static_page(unsigned long *phys)
382 {
383 unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
385 if (after_bootmem) {
386 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
388 *phys = __pa(adr);
389 return adr;
390 }
392 *phys = start_pfn << PAGE_SHIFT;
393 start_pfn++;
394 memset((void *)va, 0, PAGE_SIZE);
395 return (void *)va;
396 }
398 #define PTE_SIZE PAGE_SIZE
400 static inline void __set_pte(pte_t *dst, pte_t val)
401 {
402 *dst = val;
403 }
405 static inline int make_readonly(unsigned long paddr)
406 {
407 extern char __vsyscall_0;
408 int readonly = 0;
410 /* Make new page tables read-only. */
411 if (!xen_feature(XENFEAT_writable_page_tables)
412 && (paddr >= (table_start << PAGE_SHIFT))
413 && (paddr < (table_end << PAGE_SHIFT)))
414 readonly = 1;
415 /* Make old page tables read-only. */
416 if (!xen_feature(XENFEAT_writable_page_tables)
417 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
418 && (paddr < (start_pfn << PAGE_SHIFT)))
419 readonly = 1;
421 /*
422 * No need for writable mapping of kernel image. This also ensures that
423 * page and descriptor tables embedded inside don't have writable
424 * mappings. Exclude the vsyscall area here, allowing alternative
425 * instruction patching to work.
426 */
427 if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end))
428 && !(paddr >= __pa_symbol(&__vsyscall_0)
429 && paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE))
430 readonly = 1;
432 return readonly;
433 }
435 #ifndef CONFIG_XEN
436 /* Must run before zap_low_mappings */
437 __init void *early_ioremap(unsigned long addr, unsigned long size)
438 {
439 unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
441 /* actually usually some more */
442 if (size >= LARGE_PAGE_SIZE) {
443 printk("SMBIOS area too long %lu\n", size);
444 return NULL;
445 }
446 set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
447 map += LARGE_PAGE_SIZE;
448 set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
449 __flush_tlb();
450 return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
451 }
453 /* To avoid virtual aliases later */
454 __init void early_iounmap(void *addr, unsigned long size)
455 {
456 if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
457 printk("early_iounmap: bad address %p\n", addr);
458 set_pmd(temp_mappings[0].pmd, __pmd(0));
459 set_pmd(temp_mappings[1].pmd, __pmd(0));
460 __flush_tlb();
461 }
462 #endif
464 static void __meminit
465 phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
466 {
467 int i, k;
469 for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
470 unsigned long pte_phys;
471 pte_t *pte, *pte_save;
473 if (address >= end) {
474 if (!after_bootmem)
475 for (; i < PTRS_PER_PMD; i++, pmd++)
476 set_pmd(pmd, __pmd(0));
477 break;
478 }
479 pte = alloc_static_page(&pte_phys);
480 pte_save = pte;
481 for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
482 if ((address >= end) ||
483 ((address >> PAGE_SHIFT) >=
484 xen_start_info->nr_pages)) {
485 __set_pte(pte, __pte(0));
486 continue;
487 }
488 if (make_readonly(address)) {
489 __set_pte(pte,
490 __pte(address | (_KERNPG_TABLE & ~_PAGE_RW)));
491 continue;
492 }
493 __set_pte(pte, __pte(address | _KERNPG_TABLE));
494 }
495 pte = pte_save;
496 early_make_page_readonly(pte, XENFEAT_writable_page_tables);
497 set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
498 }
499 }
501 static void __meminit
502 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
503 {
504 pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
506 if (pmd_none(*pmd)) {
507 spin_lock(&init_mm.page_table_lock);
508 phys_pmd_init(pmd, address, end);
509 spin_unlock(&init_mm.page_table_lock);
510 __flush_tlb_all();
511 }
512 }
514 static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
515 {
516 long i = pud_index(address);
518 pud = pud + i;
520 if (after_bootmem && pud_val(*pud)) {
521 phys_pmd_update(pud, address, end);
522 return;
523 }
525 for (; i < PTRS_PER_PUD; pud++, i++) {
526 unsigned long paddr, pmd_phys;
527 pmd_t *pmd;
529 paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
530 if (paddr >= end)
531 break;
533 pmd = alloc_static_page(&pmd_phys);
534 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
535 spin_lock(&init_mm.page_table_lock);
536 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
537 phys_pmd_init(pmd, paddr, end);
538 spin_unlock(&init_mm.page_table_lock);
539 }
540 __flush_tlb();
541 }
543 void __init xen_init_pt(void)
544 {
545 unsigned long addr, *page;
547 /* Find the initial pte page that was built for us. */
548 page = (unsigned long *)xen_start_info->pt_base;
549 addr = page[pgd_index(__START_KERNEL_map)];
550 addr_to_page(addr, page);
551 addr = page[pud_index(__START_KERNEL_map)];
552 addr_to_page(addr, page);
554 #ifdef CONFIG_XEN_COMPAT_030002
555 /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
556 in kernel PTEs. We check that here. */
557 if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
558 unsigned long *pg;
559 pte_t pte;
561 /* Mess with the initial mapping of page 0. It's not needed. */
562 BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
563 addr = page[pmd_index(__START_KERNEL_map)];
564 addr_to_page(addr, pg);
565 pte.pte = pg[pte_index(__START_KERNEL_map)];
566 BUG_ON(!(pte.pte & _PAGE_PRESENT));
568 /* If _PAGE_USER isn't set, we obviously do not need it. */
569 if (pte.pte & _PAGE_USER) {
570 /* _PAGE_USER is needed, but is it set implicitly? */
571 pte.pte &= ~_PAGE_USER;
572 if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
573 pte, 0) != 0) ||
574 !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
575 /* We need to explicitly specify _PAGE_USER. */
576 __kernel_page_user = _PAGE_USER;
577 }
578 }
579 #endif
581 /* Construct mapping of initial pte page in our own directories. */
582 init_level4_pgt[pgd_index(__START_KERNEL_map)] =
583 mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
584 level3_kernel_pgt[pud_index(__START_KERNEL_map)] =
585 __pud(__pa_symbol(level2_kernel_pgt) |
586 _KERNPG_TABLE);
587 memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
589 early_make_page_readonly(init_level4_pgt,
590 XENFEAT_writable_page_tables);
591 early_make_page_readonly(init_level4_user_pgt,
592 XENFEAT_writable_page_tables);
593 early_make_page_readonly(level3_kernel_pgt,
594 XENFEAT_writable_page_tables);
595 early_make_page_readonly(level3_user_pgt,
596 XENFEAT_writable_page_tables);
597 early_make_page_readonly(level2_kernel_pgt,
598 XENFEAT_writable_page_tables);
600 xen_pgd_pin(__pa_symbol(init_level4_pgt));
601 xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
603 set_pgd((pgd_t *)(init_level4_user_pgt + 511),
604 mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
605 }
607 static void __init extend_init_mapping(unsigned long tables_space)
608 {
609 unsigned long va = __START_KERNEL_map;
610 unsigned long phys, addr, *pte_page;
611 pmd_t *pmd;
612 pte_t *pte, new_pte;
613 unsigned long *page = (unsigned long *)init_level4_pgt;
615 addr = page[pgd_index(va)];
616 addr_to_page(addr, page);
617 addr = page[pud_index(va)];
618 addr_to_page(addr, page);
620 /* Kill mapping of low 1MB. */
621 while (va < (unsigned long)&_text) {
622 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
623 va += PAGE_SIZE;
624 }
626 /* Ensure init mappings cover kernel text/data and initial tables. */
627 while (va < (__START_KERNEL_map
628 + (start_pfn << PAGE_SHIFT)
629 + tables_space)) {
630 pmd = (pmd_t *)&page[pmd_index(va)];
631 if (pmd_none(*pmd)) {
632 pte_page = alloc_static_page(&phys);
633 early_make_page_readonly(
634 pte_page, XENFEAT_writable_page_tables);
635 set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
636 } else {
637 addr = page[pmd_index(va)];
638 addr_to_page(addr, pte_page);
639 }
640 pte = (pte_t *)&pte_page[pte_index(va)];
641 if (pte_none(*pte)) {
642 new_pte = pfn_pte(
643 (va - __START_KERNEL_map) >> PAGE_SHIFT,
644 __pgprot(_KERNPG_TABLE));
645 xen_l1_entry_update(pte, new_pte);
646 }
647 va += PAGE_SIZE;
648 }
650 /* Finally, blow away any spurious initial mappings. */
651 while (1) {
652 pmd = (pmd_t *)&page[pmd_index(va)];
653 if (pmd_none(*pmd))
654 break;
655 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
656 va += PAGE_SIZE;
657 }
658 }
660 static void __init find_early_table_space(unsigned long end)
661 {
662 unsigned long puds, pmds, ptes, tables;
664 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
665 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
666 ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
668 tables = round_up(puds * 8, PAGE_SIZE) +
669 round_up(pmds * 8, PAGE_SIZE) +
670 round_up(ptes * 8, PAGE_SIZE);
672 extend_init_mapping(tables);
674 table_start = start_pfn;
675 table_end = table_start + (tables>>PAGE_SHIFT);
677 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
678 end, table_start << PAGE_SHIFT,
679 (table_start << PAGE_SHIFT) + tables);
680 }
682 static void xen_finish_init_mapping(void)
683 {
684 unsigned long i, start, end;
686 /* Re-vector virtual addresses pointing into the initial
687 mapping to the just-established permanent ones. */
688 xen_start_info = __va(__pa(xen_start_info));
689 xen_start_info->pt_base = (unsigned long)
690 __va(__pa(xen_start_info->pt_base));
691 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
692 phys_to_machine_mapping =
693 __va(__pa(xen_start_info->mfn_list));
694 xen_start_info->mfn_list = (unsigned long)
695 phys_to_machine_mapping;
696 }
697 if (xen_start_info->mod_start)
698 xen_start_info->mod_start = (unsigned long)
699 __va(__pa(xen_start_info->mod_start));
701 /* Destroy the Xen-created mappings beyond the kernel image as
702 * well as the temporary mappings created above. Prevents
703 * overlap with modules area (if init mapping is very big).
704 */
705 start = PAGE_ALIGN((unsigned long)_end);
706 end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
707 for (; start < end; start += PAGE_SIZE)
708 WARN_ON(HYPERVISOR_update_va_mapping(
709 start, __pte_ma(0), 0));
711 /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
712 table_end = ~0UL;
714 /*
715 * Prefetch pte's for the bt_ioremap() area. It gets used before the
716 * boot-time allocator is online, so allocate-on-demand would fail.
717 */
718 for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
719 __set_fixmap(i, 0, __pgprot(0));
721 /* Switch to the real shared_info page, and clear the dummy page. */
722 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
723 HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
724 memset(empty_zero_page, 0, sizeof(empty_zero_page));
726 /* Set up mapping of lowest 1MB of physical memory. */
727 for (i = 0; i < NR_FIX_ISAMAPS; i++)
728 if (is_initial_xendomain())
729 set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
730 else
731 __set_fixmap(FIX_ISAMAP_BEGIN - i,
732 virt_to_mfn(empty_zero_page)
733 << PAGE_SHIFT,
734 PAGE_KERNEL_RO);
736 /* Disable the 'start_pfn' allocator. */
737 table_end = start_pfn;
738 }
740 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
741 This runs before bootmem is initialized and gets pages directly from the
742 physical memory. To access them they are temporarily mapped. */
743 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
744 {
745 unsigned long next;
747 Dprintk("init_memory_mapping\n");
749 /*
750 * Find space for the kernel direct mapping tables.
751 * Later we should allocate these tables in the local node of the memory
752 * mapped. Unfortunately this is done currently before the nodes are
753 * discovered.
754 */
755 if (!after_bootmem)
756 find_early_table_space(end);
758 start = (unsigned long)__va(start);
759 end = (unsigned long)__va(end);
761 for (; start < end; start = next) {
762 unsigned long pud_phys;
763 pgd_t *pgd = pgd_offset_k(start);
764 pud_t *pud;
766 if (after_bootmem) {
767 pud = pud_offset(pgd, start & PGDIR_MASK);
768 make_page_readonly(pud, XENFEAT_writable_page_tables);
769 pud_phys = __pa(pud);
770 } else {
771 pud = alloc_static_page(&pud_phys);
772 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
773 }
774 next = start + PGDIR_SIZE;
775 if (next > end)
776 next = end;
777 phys_pud_init(pud, __pa(start), __pa(next));
778 if (!after_bootmem)
779 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
780 }
782 if (!after_bootmem) {
783 BUG_ON(start_pfn != table_end);
784 xen_finish_init_mapping();
785 }
787 __flush_tlb_all();
788 }
790 void __cpuinit zap_low_mappings(int cpu)
791 {
792 /* this is not required for Xen */
793 #if 0
794 swap_low_mappings();
795 #endif
796 }
798 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
799 __init void
800 size_zones(unsigned long *z, unsigned long *h,
801 unsigned long start_pfn, unsigned long end_pfn)
802 {
803 int i;
804 #ifndef CONFIG_XEN
805 unsigned long w;
806 #endif
808 for (i = 0; i < MAX_NR_ZONES; i++)
809 z[i] = 0;
811 #ifndef CONFIG_XEN
812 if (start_pfn < MAX_DMA_PFN)
813 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
814 if (start_pfn < MAX_DMA32_PFN) {
815 unsigned long dma32_pfn = MAX_DMA32_PFN;
816 if (dma32_pfn > end_pfn)
817 dma32_pfn = end_pfn;
818 z[ZONE_DMA32] = dma32_pfn - start_pfn;
819 }
820 z[ZONE_NORMAL] = end_pfn - start_pfn;
822 /* Remove lower zones from higher ones. */
823 w = 0;
824 for (i = 0; i < MAX_NR_ZONES; i++) {
825 if (z[i])
826 z[i] -= w;
827 w += z[i];
828 }
830 /* Compute holes */
831 w = start_pfn;
832 for (i = 0; i < MAX_NR_ZONES; i++) {
833 unsigned long s = w;
834 w += z[i];
835 h[i] = e820_hole_size(s, w);
836 }
838 /* Add the space pace needed for mem_map to the holes too. */
839 for (i = 0; i < MAX_NR_ZONES; i++)
840 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
842 /* The 16MB DMA zone has the kernel and other misc mappings.
843 Account them too */
844 if (h[ZONE_DMA]) {
845 h[ZONE_DMA] += dma_reserve;
846 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
847 printk(KERN_WARNING
848 "Kernel too large and filling up ZONE_DMA?\n");
849 h[ZONE_DMA] = z[ZONE_DMA];
850 }
851 }
852 #else
853 z[ZONE_DMA] = end_pfn;
854 for (i = 0; i < MAX_NR_ZONES; i++)
855 h[i] = 0;
856 #endif
857 }
859 #ifndef CONFIG_NUMA
860 void __init paging_init(void)
861 {
862 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
864 memory_present(0, 0, end_pfn);
865 sparse_init();
866 size_zones(zones, holes, 0, end_pfn);
867 free_area_init_node(0, NODE_DATA(0), zones,
868 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
870 init_mm.context.pinned = 1;
871 }
872 #endif
874 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
875 from the CPU leading to inconsistent cache lines. address and size
876 must be aligned to 2MB boundaries.
877 Does nothing when the mapping doesn't exist. */
878 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
879 {
880 unsigned long end = address + size;
882 BUG_ON(address & ~LARGE_PAGE_MASK);
883 BUG_ON(size & ~LARGE_PAGE_MASK);
885 for (; address < end; address += LARGE_PAGE_SIZE) {
886 pgd_t *pgd = pgd_offset_k(address);
887 pud_t *pud;
888 pmd_t *pmd;
889 if (pgd_none(*pgd))
890 continue;
891 pud = pud_offset(pgd, address);
892 if (pud_none(*pud))
893 continue;
894 pmd = pmd_offset(pud, address);
895 if (!pmd || pmd_none(*pmd))
896 continue;
897 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
898 /* Could handle this, but it should not happen currently. */
899 printk(KERN_ERR
900 "clear_kernel_mapping: mapping has been split. will leak memory\n");
901 pmd_ERROR(*pmd);
902 }
903 set_pmd(pmd, __pmd(0));
904 }
905 __flush_tlb_all();
906 }
908 /*
909 * Memory hotplug specific functions
910 */
911 void online_page(struct page *page)
912 {
913 ClearPageReserved(page);
914 init_page_count(page);
915 __free_page(page);
916 totalram_pages++;
917 num_physpages++;
918 }
920 #ifdef CONFIG_MEMORY_HOTPLUG
921 /*
922 * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
923 * via probe interface of sysfs. If acpi notifies hot-add event, then it
924 * can tell node id by searching dsdt. But, probe interface doesn't have
925 * node id. So, return 0 as node id at this time.
926 */
927 #ifdef CONFIG_NUMA
928 int memory_add_physaddr_to_nid(u64 start)
929 {
930 return 0;
931 }
932 #endif
934 /*
935 * Memory is added always to NORMAL zone. This means you will never get
936 * additional DMA/DMA32 memory.
937 */
938 int arch_add_memory(int nid, u64 start, u64 size)
939 {
940 struct pglist_data *pgdat = NODE_DATA(nid);
941 struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
942 unsigned long start_pfn = start >> PAGE_SHIFT;
943 unsigned long nr_pages = size >> PAGE_SHIFT;
944 int ret;
946 ret = __add_pages(zone, start_pfn, nr_pages);
947 if (ret)
948 goto error;
950 init_memory_mapping(start, (start + size -1));
952 return ret;
953 error:
954 printk("%s: Problem encountered in __add_pages!\n", __func__);
955 return ret;
956 }
957 EXPORT_SYMBOL_GPL(arch_add_memory);
959 int remove_memory(u64 start, u64 size)
960 {
961 return -EINVAL;
962 }
963 EXPORT_SYMBOL_GPL(remove_memory);
965 #else /* CONFIG_MEMORY_HOTPLUG */
966 /*
967 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
968 * just online the pages.
969 */
970 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
971 {
972 int err = -EIO;
973 unsigned long pfn;
974 unsigned long total = 0, mem = 0;
975 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
976 if (pfn_valid(pfn)) {
977 online_page(pfn_to_page(pfn));
978 err = 0;
979 mem++;
980 }
981 total++;
982 }
983 if (!err) {
984 z->spanned_pages += total;
985 z->present_pages += mem;
986 z->zone_pgdat->node_spanned_pages += total;
987 z->zone_pgdat->node_present_pages += mem;
988 }
989 return err;
990 }
991 #endif /* CONFIG_MEMORY_HOTPLUG */
993 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
994 kcore_vsyscall;
996 void __init mem_init(void)
997 {
998 long codesize, reservedpages, datasize, initsize;
999 unsigned long pfn;
1001 contiguous_bitmap = alloc_bootmem_low_pages(
1002 (end_pfn + 2*BITS_PER_LONG) >> 3);
1003 BUG_ON(!contiguous_bitmap);
1004 memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
1006 pci_iommu_alloc();
1008 /* How many end-of-memory variables you have, grandma! */
1009 max_low_pfn = end_pfn;
1010 max_pfn = end_pfn;
1011 num_physpages = end_pfn;
1012 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
1014 /* clear the zero-page */
1015 memset(empty_zero_page, 0, PAGE_SIZE);
1017 reservedpages = 0;
1019 /* this will put all low memory onto the freelists */
1020 #ifdef CONFIG_NUMA
1021 totalram_pages = numa_free_all_bootmem();
1022 #else
1023 totalram_pages = free_all_bootmem();
1024 #endif
1025 /* XEN: init and count pages outside initial allocation. */
1026 for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
1027 ClearPageReserved(pfn_to_page(pfn));
1028 init_page_count(pfn_to_page(pfn));
1029 totalram_pages++;
1031 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
1033 after_bootmem = 1;
1035 codesize = (unsigned long) &_etext - (unsigned long) &_text;
1036 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
1037 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
1039 /* Register memory areas for /proc/kcore */
1040 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
1041 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
1042 VMALLOC_END-VMALLOC_START);
1043 kclist_add(&kcore_kernel, &_stext, _end - _stext);
1044 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
1045 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
1046 VSYSCALL_END - VSYSCALL_START);
1048 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
1049 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
1050 end_pfn << (PAGE_SHIFT-10),
1051 codesize >> 10,
1052 reservedpages << (PAGE_SHIFT-10),
1053 datasize >> 10,
1054 initsize >> 10);
1056 #ifndef CONFIG_XEN
1057 #ifdef CONFIG_SMP
1058 /*
1059 * Sync boot_level4_pgt mappings with the init_level4_pgt
1060 * except for the low identity mappings which are already zapped
1061 * in init_level4_pgt. This sync-up is essential for AP's bringup
1062 */
1063 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
1064 #endif
1065 #endif
1068 void free_init_pages(char *what, unsigned long begin, unsigned long end)
1070 unsigned long addr;
1072 if (begin >= end)
1073 return;
1075 printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
1076 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1077 ClearPageReserved(virt_to_page(addr));
1078 init_page_count(virt_to_page(addr));
1079 memset((void *)(addr & ~(PAGE_SIZE-1)),
1080 POISON_FREE_INITMEM, PAGE_SIZE);
1081 if (addr >= __START_KERNEL_map) {
1082 /* make_readonly() reports all kernel addresses. */
1083 __make_page_writable(__va(__pa(addr)));
1084 if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
1085 pgd_t *pgd = pgd_offset_k(addr);
1086 pud_t *pud = pud_offset(pgd, addr);
1087 pmd_t *pmd = pmd_offset(pud, addr);
1088 pte_t *pte = pte_offset_kernel(pmd, addr);
1090 xen_l1_entry_update(pte, __pte(0)); /* fallback */
1093 free_page(addr);
1094 totalram_pages++;
1098 void free_initmem(void)
1100 memset(__initdata_begin, POISON_FREE_INITDATA,
1101 __initdata_end - __initdata_begin);
1102 free_init_pages("unused kernel memory",
1103 (unsigned long)(&__init_begin),
1104 (unsigned long)(&__init_end));
1107 #ifdef CONFIG_DEBUG_RODATA
1109 void mark_rodata_ro(void)
1111 unsigned long addr = (unsigned long)__start_rodata;
1113 for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
1114 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
1116 printk ("Write protecting the kernel read-only data: %luk\n",
1117 (__end_rodata - __start_rodata) >> 10);
1119 /*
1120 * change_page_attr_addr() requires a global_flush_tlb() call after it.
1121 * We do this after the printk so that if something went wrong in the
1122 * change, the printk gets out at least to give a better debug hint
1123 * of who is the culprit.
1124 */
1125 global_flush_tlb();
1127 #endif
1129 #ifdef CONFIG_BLK_DEV_INITRD
1130 void free_initrd_mem(unsigned long start, unsigned long end)
1132 free_init_pages("initrd memory", start, end);
1134 #endif
1136 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
1138 /* Should check here against the e820 map to avoid double free */
1139 #ifdef CONFIG_NUMA
1140 int nid = phys_to_nid(phys);
1141 reserve_bootmem_node(NODE_DATA(nid), phys, len);
1142 #else
1143 reserve_bootmem(phys, len);
1144 #endif
1145 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
1146 dma_reserve += len / PAGE_SIZE;
1149 int kern_addr_valid(unsigned long addr)
1151 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1152 pgd_t *pgd;
1153 pud_t *pud;
1154 pmd_t *pmd;
1155 pte_t *pte;
1157 if (above != 0 && above != -1UL)
1158 return 0;
1160 pgd = pgd_offset_k(addr);
1161 if (pgd_none(*pgd))
1162 return 0;
1164 pud = pud_offset(pgd, addr);
1165 if (pud_none(*pud))
1166 return 0;
1168 pmd = pmd_offset(pud, addr);
1169 if (pmd_none(*pmd))
1170 return 0;
1171 if (pmd_large(*pmd))
1172 return pfn_valid(pmd_pfn(*pmd));
1174 pte = pte_offset_kernel(pmd, addr);
1175 if (pte_none(*pte))
1176 return 0;
1177 return pfn_valid(pte_pfn(*pte));
1180 #ifdef CONFIG_SYSCTL
1181 #include <linux/sysctl.h>
1183 extern int exception_trace, page_fault_trace;
1185 static ctl_table debug_table2[] = {
1186 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
1187 proc_dointvec },
1188 { 0, }
1189 };
1191 static ctl_table debug_root_table2[] = {
1192 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
1193 .child = debug_table2 },
1194 { 0 },
1195 };
1197 static __init int x8664_sysctl_init(void)
1199 register_sysctl_table(debug_root_table2, 1);
1200 return 0;
1202 __initcall(x8664_sysctl_init);
1203 #endif
1205 /* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
1206 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
1207 not need special handling anymore. */
1209 static struct vm_area_struct gate_vma = {
1210 .vm_start = VSYSCALL_START,
1211 .vm_end = VSYSCALL_END,
1212 .vm_page_prot = PAGE_READONLY
1213 };
1215 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
1217 #ifdef CONFIG_IA32_EMULATION
1218 if (test_tsk_thread_flag(tsk, TIF_IA32))
1219 return NULL;
1220 #endif
1221 return &gate_vma;
1224 int in_gate_area(struct task_struct *task, unsigned long addr)
1226 struct vm_area_struct *vma = get_gate_vma(task);
1227 if (!vma)
1228 return 0;
1229 return (addr >= vma->vm_start) && (addr < vma->vm_end);
1232 /* Use this when you have no reliable task/vma, typically from interrupt
1233 * context. It is less reliable than using the task's vma and may give
1234 * false positives.
1235 */
1236 int in_gate_area_no_task(unsigned long addr)
1238 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);