ia64/xen-unstable

view linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c @ 15225:ca62b4b4f762

LINUX/x86_64: Cleanup 15129:a40967e39652 by using __pgd rather than
abusing mk_kernel_pgd.

Also set the user bit on the vsyscall entry in the user pgd.

Both changes suggested by Jan Beulich.

Signed-off-by: Ian Campbell <ian.campbell@xensource.com>
author Ian Campbell <ian.campbell@xensource.com>
date Fri May 25 11:24:02 2007 +0100 (2007-05-25)
parents 1fde9ebb8019
children
line source
1 /*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 *
8 * Jun Nakajima <jun.nakajima@intel.com>
9 * Modified for Xen.
10 */
12 #include <linux/signal.h>
13 #include <linux/sched.h>
14 #include <linux/kernel.h>
15 #include <linux/errno.h>
16 #include <linux/string.h>
17 #include <linux/types.h>
18 #include <linux/ptrace.h>
19 #include <linux/mman.h>
20 #include <linux/mm.h>
21 #include <linux/swap.h>
22 #include <linux/smp.h>
23 #include <linux/init.h>
24 #include <linux/pagemap.h>
25 #include <linux/bootmem.h>
26 #include <linux/proc_fs.h>
27 #include <linux/pci.h>
28 #include <linux/poison.h>
29 #include <linux/dma-mapping.h>
30 #include <linux/module.h>
31 #include <linux/memory_hotplug.h>
33 #include <asm/processor.h>
34 #include <asm/system.h>
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37 #include <asm/pgalloc.h>
38 #include <asm/dma.h>
39 #include <asm/fixmap.h>
40 #include <asm/e820.h>
41 #include <asm/apic.h>
42 #include <asm/tlb.h>
43 #include <asm/mmu_context.h>
44 #include <asm/proto.h>
45 #include <asm/smp.h>
46 #include <asm/sections.h>
48 #include <xen/features.h>
50 #ifndef Dprintk
51 #define Dprintk(x...)
52 #endif
54 struct dma_mapping_ops* dma_ops;
55 EXPORT_SYMBOL(dma_ops);
57 #if CONFIG_XEN_COMPAT <= 0x030002
58 unsigned int __kernel_page_user;
59 EXPORT_SYMBOL(__kernel_page_user);
60 #endif
62 extern unsigned long *contiguous_bitmap;
64 static unsigned long dma_reserve __initdata;
66 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
67 extern unsigned long start_pfn;
69 /*
70 * Use this until direct mapping is established, i.e. before __va() is
71 * available in init_memory_mapping().
72 */
74 #define addr_to_page(addr, page) \
75 (addr) &= PHYSICAL_PAGE_MASK; \
76 (page) = ((unsigned long *) ((unsigned long) \
77 (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
78 __START_KERNEL_map)))
80 static void __meminit early_make_page_readonly(void *va, unsigned int feature)
81 {
82 unsigned long addr, _va = (unsigned long)va;
83 pte_t pte, *ptep;
84 unsigned long *page = (unsigned long *) init_level4_pgt;
86 if (xen_feature(feature))
87 return;
89 addr = (unsigned long) page[pgd_index(_va)];
90 addr_to_page(addr, page);
92 addr = page[pud_index(_va)];
93 addr_to_page(addr, page);
95 addr = page[pmd_index(_va)];
96 addr_to_page(addr, page);
98 ptep = (pte_t *) &page[pte_index(_va)];
100 pte.pte = ptep->pte & ~_PAGE_RW;
101 if (HYPERVISOR_update_va_mapping(_va, pte, 0))
102 BUG();
103 }
105 static void __make_page_readonly(void *va)
106 {
107 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
108 unsigned long addr = (unsigned long) va;
110 pgd = pgd_offset_k(addr);
111 pud = pud_offset(pgd, addr);
112 pmd = pmd_offset(pud, addr);
113 ptep = pte_offset_kernel(pmd, addr);
115 pte.pte = ptep->pte & ~_PAGE_RW;
116 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
117 xen_l1_entry_update(ptep, pte); /* fallback */
119 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
120 __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
121 }
123 static void __make_page_writable(void *va)
124 {
125 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
126 unsigned long addr = (unsigned long) va;
128 pgd = pgd_offset_k(addr);
129 pud = pud_offset(pgd, addr);
130 pmd = pmd_offset(pud, addr);
131 ptep = pte_offset_kernel(pmd, addr);
133 pte.pte = ptep->pte | _PAGE_RW;
134 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
135 xen_l1_entry_update(ptep, pte); /* fallback */
137 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
138 __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
139 }
141 void make_page_readonly(void *va, unsigned int feature)
142 {
143 if (!xen_feature(feature))
144 __make_page_readonly(va);
145 }
147 void make_page_writable(void *va, unsigned int feature)
148 {
149 if (!xen_feature(feature))
150 __make_page_writable(va);
151 }
153 void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
154 {
155 if (xen_feature(feature))
156 return;
158 while (nr-- != 0) {
159 __make_page_readonly(va);
160 va = (void*)((unsigned long)va + PAGE_SIZE);
161 }
162 }
164 void make_pages_writable(void *va, unsigned nr, unsigned int feature)
165 {
166 if (xen_feature(feature))
167 return;
169 while (nr-- != 0) {
170 __make_page_writable(va);
171 va = (void*)((unsigned long)va + PAGE_SIZE);
172 }
173 }
175 /*
176 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
177 * physical space so we can cache the place of the first one and move
178 * around without checking the pgd every time.
179 */
181 void show_mem(void)
182 {
183 long i, total = 0, reserved = 0;
184 long shared = 0, cached = 0;
185 pg_data_t *pgdat;
186 struct page *page;
188 printk(KERN_INFO "Mem-info:\n");
189 show_free_areas();
190 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
192 for_each_online_pgdat(pgdat) {
193 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
194 page = pfn_to_page(pgdat->node_start_pfn + i);
195 total++;
196 if (PageReserved(page))
197 reserved++;
198 else if (PageSwapCache(page))
199 cached++;
200 else if (page_count(page))
201 shared += page_count(page) - 1;
202 }
203 }
204 printk(KERN_INFO "%lu pages of RAM\n", total);
205 printk(KERN_INFO "%lu reserved pages\n",reserved);
206 printk(KERN_INFO "%lu pages shared\n",shared);
207 printk(KERN_INFO "%lu pages swap cached\n",cached);
208 }
210 int after_bootmem;
212 static __init void *spp_getpage(void)
213 {
214 void *ptr;
215 if (after_bootmem)
216 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
217 else if (start_pfn < table_end) {
218 ptr = __va(start_pfn << PAGE_SHIFT);
219 start_pfn++;
220 memset(ptr, 0, PAGE_SIZE);
221 } else
222 ptr = alloc_bootmem_pages(PAGE_SIZE);
223 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
224 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
226 Dprintk("spp_getpage %p\n", ptr);
227 return ptr;
228 }
230 #define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
232 static inline pud_t *pud_offset_u(unsigned long address)
233 {
234 pud_t *pud = level3_user_pgt;
236 return pud + pud_index(address);
237 }
239 static __init void set_pte_phys(unsigned long vaddr,
240 unsigned long phys, pgprot_t prot, int user_mode)
241 {
242 pgd_t *pgd;
243 pud_t *pud;
244 pmd_t *pmd;
245 pte_t *pte, new_pte;
247 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
249 pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
250 if (pgd_none(*pgd)) {
251 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
252 return;
253 }
254 pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
255 if (pud_none(*pud)) {
256 pmd = (pmd_t *) spp_getpage();
257 make_page_readonly(pmd, XENFEAT_writable_page_tables);
258 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
259 if (pmd != pmd_offset(pud, 0)) {
260 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
261 return;
262 }
263 }
264 pmd = pmd_offset(pud, vaddr);
265 if (pmd_none(*pmd)) {
266 pte = (pte_t *) spp_getpage();
267 make_page_readonly(pte, XENFEAT_writable_page_tables);
268 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
269 if (pte != pte_offset_kernel(pmd, 0)) {
270 printk("PAGETABLE BUG #02!\n");
271 return;
272 }
273 }
274 if (pgprot_val(prot))
275 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
276 else
277 new_pte = __pte(0);
279 pte = pte_offset_kernel(pmd, vaddr);
280 if (!pte_none(*pte) &&
281 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
282 pte_ERROR(*pte);
283 set_pte(pte, new_pte);
285 /*
286 * It's enough to flush this one mapping.
287 * (PGE mappings get flushed as well)
288 */
289 __flush_tlb_one(vaddr);
290 }
292 static __init void set_pte_phys_ma(unsigned long vaddr,
293 unsigned long phys, pgprot_t prot)
294 {
295 pgd_t *pgd;
296 pud_t *pud;
297 pmd_t *pmd;
298 pte_t *pte, new_pte;
300 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
302 pgd = pgd_offset_k(vaddr);
303 if (pgd_none(*pgd)) {
304 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
305 return;
306 }
307 pud = pud_offset(pgd, vaddr);
308 if (pud_none(*pud)) {
310 pmd = (pmd_t *) spp_getpage();
311 make_page_readonly(pmd, XENFEAT_writable_page_tables);
312 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
313 if (pmd != pmd_offset(pud, 0)) {
314 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
315 return;
316 }
317 }
318 pmd = pmd_offset(pud, vaddr);
319 if (pmd_none(*pmd)) {
320 pte = (pte_t *) spp_getpage();
321 make_page_readonly(pte, XENFEAT_writable_page_tables);
322 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
323 if (pte != pte_offset_kernel(pmd, 0)) {
324 printk("PAGETABLE BUG #02!\n");
325 return;
326 }
327 }
328 new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
330 pte = pte_offset_kernel(pmd, vaddr);
331 set_pte(pte, new_pte);
333 /*
334 * It's enough to flush this one mapping.
335 * (PGE mappings get flushed as well)
336 */
337 __flush_tlb_one(vaddr);
338 }
340 #define SET_FIXMAP_KERNEL 0
341 #define SET_FIXMAP_USER 1
343 /* NOTE: this is meant to be run only at boot */
344 void __init
345 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
346 {
347 unsigned long address = __fix_to_virt(idx);
349 if (idx >= __end_of_fixed_addresses) {
350 printk("Invalid __set_fixmap\n");
351 return;
352 }
353 switch (idx) {
354 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
355 set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
356 break;
357 default:
358 set_pte_phys_ma(address, phys, prot);
359 break;
360 }
361 }
363 /*
364 * This only supports vsyscall area.
365 */
366 void __init
367 __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
368 {
369 unsigned long address = __fix_to_virt(idx);
371 if (idx >= __end_of_fixed_addresses) {
372 printk("Invalid __set_fixmap\n");
373 return;
374 }
376 set_pte_phys(address, phys, prot, SET_FIXMAP_USER);
377 }
379 unsigned long __initdata table_start, table_end;
381 static __meminit void *alloc_static_page(unsigned long *phys)
382 {
383 unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
385 if (after_bootmem) {
386 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
388 *phys = __pa(adr);
389 return adr;
390 }
392 *phys = start_pfn << PAGE_SHIFT;
393 start_pfn++;
394 memset((void *)va, 0, PAGE_SIZE);
395 return (void *)va;
396 }
398 #define PTE_SIZE PAGE_SIZE
400 static inline void __set_pte(pte_t *dst, pte_t val)
401 {
402 *dst = val;
403 }
405 static inline int make_readonly(unsigned long paddr)
406 {
407 extern char __vsyscall_0;
408 int readonly = 0;
410 /* Make new page tables read-only. */
411 if (!xen_feature(XENFEAT_writable_page_tables)
412 && (paddr >= (table_start << PAGE_SHIFT))
413 && (paddr < (table_end << PAGE_SHIFT)))
414 readonly = 1;
415 /* Make old page tables read-only. */
416 if (!xen_feature(XENFEAT_writable_page_tables)
417 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
418 && (paddr < (start_pfn << PAGE_SHIFT)))
419 readonly = 1;
421 /*
422 * No need for writable mapping of kernel image. This also ensures that
423 * page and descriptor tables embedded inside don't have writable
424 * mappings. Exclude the vsyscall area here, allowing alternative
425 * instruction patching to work.
426 */
427 if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end))
428 && !(paddr >= __pa_symbol(&__vsyscall_0)
429 && paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE))
430 readonly = 1;
432 return readonly;
433 }
435 #ifndef CONFIG_XEN
436 /* Must run before zap_low_mappings */
437 __init void *early_ioremap(unsigned long addr, unsigned long size)
438 {
439 unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
441 /* actually usually some more */
442 if (size >= LARGE_PAGE_SIZE) {
443 printk("SMBIOS area too long %lu\n", size);
444 return NULL;
445 }
446 set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
447 map += LARGE_PAGE_SIZE;
448 set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
449 __flush_tlb();
450 return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
451 }
453 /* To avoid virtual aliases later */
454 __init void early_iounmap(void *addr, unsigned long size)
455 {
456 if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
457 printk("early_iounmap: bad address %p\n", addr);
458 set_pmd(temp_mappings[0].pmd, __pmd(0));
459 set_pmd(temp_mappings[1].pmd, __pmd(0));
460 __flush_tlb();
461 }
462 #endif
464 static void __meminit
465 phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
466 {
467 int i, k;
469 for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
470 unsigned long pte_phys;
471 pte_t *pte, *pte_save;
473 if (address >= end) {
474 if (!after_bootmem)
475 for (; i < PTRS_PER_PMD; i++, pmd++)
476 set_pmd(pmd, __pmd(0));
477 break;
478 }
479 pte = alloc_static_page(&pte_phys);
480 pte_save = pte;
481 for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
482 unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
484 if ((address >= end) ||
485 ((address >> PAGE_SHIFT) >=
486 xen_start_info->nr_pages))
487 pteval = 0;
488 else if (make_readonly(address))
489 pteval &= ~_PAGE_RW;
490 __set_pte(pte, __pte(pteval & __supported_pte_mask));
491 }
492 pte = pte_save;
493 early_make_page_readonly(pte, XENFEAT_writable_page_tables);
494 set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
495 }
496 }
498 static void __meminit
499 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
500 {
501 pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
503 if (pmd_none(*pmd)) {
504 spin_lock(&init_mm.page_table_lock);
505 phys_pmd_init(pmd, address, end);
506 spin_unlock(&init_mm.page_table_lock);
507 __flush_tlb_all();
508 }
509 }
511 static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
512 {
513 long i = pud_index(address);
515 pud = pud + i;
517 if (after_bootmem && pud_val(*pud)) {
518 phys_pmd_update(pud, address, end);
519 return;
520 }
522 for (; i < PTRS_PER_PUD; pud++, i++) {
523 unsigned long paddr, pmd_phys;
524 pmd_t *pmd;
526 paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
527 if (paddr >= end)
528 break;
530 pmd = alloc_static_page(&pmd_phys);
531 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
532 spin_lock(&init_mm.page_table_lock);
533 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
534 phys_pmd_init(pmd, paddr, end);
535 spin_unlock(&init_mm.page_table_lock);
536 }
537 __flush_tlb();
538 }
540 void __init xen_init_pt(void)
541 {
542 unsigned long addr, *page;
544 /* Find the initial pte page that was built for us. */
545 page = (unsigned long *)xen_start_info->pt_base;
546 addr = page[pgd_index(__START_KERNEL_map)];
547 addr_to_page(addr, page);
548 addr = page[pud_index(__START_KERNEL_map)];
549 addr_to_page(addr, page);
551 #if CONFIG_XEN_COMPAT <= 0x030002
552 /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
553 in kernel PTEs. We check that here. */
554 if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
555 unsigned long *pg;
556 pte_t pte;
558 /* Mess with the initial mapping of page 0. It's not needed. */
559 BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
560 addr = page[pmd_index(__START_KERNEL_map)];
561 addr_to_page(addr, pg);
562 pte.pte = pg[pte_index(__START_KERNEL_map)];
563 BUG_ON(!(pte.pte & _PAGE_PRESENT));
565 /* If _PAGE_USER isn't set, we obviously do not need it. */
566 if (pte.pte & _PAGE_USER) {
567 /* _PAGE_USER is needed, but is it set implicitly? */
568 pte.pte &= ~_PAGE_USER;
569 if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
570 pte, 0) != 0) ||
571 !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
572 /* We need to explicitly specify _PAGE_USER. */
573 __kernel_page_user = _PAGE_USER;
574 }
575 }
576 #endif
578 /* Construct mapping of initial pte page in our own directories. */
579 init_level4_pgt[pgd_index(__START_KERNEL_map)] =
580 __pgd(__pa_symbol(level3_kernel_pgt) | _PAGE_TABLE);
581 level3_kernel_pgt[pud_index(__START_KERNEL_map)] =
582 __pud(__pa_symbol(level2_kernel_pgt) |
583 _KERNPG_TABLE);
584 memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
586 early_make_page_readonly(init_level4_pgt,
587 XENFEAT_writable_page_tables);
588 early_make_page_readonly(init_level4_user_pgt,
589 XENFEAT_writable_page_tables);
590 early_make_page_readonly(level3_kernel_pgt,
591 XENFEAT_writable_page_tables);
592 early_make_page_readonly(level3_user_pgt,
593 XENFEAT_writable_page_tables);
594 early_make_page_readonly(level2_kernel_pgt,
595 XENFEAT_writable_page_tables);
597 if (!xen_feature(XENFEAT_writable_page_tables)) {
598 xen_pgd_pin(__pa_symbol(init_level4_pgt));
599 xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
600 }
602 set_pgd((pgd_t *)(init_level4_user_pgt + 511),
603 mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
604 }
606 static void __init extend_init_mapping(unsigned long tables_space)
607 {
608 unsigned long va = __START_KERNEL_map;
609 unsigned long phys, addr, *pte_page;
610 pmd_t *pmd;
611 pte_t *pte, new_pte;
612 unsigned long *page = (unsigned long *)init_level4_pgt;
614 addr = page[pgd_index(va)];
615 addr_to_page(addr, page);
616 addr = page[pud_index(va)];
617 addr_to_page(addr, page);
619 /* Kill mapping of low 1MB. */
620 while (va < (unsigned long)&_text) {
621 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
622 va += PAGE_SIZE;
623 }
625 /* Ensure init mappings cover kernel text/data and initial tables. */
626 while (va < (__START_KERNEL_map
627 + (start_pfn << PAGE_SHIFT)
628 + tables_space)) {
629 pmd = (pmd_t *)&page[pmd_index(va)];
630 if (pmd_none(*pmd)) {
631 pte_page = alloc_static_page(&phys);
632 early_make_page_readonly(
633 pte_page, XENFEAT_writable_page_tables);
634 set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
635 } else {
636 addr = page[pmd_index(va)];
637 addr_to_page(addr, pte_page);
638 }
639 pte = (pte_t *)&pte_page[pte_index(va)];
640 if (pte_none(*pte)) {
641 new_pte = pfn_pte(
642 (va - __START_KERNEL_map) >> PAGE_SHIFT,
643 __pgprot(_KERNPG_TABLE));
644 xen_l1_entry_update(pte, new_pte);
645 }
646 va += PAGE_SIZE;
647 }
649 /* Finally, blow away any spurious initial mappings. */
650 while (1) {
651 pmd = (pmd_t *)&page[pmd_index(va)];
652 if (pmd_none(*pmd))
653 break;
654 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
655 va += PAGE_SIZE;
656 }
657 }
659 static void __init find_early_table_space(unsigned long end)
660 {
661 unsigned long puds, pmds, ptes, tables;
663 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
664 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
665 ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
667 tables = round_up(puds * 8, PAGE_SIZE) +
668 round_up(pmds * 8, PAGE_SIZE) +
669 round_up(ptes * 8, PAGE_SIZE);
671 extend_init_mapping(tables);
673 table_start = start_pfn;
674 table_end = table_start + (tables>>PAGE_SHIFT);
676 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
677 end, table_start << PAGE_SHIFT,
678 (table_start << PAGE_SHIFT) + tables);
679 }
681 static void xen_finish_init_mapping(void)
682 {
683 unsigned long i, start, end;
685 /* Re-vector virtual addresses pointing into the initial
686 mapping to the just-established permanent ones. */
687 xen_start_info = __va(__pa(xen_start_info));
688 xen_start_info->pt_base = (unsigned long)
689 __va(__pa(xen_start_info->pt_base));
690 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
691 phys_to_machine_mapping =
692 __va(__pa(xen_start_info->mfn_list));
693 xen_start_info->mfn_list = (unsigned long)
694 phys_to_machine_mapping;
695 }
696 if (xen_start_info->mod_start)
697 xen_start_info->mod_start = (unsigned long)
698 __va(__pa(xen_start_info->mod_start));
700 /* Destroy the Xen-created mappings beyond the kernel image as
701 * well as the temporary mappings created above. Prevents
702 * overlap with modules area (if init mapping is very big).
703 */
704 start = PAGE_ALIGN((unsigned long)_end);
705 end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
706 for (; start < end; start += PAGE_SIZE)
707 WARN_ON(HYPERVISOR_update_va_mapping(
708 start, __pte_ma(0), 0));
710 /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
711 table_end = ~0UL;
713 /*
714 * Prefetch pte's for the bt_ioremap() area. It gets used before the
715 * boot-time allocator is online, so allocate-on-demand would fail.
716 */
717 for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
718 __set_fixmap(i, 0, __pgprot(0));
720 /* Switch to the real shared_info page, and clear the dummy page. */
721 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
722 HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
723 memset(empty_zero_page, 0, sizeof(empty_zero_page));
725 /* Set up mapping of lowest 1MB of physical memory. */
726 for (i = 0; i < NR_FIX_ISAMAPS; i++)
727 if (is_initial_xendomain())
728 set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
729 else
730 __set_fixmap(FIX_ISAMAP_BEGIN - i,
731 virt_to_mfn(empty_zero_page)
732 << PAGE_SHIFT,
733 PAGE_KERNEL_RO);
735 /* Disable the 'start_pfn' allocator. */
736 table_end = start_pfn;
737 }
739 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
740 This runs before bootmem is initialized and gets pages directly from the
741 physical memory. To access them they are temporarily mapped. */
742 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
743 {
744 unsigned long next;
746 Dprintk("init_memory_mapping\n");
748 /*
749 * Find space for the kernel direct mapping tables.
750 * Later we should allocate these tables in the local node of the memory
751 * mapped. Unfortunately this is done currently before the nodes are
752 * discovered.
753 */
754 if (!after_bootmem)
755 find_early_table_space(end);
757 start = (unsigned long)__va(start);
758 end = (unsigned long)__va(end);
760 for (; start < end; start = next) {
761 unsigned long pud_phys;
762 pgd_t *pgd = pgd_offset_k(start);
763 pud_t *pud;
765 if (after_bootmem) {
766 pud = pud_offset(pgd, start & PGDIR_MASK);
767 make_page_readonly(pud, XENFEAT_writable_page_tables);
768 pud_phys = __pa(pud);
769 } else {
770 pud = alloc_static_page(&pud_phys);
771 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
772 }
773 next = start + PGDIR_SIZE;
774 if (next > end)
775 next = end;
776 phys_pud_init(pud, __pa(start), __pa(next));
777 if (!after_bootmem)
778 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
779 }
781 if (!after_bootmem) {
782 BUG_ON(start_pfn != table_end);
783 xen_finish_init_mapping();
784 }
786 __flush_tlb_all();
787 }
789 void __cpuinit zap_low_mappings(int cpu)
790 {
791 /* this is not required for Xen */
792 #if 0
793 swap_low_mappings();
794 #endif
795 }
797 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
798 __init void
799 size_zones(unsigned long *z, unsigned long *h,
800 unsigned long start_pfn, unsigned long end_pfn)
801 {
802 int i;
803 #ifndef CONFIG_XEN
804 unsigned long w;
805 #endif
807 for (i = 0; i < MAX_NR_ZONES; i++)
808 z[i] = 0;
810 #ifndef CONFIG_XEN
811 if (start_pfn < MAX_DMA_PFN)
812 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
813 if (start_pfn < MAX_DMA32_PFN) {
814 unsigned long dma32_pfn = MAX_DMA32_PFN;
815 if (dma32_pfn > end_pfn)
816 dma32_pfn = end_pfn;
817 z[ZONE_DMA32] = dma32_pfn - start_pfn;
818 }
819 z[ZONE_NORMAL] = end_pfn - start_pfn;
821 /* Remove lower zones from higher ones. */
822 w = 0;
823 for (i = 0; i < MAX_NR_ZONES; i++) {
824 if (z[i])
825 z[i] -= w;
826 w += z[i];
827 }
829 /* Compute holes */
830 w = start_pfn;
831 for (i = 0; i < MAX_NR_ZONES; i++) {
832 unsigned long s = w;
833 w += z[i];
834 h[i] = e820_hole_size(s, w);
835 }
837 /* Add the space pace needed for mem_map to the holes too. */
838 for (i = 0; i < MAX_NR_ZONES; i++)
839 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
841 /* The 16MB DMA zone has the kernel and other misc mappings.
842 Account them too */
843 if (h[ZONE_DMA]) {
844 h[ZONE_DMA] += dma_reserve;
845 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
846 printk(KERN_WARNING
847 "Kernel too large and filling up ZONE_DMA?\n");
848 h[ZONE_DMA] = z[ZONE_DMA];
849 }
850 }
851 #else
852 z[ZONE_DMA] = end_pfn;
853 for (i = 0; i < MAX_NR_ZONES; i++)
854 h[i] = 0;
855 #endif
856 }
858 #ifndef CONFIG_NUMA
859 void __init paging_init(void)
860 {
861 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
863 memory_present(0, 0, end_pfn);
864 sparse_init();
865 size_zones(zones, holes, 0, end_pfn);
866 free_area_init_node(0, NODE_DATA(0), zones,
867 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
869 init_mm.context.pinned = 1;
870 }
871 #endif
873 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
874 from the CPU leading to inconsistent cache lines. address and size
875 must be aligned to 2MB boundaries.
876 Does nothing when the mapping doesn't exist. */
877 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
878 {
879 unsigned long end = address + size;
881 BUG_ON(address & ~LARGE_PAGE_MASK);
882 BUG_ON(size & ~LARGE_PAGE_MASK);
884 for (; address < end; address += LARGE_PAGE_SIZE) {
885 pgd_t *pgd = pgd_offset_k(address);
886 pud_t *pud;
887 pmd_t *pmd;
888 if (pgd_none(*pgd))
889 continue;
890 pud = pud_offset(pgd, address);
891 if (pud_none(*pud))
892 continue;
893 pmd = pmd_offset(pud, address);
894 if (!pmd || pmd_none(*pmd))
895 continue;
896 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
897 /* Could handle this, but it should not happen currently. */
898 printk(KERN_ERR
899 "clear_kernel_mapping: mapping has been split. will leak memory\n");
900 pmd_ERROR(*pmd);
901 }
902 set_pmd(pmd, __pmd(0));
903 }
904 __flush_tlb_all();
905 }
907 /*
908 * Memory hotplug specific functions
909 */
910 void online_page(struct page *page)
911 {
912 ClearPageReserved(page);
913 init_page_count(page);
914 __free_page(page);
915 totalram_pages++;
916 num_physpages++;
917 }
919 #ifdef CONFIG_MEMORY_HOTPLUG
920 /*
921 * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
922 * via probe interface of sysfs. If acpi notifies hot-add event, then it
923 * can tell node id by searching dsdt. But, probe interface doesn't have
924 * node id. So, return 0 as node id at this time.
925 */
926 #ifdef CONFIG_NUMA
927 int memory_add_physaddr_to_nid(u64 start)
928 {
929 return 0;
930 }
931 #endif
933 /*
934 * Memory is added always to NORMAL zone. This means you will never get
935 * additional DMA/DMA32 memory.
936 */
937 int arch_add_memory(int nid, u64 start, u64 size)
938 {
939 struct pglist_data *pgdat = NODE_DATA(nid);
940 struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
941 unsigned long start_pfn = start >> PAGE_SHIFT;
942 unsigned long nr_pages = size >> PAGE_SHIFT;
943 int ret;
945 ret = __add_pages(zone, start_pfn, nr_pages);
946 if (ret)
947 goto error;
949 init_memory_mapping(start, (start + size -1));
951 return ret;
952 error:
953 printk("%s: Problem encountered in __add_pages!\n", __func__);
954 return ret;
955 }
956 EXPORT_SYMBOL_GPL(arch_add_memory);
958 int remove_memory(u64 start, u64 size)
959 {
960 return -EINVAL;
961 }
962 EXPORT_SYMBOL_GPL(remove_memory);
964 #else /* CONFIG_MEMORY_HOTPLUG */
965 /*
966 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
967 * just online the pages.
968 */
969 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
970 {
971 int err = -EIO;
972 unsigned long pfn;
973 unsigned long total = 0, mem = 0;
974 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
975 if (pfn_valid(pfn)) {
976 online_page(pfn_to_page(pfn));
977 err = 0;
978 mem++;
979 }
980 total++;
981 }
982 if (!err) {
983 z->spanned_pages += total;
984 z->present_pages += mem;
985 z->zone_pgdat->node_spanned_pages += total;
986 z->zone_pgdat->node_present_pages += mem;
987 }
988 return err;
989 }
990 #endif /* CONFIG_MEMORY_HOTPLUG */
992 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
993 kcore_vsyscall;
995 void __init mem_init(void)
996 {
997 long codesize, reservedpages, datasize, initsize;
998 unsigned long pfn;
1000 contiguous_bitmap = alloc_bootmem_low_pages(
1001 (end_pfn + 2*BITS_PER_LONG) >> 3);
1002 BUG_ON(!contiguous_bitmap);
1003 memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
1005 pci_iommu_alloc();
1007 /* How many end-of-memory variables you have, grandma! */
1008 max_low_pfn = end_pfn;
1009 max_pfn = end_pfn;
1010 num_physpages = end_pfn;
1011 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
1013 /* clear the zero-page */
1014 memset(empty_zero_page, 0, PAGE_SIZE);
1016 reservedpages = 0;
1018 /* this will put all low memory onto the freelists */
1019 #ifdef CONFIG_NUMA
1020 totalram_pages = numa_free_all_bootmem();
1021 #else
1022 totalram_pages = free_all_bootmem();
1023 #endif
1024 /* XEN: init and count pages outside initial allocation. */
1025 for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
1026 ClearPageReserved(pfn_to_page(pfn));
1027 init_page_count(pfn_to_page(pfn));
1028 totalram_pages++;
1030 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
1032 after_bootmem = 1;
1034 codesize = (unsigned long) &_etext - (unsigned long) &_text;
1035 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
1036 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
1038 /* Register memory areas for /proc/kcore */
1039 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
1040 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
1041 VMALLOC_END-VMALLOC_START);
1042 kclist_add(&kcore_kernel, &_stext, _end - _stext);
1043 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
1044 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
1045 VSYSCALL_END - VSYSCALL_START);
1047 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
1048 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
1049 end_pfn << (PAGE_SHIFT-10),
1050 codesize >> 10,
1051 reservedpages << (PAGE_SHIFT-10),
1052 datasize >> 10,
1053 initsize >> 10);
1055 #ifndef CONFIG_XEN
1056 #ifdef CONFIG_SMP
1057 /*
1058 * Sync boot_level4_pgt mappings with the init_level4_pgt
1059 * except for the low identity mappings which are already zapped
1060 * in init_level4_pgt. This sync-up is essential for AP's bringup
1061 */
1062 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
1063 #endif
1064 #endif
1067 void free_init_pages(char *what, unsigned long begin, unsigned long end)
1069 unsigned long addr;
1071 if (begin >= end)
1072 return;
1074 printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
1075 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1076 ClearPageReserved(virt_to_page(addr));
1077 init_page_count(virt_to_page(addr));
1078 memset((void *)(addr & ~(PAGE_SIZE-1)),
1079 POISON_FREE_INITMEM, PAGE_SIZE);
1080 if (addr >= __START_KERNEL_map) {
1081 /* make_readonly() reports all kernel addresses. */
1082 __make_page_writable(__va(__pa(addr)));
1083 if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
1084 pgd_t *pgd = pgd_offset_k(addr);
1085 pud_t *pud = pud_offset(pgd, addr);
1086 pmd_t *pmd = pmd_offset(pud, addr);
1087 pte_t *pte = pte_offset_kernel(pmd, addr);
1089 xen_l1_entry_update(pte, __pte(0)); /* fallback */
1092 free_page(addr);
1093 totalram_pages++;
1097 void free_initmem(void)
1099 memset(__initdata_begin, POISON_FREE_INITDATA,
1100 __initdata_end - __initdata_begin);
1101 free_init_pages("unused kernel memory",
1102 (unsigned long)(&__init_begin),
1103 (unsigned long)(&__init_end));
1106 #ifdef CONFIG_DEBUG_RODATA
1108 void mark_rodata_ro(void)
1110 unsigned long addr = (unsigned long)__start_rodata;
1112 for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
1113 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
1115 printk ("Write protecting the kernel read-only data: %luk\n",
1116 (__end_rodata - __start_rodata) >> 10);
1118 /*
1119 * change_page_attr_addr() requires a global_flush_tlb() call after it.
1120 * We do this after the printk so that if something went wrong in the
1121 * change, the printk gets out at least to give a better debug hint
1122 * of who is the culprit.
1123 */
1124 global_flush_tlb();
1126 #endif
1128 #ifdef CONFIG_BLK_DEV_INITRD
1129 void free_initrd_mem(unsigned long start, unsigned long end)
1131 free_init_pages("initrd memory", start, end);
1133 #endif
1135 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
1137 /* Should check here against the e820 map to avoid double free */
1138 #ifdef CONFIG_NUMA
1139 int nid = phys_to_nid(phys);
1140 reserve_bootmem_node(NODE_DATA(nid), phys, len);
1141 #else
1142 reserve_bootmem(phys, len);
1143 #endif
1144 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
1145 dma_reserve += len / PAGE_SIZE;
1148 int kern_addr_valid(unsigned long addr)
1150 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1151 pgd_t *pgd;
1152 pud_t *pud;
1153 pmd_t *pmd;
1154 pte_t *pte;
1156 if (above != 0 && above != -1UL)
1157 return 0;
1159 pgd = pgd_offset_k(addr);
1160 if (pgd_none(*pgd))
1161 return 0;
1163 pud = pud_offset(pgd, addr);
1164 if (pud_none(*pud))
1165 return 0;
1167 pmd = pmd_offset(pud, addr);
1168 if (pmd_none(*pmd))
1169 return 0;
1170 if (pmd_large(*pmd))
1171 return pfn_valid(pmd_pfn(*pmd));
1173 pte = pte_offset_kernel(pmd, addr);
1174 if (pte_none(*pte))
1175 return 0;
1176 return pfn_valid(pte_pfn(*pte));
1179 #ifdef CONFIG_SYSCTL
1180 #include <linux/sysctl.h>
1182 extern int exception_trace, page_fault_trace;
1184 static ctl_table debug_table2[] = {
1185 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
1186 proc_dointvec },
1187 { 0, }
1188 };
1190 static ctl_table debug_root_table2[] = {
1191 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
1192 .child = debug_table2 },
1193 { 0 },
1194 };
1196 static __init int x8664_sysctl_init(void)
1198 register_sysctl_table(debug_root_table2, 1);
1199 return 0;
1201 __initcall(x8664_sysctl_init);
1202 #endif
1204 /* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
1205 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
1206 not need special handling anymore. */
1208 static struct vm_area_struct gate_vma = {
1209 .vm_start = VSYSCALL_START,
1210 .vm_end = VSYSCALL_END,
1211 .vm_page_prot = PAGE_READONLY
1212 };
1214 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
1216 #ifdef CONFIG_IA32_EMULATION
1217 if (test_tsk_thread_flag(tsk, TIF_IA32))
1218 return NULL;
1219 #endif
1220 return &gate_vma;
1223 int in_gate_area(struct task_struct *task, unsigned long addr)
1225 struct vm_area_struct *vma = get_gate_vma(task);
1226 if (!vma)
1227 return 0;
1228 return (addr >= vma->vm_start) && (addr < vma->vm_end);
1231 /* Use this when you have no reliable task/vma, typically from interrupt
1232 * context. It is less reliable than using the task's vma and may give
1233 * false positives.
1234 */
1235 int in_gate_area_no_task(unsigned long addr)
1237 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);