direct-io.hg

view linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c @ 14403:26a1378d5ece

[LINUX] Don't pin the initial page tables on x86_64 when XENFEAT_writable_page_tables.

Signed-off-by: Ian Campbell <ian.campbell@xensource.com>
author Ian Campbell <ian.campbell@xensource.com>
date Fri Mar 16 10:24:56 2007 +0000 (2007-03-16)
parents 200d13363a38
children 2b24d842bbd3
line source
1 /*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 *
8 * Jun Nakajima <jun.nakajima@intel.com>
9 * Modified for Xen.
10 */
12 #include <linux/signal.h>
13 #include <linux/sched.h>
14 #include <linux/kernel.h>
15 #include <linux/errno.h>
16 #include <linux/string.h>
17 #include <linux/types.h>
18 #include <linux/ptrace.h>
19 #include <linux/mman.h>
20 #include <linux/mm.h>
21 #include <linux/swap.h>
22 #include <linux/smp.h>
23 #include <linux/init.h>
24 #include <linux/pagemap.h>
25 #include <linux/bootmem.h>
26 #include <linux/proc_fs.h>
27 #include <linux/pci.h>
28 #include <linux/poison.h>
29 #include <linux/dma-mapping.h>
30 #include <linux/module.h>
31 #include <linux/memory_hotplug.h>
33 #include <asm/processor.h>
34 #include <asm/system.h>
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37 #include <asm/pgalloc.h>
38 #include <asm/dma.h>
39 #include <asm/fixmap.h>
40 #include <asm/e820.h>
41 #include <asm/apic.h>
42 #include <asm/tlb.h>
43 #include <asm/mmu_context.h>
44 #include <asm/proto.h>
45 #include <asm/smp.h>
46 #include <asm/sections.h>
48 #include <xen/features.h>
50 #ifndef Dprintk
51 #define Dprintk(x...)
52 #endif
54 struct dma_mapping_ops* dma_ops;
55 EXPORT_SYMBOL(dma_ops);
57 #ifdef CONFIG_XEN_COMPAT_030002
58 unsigned int __kernel_page_user;
59 EXPORT_SYMBOL(__kernel_page_user);
60 #endif
62 extern unsigned long *contiguous_bitmap;
64 static unsigned long dma_reserve __initdata;
66 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
67 extern unsigned long start_pfn;
69 /*
70 * Use this until direct mapping is established, i.e. before __va() is
71 * available in init_memory_mapping().
72 */
74 #define addr_to_page(addr, page) \
75 (addr) &= PHYSICAL_PAGE_MASK; \
76 (page) = ((unsigned long *) ((unsigned long) \
77 (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
78 __START_KERNEL_map)))
80 static void __meminit early_make_page_readonly(void *va, unsigned int feature)
81 {
82 unsigned long addr, _va = (unsigned long)va;
83 pte_t pte, *ptep;
84 unsigned long *page = (unsigned long *) init_level4_pgt;
86 if (xen_feature(feature))
87 return;
89 addr = (unsigned long) page[pgd_index(_va)];
90 addr_to_page(addr, page);
92 addr = page[pud_index(_va)];
93 addr_to_page(addr, page);
95 addr = page[pmd_index(_va)];
96 addr_to_page(addr, page);
98 ptep = (pte_t *) &page[pte_index(_va)];
100 pte.pte = ptep->pte & ~_PAGE_RW;
101 if (HYPERVISOR_update_va_mapping(_va, pte, 0))
102 BUG();
103 }
105 static void __make_page_readonly(void *va)
106 {
107 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
108 unsigned long addr = (unsigned long) va;
110 pgd = pgd_offset_k(addr);
111 pud = pud_offset(pgd, addr);
112 pmd = pmd_offset(pud, addr);
113 ptep = pte_offset_kernel(pmd, addr);
115 pte.pte = ptep->pte & ~_PAGE_RW;
116 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
117 xen_l1_entry_update(ptep, pte); /* fallback */
119 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
120 __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
121 }
123 static void __make_page_writable(void *va)
124 {
125 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
126 unsigned long addr = (unsigned long) va;
128 pgd = pgd_offset_k(addr);
129 pud = pud_offset(pgd, addr);
130 pmd = pmd_offset(pud, addr);
131 ptep = pte_offset_kernel(pmd, addr);
133 pte.pte = ptep->pte | _PAGE_RW;
134 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
135 xen_l1_entry_update(ptep, pte); /* fallback */
137 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
138 __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
139 }
141 void make_page_readonly(void *va, unsigned int feature)
142 {
143 if (!xen_feature(feature))
144 __make_page_readonly(va);
145 }
147 void make_page_writable(void *va, unsigned int feature)
148 {
149 if (!xen_feature(feature))
150 __make_page_writable(va);
151 }
153 void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
154 {
155 if (xen_feature(feature))
156 return;
158 while (nr-- != 0) {
159 __make_page_readonly(va);
160 va = (void*)((unsigned long)va + PAGE_SIZE);
161 }
162 }
164 void make_pages_writable(void *va, unsigned nr, unsigned int feature)
165 {
166 if (xen_feature(feature))
167 return;
169 while (nr-- != 0) {
170 __make_page_writable(va);
171 va = (void*)((unsigned long)va + PAGE_SIZE);
172 }
173 }
175 /*
176 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
177 * physical space so we can cache the place of the first one and move
178 * around without checking the pgd every time.
179 */
181 void show_mem(void)
182 {
183 long i, total = 0, reserved = 0;
184 long shared = 0, cached = 0;
185 pg_data_t *pgdat;
186 struct page *page;
188 printk(KERN_INFO "Mem-info:\n");
189 show_free_areas();
190 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
192 for_each_online_pgdat(pgdat) {
193 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
194 page = pfn_to_page(pgdat->node_start_pfn + i);
195 total++;
196 if (PageReserved(page))
197 reserved++;
198 else if (PageSwapCache(page))
199 cached++;
200 else if (page_count(page))
201 shared += page_count(page) - 1;
202 }
203 }
204 printk(KERN_INFO "%lu pages of RAM\n", total);
205 printk(KERN_INFO "%lu reserved pages\n",reserved);
206 printk(KERN_INFO "%lu pages shared\n",shared);
207 printk(KERN_INFO "%lu pages swap cached\n",cached);
208 }
210 int after_bootmem;
212 static __init void *spp_getpage(void)
213 {
214 void *ptr;
215 if (after_bootmem)
216 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
217 else if (start_pfn < table_end) {
218 ptr = __va(start_pfn << PAGE_SHIFT);
219 start_pfn++;
220 memset(ptr, 0, PAGE_SIZE);
221 } else
222 ptr = alloc_bootmem_pages(PAGE_SIZE);
223 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
224 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
226 Dprintk("spp_getpage %p\n", ptr);
227 return ptr;
228 }
230 #define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
232 static inline pud_t *pud_offset_u(unsigned long address)
233 {
234 pud_t *pud = level3_user_pgt;
236 return pud + pud_index(address);
237 }
239 static __init void set_pte_phys(unsigned long vaddr,
240 unsigned long phys, pgprot_t prot, int user_mode)
241 {
242 pgd_t *pgd;
243 pud_t *pud;
244 pmd_t *pmd;
245 pte_t *pte, new_pte;
247 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
249 pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
250 if (pgd_none(*pgd)) {
251 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
252 return;
253 }
254 pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
255 if (pud_none(*pud)) {
256 pmd = (pmd_t *) spp_getpage();
257 make_page_readonly(pmd, XENFEAT_writable_page_tables);
258 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
259 if (pmd != pmd_offset(pud, 0)) {
260 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
261 return;
262 }
263 }
264 pmd = pmd_offset(pud, vaddr);
265 if (pmd_none(*pmd)) {
266 pte = (pte_t *) spp_getpage();
267 make_page_readonly(pte, XENFEAT_writable_page_tables);
268 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
269 if (pte != pte_offset_kernel(pmd, 0)) {
270 printk("PAGETABLE BUG #02!\n");
271 return;
272 }
273 }
274 if (pgprot_val(prot))
275 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
276 else
277 new_pte = __pte(0);
279 pte = pte_offset_kernel(pmd, vaddr);
280 if (!pte_none(*pte) &&
281 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
282 pte_ERROR(*pte);
283 set_pte(pte, new_pte);
285 /*
286 * It's enough to flush this one mapping.
287 * (PGE mappings get flushed as well)
288 */
289 __flush_tlb_one(vaddr);
290 }
292 static __init void set_pte_phys_ma(unsigned long vaddr,
293 unsigned long phys, pgprot_t prot)
294 {
295 pgd_t *pgd;
296 pud_t *pud;
297 pmd_t *pmd;
298 pte_t *pte, new_pte;
300 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
302 pgd = pgd_offset_k(vaddr);
303 if (pgd_none(*pgd)) {
304 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
305 return;
306 }
307 pud = pud_offset(pgd, vaddr);
308 if (pud_none(*pud)) {
310 pmd = (pmd_t *) spp_getpage();
311 make_page_readonly(pmd, XENFEAT_writable_page_tables);
312 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
313 if (pmd != pmd_offset(pud, 0)) {
314 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
315 return;
316 }
317 }
318 pmd = pmd_offset(pud, vaddr);
319 if (pmd_none(*pmd)) {
320 pte = (pte_t *) spp_getpage();
321 make_page_readonly(pte, XENFEAT_writable_page_tables);
322 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
323 if (pte != pte_offset_kernel(pmd, 0)) {
324 printk("PAGETABLE BUG #02!\n");
325 return;
326 }
327 }
328 new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
330 pte = pte_offset_kernel(pmd, vaddr);
331 set_pte(pte, new_pte);
333 /*
334 * It's enough to flush this one mapping.
335 * (PGE mappings get flushed as well)
336 */
337 __flush_tlb_one(vaddr);
338 }
340 #define SET_FIXMAP_KERNEL 0
341 #define SET_FIXMAP_USER 1
343 /* NOTE: this is meant to be run only at boot */
344 void __init
345 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
346 {
347 unsigned long address = __fix_to_virt(idx);
349 if (idx >= __end_of_fixed_addresses) {
350 printk("Invalid __set_fixmap\n");
351 return;
352 }
353 switch (idx) {
354 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
355 set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
356 break;
357 default:
358 set_pte_phys_ma(address, phys, prot);
359 break;
360 }
361 }
363 /*
364 * This only supports vsyscall area.
365 */
366 void __init
367 __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
368 {
369 unsigned long address = __fix_to_virt(idx);
371 if (idx >= __end_of_fixed_addresses) {
372 printk("Invalid __set_fixmap\n");
373 return;
374 }
376 set_pte_phys(address, phys, prot, SET_FIXMAP_USER);
377 }
379 unsigned long __initdata table_start, table_end;
381 static __meminit void *alloc_static_page(unsigned long *phys)
382 {
383 unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
385 if (after_bootmem) {
386 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
388 *phys = __pa(adr);
389 return adr;
390 }
392 *phys = start_pfn << PAGE_SHIFT;
393 start_pfn++;
394 memset((void *)va, 0, PAGE_SIZE);
395 return (void *)va;
396 }
398 #define PTE_SIZE PAGE_SIZE
400 static inline void __set_pte(pte_t *dst, pte_t val)
401 {
402 *dst = val;
403 }
405 static inline int make_readonly(unsigned long paddr)
406 {
407 extern char __vsyscall_0;
408 int readonly = 0;
410 /* Make new page tables read-only. */
411 if (!xen_feature(XENFEAT_writable_page_tables)
412 && (paddr >= (table_start << PAGE_SHIFT))
413 && (paddr < (table_end << PAGE_SHIFT)))
414 readonly = 1;
415 /* Make old page tables read-only. */
416 if (!xen_feature(XENFEAT_writable_page_tables)
417 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
418 && (paddr < (start_pfn << PAGE_SHIFT)))
419 readonly = 1;
421 /*
422 * No need for writable mapping of kernel image. This also ensures that
423 * page and descriptor tables embedded inside don't have writable
424 * mappings. Exclude the vsyscall area here, allowing alternative
425 * instruction patching to work.
426 */
427 if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end))
428 && !(paddr >= __pa_symbol(&__vsyscall_0)
429 && paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE))
430 readonly = 1;
432 return readonly;
433 }
435 #ifndef CONFIG_XEN
436 /* Must run before zap_low_mappings */
437 __init void *early_ioremap(unsigned long addr, unsigned long size)
438 {
439 unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
441 /* actually usually some more */
442 if (size >= LARGE_PAGE_SIZE) {
443 printk("SMBIOS area too long %lu\n", size);
444 return NULL;
445 }
446 set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
447 map += LARGE_PAGE_SIZE;
448 set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
449 __flush_tlb();
450 return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
451 }
453 /* To avoid virtual aliases later */
454 __init void early_iounmap(void *addr, unsigned long size)
455 {
456 if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
457 printk("early_iounmap: bad address %p\n", addr);
458 set_pmd(temp_mappings[0].pmd, __pmd(0));
459 set_pmd(temp_mappings[1].pmd, __pmd(0));
460 __flush_tlb();
461 }
462 #endif
464 static void __meminit
465 phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
466 {
467 int i, k;
469 for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
470 unsigned long pte_phys;
471 pte_t *pte, *pte_save;
473 if (address >= end) {
474 if (!after_bootmem)
475 for (; i < PTRS_PER_PMD; i++, pmd++)
476 set_pmd(pmd, __pmd(0));
477 break;
478 }
479 pte = alloc_static_page(&pte_phys);
480 pte_save = pte;
481 for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
482 if ((address >= end) ||
483 ((address >> PAGE_SHIFT) >=
484 xen_start_info->nr_pages)) {
485 __set_pte(pte, __pte(0));
486 continue;
487 }
488 if (make_readonly(address)) {
489 __set_pte(pte,
490 __pte(address | (_KERNPG_TABLE & ~_PAGE_RW)));
491 continue;
492 }
493 __set_pte(pte, __pte(address | _KERNPG_TABLE));
494 }
495 pte = pte_save;
496 early_make_page_readonly(pte, XENFEAT_writable_page_tables);
497 set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
498 }
499 }
501 static void __meminit
502 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
503 {
504 pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
506 if (pmd_none(*pmd)) {
507 spin_lock(&init_mm.page_table_lock);
508 phys_pmd_init(pmd, address, end);
509 spin_unlock(&init_mm.page_table_lock);
510 __flush_tlb_all();
511 }
512 }
514 static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
515 {
516 long i = pud_index(address);
518 pud = pud + i;
520 if (after_bootmem && pud_val(*pud)) {
521 phys_pmd_update(pud, address, end);
522 return;
523 }
525 for (; i < PTRS_PER_PUD; pud++, i++) {
526 unsigned long paddr, pmd_phys;
527 pmd_t *pmd;
529 paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
530 if (paddr >= end)
531 break;
533 pmd = alloc_static_page(&pmd_phys);
534 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
535 spin_lock(&init_mm.page_table_lock);
536 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
537 phys_pmd_init(pmd, paddr, end);
538 spin_unlock(&init_mm.page_table_lock);
539 }
540 __flush_tlb();
541 }
543 void __init xen_init_pt(void)
544 {
545 unsigned long addr, *page;
547 /* Find the initial pte page that was built for us. */
548 page = (unsigned long *)xen_start_info->pt_base;
549 addr = page[pgd_index(__START_KERNEL_map)];
550 addr_to_page(addr, page);
551 addr = page[pud_index(__START_KERNEL_map)];
552 addr_to_page(addr, page);
554 #ifdef CONFIG_XEN_COMPAT_030002
555 /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
556 in kernel PTEs. We check that here. */
557 if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
558 unsigned long *pg;
559 pte_t pte;
561 /* Mess with the initial mapping of page 0. It's not needed. */
562 BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
563 addr = page[pmd_index(__START_KERNEL_map)];
564 addr_to_page(addr, pg);
565 pte.pte = pg[pte_index(__START_KERNEL_map)];
566 BUG_ON(!(pte.pte & _PAGE_PRESENT));
568 /* If _PAGE_USER isn't set, we obviously do not need it. */
569 if (pte.pte & _PAGE_USER) {
570 /* _PAGE_USER is needed, but is it set implicitly? */
571 pte.pte &= ~_PAGE_USER;
572 if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
573 pte, 0) != 0) ||
574 !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
575 /* We need to explicitly specify _PAGE_USER. */
576 __kernel_page_user = _PAGE_USER;
577 }
578 }
579 #endif
581 /* Construct mapping of initial pte page in our own directories. */
582 init_level4_pgt[pgd_index(__START_KERNEL_map)] =
583 mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
584 level3_kernel_pgt[pud_index(__START_KERNEL_map)] =
585 __pud(__pa_symbol(level2_kernel_pgt) |
586 _KERNPG_TABLE);
587 memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
589 early_make_page_readonly(init_level4_pgt,
590 XENFEAT_writable_page_tables);
591 early_make_page_readonly(init_level4_user_pgt,
592 XENFEAT_writable_page_tables);
593 early_make_page_readonly(level3_kernel_pgt,
594 XENFEAT_writable_page_tables);
595 early_make_page_readonly(level3_user_pgt,
596 XENFEAT_writable_page_tables);
597 early_make_page_readonly(level2_kernel_pgt,
598 XENFEAT_writable_page_tables);
600 if (!xen_feature(XENFEAT_writable_page_tables)) {
601 xen_pgd_pin(__pa_symbol(init_level4_pgt));
602 xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
603 }
605 set_pgd((pgd_t *)(init_level4_user_pgt + 511),
606 mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
607 }
609 static void __init extend_init_mapping(unsigned long tables_space)
610 {
611 unsigned long va = __START_KERNEL_map;
612 unsigned long phys, addr, *pte_page;
613 pmd_t *pmd;
614 pte_t *pte, new_pte;
615 unsigned long *page = (unsigned long *)init_level4_pgt;
617 addr = page[pgd_index(va)];
618 addr_to_page(addr, page);
619 addr = page[pud_index(va)];
620 addr_to_page(addr, page);
622 /* Kill mapping of low 1MB. */
623 while (va < (unsigned long)&_text) {
624 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
625 va += PAGE_SIZE;
626 }
628 /* Ensure init mappings cover kernel text/data and initial tables. */
629 while (va < (__START_KERNEL_map
630 + (start_pfn << PAGE_SHIFT)
631 + tables_space)) {
632 pmd = (pmd_t *)&page[pmd_index(va)];
633 if (pmd_none(*pmd)) {
634 pte_page = alloc_static_page(&phys);
635 early_make_page_readonly(
636 pte_page, XENFEAT_writable_page_tables);
637 set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
638 } else {
639 addr = page[pmd_index(va)];
640 addr_to_page(addr, pte_page);
641 }
642 pte = (pte_t *)&pte_page[pte_index(va)];
643 if (pte_none(*pte)) {
644 new_pte = pfn_pte(
645 (va - __START_KERNEL_map) >> PAGE_SHIFT,
646 __pgprot(_KERNPG_TABLE));
647 xen_l1_entry_update(pte, new_pte);
648 }
649 va += PAGE_SIZE;
650 }
652 /* Finally, blow away any spurious initial mappings. */
653 while (1) {
654 pmd = (pmd_t *)&page[pmd_index(va)];
655 if (pmd_none(*pmd))
656 break;
657 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
658 va += PAGE_SIZE;
659 }
660 }
662 static void __init find_early_table_space(unsigned long end)
663 {
664 unsigned long puds, pmds, ptes, tables;
666 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
667 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
668 ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
670 tables = round_up(puds * 8, PAGE_SIZE) +
671 round_up(pmds * 8, PAGE_SIZE) +
672 round_up(ptes * 8, PAGE_SIZE);
674 extend_init_mapping(tables);
676 table_start = start_pfn;
677 table_end = table_start + (tables>>PAGE_SHIFT);
679 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
680 end, table_start << PAGE_SHIFT,
681 (table_start << PAGE_SHIFT) + tables);
682 }
684 static void xen_finish_init_mapping(void)
685 {
686 unsigned long i, start, end;
688 /* Re-vector virtual addresses pointing into the initial
689 mapping to the just-established permanent ones. */
690 xen_start_info = __va(__pa(xen_start_info));
691 xen_start_info->pt_base = (unsigned long)
692 __va(__pa(xen_start_info->pt_base));
693 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
694 phys_to_machine_mapping =
695 __va(__pa(xen_start_info->mfn_list));
696 xen_start_info->mfn_list = (unsigned long)
697 phys_to_machine_mapping;
698 }
699 if (xen_start_info->mod_start)
700 xen_start_info->mod_start = (unsigned long)
701 __va(__pa(xen_start_info->mod_start));
703 /* Destroy the Xen-created mappings beyond the kernel image as
704 * well as the temporary mappings created above. Prevents
705 * overlap with modules area (if init mapping is very big).
706 */
707 start = PAGE_ALIGN((unsigned long)_end);
708 end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
709 for (; start < end; start += PAGE_SIZE)
710 WARN_ON(HYPERVISOR_update_va_mapping(
711 start, __pte_ma(0), 0));
713 /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
714 table_end = ~0UL;
716 /*
717 * Prefetch pte's for the bt_ioremap() area. It gets used before the
718 * boot-time allocator is online, so allocate-on-demand would fail.
719 */
720 for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
721 __set_fixmap(i, 0, __pgprot(0));
723 /* Switch to the real shared_info page, and clear the dummy page. */
724 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
725 HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
726 memset(empty_zero_page, 0, sizeof(empty_zero_page));
728 /* Set up mapping of lowest 1MB of physical memory. */
729 for (i = 0; i < NR_FIX_ISAMAPS; i++)
730 if (is_initial_xendomain())
731 set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
732 else
733 __set_fixmap(FIX_ISAMAP_BEGIN - i,
734 virt_to_mfn(empty_zero_page)
735 << PAGE_SHIFT,
736 PAGE_KERNEL_RO);
738 /* Disable the 'start_pfn' allocator. */
739 table_end = start_pfn;
740 }
742 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
743 This runs before bootmem is initialized and gets pages directly from the
744 physical memory. To access them they are temporarily mapped. */
745 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
746 {
747 unsigned long next;
749 Dprintk("init_memory_mapping\n");
751 /*
752 * Find space for the kernel direct mapping tables.
753 * Later we should allocate these tables in the local node of the memory
754 * mapped. Unfortunately this is done currently before the nodes are
755 * discovered.
756 */
757 if (!after_bootmem)
758 find_early_table_space(end);
760 start = (unsigned long)__va(start);
761 end = (unsigned long)__va(end);
763 for (; start < end; start = next) {
764 unsigned long pud_phys;
765 pgd_t *pgd = pgd_offset_k(start);
766 pud_t *pud;
768 if (after_bootmem) {
769 pud = pud_offset(pgd, start & PGDIR_MASK);
770 make_page_readonly(pud, XENFEAT_writable_page_tables);
771 pud_phys = __pa(pud);
772 } else {
773 pud = alloc_static_page(&pud_phys);
774 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
775 }
776 next = start + PGDIR_SIZE;
777 if (next > end)
778 next = end;
779 phys_pud_init(pud, __pa(start), __pa(next));
780 if (!after_bootmem)
781 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
782 }
784 if (!after_bootmem) {
785 BUG_ON(start_pfn != table_end);
786 xen_finish_init_mapping();
787 }
789 __flush_tlb_all();
790 }
792 void __cpuinit zap_low_mappings(int cpu)
793 {
794 /* this is not required for Xen */
795 #if 0
796 swap_low_mappings();
797 #endif
798 }
800 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
801 __init void
802 size_zones(unsigned long *z, unsigned long *h,
803 unsigned long start_pfn, unsigned long end_pfn)
804 {
805 int i;
806 #ifndef CONFIG_XEN
807 unsigned long w;
808 #endif
810 for (i = 0; i < MAX_NR_ZONES; i++)
811 z[i] = 0;
813 #ifndef CONFIG_XEN
814 if (start_pfn < MAX_DMA_PFN)
815 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
816 if (start_pfn < MAX_DMA32_PFN) {
817 unsigned long dma32_pfn = MAX_DMA32_PFN;
818 if (dma32_pfn > end_pfn)
819 dma32_pfn = end_pfn;
820 z[ZONE_DMA32] = dma32_pfn - start_pfn;
821 }
822 z[ZONE_NORMAL] = end_pfn - start_pfn;
824 /* Remove lower zones from higher ones. */
825 w = 0;
826 for (i = 0; i < MAX_NR_ZONES; i++) {
827 if (z[i])
828 z[i] -= w;
829 w += z[i];
830 }
832 /* Compute holes */
833 w = start_pfn;
834 for (i = 0; i < MAX_NR_ZONES; i++) {
835 unsigned long s = w;
836 w += z[i];
837 h[i] = e820_hole_size(s, w);
838 }
840 /* Add the space pace needed for mem_map to the holes too. */
841 for (i = 0; i < MAX_NR_ZONES; i++)
842 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
844 /* The 16MB DMA zone has the kernel and other misc mappings.
845 Account them too */
846 if (h[ZONE_DMA]) {
847 h[ZONE_DMA] += dma_reserve;
848 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
849 printk(KERN_WARNING
850 "Kernel too large and filling up ZONE_DMA?\n");
851 h[ZONE_DMA] = z[ZONE_DMA];
852 }
853 }
854 #else
855 z[ZONE_DMA] = end_pfn;
856 for (i = 0; i < MAX_NR_ZONES; i++)
857 h[i] = 0;
858 #endif
859 }
861 #ifndef CONFIG_NUMA
862 void __init paging_init(void)
863 {
864 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
866 memory_present(0, 0, end_pfn);
867 sparse_init();
868 size_zones(zones, holes, 0, end_pfn);
869 free_area_init_node(0, NODE_DATA(0), zones,
870 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
872 init_mm.context.pinned = 1;
873 }
874 #endif
876 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
877 from the CPU leading to inconsistent cache lines. address and size
878 must be aligned to 2MB boundaries.
879 Does nothing when the mapping doesn't exist. */
880 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
881 {
882 unsigned long end = address + size;
884 BUG_ON(address & ~LARGE_PAGE_MASK);
885 BUG_ON(size & ~LARGE_PAGE_MASK);
887 for (; address < end; address += LARGE_PAGE_SIZE) {
888 pgd_t *pgd = pgd_offset_k(address);
889 pud_t *pud;
890 pmd_t *pmd;
891 if (pgd_none(*pgd))
892 continue;
893 pud = pud_offset(pgd, address);
894 if (pud_none(*pud))
895 continue;
896 pmd = pmd_offset(pud, address);
897 if (!pmd || pmd_none(*pmd))
898 continue;
899 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
900 /* Could handle this, but it should not happen currently. */
901 printk(KERN_ERR
902 "clear_kernel_mapping: mapping has been split. will leak memory\n");
903 pmd_ERROR(*pmd);
904 }
905 set_pmd(pmd, __pmd(0));
906 }
907 __flush_tlb_all();
908 }
910 /*
911 * Memory hotplug specific functions
912 */
913 void online_page(struct page *page)
914 {
915 ClearPageReserved(page);
916 init_page_count(page);
917 __free_page(page);
918 totalram_pages++;
919 num_physpages++;
920 }
922 #ifdef CONFIG_MEMORY_HOTPLUG
923 /*
924 * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
925 * via probe interface of sysfs. If acpi notifies hot-add event, then it
926 * can tell node id by searching dsdt. But, probe interface doesn't have
927 * node id. So, return 0 as node id at this time.
928 */
929 #ifdef CONFIG_NUMA
930 int memory_add_physaddr_to_nid(u64 start)
931 {
932 return 0;
933 }
934 #endif
936 /*
937 * Memory is added always to NORMAL zone. This means you will never get
938 * additional DMA/DMA32 memory.
939 */
940 int arch_add_memory(int nid, u64 start, u64 size)
941 {
942 struct pglist_data *pgdat = NODE_DATA(nid);
943 struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
944 unsigned long start_pfn = start >> PAGE_SHIFT;
945 unsigned long nr_pages = size >> PAGE_SHIFT;
946 int ret;
948 ret = __add_pages(zone, start_pfn, nr_pages);
949 if (ret)
950 goto error;
952 init_memory_mapping(start, (start + size -1));
954 return ret;
955 error:
956 printk("%s: Problem encountered in __add_pages!\n", __func__);
957 return ret;
958 }
959 EXPORT_SYMBOL_GPL(arch_add_memory);
961 int remove_memory(u64 start, u64 size)
962 {
963 return -EINVAL;
964 }
965 EXPORT_SYMBOL_GPL(remove_memory);
967 #else /* CONFIG_MEMORY_HOTPLUG */
968 /*
969 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
970 * just online the pages.
971 */
972 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
973 {
974 int err = -EIO;
975 unsigned long pfn;
976 unsigned long total = 0, mem = 0;
977 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
978 if (pfn_valid(pfn)) {
979 online_page(pfn_to_page(pfn));
980 err = 0;
981 mem++;
982 }
983 total++;
984 }
985 if (!err) {
986 z->spanned_pages += total;
987 z->present_pages += mem;
988 z->zone_pgdat->node_spanned_pages += total;
989 z->zone_pgdat->node_present_pages += mem;
990 }
991 return err;
992 }
993 #endif /* CONFIG_MEMORY_HOTPLUG */
995 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
996 kcore_vsyscall;
998 void __init mem_init(void)
999 {
1000 long codesize, reservedpages, datasize, initsize;
1001 unsigned long pfn;
1003 contiguous_bitmap = alloc_bootmem_low_pages(
1004 (end_pfn + 2*BITS_PER_LONG) >> 3);
1005 BUG_ON(!contiguous_bitmap);
1006 memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
1008 pci_iommu_alloc();
1010 /* How many end-of-memory variables you have, grandma! */
1011 max_low_pfn = end_pfn;
1012 max_pfn = end_pfn;
1013 num_physpages = end_pfn;
1014 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
1016 /* clear the zero-page */
1017 memset(empty_zero_page, 0, PAGE_SIZE);
1019 reservedpages = 0;
1021 /* this will put all low memory onto the freelists */
1022 #ifdef CONFIG_NUMA
1023 totalram_pages = numa_free_all_bootmem();
1024 #else
1025 totalram_pages = free_all_bootmem();
1026 #endif
1027 /* XEN: init and count pages outside initial allocation. */
1028 for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
1029 ClearPageReserved(pfn_to_page(pfn));
1030 init_page_count(pfn_to_page(pfn));
1031 totalram_pages++;
1033 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
1035 after_bootmem = 1;
1037 codesize = (unsigned long) &_etext - (unsigned long) &_text;
1038 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
1039 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
1041 /* Register memory areas for /proc/kcore */
1042 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
1043 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
1044 VMALLOC_END-VMALLOC_START);
1045 kclist_add(&kcore_kernel, &_stext, _end - _stext);
1046 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
1047 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
1048 VSYSCALL_END - VSYSCALL_START);
1050 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
1051 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
1052 end_pfn << (PAGE_SHIFT-10),
1053 codesize >> 10,
1054 reservedpages << (PAGE_SHIFT-10),
1055 datasize >> 10,
1056 initsize >> 10);
1058 #ifndef CONFIG_XEN
1059 #ifdef CONFIG_SMP
1060 /*
1061 * Sync boot_level4_pgt mappings with the init_level4_pgt
1062 * except for the low identity mappings which are already zapped
1063 * in init_level4_pgt. This sync-up is essential for AP's bringup
1064 */
1065 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
1066 #endif
1067 #endif
1070 void free_init_pages(char *what, unsigned long begin, unsigned long end)
1072 unsigned long addr;
1074 if (begin >= end)
1075 return;
1077 printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
1078 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1079 ClearPageReserved(virt_to_page(addr));
1080 init_page_count(virt_to_page(addr));
1081 memset((void *)(addr & ~(PAGE_SIZE-1)),
1082 POISON_FREE_INITMEM, PAGE_SIZE);
1083 if (addr >= __START_KERNEL_map) {
1084 /* make_readonly() reports all kernel addresses. */
1085 __make_page_writable(__va(__pa(addr)));
1086 if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
1087 pgd_t *pgd = pgd_offset_k(addr);
1088 pud_t *pud = pud_offset(pgd, addr);
1089 pmd_t *pmd = pmd_offset(pud, addr);
1090 pte_t *pte = pte_offset_kernel(pmd, addr);
1092 xen_l1_entry_update(pte, __pte(0)); /* fallback */
1095 free_page(addr);
1096 totalram_pages++;
1100 void free_initmem(void)
1102 memset(__initdata_begin, POISON_FREE_INITDATA,
1103 __initdata_end - __initdata_begin);
1104 free_init_pages("unused kernel memory",
1105 (unsigned long)(&__init_begin),
1106 (unsigned long)(&__init_end));
1109 #ifdef CONFIG_DEBUG_RODATA
1111 void mark_rodata_ro(void)
1113 unsigned long addr = (unsigned long)__start_rodata;
1115 for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
1116 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
1118 printk ("Write protecting the kernel read-only data: %luk\n",
1119 (__end_rodata - __start_rodata) >> 10);
1121 /*
1122 * change_page_attr_addr() requires a global_flush_tlb() call after it.
1123 * We do this after the printk so that if something went wrong in the
1124 * change, the printk gets out at least to give a better debug hint
1125 * of who is the culprit.
1126 */
1127 global_flush_tlb();
1129 #endif
1131 #ifdef CONFIG_BLK_DEV_INITRD
1132 void free_initrd_mem(unsigned long start, unsigned long end)
1134 free_init_pages("initrd memory", start, end);
1136 #endif
1138 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
1140 /* Should check here against the e820 map to avoid double free */
1141 #ifdef CONFIG_NUMA
1142 int nid = phys_to_nid(phys);
1143 reserve_bootmem_node(NODE_DATA(nid), phys, len);
1144 #else
1145 reserve_bootmem(phys, len);
1146 #endif
1147 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
1148 dma_reserve += len / PAGE_SIZE;
1151 int kern_addr_valid(unsigned long addr)
1153 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1154 pgd_t *pgd;
1155 pud_t *pud;
1156 pmd_t *pmd;
1157 pte_t *pte;
1159 if (above != 0 && above != -1UL)
1160 return 0;
1162 pgd = pgd_offset_k(addr);
1163 if (pgd_none(*pgd))
1164 return 0;
1166 pud = pud_offset(pgd, addr);
1167 if (pud_none(*pud))
1168 return 0;
1170 pmd = pmd_offset(pud, addr);
1171 if (pmd_none(*pmd))
1172 return 0;
1173 if (pmd_large(*pmd))
1174 return pfn_valid(pmd_pfn(*pmd));
1176 pte = pte_offset_kernel(pmd, addr);
1177 if (pte_none(*pte))
1178 return 0;
1179 return pfn_valid(pte_pfn(*pte));
1182 #ifdef CONFIG_SYSCTL
1183 #include <linux/sysctl.h>
1185 extern int exception_trace, page_fault_trace;
1187 static ctl_table debug_table2[] = {
1188 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
1189 proc_dointvec },
1190 { 0, }
1191 };
1193 static ctl_table debug_root_table2[] = {
1194 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
1195 .child = debug_table2 },
1196 { 0 },
1197 };
1199 static __init int x8664_sysctl_init(void)
1201 register_sysctl_table(debug_root_table2, 1);
1202 return 0;
1204 __initcall(x8664_sysctl_init);
1205 #endif
1207 /* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
1208 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
1209 not need special handling anymore. */
1211 static struct vm_area_struct gate_vma = {
1212 .vm_start = VSYSCALL_START,
1213 .vm_end = VSYSCALL_END,
1214 .vm_page_prot = PAGE_READONLY
1215 };
1217 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
1219 #ifdef CONFIG_IA32_EMULATION
1220 if (test_tsk_thread_flag(tsk, TIF_IA32))
1221 return NULL;
1222 #endif
1223 return &gate_vma;
1226 int in_gate_area(struct task_struct *task, unsigned long addr)
1228 struct vm_area_struct *vma = get_gate_vma(task);
1229 if (!vma)
1230 return 0;
1231 return (addr >= vma->vm_start) && (addr < vma->vm_end);
1234 /* Use this when you have no reliable task/vma, typically from interrupt
1235 * context. It is less reliable than using the task's vma and may give
1236 * false positives.
1237 */
1238 int in_gate_area_no_task(unsigned long addr)
1240 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);