ia64/linux-2.6.18-xen.hg

view arch/x86_64/mm/init-xen.c @ 912:dd42cdb0ab89

[IA64] Build blktap2 driver by default in x86 builds.

add CONFIG_XEN_BLKDEV_TAP2=y to buildconfigs/linux-defconfig_xen_ia64.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
author Isaku Yamahata <yamahata@valinux.co.jp>
date Mon Jun 29 12:09:16 2009 +0900 (2009-06-29)
parents baeb818cd2dc
children
line source
1 /*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 *
8 * Jun Nakajima <jun.nakajima@intel.com>
9 * Modified for Xen.
10 */
12 #include <linux/signal.h>
13 #include <linux/sched.h>
14 #include <linux/kernel.h>
15 #include <linux/errno.h>
16 #include <linux/string.h>
17 #include <linux/types.h>
18 #include <linux/ptrace.h>
19 #include <linux/mman.h>
20 #include <linux/mm.h>
21 #include <linux/swap.h>
22 #include <linux/smp.h>
23 #include <linux/init.h>
24 #include <linux/pagemap.h>
25 #include <linux/bootmem.h>
26 #include <linux/proc_fs.h>
27 #include <linux/pci.h>
28 #include <linux/poison.h>
29 #include <linux/dma-mapping.h>
30 #include <linux/module.h>
31 #include <linux/memory_hotplug.h>
33 #include <asm/processor.h>
34 #include <asm/system.h>
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37 #include <asm/pgalloc.h>
38 #include <asm/dma.h>
39 #include <asm/fixmap.h>
40 #include <asm/e820.h>
41 #include <asm/apic.h>
42 #include <asm/tlb.h>
43 #include <asm/mmu_context.h>
44 #include <asm/proto.h>
45 #include <asm/smp.h>
46 #include <asm/sections.h>
48 #include <xen/features.h>
50 #ifndef Dprintk
51 #define Dprintk(x...)
52 #endif
54 struct dma_mapping_ops* dma_ops;
55 EXPORT_SYMBOL(dma_ops);
57 #if CONFIG_XEN_COMPAT <= 0x030002
58 unsigned int __kernel_page_user;
59 EXPORT_SYMBOL(__kernel_page_user);
60 #endif
62 int after_bootmem;
64 static unsigned long dma_reserve __initdata;
66 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
67 extern unsigned long start_pfn;
69 /*
70 * Use this until direct mapping is established, i.e. before __va() is
71 * available in init_memory_mapping().
72 */
74 #define addr_to_page(addr, page) \
75 (addr) &= PHYSICAL_PAGE_MASK; \
76 (page) = ((unsigned long *) ((unsigned long) \
77 (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
78 __START_KERNEL_map)))
80 static void __meminit early_make_page_readonly(void *va, unsigned int feature)
81 {
82 unsigned long addr, _va = (unsigned long)va;
83 pte_t pte, *ptep;
84 unsigned long *page = (unsigned long *) init_level4_pgt;
86 BUG_ON(after_bootmem);
88 if (xen_feature(feature))
89 return;
91 addr = (unsigned long) page[pgd_index(_va)];
92 addr_to_page(addr, page);
94 addr = page[pud_index(_va)];
95 addr_to_page(addr, page);
97 addr = page[pmd_index(_va)];
98 addr_to_page(addr, page);
100 ptep = (pte_t *) &page[pte_index(_va)];
102 pte.pte = ptep->pte & ~_PAGE_RW;
103 if (HYPERVISOR_update_va_mapping(_va, pte, 0))
104 BUG();
105 }
107 static void __make_page_readonly(void *va)
108 {
109 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
110 unsigned long addr = (unsigned long) va;
112 pgd = pgd_offset_k(addr);
113 pud = pud_offset(pgd, addr);
114 pmd = pmd_offset(pud, addr);
115 ptep = pte_offset_kernel(pmd, addr);
117 pte.pte = ptep->pte & ~_PAGE_RW;
118 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
119 xen_l1_entry_update(ptep, pte); /* fallback */
121 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
122 __make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT));
123 }
125 static void __make_page_writable(void *va)
126 {
127 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
128 unsigned long addr = (unsigned long) va;
130 pgd = pgd_offset_k(addr);
131 pud = pud_offset(pgd, addr);
132 pmd = pmd_offset(pud, addr);
133 ptep = pte_offset_kernel(pmd, addr);
135 pte.pte = ptep->pte | _PAGE_RW;
136 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
137 xen_l1_entry_update(ptep, pte); /* fallback */
139 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
140 __make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT));
141 }
143 void make_page_readonly(void *va, unsigned int feature)
144 {
145 if (!xen_feature(feature))
146 __make_page_readonly(va);
147 }
149 void make_page_writable(void *va, unsigned int feature)
150 {
151 if (!xen_feature(feature))
152 __make_page_writable(va);
153 }
155 void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
156 {
157 if (xen_feature(feature))
158 return;
160 while (nr-- != 0) {
161 __make_page_readonly(va);
162 va = (void*)((unsigned long)va + PAGE_SIZE);
163 }
164 }
166 void make_pages_writable(void *va, unsigned nr, unsigned int feature)
167 {
168 if (xen_feature(feature))
169 return;
171 while (nr-- != 0) {
172 __make_page_writable(va);
173 va = (void*)((unsigned long)va + PAGE_SIZE);
174 }
175 }
177 /*
178 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
179 * physical space so we can cache the place of the first one and move
180 * around without checking the pgd every time.
181 */
183 void show_mem(void)
184 {
185 long i, total = 0, reserved = 0;
186 long shared = 0, cached = 0;
187 pg_data_t *pgdat;
188 struct page *page;
190 printk(KERN_INFO "Mem-info:\n");
191 show_free_areas();
192 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
194 for_each_online_pgdat(pgdat) {
195 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
196 page = pfn_to_page(pgdat->node_start_pfn + i);
197 total++;
198 if (PageReserved(page))
199 reserved++;
200 else if (PageSwapCache(page))
201 cached++;
202 else if (page_count(page))
203 shared += page_count(page) - 1;
204 }
205 }
206 printk(KERN_INFO "%lu pages of RAM\n", total);
207 printk(KERN_INFO "%lu reserved pages\n",reserved);
208 printk(KERN_INFO "%lu pages shared\n",shared);
209 printk(KERN_INFO "%lu pages swap cached\n",cached);
210 }
213 static __init void *spp_getpage(void)
214 {
215 void *ptr;
216 if (after_bootmem)
217 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
218 else if (start_pfn < table_end) {
219 ptr = __va(start_pfn << PAGE_SHIFT);
220 start_pfn++;
221 memset(ptr, 0, PAGE_SIZE);
222 } else
223 ptr = alloc_bootmem_pages(PAGE_SIZE);
224 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
225 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
227 Dprintk("spp_getpage %p\n", ptr);
228 return ptr;
229 }
231 #define pgd_offset_u(address) (__user_pgd(init_level4_pgt) + pgd_index(address))
232 #define pud_offset_u(address) (level3_user_pgt + pud_index(address))
234 static __init void set_pte_phys(unsigned long vaddr,
235 unsigned long phys, pgprot_t prot, int user_mode)
236 {
237 pgd_t *pgd;
238 pud_t *pud;
239 pmd_t *pmd;
240 pte_t *pte, new_pte;
242 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
244 pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
245 if (pgd_none(*pgd)) {
246 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
247 return;
248 }
249 pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
250 if (pud_none(*pud)) {
251 pmd = (pmd_t *) spp_getpage();
252 make_page_readonly(pmd, XENFEAT_writable_page_tables);
253 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
254 if (pmd != pmd_offset(pud, 0)) {
255 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
256 return;
257 }
258 }
259 pmd = pmd_offset(pud, vaddr);
260 if (pmd_none(*pmd)) {
261 pte = (pte_t *) spp_getpage();
262 make_page_readonly(pte, XENFEAT_writable_page_tables);
263 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
264 if (pte != pte_offset_kernel(pmd, 0)) {
265 printk("PAGETABLE BUG #02!\n");
266 return;
267 }
268 }
269 if (pgprot_val(prot))
270 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
271 else
272 new_pte = __pte(0);
274 pte = pte_offset_kernel(pmd, vaddr);
275 if (!pte_none(*pte) && __pte_val(new_pte) &&
276 __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
277 pte_ERROR(*pte);
278 set_pte(pte, new_pte);
280 /*
281 * It's enough to flush this one mapping.
282 * (PGE mappings get flushed as well)
283 */
284 __flush_tlb_one(vaddr);
285 }
287 static __init void set_pte_phys_ma(unsigned long vaddr,
288 unsigned long phys, pgprot_t prot)
289 {
290 pgd_t *pgd;
291 pud_t *pud;
292 pmd_t *pmd;
293 pte_t *pte, new_pte;
295 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
297 pgd = pgd_offset_k(vaddr);
298 if (pgd_none(*pgd)) {
299 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
300 return;
301 }
302 pud = pud_offset(pgd, vaddr);
303 if (pud_none(*pud)) {
305 pmd = (pmd_t *) spp_getpage();
306 make_page_readonly(pmd, XENFEAT_writable_page_tables);
307 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
308 if (pmd != pmd_offset(pud, 0)) {
309 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
310 return;
311 }
312 }
313 pmd = pmd_offset(pud, vaddr);
314 if (pmd_none(*pmd)) {
315 pte = (pte_t *) spp_getpage();
316 make_page_readonly(pte, XENFEAT_writable_page_tables);
317 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
318 if (pte != pte_offset_kernel(pmd, 0)) {
319 printk("PAGETABLE BUG #02!\n");
320 return;
321 }
322 }
323 new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
325 pte = pte_offset_kernel(pmd, vaddr);
326 if (!pte_none(*pte) && __pte_val(new_pte) &&
327 #ifdef CONFIG_ACPI
328 /* __acpi_map_table() fails to properly call clear_fixmap() */
329 (vaddr < __fix_to_virt(FIX_ACPI_END) ||
330 vaddr > __fix_to_virt(FIX_ACPI_BEGIN)) &&
331 #endif
332 __pte_val(*pte) != (__pte_val(new_pte) & __supported_pte_mask))
333 pte_ERROR(*pte);
334 set_pte(pte, new_pte);
336 /*
337 * It's enough to flush this one mapping.
338 * (PGE mappings get flushed as well)
339 */
340 __flush_tlb_one(vaddr);
341 }
343 /* NOTE: this is meant to be run only at boot */
344 void __init
345 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
346 {
347 unsigned long address = __fix_to_virt(idx);
349 if (idx >= __end_of_fixed_addresses) {
350 printk("Invalid __set_fixmap\n");
351 return;
352 }
353 switch (idx) {
354 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
355 set_pte_phys(address, phys, prot, 0);
356 set_pte_phys(address, phys, prot, 1);
357 break;
358 default:
359 set_pte_phys_ma(address, phys, prot);
360 break;
361 }
362 }
364 unsigned long __initdata table_start, table_end;
366 static __meminit void *alloc_static_page(unsigned long *phys)
367 {
368 unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
370 if (after_bootmem) {
371 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
373 *phys = __pa(adr);
374 return adr;
375 }
377 *phys = start_pfn << PAGE_SHIFT;
378 start_pfn++;
379 memset((void *)va, 0, PAGE_SIZE);
380 return (void *)va;
381 }
383 #define PTE_SIZE PAGE_SIZE
385 static inline int make_readonly(unsigned long paddr)
386 {
387 extern char __vsyscall_0;
388 int readonly = 0;
390 /* Make new page tables read-only. */
391 if (!xen_feature(XENFEAT_writable_page_tables)
392 && (paddr >= (table_start << PAGE_SHIFT))
393 && (paddr < (table_end << PAGE_SHIFT)))
394 readonly = 1;
395 /* Make old page tables read-only. */
396 if (!xen_feature(XENFEAT_writable_page_tables)
397 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
398 && (paddr < (start_pfn << PAGE_SHIFT)))
399 readonly = 1;
401 /*
402 * No need for writable mapping of kernel image. This also ensures that
403 * page and descriptor tables embedded inside don't have writable
404 * mappings. Exclude the vsyscall area here, allowing alternative
405 * instruction patching to work.
406 */
407 if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end))
408 && !(paddr >= __pa_symbol(&__vsyscall_0)
409 && paddr < __pa_symbol(&__vsyscall_0) + PAGE_SIZE))
410 readonly = 1;
412 return readonly;
413 }
415 #ifndef CONFIG_XEN
416 /* Must run before zap_low_mappings */
417 __init void *early_ioremap(unsigned long addr, unsigned long size)
418 {
419 unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
421 /* actually usually some more */
422 if (size >= LARGE_PAGE_SIZE) {
423 printk("SMBIOS area too long %lu\n", size);
424 return NULL;
425 }
426 set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
427 map += LARGE_PAGE_SIZE;
428 set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
429 __flush_tlb();
430 return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
431 }
433 /* To avoid virtual aliases later */
434 __init void early_iounmap(void *addr, unsigned long size)
435 {
436 if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
437 printk("early_iounmap: bad address %p\n", addr);
438 set_pmd(temp_mappings[0].pmd, __pmd(0));
439 set_pmd(temp_mappings[1].pmd, __pmd(0));
440 __flush_tlb();
441 }
442 #endif
444 static void __meminit
445 phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
446 {
447 int i, k;
449 for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
450 unsigned long pte_phys;
451 pte_t *pte, *pte_save;
453 if (address >= end)
454 break;
455 pte = alloc_static_page(&pte_phys);
456 pte_save = pte;
457 for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
458 unsigned long pteval = address | _PAGE_NX | _KERNPG_TABLE;
460 if (address >= end ||
461 (!after_bootmem &&
462 (address >> PAGE_SHIFT) >= xen_start_info->nr_pages))
463 pteval = 0;
464 else if (make_readonly(address))
465 pteval &= ~_PAGE_RW;
466 set_pte(pte, __pte(pteval & __supported_pte_mask));
467 }
468 if (!after_bootmem) {
469 early_make_page_readonly(pte_save, XENFEAT_writable_page_tables);
470 *pmd = __pmd(pte_phys | _KERNPG_TABLE);
471 } else {
472 make_page_readonly(pte_save, XENFEAT_writable_page_tables);
473 set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
474 }
475 }
476 }
478 static void __meminit
479 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
480 {
481 pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
483 if (pmd_none(*pmd)) {
484 spin_lock(&init_mm.page_table_lock);
485 phys_pmd_init(pmd, address, end);
486 spin_unlock(&init_mm.page_table_lock);
487 __flush_tlb_all();
488 }
489 }
491 static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
492 {
493 long i = pud_index(address);
495 pud = pud + i;
497 if (after_bootmem && pud_val(*pud)) {
498 phys_pmd_update(pud, address, end);
499 return;
500 }
502 for (; i < PTRS_PER_PUD; pud++, i++) {
503 unsigned long paddr, pmd_phys;
504 pmd_t *pmd;
506 paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
507 if (paddr >= end)
508 break;
510 pmd = alloc_static_page(&pmd_phys);
512 spin_lock(&init_mm.page_table_lock);
513 *pud = __pud(pmd_phys | _KERNPG_TABLE);
514 phys_pmd_init(pmd, paddr, end);
515 spin_unlock(&init_mm.page_table_lock);
517 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
518 }
519 __flush_tlb();
520 }
522 void __init xen_init_pt(void)
523 {
524 unsigned long addr, *page;
526 /* Find the initial pte page that was built for us. */
527 page = (unsigned long *)xen_start_info->pt_base;
528 addr = page[pgd_index(__START_KERNEL_map)];
529 addr_to_page(addr, page);
531 #if CONFIG_XEN_COMPAT <= 0x030002
532 /* On Xen 3.0.2 and older we may need to explicitly specify _PAGE_USER
533 in kernel PTEs. We check that here. */
534 if (HYPERVISOR_xen_version(XENVER_version, NULL) <= 0x30000) {
535 unsigned long *pg;
536 pte_t pte;
538 /* Mess with the initial mapping of page 0. It's not needed. */
539 BUILD_BUG_ON(__START_KERNEL <= __START_KERNEL_map);
540 addr = page[pud_index(__START_KERNEL_map)];
541 addr_to_page(addr, pg);
542 addr = pg[pmd_index(__START_KERNEL_map)];
543 addr_to_page(addr, pg);
544 pte.pte = pg[pte_index(__START_KERNEL_map)];
545 BUG_ON(!(pte.pte & _PAGE_PRESENT));
547 /* If _PAGE_USER isn't set, we obviously do not need it. */
548 if (pte.pte & _PAGE_USER) {
549 /* _PAGE_USER is needed, but is it set implicitly? */
550 pte.pte &= ~_PAGE_USER;
551 if ((HYPERVISOR_update_va_mapping(__START_KERNEL_map,
552 pte, 0) != 0) ||
553 !(pg[pte_index(__START_KERNEL_map)] & _PAGE_USER))
554 /* We need to explicitly specify _PAGE_USER. */
555 __kernel_page_user = _PAGE_USER;
556 }
557 }
558 #endif
560 /* Construct mapping of initial pte page in our own directories. */
561 init_level4_pgt[pgd_index(__START_KERNEL_map)] =
562 __pgd(__pa_symbol(level3_kernel_pgt) | _PAGE_TABLE);
563 memcpy(level3_kernel_pgt + pud_index(__START_KERNEL_map),
564 page + pud_index(__START_KERNEL_map),
565 (PTRS_PER_PUD - pud_index(__START_KERNEL_map))
566 * sizeof(*level3_kernel_pgt));
568 __user_pgd(init_level4_pgt)[pgd_index(VSYSCALL_START)] =
569 __pgd(__pa_symbol(level3_user_pgt) | _PAGE_TABLE);
571 early_make_page_readonly(init_level4_pgt,
572 XENFEAT_writable_page_tables);
573 early_make_page_readonly(__user_pgd(init_level4_pgt),
574 XENFEAT_writable_page_tables);
575 early_make_page_readonly(level3_kernel_pgt,
576 XENFEAT_writable_page_tables);
577 early_make_page_readonly(level3_user_pgt,
578 XENFEAT_writable_page_tables);
580 if (!xen_feature(XENFEAT_writable_page_tables)) {
581 xen_pgd_pin(__pa_symbol(init_level4_pgt));
582 xen_pgd_pin(__pa_symbol(__user_pgd(init_level4_pgt)));
583 }
584 }
586 static void __init extend_init_mapping(unsigned long tables_space)
587 {
588 unsigned long va = __START_KERNEL_map;
589 unsigned long phys, addr, *pte_page;
590 pmd_t *pmd;
591 pte_t *pte, new_pte;
592 unsigned long *page = (unsigned long *)init_level4_pgt;
594 addr = page[pgd_index(va)];
595 addr_to_page(addr, page);
596 addr = page[pud_index(va)];
597 addr_to_page(addr, page);
599 /* Kill mapping of low 1MB. */
600 while (va < (unsigned long)&_text) {
601 if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
602 BUG();
603 va += PAGE_SIZE;
604 }
606 /* Ensure init mappings cover kernel text/data and initial tables. */
607 while (va < (__START_KERNEL_map
608 + (start_pfn << PAGE_SHIFT)
609 + tables_space)) {
610 if (!(pmd_index(va) | pte_index(va))) {
611 pud_t *pud;
613 page = (unsigned long *)init_level4_pgt;
614 addr = page[pgd_index(va)];
615 addr_to_page(addr, page);
616 pud = (pud_t *)&page[pud_index(va)];
617 if (pud_none(*pud)) {
618 page = alloc_static_page(&phys);
619 early_make_page_readonly(
620 page, XENFEAT_writable_page_tables);
621 set_pud(pud, __pud(phys | _KERNPG_TABLE));
622 } else {
623 addr = page[pud_index(va)];
624 addr_to_page(addr, page);
625 }
626 }
627 pmd = (pmd_t *)&page[pmd_index(va)];
628 if (pmd_none(*pmd)) {
629 pte_page = alloc_static_page(&phys);
630 early_make_page_readonly(
631 pte_page, XENFEAT_writable_page_tables);
632 set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
633 } else {
634 addr = page[pmd_index(va)];
635 addr_to_page(addr, pte_page);
636 }
637 pte = (pte_t *)&pte_page[pte_index(va)];
638 if (pte_none(*pte)) {
639 new_pte = pfn_pte(
640 (va - __START_KERNEL_map) >> PAGE_SHIFT,
641 __pgprot(_KERNPG_TABLE));
642 xen_l1_entry_update(pte, new_pte);
643 }
644 va += PAGE_SIZE;
645 }
647 /* Finally, blow away any spurious initial mappings. */
648 while (1) {
649 if (!(pmd_index(va) | pte_index(va))) {
650 page = (unsigned long *)init_level4_pgt;
651 addr = page[pgd_index(va)];
652 addr_to_page(addr, page);
653 if (pud_none(((pud_t *)page)[pud_index(va)]))
654 break;
655 addr = page[pud_index(va)];
656 addr_to_page(addr, page);
657 }
658 pmd = (pmd_t *)&page[pmd_index(va)];
659 if (pmd_none(*pmd))
660 break;
661 if (HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0))
662 BUG();
663 va += PAGE_SIZE;
664 }
665 }
667 static void __init find_early_table_space(unsigned long end)
668 {
669 unsigned long puds, pmds, ptes, tables;
671 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
672 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
673 ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
675 tables = round_up(puds * 8, PAGE_SIZE) +
676 round_up(pmds * 8, PAGE_SIZE) +
677 round_up(ptes * 8, PAGE_SIZE);
679 extend_init_mapping(tables);
681 table_start = start_pfn;
682 table_end = table_start + (tables>>PAGE_SHIFT);
684 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
685 end, table_start << PAGE_SHIFT,
686 (table_start << PAGE_SHIFT) + tables);
687 }
689 static void xen_finish_init_mapping(void)
690 {
691 unsigned long i, start, end;
693 /* Re-vector virtual addresses pointing into the initial
694 mapping to the just-established permanent ones. */
695 xen_start_info = __va(__pa(xen_start_info));
696 xen_start_info->pt_base = (unsigned long)
697 __va(__pa(xen_start_info->pt_base));
698 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
699 phys_to_machine_mapping =
700 __va(__pa(xen_start_info->mfn_list));
701 xen_start_info->mfn_list = (unsigned long)
702 phys_to_machine_mapping;
703 }
704 if (xen_start_info->mod_start)
705 xen_start_info->mod_start = (unsigned long)
706 __va(__pa(xen_start_info->mod_start));
708 /* Destroy the Xen-created mappings beyond the kernel image as
709 * well as the temporary mappings created above. Prevents
710 * overlap with modules area (if init mapping is very big).
711 */
712 start = PAGE_ALIGN((unsigned long)_end);
713 end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
714 for (; start < end; start += PAGE_SIZE)
715 if (HYPERVISOR_update_va_mapping(start, __pte_ma(0), 0))
716 BUG();
718 /* Allocate pte's for initial fixmaps from 'start_pfn' allocator. */
719 table_end = ~0UL;
721 /*
722 * Prefetch pte's for the bt_ioremap() area. It gets used before the
723 * boot-time allocator is online, so allocate-on-demand would fail.
724 */
725 for (i = FIX_BTMAP_END; i <= FIX_BTMAP_BEGIN; i++)
726 __set_fixmap(i, 0, __pgprot(0));
728 /* Switch to the real shared_info page, and clear the dummy page. */
729 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
730 HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
731 memset(empty_zero_page, 0, sizeof(empty_zero_page));
733 /* Set up mapping of lowest 1MB of physical memory. */
734 for (i = 0; i < NR_FIX_ISAMAPS; i++)
735 if (is_initial_xendomain())
736 set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
737 else
738 __set_fixmap(FIX_ISAMAP_BEGIN - i,
739 virt_to_mfn(empty_zero_page)
740 << PAGE_SHIFT,
741 PAGE_KERNEL_RO);
743 /* Disable the 'start_pfn' allocator. */
744 table_end = start_pfn;
745 }
747 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
748 This runs before bootmem is initialized and gets pages directly from the
749 physical memory. To access them they are temporarily mapped. */
750 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
751 {
752 unsigned long next;
754 Dprintk("init_memory_mapping\n");
756 /*
757 * Find space for the kernel direct mapping tables.
758 * Later we should allocate these tables in the local node of the memory
759 * mapped. Unfortunately this is done currently before the nodes are
760 * discovered.
761 */
762 if (!after_bootmem)
763 find_early_table_space(end);
765 start = (unsigned long)__va(start);
766 end = (unsigned long)__va(end);
768 for (; start < end; start = next) {
769 unsigned long pud_phys;
770 pgd_t *pgd = pgd_offset_k(start);
771 pud_t *pud;
773 if (after_bootmem)
774 pud = pud_offset(pgd, start & PGDIR_MASK);
775 else
776 pud = alloc_static_page(&pud_phys);
777 next = start + PGDIR_SIZE;
778 if (next > end)
779 next = end;
780 phys_pud_init(pud, __pa(start), __pa(next));
781 if (!after_bootmem) {
782 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
783 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
784 }
785 }
787 if (!after_bootmem) {
788 BUG_ON(start_pfn != table_end);
789 xen_finish_init_mapping();
790 }
792 __flush_tlb_all();
793 }
795 void __cpuinit zap_low_mappings(int cpu)
796 {
797 /* this is not required for Xen */
798 #if 0
799 swap_low_mappings();
800 #endif
801 }
803 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
804 __init void
805 size_zones(unsigned long *z, unsigned long *h,
806 unsigned long start_pfn, unsigned long end_pfn)
807 {
808 int i;
809 unsigned long w;
811 for (i = 0; i < MAX_NR_ZONES; i++)
812 z[i] = 0;
814 if (start_pfn < MAX_DMA_PFN)
815 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
816 if (start_pfn < MAX_DMA32_PFN) {
817 unsigned long dma32_pfn = MAX_DMA32_PFN;
818 if (dma32_pfn > end_pfn)
819 dma32_pfn = end_pfn;
820 z[ZONE_DMA32] = dma32_pfn - start_pfn;
821 }
822 z[ZONE_NORMAL] = end_pfn - start_pfn;
824 /* Remove lower zones from higher ones. */
825 w = 0;
826 for (i = 0; i < MAX_NR_ZONES; i++) {
827 if (z[i])
828 z[i] -= w;
829 w += z[i];
830 }
832 /* Compute holes */
833 w = start_pfn;
834 for (i = 0; i < MAX_NR_ZONES; i++) {
835 unsigned long s = w;
836 w += z[i];
837 h[i] = e820_hole_size(s, w);
838 }
840 /* Add the space pace needed for mem_map to the holes too. */
841 for (i = 0; i < MAX_NR_ZONES; i++)
842 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
844 /* The 16MB DMA zone has the kernel and other misc mappings.
845 Account them too */
846 if (h[ZONE_DMA]) {
847 h[ZONE_DMA] += dma_reserve;
848 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
849 printk(KERN_WARNING
850 "Kernel too large and filling up ZONE_DMA?\n");
851 h[ZONE_DMA] = z[ZONE_DMA];
852 }
853 }
854 }
856 #ifndef CONFIG_NUMA
857 void __init paging_init(void)
858 {
859 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
861 memory_present(0, 0, end_pfn);
862 sparse_init();
863 size_zones(zones, holes, 0, end_pfn);
864 free_area_init_node(0, NODE_DATA(0), zones,
865 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
867 init_mm.context.pinned = 1;
868 }
869 #endif
871 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
872 from the CPU leading to inconsistent cache lines. address and size
873 must be aligned to 2MB boundaries.
874 Does nothing when the mapping doesn't exist. */
875 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
876 {
877 unsigned long end = address + size;
879 BUG_ON(address & ~LARGE_PAGE_MASK);
880 BUG_ON(size & ~LARGE_PAGE_MASK);
882 for (; address < end; address += LARGE_PAGE_SIZE) {
883 pgd_t *pgd = pgd_offset_k(address);
884 pud_t *pud;
885 pmd_t *pmd;
886 if (pgd_none(*pgd))
887 continue;
888 pud = pud_offset(pgd, address);
889 if (pud_none(*pud))
890 continue;
891 pmd = pmd_offset(pud, address);
892 if (!pmd || pmd_none(*pmd))
893 continue;
894 if (0 == (__pmd_val(*pmd) & _PAGE_PSE)) {
895 /* Could handle this, but it should not happen currently. */
896 printk(KERN_ERR
897 "clear_kernel_mapping: mapping has been split. will leak memory\n");
898 pmd_ERROR(*pmd);
899 }
900 set_pmd(pmd, __pmd(0));
901 }
902 __flush_tlb_all();
903 }
905 /*
906 * Memory hotplug specific functions
907 */
908 void online_page(struct page *page)
909 {
910 ClearPageReserved(page);
911 init_page_count(page);
912 __free_page(page);
913 totalram_pages++;
914 num_physpages++;
915 }
917 #ifdef CONFIG_MEMORY_HOTPLUG
918 /*
919 * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
920 * via probe interface of sysfs. If acpi notifies hot-add event, then it
921 * can tell node id by searching dsdt. But, probe interface doesn't have
922 * node id. So, return 0 as node id at this time.
923 */
924 #ifdef CONFIG_NUMA
925 int memory_add_physaddr_to_nid(u64 start)
926 {
927 return 0;
928 }
929 #endif
931 /*
932 * Memory is added always to NORMAL zone. This means you will never get
933 * additional DMA/DMA32 memory.
934 */
935 int arch_add_memory(int nid, u64 start, u64 size)
936 {
937 struct pglist_data *pgdat = NODE_DATA(nid);
938 struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
939 unsigned long start_pfn = start >> PAGE_SHIFT;
940 unsigned long nr_pages = size >> PAGE_SHIFT;
941 int ret;
943 ret = __add_pages(zone, start_pfn, nr_pages);
944 if (ret)
945 goto error;
947 init_memory_mapping(start, (start + size -1));
949 return ret;
950 error:
951 printk("%s: Problem encountered in __add_pages!\n", __func__);
952 return ret;
953 }
954 EXPORT_SYMBOL_GPL(arch_add_memory);
956 int remove_memory(u64 start, u64 size)
957 {
958 return -EINVAL;
959 }
960 EXPORT_SYMBOL_GPL(remove_memory);
962 #else /* CONFIG_MEMORY_HOTPLUG */
963 /*
964 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
965 * just online the pages.
966 */
967 int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
968 {
969 int err = -EIO;
970 unsigned long pfn;
971 unsigned long total = 0, mem = 0;
972 for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
973 if (pfn_valid(pfn)) {
974 online_page(pfn_to_page(pfn));
975 err = 0;
976 mem++;
977 }
978 total++;
979 }
980 if (!err) {
981 z->spanned_pages += total;
982 z->present_pages += mem;
983 z->zone_pgdat->node_spanned_pages += total;
984 z->zone_pgdat->node_present_pages += mem;
985 }
986 return err;
987 }
988 #endif /* CONFIG_MEMORY_HOTPLUG */
990 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
991 kcore_vsyscall;
993 void __init mem_init(void)
994 {
995 long codesize, reservedpages, datasize, initsize;
996 unsigned long pfn;
998 pci_iommu_alloc();
1000 /* How many end-of-memory variables you have, grandma! */
1001 max_low_pfn = end_pfn;
1002 max_pfn = end_pfn;
1003 num_physpages = end_pfn;
1004 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
1006 /* clear the zero-page */
1007 memset(empty_zero_page, 0, PAGE_SIZE);
1009 reservedpages = 0;
1011 /* this will put all low memory onto the freelists */
1012 #ifdef CONFIG_NUMA
1013 totalram_pages = numa_free_all_bootmem();
1014 #else
1015 totalram_pages = free_all_bootmem();
1016 #endif
1017 /* XEN: init and count pages outside initial allocation. */
1018 for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
1019 ClearPageReserved(pfn_to_page(pfn));
1020 init_page_count(pfn_to_page(pfn));
1021 totalram_pages++;
1023 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
1025 after_bootmem = 1;
1027 codesize = (unsigned long) &_etext - (unsigned long) &_text;
1028 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
1029 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
1031 /* Register memory areas for /proc/kcore */
1032 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
1033 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
1034 VMALLOC_END-VMALLOC_START);
1035 kclist_add(&kcore_kernel, &_stext, _end - _stext);
1036 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
1037 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
1038 VSYSCALL_END - VSYSCALL_START);
1040 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
1041 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
1042 end_pfn << (PAGE_SHIFT-10),
1043 codesize >> 10,
1044 reservedpages << (PAGE_SHIFT-10),
1045 datasize >> 10,
1046 initsize >> 10);
1048 #ifndef CONFIG_XEN
1049 #ifdef CONFIG_SMP
1050 /*
1051 * Sync boot_level4_pgt mappings with the init_level4_pgt
1052 * except for the low identity mappings which are already zapped
1053 * in init_level4_pgt. This sync-up is essential for AP's bringup
1054 */
1055 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
1056 #endif
1057 #endif
1060 void free_init_pages(char *what, unsigned long begin, unsigned long end)
1062 unsigned long addr;
1064 if (begin >= end)
1065 return;
1067 printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
1068 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1069 ClearPageReserved(virt_to_page(addr));
1070 init_page_count(virt_to_page(addr));
1071 memset((void *)(addr & ~(PAGE_SIZE-1)),
1072 POISON_FREE_INITMEM, PAGE_SIZE);
1073 if (addr >= __START_KERNEL_map) {
1074 /* make_readonly() reports all kernel addresses. */
1075 __make_page_writable(__va(__pa(addr)));
1076 if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
1077 pgd_t *pgd = pgd_offset_k(addr);
1078 pud_t *pud = pud_offset(pgd, addr);
1079 pmd_t *pmd = pmd_offset(pud, addr);
1080 pte_t *pte = pte_offset_kernel(pmd, addr);
1082 xen_l1_entry_update(pte, __pte(0)); /* fallback */
1085 free_page(addr);
1086 totalram_pages++;
1090 void free_initmem(void)
1092 memset(__initdata_begin, POISON_FREE_INITDATA,
1093 __initdata_end - __initdata_begin);
1094 free_init_pages("unused kernel memory",
1095 (unsigned long)(&__init_begin),
1096 (unsigned long)(&__init_end));
1099 #ifdef CONFIG_DEBUG_RODATA
1101 void mark_rodata_ro(void)
1103 unsigned long addr = (unsigned long)__start_rodata;
1105 for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
1106 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
1108 printk ("Write protecting the kernel read-only data: %luk\n",
1109 (__end_rodata - __start_rodata) >> 10);
1111 /*
1112 * change_page_attr_addr() requires a global_flush_tlb() call after it.
1113 * We do this after the printk so that if something went wrong in the
1114 * change, the printk gets out at least to give a better debug hint
1115 * of who is the culprit.
1116 */
1117 global_flush_tlb();
1119 #endif
1121 #ifdef CONFIG_BLK_DEV_INITRD
1122 void free_initrd_mem(unsigned long start, unsigned long end)
1124 free_init_pages("initrd memory", start, end);
1126 #endif
1128 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
1130 /* Should check here against the e820 map to avoid double free */
1131 #ifdef CONFIG_NUMA
1132 int nid = phys_to_nid(phys);
1133 reserve_bootmem_node(NODE_DATA(nid), phys, len);
1134 #else
1135 reserve_bootmem(phys, len);
1136 #endif
1137 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
1138 dma_reserve += len / PAGE_SIZE;
1141 int kern_addr_valid(unsigned long addr)
1143 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1144 pgd_t *pgd;
1145 pud_t *pud;
1146 pmd_t *pmd;
1147 pte_t *pte;
1149 if (above != 0 && above != -1UL)
1150 return 0;
1152 pgd = pgd_offset_k(addr);
1153 if (pgd_none(*pgd))
1154 return 0;
1156 pud = pud_offset(pgd, addr);
1157 if (pud_none(*pud))
1158 return 0;
1160 pmd = pmd_offset(pud, addr);
1161 if (pmd_none(*pmd))
1162 return 0;
1163 if (pmd_large(*pmd))
1164 return pfn_valid(pmd_pfn(*pmd));
1166 pte = pte_offset_kernel(pmd, addr);
1167 if (pte_none(*pte))
1168 return 0;
1169 return pfn_valid(pte_pfn(*pte));
1172 #ifdef CONFIG_SYSCTL
1173 #include <linux/sysctl.h>
1175 extern int exception_trace, page_fault_trace;
1177 static ctl_table debug_table2[] = {
1178 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
1179 proc_dointvec },
1180 { 0, }
1181 };
1183 static ctl_table debug_root_table2[] = {
1184 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
1185 .child = debug_table2 },
1186 { 0 },
1187 };
1189 static __init int x8664_sysctl_init(void)
1191 register_sysctl_table(debug_root_table2, 1);
1192 return 0;
1194 __initcall(x8664_sysctl_init);
1195 #endif
1197 /* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
1198 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
1199 not need special handling anymore. */
1201 static struct vm_area_struct gate_vma = {
1202 .vm_start = VSYSCALL_START,
1203 .vm_end = VSYSCALL_END,
1204 .vm_page_prot = PAGE_READONLY
1205 };
1207 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
1209 #ifdef CONFIG_IA32_EMULATION
1210 if (test_tsk_thread_flag(tsk, TIF_IA32))
1211 return NULL;
1212 #endif
1213 return &gate_vma;
1216 int in_gate_area(struct task_struct *task, unsigned long addr)
1218 struct vm_area_struct *vma = get_gate_vma(task);
1219 if (!vma)
1220 return 0;
1221 return (addr >= vma->vm_start) && (addr < vma->vm_end);
1224 /* Use this when you have no reliable task/vma, typically from interrupt
1225 * context. It is less reliable than using the task's vma and may give
1226 * false positives.
1227 */
1228 int in_gate_area_no_task(unsigned long addr)
1230 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);