direct-io.hg

view linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c @ 11443:af50fb41612c

[IA64] Expand memory_op for PV-on-HVM on IPF

Signed-off-by: Tsunehisa Doi <Doi.Tsunehisa@jp.fujitsu.com>
Signed-off-by: Tomonari Horikoshi <t.horikoshi@jp.fujitsu.com>
author awilliam@xenbuild.aw
date Fri Sep 01 08:46:02 2006 -0600 (2006-09-01)
parents fc5736e0a2eb
children 8f552314e45a
line source
1 /*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 *
8 * Jun Nakajima <jun.nakajima@intel.com>
9 * Modified for Xen.
10 */
12 #include <linux/config.h>
13 #include <linux/signal.h>
14 #include <linux/sched.h>
15 #include <linux/kernel.h>
16 #include <linux/errno.h>
17 #include <linux/string.h>
18 #include <linux/types.h>
19 #include <linux/ptrace.h>
20 #include <linux/mman.h>
21 #include <linux/mm.h>
22 #include <linux/swap.h>
23 #include <linux/smp.h>
24 #include <linux/init.h>
25 #include <linux/pagemap.h>
26 #include <linux/bootmem.h>
27 #include <linux/proc_fs.h>
28 #include <linux/pci.h>
29 #include <linux/dma-mapping.h>
30 #include <linux/module.h>
31 #include <linux/memory_hotplug.h>
33 #include <asm/processor.h>
34 #include <asm/system.h>
35 #include <asm/uaccess.h>
36 #include <asm/pgtable.h>
37 #include <asm/pgalloc.h>
38 #include <asm/dma.h>
39 #include <asm/fixmap.h>
40 #include <asm/e820.h>
41 #include <asm/apic.h>
42 #include <asm/tlb.h>
43 #include <asm/mmu_context.h>
44 #include <asm/proto.h>
45 #include <asm/smp.h>
46 #include <asm/sections.h>
47 #include <asm/dma-mapping.h>
48 #include <asm/swiotlb.h>
50 #include <xen/features.h>
52 #ifndef Dprintk
53 #define Dprintk(x...)
54 #endif
56 struct dma_mapping_ops* dma_ops;
57 EXPORT_SYMBOL(dma_ops);
59 extern unsigned long *contiguous_bitmap;
61 static unsigned long dma_reserve __initdata;
63 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
64 extern unsigned long start_pfn;
66 /*
67 * Use this until direct mapping is established, i.e. before __va() is
68 * available in init_memory_mapping().
69 */
71 #define addr_to_page(addr, page) \
72 (addr) &= PHYSICAL_PAGE_MASK; \
73 (page) = ((unsigned long *) ((unsigned long) \
74 (((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + \
75 __START_KERNEL_map)))
77 static void early_make_page_readonly(void *va, unsigned int feature)
78 {
79 unsigned long addr, _va = (unsigned long)va;
80 pte_t pte, *ptep;
81 unsigned long *page = (unsigned long *) init_level4_pgt;
83 if (xen_feature(feature))
84 return;
86 addr = (unsigned long) page[pgd_index(_va)];
87 addr_to_page(addr, page);
89 addr = page[pud_index(_va)];
90 addr_to_page(addr, page);
92 addr = page[pmd_index(_va)];
93 addr_to_page(addr, page);
95 ptep = (pte_t *) &page[pte_index(_va)];
97 pte.pte = ptep->pte & ~_PAGE_RW;
98 if (HYPERVISOR_update_va_mapping(_va, pte, 0))
99 BUG();
100 }
102 void make_page_readonly(void *va, unsigned int feature)
103 {
104 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
105 unsigned long addr = (unsigned long) va;
107 if (xen_feature(feature))
108 return;
110 pgd = pgd_offset_k(addr);
111 pud = pud_offset(pgd, addr);
112 pmd = pmd_offset(pud, addr);
113 ptep = pte_offset_kernel(pmd, addr);
115 pte.pte = ptep->pte & ~_PAGE_RW;
116 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
117 xen_l1_entry_update(ptep, pte); /* fallback */
119 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
120 make_page_readonly(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
121 }
123 void make_page_writable(void *va, unsigned int feature)
124 {
125 pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t pte, *ptep;
126 unsigned long addr = (unsigned long) va;
128 if (xen_feature(feature))
129 return;
131 pgd = pgd_offset_k(addr);
132 pud = pud_offset(pgd, addr);
133 pmd = pmd_offset(pud, addr);
134 ptep = pte_offset_kernel(pmd, addr);
136 pte.pte = ptep->pte | _PAGE_RW;
137 if (HYPERVISOR_update_va_mapping(addr, pte, 0))
138 xen_l1_entry_update(ptep, pte); /* fallback */
140 if ((addr >= VMALLOC_START) && (addr < VMALLOC_END))
141 make_page_writable(__va(pte_pfn(pte) << PAGE_SHIFT), feature);
142 }
144 void make_pages_readonly(void *va, unsigned nr, unsigned int feature)
145 {
146 if (xen_feature(feature))
147 return;
149 while (nr-- != 0) {
150 make_page_readonly(va, feature);
151 va = (void*)((unsigned long)va + PAGE_SIZE);
152 }
153 }
155 void make_pages_writable(void *va, unsigned nr, unsigned int feature)
156 {
157 if (xen_feature(feature))
158 return;
160 while (nr-- != 0) {
161 make_page_writable(va, feature);
162 va = (void*)((unsigned long)va + PAGE_SIZE);
163 }
164 }
166 /*
167 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
168 * physical space so we can cache the place of the first one and move
169 * around without checking the pgd every time.
170 */
172 void show_mem(void)
173 {
174 long i, total = 0, reserved = 0;
175 long shared = 0, cached = 0;
176 pg_data_t *pgdat;
177 struct page *page;
179 printk(KERN_INFO "Mem-info:\n");
180 show_free_areas();
181 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
183 for_each_pgdat(pgdat) {
184 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
185 page = pfn_to_page(pgdat->node_start_pfn + i);
186 total++;
187 if (PageReserved(page))
188 reserved++;
189 else if (PageSwapCache(page))
190 cached++;
191 else if (page_count(page))
192 shared += page_count(page) - 1;
193 }
194 }
195 printk(KERN_INFO "%lu pages of RAM\n", total);
196 printk(KERN_INFO "%lu reserved pages\n",reserved);
197 printk(KERN_INFO "%lu pages shared\n",shared);
198 printk(KERN_INFO "%lu pages swap cached\n",cached);
199 }
201 /* References to section boundaries */
203 int after_bootmem;
205 static void *spp_getpage(void)
206 {
207 void *ptr;
208 if (after_bootmem)
209 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
210 else
211 ptr = alloc_bootmem_pages(PAGE_SIZE);
212 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
213 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
215 Dprintk("spp_getpage %p\n", ptr);
216 return ptr;
217 }
219 #define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address))
221 static inline pud_t *pud_offset_u(unsigned long address)
222 {
223 pud_t *pud = level3_user_pgt;
225 return pud + pud_index(address);
226 }
228 static void set_pte_phys(unsigned long vaddr,
229 unsigned long phys, pgprot_t prot, int user_mode)
230 {
231 pgd_t *pgd;
232 pud_t *pud;
233 pmd_t *pmd;
234 pte_t *pte, new_pte;
236 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
238 pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr));
239 if (pgd_none(*pgd)) {
240 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
241 return;
242 }
243 pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr));
244 if (pud_none(*pud)) {
245 pmd = (pmd_t *) spp_getpage();
246 make_page_readonly(pmd, XENFEAT_writable_page_tables);
247 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
248 if (pmd != pmd_offset(pud, 0)) {
249 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
250 return;
251 }
252 }
253 pmd = pmd_offset(pud, vaddr);
254 if (pmd_none(*pmd)) {
255 pte = (pte_t *) spp_getpage();
256 make_page_readonly(pte, XENFEAT_writable_page_tables);
257 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
258 if (pte != pte_offset_kernel(pmd, 0)) {
259 printk("PAGETABLE BUG #02!\n");
260 return;
261 }
262 }
263 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
265 pte = pte_offset_kernel(pmd, vaddr);
266 if (!pte_none(*pte) &&
267 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
268 pte_ERROR(*pte);
269 set_pte(pte, new_pte);
271 /*
272 * It's enough to flush this one mapping.
273 * (PGE mappings get flushed as well)
274 */
275 __flush_tlb_one(vaddr);
276 }
278 static void set_pte_phys_ma(unsigned long vaddr,
279 unsigned long phys, pgprot_t prot)
280 {
281 pgd_t *pgd;
282 pud_t *pud;
283 pmd_t *pmd;
284 pte_t *pte, new_pte;
286 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
288 pgd = pgd_offset_k(vaddr);
289 if (pgd_none(*pgd)) {
290 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
291 return;
292 }
293 pud = pud_offset(pgd, vaddr);
294 if (pud_none(*pud)) {
296 pmd = (pmd_t *) spp_getpage();
297 make_page_readonly(pmd, XENFEAT_writable_page_tables);
299 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
301 if (pmd != pmd_offset(pud, 0)) {
302 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
303 return;
304 }
305 }
306 pmd = pmd_offset(pud, vaddr);
308 if (pmd_none(*pmd)) {
309 pte = (pte_t *) spp_getpage();
310 make_page_readonly(pte, XENFEAT_writable_page_tables);
312 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
313 if (pte != pte_offset_kernel(pmd, 0)) {
314 printk("PAGETABLE BUG #02!\n");
315 return;
316 }
317 }
319 new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot);
320 pte = pte_offset_kernel(pmd, vaddr);
322 /*
323 * Note that the pte page is already RO, thus we want to use
324 * xen_l1_entry_update(), not set_pte().
325 */
326 xen_l1_entry_update(pte,
327 pfn_pte_ma(phys >> PAGE_SHIFT, prot));
329 /*
330 * It's enough to flush this one mapping.
331 * (PGE mappings get flushed as well)
332 */
333 __flush_tlb_one(vaddr);
334 }
336 #define SET_FIXMAP_KERNEL 0
337 #define SET_FIXMAP_USER 1
339 /* NOTE: this is meant to be run only at boot */
340 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
341 {
342 unsigned long address = __fix_to_virt(idx);
344 if (idx >= __end_of_fixed_addresses) {
345 printk("Invalid __set_fixmap\n");
346 return;
347 }
348 switch (idx) {
349 case VSYSCALL_FIRST_PAGE:
350 set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL);
351 break;
352 default:
353 set_pte_phys_ma(address, phys, prot);
354 break;
355 }
356 }
358 /*
359 * At this point it only supports vsyscall area.
360 */
361 void __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
362 {
363 unsigned long address = __fix_to_virt(idx);
365 if (idx >= __end_of_fixed_addresses) {
366 printk("Invalid __set_fixmap\n");
367 return;
368 }
370 set_pte_phys(address, phys, prot, SET_FIXMAP_USER);
371 }
373 unsigned long __initdata table_start, table_end;
375 unsigned long get_machine_pfn(unsigned long addr)
376 {
377 pud_t* pud = pud_offset_k(NULL, addr);
378 pmd_t* pmd = pmd_offset(pud, addr);
379 pte_t *pte = pte_offset_kernel(pmd, addr);
381 return pte_mfn(*pte);
382 }
384 static __meminit void *alloc_static_page(unsigned long *phys)
385 {
386 unsigned long va = (start_pfn << PAGE_SHIFT) + __START_KERNEL_map;
388 if (after_bootmem) {
389 void *adr = (void *)get_zeroed_page(GFP_ATOMIC);
391 *phys = __pa(adr);
392 return adr;
393 }
395 *phys = start_pfn << PAGE_SHIFT;
396 start_pfn++;
397 memset((void *)va, 0, PAGE_SIZE);
398 return (void *)va;
399 }
401 #define PTE_SIZE PAGE_SIZE
403 static inline void __set_pte(pte_t *dst, pte_t val)
404 {
405 *dst = val;
406 }
408 static inline int make_readonly(unsigned long paddr)
409 {
410 int readonly = 0;
412 /* Make new page tables read-only. */
413 if (!xen_feature(XENFEAT_writable_page_tables)
414 && (paddr >= (table_start << PAGE_SHIFT))
415 && (paddr < (table_end << PAGE_SHIFT)))
416 readonly = 1;
417 /* Make old page tables read-only. */
418 if (!xen_feature(XENFEAT_writable_page_tables)
419 && (paddr >= (xen_start_info->pt_base - __START_KERNEL_map))
420 && (paddr < (start_pfn << PAGE_SHIFT)))
421 readonly = 1;
423 /*
424 * No need for writable mapping of kernel image. This also ensures that
425 * page and descriptor tables embedded inside don't have writable
426 * mappings.
427 */
428 if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end)))
429 readonly = 1;
431 return readonly;
432 }
434 static void __meminit
435 phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
436 {
437 int i, k;
439 for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
440 unsigned long pte_phys;
441 pte_t *pte, *pte_save;
443 if (address >= end) {
444 for (; i < PTRS_PER_PMD; i++, pmd++)
445 set_pmd(pmd, __pmd(0));
446 break;
447 }
448 pte = alloc_static_page(&pte_phys);
449 pte_save = pte;
450 for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
451 if ((address >= end) ||
452 ((address >> PAGE_SHIFT) >=
453 xen_start_info->nr_pages)) {
454 __set_pte(pte, __pte(0));
455 continue;
456 }
457 if (make_readonly(address)) {
458 __set_pte(pte,
459 __pte(address | (_KERNPG_TABLE & ~_PAGE_RW)));
460 continue;
461 }
462 __set_pte(pte, __pte(address | _KERNPG_TABLE));
463 }
464 pte = pte_save;
465 early_make_page_readonly(pte, XENFEAT_writable_page_tables);
466 set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE));
467 }
468 }
470 static void __meminit
471 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
472 {
473 pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
475 if (pmd_none(*pmd)) {
476 spin_lock(&init_mm.page_table_lock);
477 phys_pmd_init(pmd, address, end);
478 spin_unlock(&init_mm.page_table_lock);
479 __flush_tlb_all();
480 }
481 }
483 static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
484 {
485 long i = pud_index(address);
487 pud = pud + i;
489 if (after_bootmem && pud_val(*pud)) {
490 phys_pmd_update(pud, address, end);
491 return;
492 }
494 for (; i < PTRS_PER_PUD; pud++, i++) {
495 unsigned long paddr, pmd_phys;
496 pmd_t *pmd;
498 paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
499 if (paddr >= end)
500 break;
502 pmd = alloc_static_page(&pmd_phys);
503 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
504 spin_lock(&init_mm.page_table_lock);
505 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
506 phys_pmd_init(pmd, paddr, end);
507 spin_unlock(&init_mm.page_table_lock);
508 }
509 __flush_tlb();
510 }
512 void __init xen_init_pt(void)
513 {
514 unsigned long addr, *page;
516 memset((void *)init_level4_pgt, 0, PAGE_SIZE);
517 memset((void *)level3_kernel_pgt, 0, PAGE_SIZE);
518 memset((void *)level2_kernel_pgt, 0, PAGE_SIZE);
520 /* Find the initial pte page that was built for us. */
521 page = (unsigned long *)xen_start_info->pt_base;
522 addr = page[pgd_index(__START_KERNEL_map)];
523 addr_to_page(addr, page);
524 addr = page[pud_index(__START_KERNEL_map)];
525 addr_to_page(addr, page);
527 /* Construct mapping of initial pte page in our own directories. */
528 init_level4_pgt[pgd_index(__START_KERNEL_map)] =
529 mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
530 level3_kernel_pgt[pud_index(__START_KERNEL_map)] =
531 __pud(__pa_symbol(level2_kernel_pgt) |
532 _KERNPG_TABLE);
533 memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
535 early_make_page_readonly(init_level4_pgt,
536 XENFEAT_writable_page_tables);
537 early_make_page_readonly(init_level4_user_pgt,
538 XENFEAT_writable_page_tables);
539 early_make_page_readonly(level3_kernel_pgt,
540 XENFEAT_writable_page_tables);
541 early_make_page_readonly(level3_user_pgt,
542 XENFEAT_writable_page_tables);
543 early_make_page_readonly(level2_kernel_pgt,
544 XENFEAT_writable_page_tables);
546 xen_pgd_pin(__pa_symbol(init_level4_pgt));
547 xen_pgd_pin(__pa_symbol(init_level4_user_pgt));
549 set_pgd((pgd_t *)(init_level4_user_pgt + 511),
550 mk_kernel_pgd(__pa_symbol(level3_user_pgt)));
551 }
553 void __init extend_init_mapping(unsigned long tables_space)
554 {
555 unsigned long va = __START_KERNEL_map;
556 unsigned long phys, addr, *pte_page;
557 pmd_t *pmd;
558 pte_t *pte, new_pte;
559 unsigned long *page = (unsigned long *)init_level4_pgt;
561 addr = page[pgd_index(va)];
562 addr_to_page(addr, page);
563 addr = page[pud_index(va)];
564 addr_to_page(addr, page);
566 /* Kill mapping of low 1MB. */
567 while (va < (unsigned long)&_text) {
568 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
569 va += PAGE_SIZE;
570 }
572 /* Ensure init mappings cover kernel text/data and initial tables. */
573 while (va < (__START_KERNEL_map
574 + (start_pfn << PAGE_SHIFT)
575 + tables_space)) {
576 pmd = (pmd_t *)&page[pmd_index(va)];
577 if (pmd_none(*pmd)) {
578 pte_page = alloc_static_page(&phys);
579 early_make_page_readonly(
580 pte_page, XENFEAT_writable_page_tables);
581 set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
582 } else {
583 addr = page[pmd_index(va)];
584 addr_to_page(addr, pte_page);
585 }
586 pte = (pte_t *)&pte_page[pte_index(va)];
587 if (pte_none(*pte)) {
588 new_pte = pfn_pte(
589 (va - __START_KERNEL_map) >> PAGE_SHIFT,
590 __pgprot(_KERNPG_TABLE));
591 xen_l1_entry_update(pte, new_pte);
592 }
593 va += PAGE_SIZE;
594 }
596 /* Finally, blow away any spurious initial mappings. */
597 while (1) {
598 pmd = (pmd_t *)&page[pmd_index(va)];
599 if (pmd_none(*pmd))
600 break;
601 HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0);
602 va += PAGE_SIZE;
603 }
604 }
606 static void __init find_early_table_space(unsigned long end)
607 {
608 unsigned long puds, pmds, ptes, tables;
610 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
611 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
612 ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT;
614 tables = round_up(puds * 8, PAGE_SIZE) +
615 round_up(pmds * 8, PAGE_SIZE) +
616 round_up(ptes * 8, PAGE_SIZE);
618 extend_init_mapping(tables);
620 table_start = start_pfn;
621 table_end = table_start + (tables>>PAGE_SHIFT);
623 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
624 end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
625 }
627 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
628 This runs before bootmem is initialized and gets pages directly from the
629 physical memory. To access them they are temporarily mapped. */
630 void __meminit init_memory_mapping(unsigned long start, unsigned long end)
631 {
632 unsigned long next;
634 Dprintk("init_memory_mapping\n");
636 /*
637 * Find space for the kernel direct mapping tables.
638 * Later we should allocate these tables in the local node of the memory
639 * mapped. Unfortunately this is done currently before the nodes are
640 * discovered.
641 */
642 if (!after_bootmem)
643 find_early_table_space(end);
645 start = (unsigned long)__va(start);
646 end = (unsigned long)__va(end);
648 for (; start < end; start = next) {
649 unsigned long pud_phys;
650 pgd_t *pgd = pgd_offset_k(start);
651 pud_t *pud;
653 if (after_bootmem) {
654 pud = pud_offset_k(pgd, __PAGE_OFFSET);
655 make_page_readonly(pud, XENFEAT_writable_page_tables);
656 pud_phys = __pa(pud);
657 } else {
658 pud = alloc_static_page(&pud_phys);
659 early_make_page_readonly(pud, XENFEAT_writable_page_tables);
660 }
661 next = start + PGDIR_SIZE;
662 if (next > end)
663 next = end;
664 phys_pud_init(pud, __pa(start), __pa(next));
665 if (!after_bootmem)
666 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
667 }
669 if (!after_bootmem) {
670 BUG_ON(start_pfn != table_end);
672 /* Re-vector virtual addresses pointing into the initial
673 mapping to the just-established permanent ones. */
674 xen_start_info = __va(__pa(xen_start_info));
675 xen_start_info->pt_base = (unsigned long)
676 __va(__pa(xen_start_info->pt_base));
677 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
678 phys_to_machine_mapping =
679 __va(__pa(xen_start_info->mfn_list));
680 xen_start_info->mfn_list = (unsigned long)
681 phys_to_machine_mapping;
682 }
683 if (xen_start_info->mod_start)
684 xen_start_info->mod_start = (unsigned long)
685 __va(__pa(xen_start_info->mod_start));
687 /* Destroy the Xen-created mappings beyond the kernel image as
688 * well as the temporary mappings created above. Prevents
689 * overlap with modules area (if init mapping is very big).
690 */
691 start = PAGE_ALIGN((unsigned long)_end);
692 end = __START_KERNEL_map + (table_end << PAGE_SHIFT);
693 for (; start < end; start += PAGE_SIZE)
694 WARN_ON(HYPERVISOR_update_va_mapping(
695 start, __pte_ma(0), 0));
696 }
698 __flush_tlb_all();
699 }
701 void __cpuinit zap_low_mappings(int cpu)
702 {
703 /* this is not required for Xen */
704 #if 0
705 swap_low_mappings();
706 #endif
707 }
709 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
710 __init void
711 size_zones(unsigned long *z, unsigned long *h,
712 unsigned long start_pfn, unsigned long end_pfn)
713 {
714 int i;
715 #ifndef CONFIG_XEN
716 unsigned long w;
717 #endif
719 for (i = 0; i < MAX_NR_ZONES; i++)
720 z[i] = 0;
722 #ifndef CONFIG_XEN
723 if (start_pfn < MAX_DMA_PFN)
724 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
725 if (start_pfn < MAX_DMA32_PFN) {
726 unsigned long dma32_pfn = MAX_DMA32_PFN;
727 if (dma32_pfn > end_pfn)
728 dma32_pfn = end_pfn;
729 z[ZONE_DMA32] = dma32_pfn - start_pfn;
730 }
731 z[ZONE_NORMAL] = end_pfn - start_pfn;
733 /* Remove lower zones from higher ones. */
734 w = 0;
735 for (i = 0; i < MAX_NR_ZONES; i++) {
736 if (z[i])
737 z[i] -= w;
738 w += z[i];
739 }
741 /* Compute holes */
742 w = start_pfn;
743 for (i = 0; i < MAX_NR_ZONES; i++) {
744 unsigned long s = w;
745 w += z[i];
746 h[i] = e820_hole_size(s, w);
747 }
749 /* Add the space pace needed for mem_map to the holes too. */
750 for (i = 0; i < MAX_NR_ZONES; i++)
751 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
753 /* The 16MB DMA zone has the kernel and other misc mappings.
754 Account them too */
755 if (h[ZONE_DMA]) {
756 h[ZONE_DMA] += dma_reserve;
757 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
758 printk(KERN_WARNING
759 "Kernel too large and filling up ZONE_DMA?\n");
760 h[ZONE_DMA] = z[ZONE_DMA];
761 }
762 }
763 #else
764 z[ZONE_DMA] = end_pfn;
765 for (i = 0; i < MAX_NR_ZONES; i++)
766 h[i] = 0;
767 #endif
768 }
770 #ifndef CONFIG_NUMA
771 void __init paging_init(void)
772 {
773 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
774 int i;
776 memory_present(0, 0, end_pfn);
777 sparse_init();
778 size_zones(zones, holes, 0, end_pfn);
779 free_area_init_node(0, NODE_DATA(0), zones,
780 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
782 /* Switch to the real shared_info page, and clear the
783 * dummy page. */
784 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
785 HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
786 memset(empty_zero_page, 0, sizeof(empty_zero_page));
788 init_mm.context.pinned = 1;
790 /* Setup mapping of lower 1st MB */
791 for (i = 0; i < NR_FIX_ISAMAPS; i++)
792 if (is_initial_xendomain())
793 set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
794 else
795 __set_fixmap(FIX_ISAMAP_BEGIN - i,
796 virt_to_mfn(empty_zero_page) << PAGE_SHIFT,
797 PAGE_KERNEL_RO);
798 }
799 #endif
801 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
802 from the CPU leading to inconsistent cache lines. address and size
803 must be aligned to 2MB boundaries.
804 Does nothing when the mapping doesn't exist. */
805 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
806 {
807 unsigned long end = address + size;
809 BUG_ON(address & ~LARGE_PAGE_MASK);
810 BUG_ON(size & ~LARGE_PAGE_MASK);
812 for (; address < end; address += LARGE_PAGE_SIZE) {
813 pgd_t *pgd = pgd_offset_k(address);
814 pud_t *pud;
815 pmd_t *pmd;
816 if (pgd_none(*pgd))
817 continue;
818 pud = pud_offset(pgd, address);
819 if (pud_none(*pud))
820 continue;
821 pmd = pmd_offset(pud, address);
822 if (!pmd || pmd_none(*pmd))
823 continue;
824 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
825 /* Could handle this, but it should not happen currently. */
826 printk(KERN_ERR
827 "clear_kernel_mapping: mapping has been split. will leak memory\n");
828 pmd_ERROR(*pmd);
829 }
830 set_pmd(pmd, __pmd(0));
831 }
832 __flush_tlb_all();
833 }
835 /*
836 * Memory hotplug specific functions
837 * These are only for non-NUMA machines right now.
838 */
839 #ifdef CONFIG_MEMORY_HOTPLUG
841 void online_page(struct page *page)
842 {
843 ClearPageReserved(page);
844 set_page_count(page, 1);
845 __free_page(page);
846 totalram_pages++;
847 num_physpages++;
848 }
850 int add_memory(u64 start, u64 size)
851 {
852 struct pglist_data *pgdat = NODE_DATA(0);
853 struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
854 unsigned long start_pfn = start >> PAGE_SHIFT;
855 unsigned long nr_pages = size >> PAGE_SHIFT;
856 int ret;
858 ret = __add_pages(zone, start_pfn, nr_pages);
859 if (ret)
860 goto error;
862 init_memory_mapping(start, (start + size -1));
864 return ret;
865 error:
866 printk("%s: Problem encountered in __add_pages!\n", __func__);
867 return ret;
868 }
869 EXPORT_SYMBOL_GPL(add_memory);
871 int remove_memory(u64 start, u64 size)
872 {
873 return -EINVAL;
874 }
875 EXPORT_SYMBOL_GPL(remove_memory);
877 #endif
879 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
880 kcore_vsyscall;
882 void __init mem_init(void)
883 {
884 long codesize, reservedpages, datasize, initsize;
885 unsigned long pfn;
887 contiguous_bitmap = alloc_bootmem_low_pages(
888 (end_pfn + 2*BITS_PER_LONG) >> 3);
889 BUG_ON(!contiguous_bitmap);
890 memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3);
892 #if defined(CONFIG_SWIOTLB)
893 pci_swiotlb_init();
894 #endif
895 no_iommu_init();
897 /* How many end-of-memory variables you have, grandma! */
898 max_low_pfn = end_pfn;
899 max_pfn = end_pfn;
900 num_physpages = end_pfn;
901 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
903 /* clear the zero-page */
904 memset(empty_zero_page, 0, PAGE_SIZE);
906 reservedpages = 0;
908 /* this will put all low memory onto the freelists */
909 #ifdef CONFIG_NUMA
910 totalram_pages = numa_free_all_bootmem();
911 #else
912 totalram_pages = free_all_bootmem();
913 #endif
914 /* XEN: init and count pages outside initial allocation. */
915 for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) {
916 ClearPageReserved(&mem_map[pfn]);
917 set_page_count(&mem_map[pfn], 1);
918 totalram_pages++;
919 }
920 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
922 after_bootmem = 1;
924 codesize = (unsigned long) &_etext - (unsigned long) &_text;
925 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
926 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
928 /* Register memory areas for /proc/kcore */
929 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
930 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
931 VMALLOC_END-VMALLOC_START);
932 kclist_add(&kcore_kernel, &_stext, _end - _stext);
933 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
934 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
935 VSYSCALL_END - VSYSCALL_START);
937 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
938 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
939 end_pfn << (PAGE_SHIFT-10),
940 codesize >> 10,
941 reservedpages << (PAGE_SHIFT-10),
942 datasize >> 10,
943 initsize >> 10);
945 #ifndef CONFIG_XEN
946 #ifdef CONFIG_SMP
947 /*
948 * Sync boot_level4_pgt mappings with the init_level4_pgt
949 * except for the low identity mappings which are already zapped
950 * in init_level4_pgt. This sync-up is essential for AP's bringup
951 */
952 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
953 #endif
954 #endif
955 }
957 void free_initmem(void)
958 {
959 #ifdef __DO_LATER__
960 /*
961 * Some pages can be pinned, but some are not. Unpinning such pages
962 * triggers BUG().
963 */
964 unsigned long addr;
966 addr = (unsigned long)(&__init_begin);
967 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
968 ClearPageReserved(virt_to_page(addr));
969 set_page_count(virt_to_page(addr), 1);
970 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
971 make_page_writable(
972 __va(__pa(addr)), XENFEAT_writable_page_tables);
973 /*
974 * Make pages from __PAGE_OFFSET address as well
975 */
976 make_page_writable(
977 (void *)addr, XENFEAT_writable_page_tables);
978 free_page(addr);
979 totalram_pages++;
980 }
981 memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
982 printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
983 #endif
984 }
986 #ifdef CONFIG_DEBUG_RODATA
988 extern char __start_rodata, __end_rodata;
989 void mark_rodata_ro(void)
990 {
991 unsigned long addr = (unsigned long)&__start_rodata;
993 for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
994 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
996 printk ("Write protecting the kernel read-only data: %luk\n",
997 (&__end_rodata - &__start_rodata) >> 10);
999 /*
1000 * change_page_attr_addr() requires a global_flush_tlb() call after it.
1001 * We do this after the printk so that if something went wrong in the
1002 * change, the printk gets out at least to give a better debug hint
1003 * of who is the culprit.
1004 */
1005 global_flush_tlb();
1007 #endif
1009 #ifdef CONFIG_BLK_DEV_INITRD
1010 void free_initrd_mem(unsigned long start, unsigned long end)
1012 if (start >= end)
1013 return;
1014 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
1015 for (; start < end; start += PAGE_SIZE) {
1016 ClearPageReserved(virt_to_page(start));
1017 set_page_count(virt_to_page(start), 1);
1018 free_page(start);
1019 totalram_pages++;
1022 #endif
1024 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
1026 /* Should check here against the e820 map to avoid double free */
1027 #ifdef CONFIG_NUMA
1028 int nid = phys_to_nid(phys);
1029 reserve_bootmem_node(NODE_DATA(nid), phys, len);
1030 #else
1031 reserve_bootmem(phys, len);
1032 #endif
1033 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
1034 dma_reserve += len / PAGE_SIZE;
1037 int kern_addr_valid(unsigned long addr)
1039 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
1040 pgd_t *pgd;
1041 pud_t *pud;
1042 pmd_t *pmd;
1043 pte_t *pte;
1045 if (above != 0 && above != -1UL)
1046 return 0;
1048 pgd = pgd_offset_k(addr);
1049 if (pgd_none(*pgd))
1050 return 0;
1052 pud = pud_offset_k(pgd, addr);
1053 if (pud_none(*pud))
1054 return 0;
1056 pmd = pmd_offset(pud, addr);
1057 if (pmd_none(*pmd))
1058 return 0;
1059 if (pmd_large(*pmd))
1060 return pfn_valid(pmd_pfn(*pmd));
1062 pte = pte_offset_kernel(pmd, addr);
1063 if (pte_none(*pte))
1064 return 0;
1065 return pfn_valid(pte_pfn(*pte));
1068 #ifdef CONFIG_SYSCTL
1069 #include <linux/sysctl.h>
1071 extern int exception_trace, page_fault_trace;
1073 static ctl_table debug_table2[] = {
1074 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
1075 proc_dointvec },
1076 { 0, }
1077 };
1079 static ctl_table debug_root_table2[] = {
1080 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
1081 .child = debug_table2 },
1082 { 0 },
1083 };
1085 static __init int x8664_sysctl_init(void)
1087 register_sysctl_table(debug_root_table2, 1);
1088 return 0;
1090 __initcall(x8664_sysctl_init);
1091 #endif
1093 /* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
1094 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
1095 not need special handling anymore. */
1097 static struct vm_area_struct gate_vma = {
1098 .vm_start = VSYSCALL_START,
1099 .vm_end = VSYSCALL_END,
1100 .vm_page_prot = PAGE_READONLY
1101 };
1103 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
1105 #ifdef CONFIG_IA32_EMULATION
1106 if (test_tsk_thread_flag(tsk, TIF_IA32))
1107 return NULL;
1108 #endif
1109 return &gate_vma;
1112 int in_gate_area(struct task_struct *task, unsigned long addr)
1114 struct vm_area_struct *vma = get_gate_vma(task);
1115 if (!vma)
1116 return 0;
1117 return (addr >= vma->vm_start) && (addr < vma->vm_end);
1120 /* Use this when you have no reliable task/vma, typically from interrupt
1121 * context. It is less reliable than using the task's vma and may give
1122 * false positives.
1123 */
1124 int in_gate_area_no_task(unsigned long addr)
1126 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);