ia64/xen-unstable

view linux-2.6-xen-sparse/arch/xen/x86_64/kernel/e820.c @ 7959:ff18a169e866

Update the memory_op() hypercall. Add two new subcommands, to
query a domain's current and maximum memory reservation. Also,
XENMEM_maximum_ram_page now returns the max_page directly,
rather than writing through a passed-in pointer.

Also, disable PAE in the default config (accidentally checked
in two changesets ago).

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Mon Nov 21 16:56:39 2005 +0100 (2005-11-21)
parents 6a2d93c2b32d
children d3a4485a41fc
line source
1 /*
2 * Handle the memory map.
3 * The functions here do the job until bootmem takes over.
4 * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $
5 *
6 * Getting sanitize_e820_map() in sync with i386 version by applying change:
7 * - Provisions for empty E820 memory regions (reported by certain BIOSes).
8 * Alex Achenbach <xela@slit.de>, December 2002.
9 * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
10 *
11 */
12 #include <linux/config.h>
13 #include <linux/kernel.h>
14 #include <linux/types.h>
15 #include <linux/init.h>
16 #include <linux/bootmem.h>
17 #include <linux/ioport.h>
18 #include <linux/string.h>
19 #include <asm/page.h>
20 #include <asm/e820.h>
21 #include <asm/proto.h>
22 #include <asm/bootsetup.h>
23 #include <asm-xen/xen-public/memory.h>
25 unsigned long pci_mem_start = 0xaeedbabe;
27 /*
28 * PFN of last memory page.
29 */
30 unsigned long end_pfn;
31 unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;
32 unsigned long end_pfn_map;
34 /*
35 * Add a memory region to the kernel e820 map.
36 */
37 void __init add_memory_region(unsigned long start, unsigned long size, int type)
38 {
39 int x = e820.nr_map;
41 if (x == E820MAX) {
42 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
43 return;
44 }
46 e820.map[x].addr = start;
47 e820.map[x].size = size;
48 e820.map[x].type = type;
49 e820.nr_map++;
50 }
52 #ifndef CONFIG_XEN
53 extern char _end[];
55 /*
56 * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
57 * The direct mapping extends to end_pfn_map, so that we can directly access
58 * apertures, ACPI and other tables without having to play with fixmaps.
59 */
61 /*
62 * Last pfn which the user wants to use.
63 */
65 extern struct resource code_resource, data_resource;
67 /* Check for some hardcoded bad areas that early boot is not allowed to touch */
68 static inline int bad_addr(unsigned long *addrp, unsigned long size)
69 {
70 unsigned long addr = *addrp, last = addr + size;
72 /* various gunk below that needed for SMP startup */
73 if (addr < 0x8000) {
74 *addrp = 0x8000;
75 return 1;
76 }
78 /* direct mapping tables of the kernel */
79 if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
80 *addrp = table_end << PAGE_SHIFT;
81 return 1;
82 }
84 /* initrd */
85 #ifdef CONFIG_BLK_DEV_INITRD
86 if (LOADER_TYPE && INITRD_START && last >= INITRD_START &&
87 addr < INITRD_START+INITRD_SIZE) {
88 *addrp = INITRD_START + INITRD_SIZE;
89 return 1;
90 }
91 #endif
92 /* kernel code + 640k memory hole (later should not be needed, but
93 be paranoid for now) */
94 if (last >= 640*1024 && addr < __pa_symbol(&_end)) {
95 *addrp = __pa_symbol(&_end);
96 return 1;
97 }
98 /* XXX ramdisk image here? */
99 return 0;
100 }
102 int __init e820_mapped(unsigned long start, unsigned long end, unsigned type)
103 {
104 int i;
105 for (i = 0; i < e820.nr_map; i++) {
106 struct e820entry *ei = &e820.map[i];
107 if (type && ei->type != type)
108 continue;
109 if (ei->addr >= end || ei->addr + ei->size < start)
110 continue;
111 return 1;
112 }
113 return 0;
114 }
116 /*
117 * Find a free area in a specific range.
118 */
119 unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
120 {
121 int i;
122 for (i = 0; i < e820.nr_map; i++) {
123 struct e820entry *ei = &e820.map[i];
124 unsigned long addr = ei->addr, last;
125 if (ei->type != E820_RAM)
126 continue;
127 if (addr < start)
128 addr = start;
129 if (addr > ei->addr + ei->size)
130 continue;
131 while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size)
132 ;
133 last = addr + size;
134 if (last > ei->addr + ei->size)
135 continue;
136 if (last > end)
137 continue;
138 return addr;
139 }
140 return -1UL;
141 }
143 /*
144 * Free bootmem based on the e820 table for a node.
145 */
146 void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
147 {
148 int i;
149 for (i = 0; i < e820.nr_map; i++) {
150 struct e820entry *ei = &e820.map[i];
151 unsigned long last, addr;
153 if (ei->type != E820_RAM ||
154 ei->addr+ei->size <= start ||
155 ei->addr > end)
156 continue;
158 addr = round_up(ei->addr, PAGE_SIZE);
159 if (addr < start)
160 addr = start;
162 last = round_down(ei->addr + ei->size, PAGE_SIZE);
163 if (last >= end)
164 last = end;
166 if (last > addr && last-addr >= PAGE_SIZE)
167 free_bootmem_node(pgdat, addr, last-addr);
168 }
169 }
171 /*
172 * Find the highest page frame number we have available
173 */
174 unsigned long __init e820_end_of_ram(void)
175 {
176 int i;
177 unsigned long end_pfn = 0;
179 for (i = 0; i < e820.nr_map; i++) {
180 struct e820entry *ei = &e820.map[i];
181 unsigned long start, end;
183 start = round_up(ei->addr, PAGE_SIZE);
184 end = round_down(ei->addr + ei->size, PAGE_SIZE);
185 if (start >= end)
186 continue;
187 if (ei->type == E820_RAM) {
188 if (end > end_pfn<<PAGE_SHIFT)
189 end_pfn = end>>PAGE_SHIFT;
190 } else {
191 if (end > end_pfn_map<<PAGE_SHIFT)
192 end_pfn_map = end>>PAGE_SHIFT;
193 }
194 }
196 if (end_pfn > end_pfn_map)
197 end_pfn_map = end_pfn;
198 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
199 end_pfn_map = MAXMEM>>PAGE_SHIFT;
200 if (end_pfn > end_user_pfn)
201 end_pfn = end_user_pfn;
202 if (end_pfn > end_pfn_map)
203 end_pfn = end_pfn_map;
205 return end_pfn;
206 }
208 /*
209 * Mark e820 reserved areas as busy for the resource manager.
210 */
211 void __init e820_reserve_resources(void)
212 {
213 int i;
214 for (i = 0; i < e820.nr_map; i++) {
215 struct resource *res;
216 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
217 continue;
218 res = alloc_bootmem_low(sizeof(struct resource));
219 switch (e820.map[i].type) {
220 case E820_RAM: res->name = "System RAM"; break;
221 case E820_ACPI: res->name = "ACPI Tables"; break;
222 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
223 default: res->name = "reserved";
224 }
225 res->start = e820.map[i].addr;
226 res->end = res->start + e820.map[i].size - 1;
227 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
228 request_resource(&iomem_resource, res);
229 if (e820.map[i].type == E820_RAM) {
230 /*
231 * We don't know which RAM region contains kernel data,
232 * so we try it repeatedly and let the resource manager
233 * test it.
234 */
235 request_resource(res, &code_resource);
236 request_resource(res, &data_resource);
237 }
238 }
239 }
241 void __init e820_print_map(char *who)
242 {
243 int i;
245 for (i = 0; i < e820.nr_map; i++) {
246 printk(" %s: %016Lx - %016Lx ", who,
247 (unsigned long long) e820.map[i].addr,
248 (unsigned long long) (e820.map[i].addr + e820.map[i].size));
249 switch (e820.map[i].type) {
250 case E820_RAM: printk("(usable)\n");
251 break;
252 case E820_RESERVED:
253 printk("(reserved)\n");
254 break;
255 case E820_ACPI:
256 printk("(ACPI data)\n");
257 break;
258 case E820_NVS:
259 printk("(ACPI NVS)\n");
260 break;
261 default: printk("type %u\n", e820.map[i].type);
262 break;
263 }
264 }
265 }
267 /*
268 * Sanitize the BIOS e820 map.
269 *
270 * Some e820 responses include overlapping entries. The following
271 * replaces the original e820 map with a new one, removing overlaps.
272 *
273 */
274 static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
275 {
276 struct change_member {
277 struct e820entry *pbios; /* pointer to original bios entry */
278 unsigned long long addr; /* address for this change point */
279 };
280 static struct change_member change_point_list[2*E820MAX] __initdata;
281 static struct change_member *change_point[2*E820MAX] __initdata;
282 static struct e820entry *overlap_list[E820MAX] __initdata;
283 static struct e820entry new_bios[E820MAX] __initdata;
284 struct change_member *change_tmp;
285 unsigned long current_type, last_type;
286 unsigned long long last_addr;
287 int chgidx, still_changing;
288 int overlap_entries;
289 int new_bios_entry;
290 int old_nr, new_nr, chg_nr;
291 int i;
293 /*
294 Visually we're performing the following (1,2,3,4 = memory types)...
296 Sample memory map (w/overlaps):
297 ____22__________________
298 ______________________4_
299 ____1111________________
300 _44_____________________
301 11111111________________
302 ____________________33__
303 ___________44___________
304 __________33333_________
305 ______________22________
306 ___________________2222_
307 _________111111111______
308 _____________________11_
309 _________________4______
311 Sanitized equivalent (no overlap):
312 1_______________________
313 _44_____________________
314 ___1____________________
315 ____22__________________
316 ______11________________
317 _________1______________
318 __________3_____________
319 ___________44___________
320 _____________33_________
321 _______________2________
322 ________________1_______
323 _________________4______
324 ___________________2____
325 ____________________33__
326 ______________________4_
327 */
329 /* if there's only one memory region, don't bother */
330 if (*pnr_map < 2)
331 return -1;
333 old_nr = *pnr_map;
335 /* bail out if we find any unreasonable addresses in bios map */
336 for (i=0; i<old_nr; i++)
337 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
338 return -1;
340 /* create pointers for initial change-point information (for sorting) */
341 for (i=0; i < 2*old_nr; i++)
342 change_point[i] = &change_point_list[i];
344 /* record all known change-points (starting and ending addresses),
345 omitting those that are for empty memory regions */
346 chgidx = 0;
347 for (i=0; i < old_nr; i++) {
348 if (biosmap[i].size != 0) {
349 change_point[chgidx]->addr = biosmap[i].addr;
350 change_point[chgidx++]->pbios = &biosmap[i];
351 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
352 change_point[chgidx++]->pbios = &biosmap[i];
353 }
354 }
355 chg_nr = chgidx;
357 /* sort change-point list by memory addresses (low -> high) */
358 still_changing = 1;
359 while (still_changing) {
360 still_changing = 0;
361 for (i=1; i < chg_nr; i++) {
362 /* if <current_addr> > <last_addr>, swap */
363 /* or, if current=<start_addr> & last=<end_addr>, swap */
364 if ((change_point[i]->addr < change_point[i-1]->addr) ||
365 ((change_point[i]->addr == change_point[i-1]->addr) &&
366 (change_point[i]->addr == change_point[i]->pbios->addr) &&
367 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
368 )
369 {
370 change_tmp = change_point[i];
371 change_point[i] = change_point[i-1];
372 change_point[i-1] = change_tmp;
373 still_changing=1;
374 }
375 }
376 }
378 /* create a new bios memory map, removing overlaps */
379 overlap_entries=0; /* number of entries in the overlap table */
380 new_bios_entry=0; /* index for creating new bios map entries */
381 last_type = 0; /* start with undefined memory type */
382 last_addr = 0; /* start with 0 as last starting address */
383 /* loop through change-points, determining affect on the new bios map */
384 for (chgidx=0; chgidx < chg_nr; chgidx++)
385 {
386 /* keep track of all overlapping bios entries */
387 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
388 {
389 /* add map entry to overlap list (> 1 entry implies an overlap) */
390 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
391 }
392 else
393 {
394 /* remove entry from list (order independent, so swap with last) */
395 for (i=0; i<overlap_entries; i++)
396 {
397 if (overlap_list[i] == change_point[chgidx]->pbios)
398 overlap_list[i] = overlap_list[overlap_entries-1];
399 }
400 overlap_entries--;
401 }
402 /* if there are overlapping entries, decide which "type" to use */
403 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
404 current_type = 0;
405 for (i=0; i<overlap_entries; i++)
406 if (overlap_list[i]->type > current_type)
407 current_type = overlap_list[i]->type;
408 /* continue building up new bios map based on this information */
409 if (current_type != last_type) {
410 if (last_type != 0) {
411 new_bios[new_bios_entry].size =
412 change_point[chgidx]->addr - last_addr;
413 /* move forward only if the new size was non-zero */
414 if (new_bios[new_bios_entry].size != 0)
415 if (++new_bios_entry >= E820MAX)
416 break; /* no more space left for new bios entries */
417 }
418 if (current_type != 0) {
419 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
420 new_bios[new_bios_entry].type = current_type;
421 last_addr=change_point[chgidx]->addr;
422 }
423 last_type = current_type;
424 }
425 }
426 new_nr = new_bios_entry; /* retain count for new bios entries */
428 /* copy new bios mapping into original location */
429 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
430 *pnr_map = new_nr;
432 return 0;
433 }
435 /*
436 * Copy the BIOS e820 map into a safe place.
437 *
438 * Sanity-check it while we're at it..
439 *
440 * If we're lucky and live on a modern system, the setup code
441 * will have given us a memory map that we can use to properly
442 * set up memory. If we aren't, we'll fake a memory map.
443 *
444 * We check to see that the memory map contains at least 2 elements
445 * before we'll use it, because the detection code in setup.S may
446 * not be perfect and most every PC known to man has two memory
447 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
448 * thinkpad 560x, for example, does not cooperate with the memory
449 * detection code.)
450 */
451 static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
452 {
453 /* Only one memory region (or negative)? Ignore it */
454 if (nr_map < 2)
455 return -1;
457 do {
458 unsigned long start = biosmap->addr;
459 unsigned long size = biosmap->size;
460 unsigned long end = start + size;
461 unsigned long type = biosmap->type;
463 /* Overflow in 64 bits? Ignore the memory map. */
464 if (start > end)
465 return -1;
467 /*
468 * Some BIOSes claim RAM in the 640k - 1M region.
469 * Not right. Fix it up.
470 *
471 * This should be removed on Hammer which is supposed to not
472 * have non e820 covered ISA mappings there, but I had some strange
473 * problems so it stays for now. -AK
474 */
475 if (type == E820_RAM) {
476 if (start < 0x100000ULL && end > 0xA0000ULL) {
477 if (start < 0xA0000ULL)
478 add_memory_region(start, 0xA0000ULL-start, type);
479 if (end <= 0x100000ULL)
480 continue;
481 start = 0x100000ULL;
482 size = end - start;
483 }
484 }
486 add_memory_region(start, size, type);
487 } while (biosmap++,--nr_map);
488 return 0;
489 }
491 void __init setup_memory_region(void)
492 {
493 char *who = "BIOS-e820";
495 /*
496 * Try to copy the BIOS-supplied E820-map.
497 *
498 * Otherwise fake a memory map; one section from 0k->640k,
499 * the next section from 1mb->appropriate_mem_k
500 */
501 sanitize_e820_map(E820_MAP, &E820_MAP_NR);
502 if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
503 unsigned long mem_size;
505 /* compare results from other methods and take the greater */
506 if (ALT_MEM_K < EXT_MEM_K) {
507 mem_size = EXT_MEM_K;
508 who = "BIOS-88";
509 } else {
510 mem_size = ALT_MEM_K;
511 who = "BIOS-e801";
512 }
514 e820.nr_map = 0;
515 add_memory_region(0, LOWMEMSIZE(), E820_RAM);
516 add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
517 }
518 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
519 e820_print_map(who);
520 }
522 #else /* CONFIG_XEN */
524 extern unsigned long xen_override_max_pfn;
525 extern union xen_start_info_union xen_start_info_union;
527 unsigned long __init e820_end_of_ram(void)
528 {
529 unsigned long max_end_pfn;
531 if (xen_override_max_pfn == 0) {
532 max_end_pfn = xen_start_info->nr_pages;
533 /* Default 8MB slack (to balance backend allocations). */
534 max_end_pfn += 8 << (20 - PAGE_SHIFT);
535 } else if (xen_override_max_pfn > xen_start_info->nr_pages) {
536 max_end_pfn = xen_override_max_pfn;
537 } else {
538 max_end_pfn = xen_start_info->nr_pages;
539 }
541 return max_end_pfn;
542 }
544 void __init e820_reserve_resources(void)
545 {
546 dom0_op_t op;
547 struct dom0_memory_map_entry *map;
548 unsigned long gapstart, gapsize, last;
549 int i, found = 0;
551 if (!(xen_start_info->flags & SIF_INITDOMAIN))
552 return;
554 map = alloc_bootmem_low_pages(PAGE_SIZE);
555 op.cmd = DOM0_PHYSICAL_MEMORY_MAP;
556 op.u.physical_memory_map.memory_map = map;
557 op.u.physical_memory_map.max_map_entries =
558 PAGE_SIZE / sizeof(struct dom0_memory_map_entry);
559 BUG_ON(HYPERVISOR_dom0_op(&op));
561 last = 0x100000000ULL;
562 gapstart = 0x10000000;
563 gapsize = 0x400000;
565 for (i = op.u.physical_memory_map.nr_map_entries - 1; i >= 0; i--) {
566 struct resource *res;
568 if ((last > map[i].end) && ((last - map[i].end) > gapsize)) {
569 gapsize = last - map[i].end;
570 gapstart = map[i].end;
571 found = 1;
572 }
573 if (map[i].start < last)
574 last = map[i].start;
576 if (map[i].end > 0x100000000ULL)
577 continue;
578 res = alloc_bootmem_low(sizeof(struct resource));
579 res->name = map[i].is_ram ? "System RAM" : "reserved";
580 res->start = map[i].start;
581 res->end = map[i].end - 1;
582 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
583 request_resource(&iomem_resource, res);
584 }
586 free_bootmem(__pa(map), PAGE_SIZE);
588 if (!found) {
589 gapstart = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL);
590 gapstart = (gapstart << PAGE_SHIFT) + 1024*1024;
591 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
592 KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
593 }
595 /*
596 * Start allocating dynamic PCI memory a bit into the gap,
597 * aligned up to the nearest megabyte.
598 *
599 * Question: should we try to pad it up a bit (do something
600 * like " + (gapsize >> 3)" in there too?). We now have the
601 * technology.
602 */
603 pci_mem_start = (gapstart + 0xfffff) & ~0xfffff;
605 printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
606 pci_mem_start, gapstart, gapsize);
607 }
609 #endif
611 void __init parse_memopt(char *p, char **from)
612 {
613 end_user_pfn = memparse(p, from);
614 end_user_pfn >>= PAGE_SHIFT;
615 xen_override_max_pfn = (unsigned long) end_user_pfn;
616 }
618 /*
619 * Search for the biggest gap in the low 32 bits of the e820
620 * memory space. We pass this space to PCI to assign MMIO resources
621 * for hotplug or unconfigured devices in.
622 * Hopefully the BIOS let enough space left.
623 */
624 __init void e820_setup_gap(void)
625 {
626 #ifndef CONFIG_XEN
627 unsigned long gapstart, gapsize;
628 unsigned long last;
629 int i;
630 int found = 0;
632 last = 0x100000000ull;
633 gapstart = 0x10000000;
634 gapsize = 0x400000;
635 i = e820.nr_map;
636 while (--i >= 0) {
637 unsigned long long start = e820.map[i].addr;
638 unsigned long long end = start + e820.map[i].size;
640 /*
641 * Since "last" is at most 4GB, we know we'll
642 * fit in 32 bits if this condition is true
643 */
644 if (last > end) {
645 unsigned long gap = last - end;
647 if (gap > gapsize) {
648 gapsize = gap;
649 gapstart = end;
650 found = 1;
651 }
652 }
653 if (start < last)
654 last = start;
655 }
657 if (!found) {
658 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
659 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n"
660 KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n");
661 }
663 /*
664 * Start allocating dynamic PCI memory a bit into the gap,
665 * aligned up to the nearest megabyte.
666 *
667 * Question: should we try to pad it up a bit (do something
668 * like " + (gapsize >> 3)" in there too?). We now have the
669 * technology.
670 */
671 pci_mem_start = (gapstart + 0xfffff) & ~0xfffff;
673 printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
674 pci_mem_start, gapstart, gapsize);
675 #endif
676 }