ia64/xen-unstable

view linux-2.4.29-xen-sparse/mm/page_alloc.c @ 3602:9a9c5a491401

bitkeeper revision 1.1159.235.1 (42000d3dwcPyT8aY4VIPYGCfCAJuQQ)

More x86/64. Status: traps.c now included in the build, but actual building
of IDT doesn't happen, and we need some sort of entry.S. More page-table
building required so that arch_init_memory() can work. And there is something
odd with MP-table parsing; I currently suspect that __init sections are
causing problems.
Signed-off-by: keir.fraser@cl.cam.ac.uk
author kaf24@viper.(none)
date Tue Feb 01 23:14:05 2005 +0000 (2005-02-01)
parents 610068179f96
children 0a4b76b6b5a0
line source
1 /*
2 * linux/mm/page_alloc.c
3 *
4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c
6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 */
15 #include <linux/config.h>
16 #include <linux/mm.h>
17 #include <linux/swap.h>
18 #include <linux/swapctl.h>
19 #include <linux/interrupt.h>
20 #include <linux/pagemap.h>
21 #include <linux/bootmem.h>
22 #include <linux/slab.h>
23 #include <linux/module.h>
25 int nr_swap_pages;
26 int nr_active_pages;
27 int nr_inactive_pages;
28 LIST_HEAD(inactive_list);
29 LIST_HEAD(active_list);
30 pg_data_t *pgdat_list;
32 /*
33 *
34 * The zone_table array is used to look up the address of the
35 * struct zone corresponding to a given zone number (ZONE_DMA,
36 * ZONE_NORMAL, or ZONE_HIGHMEM).
37 */
38 zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
39 EXPORT_SYMBOL(zone_table);
41 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
42 static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
43 static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
44 static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
45 static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
47 int vm_gfp_debug = 0;
49 static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
51 static spinlock_t free_pages_ok_no_irq_lock = SPIN_LOCK_UNLOCKED;
52 struct page * free_pages_ok_no_irq_head;
54 static void do_free_pages_ok_no_irq(void * arg)
55 {
56 struct page * page, * __page;
58 spin_lock_irq(&free_pages_ok_no_irq_lock);
60 page = free_pages_ok_no_irq_head;
61 free_pages_ok_no_irq_head = NULL;
63 spin_unlock_irq(&free_pages_ok_no_irq_lock);
65 while (page) {
66 __page = page;
67 page = page->next_hash;
68 __free_pages_ok(__page, __page->index);
69 }
70 }
72 static struct tq_struct free_pages_ok_no_irq_task = {
73 .routine = do_free_pages_ok_no_irq,
74 };
77 /*
78 * Temporary debugging check.
79 */
80 #define BAD_RANGE(zone, page) \
81 ( \
82 (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \
83 || (((page) - mem_map) < (zone)->zone_start_mapnr) \
84 || ((zone) != page_zone(page)) \
85 )
87 /*
88 * Freeing function for a buddy system allocator.
89 * Contrary to prior comments, this is *NOT* hairy, and there
90 * is no reason for anyone not to understand it.
91 *
92 * The concept of a buddy system is to maintain direct-mapped tables
93 * (containing bit values) for memory blocks of various "orders".
94 * The bottom level table contains the map for the smallest allocatable
95 * units of memory (here, pages), and each level above it describes
96 * pairs of units from the levels below, hence, "buddies".
97 * At a high level, all that happens here is marking the table entry
98 * at the bottom level available, and propagating the changes upward
99 * as necessary, plus some accounting needed to play nicely with other
100 * parts of the VM system.
101 * At each level, we keep one bit for each pair of blocks, which
102 * is set to 1 iff only one of the pair is allocated. So when we
103 * are allocating or freeing one, we can derive the state of the
104 * other. That is, if we allocate a small block, and both were
105 * free, the remainder of the region must be split into blocks.
106 * If a block is freed, and its buddy is also free, then this
107 * triggers coalescing into a block of larger size.
108 *
109 * -- wli
110 */
112 static void fastcall __free_pages_ok (struct page *page, unsigned int order)
113 {
114 unsigned long index, page_idx, mask, flags;
115 free_area_t *area;
116 struct page *base;
117 zone_t *zone;
119 if (PageForeign(page))
120 return (PageForeignDestructor(page))(page);
122 /*
123 * Yes, think what happens when other parts of the kernel take
124 * a reference to a page in order to pin it for io. -ben
125 */
126 if (PageLRU(page)) {
127 if (unlikely(in_interrupt())) {
128 unsigned long flags;
130 spin_lock_irqsave(&free_pages_ok_no_irq_lock, flags);
131 page->next_hash = free_pages_ok_no_irq_head;
132 free_pages_ok_no_irq_head = page;
133 page->index = order;
135 spin_unlock_irqrestore(&free_pages_ok_no_irq_lock, flags);
137 schedule_task(&free_pages_ok_no_irq_task);
138 return;
139 }
141 lru_cache_del(page);
142 }
144 if (page->buffers)
145 BUG();
146 if (page->mapping)
147 BUG();
148 if (!VALID_PAGE(page))
149 BUG();
150 if (PageLocked(page))
151 BUG();
152 if (PageActive(page))
153 BUG();
154 ClearPageReferenced(page);
155 ClearPageDirty(page);
157 if (current->flags & PF_FREE_PAGES)
158 goto local_freelist;
159 back_local_freelist:
161 zone = page_zone(page);
163 mask = (~0UL) << order;
164 base = zone->zone_mem_map;
165 page_idx = page - base;
166 if (page_idx & ~mask)
167 BUG();
168 index = page_idx >> (1 + order);
170 area = zone->free_area + order;
172 spin_lock_irqsave(&zone->lock, flags);
174 zone->free_pages -= mask;
176 while (mask + (1 << (MAX_ORDER-1))) {
177 struct page *buddy1, *buddy2;
179 if (area >= zone->free_area + MAX_ORDER)
180 BUG();
181 if (!__test_and_change_bit(index, area->map))
182 /*
183 * the buddy page is still allocated.
184 */
185 break;
186 /*
187 * Move the buddy up one level.
188 * This code is taking advantage of the identity:
189 * -mask = 1+~mask
190 */
191 buddy1 = base + (page_idx ^ -mask);
192 buddy2 = base + page_idx;
193 if (BAD_RANGE(zone,buddy1))
194 BUG();
195 if (BAD_RANGE(zone,buddy2))
196 BUG();
198 list_del(&buddy1->list);
199 mask <<= 1;
200 area++;
201 index >>= 1;
202 page_idx &= mask;
203 }
204 list_add(&(base + page_idx)->list, &area->free_list);
206 spin_unlock_irqrestore(&zone->lock, flags);
207 return;
209 local_freelist:
210 if (current->nr_local_pages)
211 goto back_local_freelist;
212 if (in_interrupt())
213 goto back_local_freelist;
215 list_add(&page->list, &current->local_pages);
216 page->index = order;
217 current->nr_local_pages++;
218 }
220 #define MARK_USED(index, order, area) \
221 __change_bit((index) >> (1+(order)), (area)->map)
223 static inline struct page * expand (zone_t *zone, struct page *page,
224 unsigned long index, int low, int high, free_area_t * area)
225 {
226 unsigned long size = 1 << high;
228 while (high > low) {
229 if (BAD_RANGE(zone,page))
230 BUG();
231 area--;
232 high--;
233 size >>= 1;
234 list_add(&(page)->list, &(area)->free_list);
235 MARK_USED(index, high, area);
236 index += size;
237 page += size;
238 }
239 if (BAD_RANGE(zone,page))
240 BUG();
241 return page;
242 }
244 static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
245 static struct page * fastcall rmqueue(zone_t *zone, unsigned int order)
246 {
247 free_area_t * area = zone->free_area + order;
248 unsigned int curr_order = order;
249 struct list_head *head, *curr;
250 unsigned long flags;
251 struct page *page;
253 spin_lock_irqsave(&zone->lock, flags);
254 do {
255 head = &area->free_list;
256 curr = head->next;
258 if (curr != head) {
259 unsigned int index;
261 page = list_entry(curr, struct page, list);
262 if (BAD_RANGE(zone,page))
263 BUG();
264 list_del(curr);
265 index = page - zone->zone_mem_map;
266 if (curr_order != MAX_ORDER-1)
267 MARK_USED(index, curr_order, area);
268 zone->free_pages -= 1UL << order;
270 page = expand(zone, page, index, order, curr_order, area);
271 spin_unlock_irqrestore(&zone->lock, flags);
273 set_page_count(page, 1);
274 if (BAD_RANGE(zone,page))
275 BUG();
276 if (PageLRU(page))
277 BUG();
278 if (PageActive(page))
279 BUG();
280 return page;
281 }
282 curr_order++;
283 area++;
284 } while (curr_order < MAX_ORDER);
285 spin_unlock_irqrestore(&zone->lock, flags);
287 return NULL;
288 }
290 #ifndef CONFIG_DISCONTIGMEM
291 struct page * fastcall _alloc_pages(unsigned int gfp_mask, unsigned int order)
292 {
293 return __alloc_pages(gfp_mask, order,
294 contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
295 }
296 #endif
298 static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
299 static struct page * fastcall balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
300 {
301 struct page * page = NULL;
302 int __freed;
304 if (in_interrupt())
305 BUG();
307 current->allocation_order = order;
308 current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
310 __freed = try_to_free_pages_zone(classzone, gfp_mask);
312 current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
314 if (current->nr_local_pages) {
315 struct list_head * entry, * local_pages;
316 struct page * tmp;
317 int nr_pages;
319 local_pages = &current->local_pages;
321 if (likely(__freed)) {
322 /* pick from the last inserted so we're lifo */
323 entry = local_pages->next;
324 do {
325 tmp = list_entry(entry, struct page, list);
326 if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
327 list_del(entry);
328 current->nr_local_pages--;
329 set_page_count(tmp, 1);
330 page = tmp;
332 if (page->buffers)
333 BUG();
334 if (page->mapping)
335 BUG();
336 if (!VALID_PAGE(page))
337 BUG();
338 if (PageLocked(page))
339 BUG();
340 if (PageLRU(page))
341 BUG();
342 if (PageActive(page))
343 BUG();
344 if (PageDirty(page))
345 BUG();
347 break;
348 }
349 } while ((entry = entry->next) != local_pages);
350 }
352 nr_pages = current->nr_local_pages;
353 /* free in reverse order so that the global order will be lifo */
354 while ((entry = local_pages->prev) != local_pages) {
355 list_del(entry);
356 tmp = list_entry(entry, struct page, list);
357 __free_pages_ok(tmp, tmp->index);
358 if (!nr_pages--)
359 BUG();
360 }
361 current->nr_local_pages = 0;
362 }
364 *freed = __freed;
365 return page;
366 }
368 static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order)
369 {
370 long free = zone->free_pages - (1UL << order);
371 return free >= 0 ? free : 0;
372 }
374 /*
375 * This is the 'heart' of the zoned buddy allocator:
376 */
377 struct page * fastcall __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
378 {
379 zone_t **zone, * classzone;
380 struct page * page;
381 int freed, class_idx;
383 zone = zonelist->zones;
384 classzone = *zone;
385 class_idx = zone_idx(classzone);
387 for (;;) {
388 zone_t *z = *(zone++);
389 if (!z)
390 break;
392 if (zone_free_pages(z, order) > z->watermarks[class_idx].low) {
393 page = rmqueue(z, order);
394 if (page)
395 return page;
396 }
397 }
399 classzone->need_balance = 1;
400 mb();
401 if (waitqueue_active(&kswapd_wait))
402 wake_up_interruptible(&kswapd_wait);
404 zone = zonelist->zones;
405 for (;;) {
406 unsigned long min;
407 zone_t *z = *(zone++);
408 if (!z)
409 break;
411 min = z->watermarks[class_idx].min;
412 if (!(gfp_mask & __GFP_WAIT))
413 min >>= 2;
414 if (zone_free_pages(z, order) > min) {
415 page = rmqueue(z, order);
416 if (page)
417 return page;
418 }
419 }
421 /* here we're in the low on memory slow path */
423 if ((current->flags & PF_MEMALLOC) &&
424 (!in_interrupt() || (current->flags & PF_MEMDIE))) {
425 zone = zonelist->zones;
426 for (;;) {
427 zone_t *z = *(zone++);
428 if (!z)
429 break;
431 page = rmqueue(z, order);
432 if (page)
433 return page;
434 }
435 return NULL;
436 }
438 /* Atomic allocations - we can't balance anything */
439 if (!(gfp_mask & __GFP_WAIT))
440 goto out;
442 rebalance:
443 page = balance_classzone(classzone, gfp_mask, order, &freed);
444 if (page)
445 return page;
447 zone = zonelist->zones;
448 if (likely(freed)) {
449 for (;;) {
450 zone_t *z = *(zone++);
451 if (!z)
452 break;
454 if (zone_free_pages(z, order) > z->watermarks[class_idx].min) {
455 page = rmqueue(z, order);
456 if (page)
457 return page;
458 }
459 }
460 goto rebalance;
461 } else {
462 /*
463 * Check that no other task is been killed meanwhile,
464 * in such a case we can succeed the allocation.
465 */
466 for (;;) {
467 zone_t *z = *(zone++);
468 if (!z)
469 break;
471 if (zone_free_pages(z, order) > z->watermarks[class_idx].high) {
472 page = rmqueue(z, order);
473 if (page)
474 return page;
475 }
476 }
477 }
479 out:
480 printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n",
481 order, gfp_mask, !!(current->flags & PF_MEMALLOC));
482 if (unlikely(vm_gfp_debug))
483 dump_stack();
484 return NULL;
485 }
487 /*
488 * Common helper functions.
489 */
490 fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
491 {
492 struct page * page;
494 page = alloc_pages(gfp_mask, order);
495 if (!page)
496 return 0;
497 return (unsigned long) page_address(page);
498 }
500 fastcall unsigned long get_zeroed_page(unsigned int gfp_mask)
501 {
502 struct page * page;
504 page = alloc_pages(gfp_mask, 0);
505 if (page) {
506 void *address = page_address(page);
507 clear_page(address);
508 return (unsigned long) address;
509 }
510 return 0;
511 }
513 fastcall void __free_pages(struct page *page, unsigned int order)
514 {
515 if (!PageReserved(page) && put_page_testzero(page))
516 __free_pages_ok(page, order);
517 }
519 fastcall void free_pages(unsigned long addr, unsigned int order)
520 {
521 if (addr != 0)
522 __free_pages(virt_to_page(addr), order);
523 }
525 /*
526 * Total amount of free (allocatable) RAM:
527 */
528 unsigned int nr_free_pages (void)
529 {
530 unsigned int sum = 0;
531 zone_t *zone;
533 for_each_zone(zone)
534 sum += zone->free_pages;
536 return sum;
537 }
539 /*
540 * Amount of free RAM allocatable as buffer memory:
541 */
542 unsigned int nr_free_buffer_pages (void)
543 {
544 pg_data_t *pgdat;
545 unsigned int sum = 0;
546 zonelist_t *zonelist;
547 zone_t **zonep, *zone;
549 for_each_pgdat(pgdat) {
550 int class_idx;
551 zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
552 zonep = zonelist->zones;
553 zone = *zonep;
554 class_idx = zone_idx(zone);
556 sum += zone->nr_cache_pages;
557 for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
558 int free = zone->free_pages - zone->watermarks[class_idx].high;
559 if (free <= 0)
560 continue;
561 sum += free;
562 }
563 }
565 return sum;
566 }
568 #if CONFIG_HIGHMEM
569 unsigned int nr_free_highpages (void)
570 {
571 pg_data_t *pgdat;
572 unsigned int pages = 0;
574 for_each_pgdat(pgdat)
575 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
577 return pages;
578 }
580 unsigned int freeable_lowmem(void)
581 {
582 unsigned int pages = 0;
583 pg_data_t *pgdat;
585 for_each_pgdat(pgdat) {
586 pages += pgdat->node_zones[ZONE_DMA].free_pages;
587 pages += pgdat->node_zones[ZONE_DMA].nr_active_pages;
588 pages += pgdat->node_zones[ZONE_DMA].nr_inactive_pages;
589 pages += pgdat->node_zones[ZONE_NORMAL].free_pages;
590 pages += pgdat->node_zones[ZONE_NORMAL].nr_active_pages;
591 pages += pgdat->node_zones[ZONE_NORMAL].nr_inactive_pages;
592 }
594 return pages;
595 }
596 #endif
598 #define K(x) ((x) << (PAGE_SHIFT-10))
600 /*
601 * Show free area list (used inside shift_scroll-lock stuff)
602 * We also calculate the percentage fragmentation. We do this by counting the
603 * memory on each free list with the exception of the first item on the list.
604 */
605 void show_free_areas_core(pg_data_t *pgdat)
606 {
607 unsigned int order;
608 unsigned type;
609 pg_data_t *tmpdat = pgdat;
611 printk("Free pages: %6dkB (%6dkB HighMem)\n",
612 K(nr_free_pages()),
613 K(nr_free_highpages()));
615 while (tmpdat) {
616 zone_t *zone;
617 for (zone = tmpdat->node_zones;
618 zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
619 printk("Zone:%s freepages:%6lukB\n",
620 zone->name,
621 K(zone->free_pages));
623 tmpdat = tmpdat->node_next;
624 }
626 printk("( Active: %d, inactive: %d, free: %d )\n",
627 nr_active_pages,
628 nr_inactive_pages,
629 nr_free_pages());
631 for (type = 0; type < MAX_NR_ZONES; type++) {
632 struct list_head *head, *curr;
633 zone_t *zone = pgdat->node_zones + type;
634 unsigned long nr, total, flags;
636 total = 0;
637 if (zone->size) {
638 spin_lock_irqsave(&zone->lock, flags);
639 for (order = 0; order < MAX_ORDER; order++) {
640 head = &(zone->free_area + order)->free_list;
641 curr = head;
642 nr = 0;
643 for (;;) {
644 if ((curr = curr->next) == head)
645 break;
646 nr++;
647 }
648 total += nr * (1 << order);
649 printk("%lu*%lukB ", nr, K(1UL) << order);
650 }
651 spin_unlock_irqrestore(&zone->lock, flags);
652 }
653 printk("= %lukB)\n", K(total));
654 }
656 #ifdef SWAP_CACHE_INFO
657 show_swap_cache_info();
658 #endif
659 }
661 void show_free_areas(void)
662 {
663 show_free_areas_core(pgdat_list);
664 }
666 /*
667 * Builds allocation fallback zone lists.
668 */
669 static inline void build_zonelists(pg_data_t *pgdat)
670 {
671 int i, j, k;
673 for (i = 0; i <= GFP_ZONEMASK; i++) {
674 zonelist_t *zonelist;
675 zone_t *zone;
677 zonelist = pgdat->node_zonelists + i;
678 memset(zonelist, 0, sizeof(*zonelist));
680 j = 0;
681 k = ZONE_NORMAL;
682 if (i & __GFP_HIGHMEM)
683 k = ZONE_HIGHMEM;
684 if (i & __GFP_DMA)
685 k = ZONE_DMA;
687 switch (k) {
688 default:
689 BUG();
690 /*
691 * fallthrough:
692 */
693 case ZONE_HIGHMEM:
694 zone = pgdat->node_zones + ZONE_HIGHMEM;
695 if (zone->size) {
696 #ifndef CONFIG_HIGHMEM
697 BUG();
698 #endif
699 zonelist->zones[j++] = zone;
700 }
701 case ZONE_NORMAL:
702 zone = pgdat->node_zones + ZONE_NORMAL;
703 if (zone->size)
704 zonelist->zones[j++] = zone;
705 case ZONE_DMA:
706 zone = pgdat->node_zones + ZONE_DMA;
707 if (zone->size)
708 zonelist->zones[j++] = zone;
709 }
710 zonelist->zones[j++] = NULL;
711 }
712 }
714 /*
715 * Helper functions to size the waitqueue hash table.
716 * Essentially these want to choose hash table sizes sufficiently
717 * large so that collisions trying to wait on pages are rare.
718 * But in fact, the number of active page waitqueues on typical
719 * systems is ridiculously low, less than 200. So this is even
720 * conservative, even though it seems large.
721 *
722 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
723 * waitqueues, i.e. the size of the waitq table given the number of pages.
724 */
725 #define PAGES_PER_WAITQUEUE 256
727 static inline unsigned long wait_table_size(unsigned long pages)
728 {
729 unsigned long size = 1;
731 pages /= PAGES_PER_WAITQUEUE;
733 while (size < pages)
734 size <<= 1;
736 /*
737 * Once we have dozens or even hundreds of threads sleeping
738 * on IO we've got bigger problems than wait queue collision.
739 * Limit the size of the wait table to a reasonable size.
740 */
741 size = min(size, 4096UL);
743 return size;
744 }
746 /*
747 * This is an integer logarithm so that shifts can be used later
748 * to extract the more random high bits from the multiplicative
749 * hash function before the remainder is taken.
750 */
751 static inline unsigned long wait_table_bits(unsigned long size)
752 {
753 return ffz(~size);
754 }
756 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
758 /*
759 * Set up the zone data structures:
760 * - mark all pages reserved
761 * - mark all memory queues empty
762 * - clear the memory bitmaps
763 */
764 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
765 unsigned long *zones_size, unsigned long zone_start_paddr,
766 unsigned long *zholes_size, struct page *lmem_map)
767 {
768 unsigned long i, j;
769 unsigned long map_size;
770 unsigned long totalpages, offset, realtotalpages;
771 const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
773 if (zone_start_paddr & ~PAGE_MASK)
774 BUG();
776 totalpages = 0;
777 for (i = 0; i < MAX_NR_ZONES; i++) {
778 unsigned long size = zones_size[i];
779 totalpages += size;
780 }
781 realtotalpages = totalpages;
782 if (zholes_size)
783 for (i = 0; i < MAX_NR_ZONES; i++)
784 realtotalpages -= zholes_size[i];
786 printk("On node %d totalpages: %lu\n", nid, realtotalpages);
788 /*
789 * Some architectures (with lots of mem and discontinous memory
790 * maps) have to search for a good mem_map area:
791 * For discontigmem, the conceptual mem map array starts from
792 * PAGE_OFFSET, we need to align the actual array onto a mem map
793 * boundary, so that MAP_NR works.
794 */
795 map_size = (totalpages + 1)*sizeof(struct page);
796 if (lmem_map == (struct page *)0) {
797 lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
798 lmem_map = (struct page *)(PAGE_OFFSET +
799 MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
800 }
801 *gmap = pgdat->node_mem_map = lmem_map;
802 pgdat->node_size = totalpages;
803 pgdat->node_start_paddr = zone_start_paddr;
804 pgdat->node_start_mapnr = (lmem_map - mem_map);
805 pgdat->nr_zones = 0;
807 offset = lmem_map - mem_map;
808 for (j = 0; j < MAX_NR_ZONES; j++) {
809 zone_t *zone = pgdat->node_zones + j;
810 unsigned long mask;
811 unsigned long size, realsize;
812 int idx;
814 zone_table[nid * MAX_NR_ZONES + j] = zone;
815 realsize = size = zones_size[j];
816 if (zholes_size)
817 realsize -= zholes_size[j];
819 printk("zone(%lu): %lu pages.\n", j, size);
820 zone->size = size;
821 zone->realsize = realsize;
822 zone->name = zone_names[j];
823 zone->lock = SPIN_LOCK_UNLOCKED;
824 zone->zone_pgdat = pgdat;
825 zone->free_pages = 0;
826 zone->need_balance = 0;
827 zone->nr_active_pages = zone->nr_inactive_pages = 0;
830 if (!size)
831 continue;
833 /*
834 * The per-page waitqueue mechanism uses hashed waitqueues
835 * per zone.
836 */
837 zone->wait_table_size = wait_table_size(size);
838 zone->wait_table_shift =
839 BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
840 zone->wait_table = (wait_queue_head_t *)
841 alloc_bootmem_node(pgdat, zone->wait_table_size
842 * sizeof(wait_queue_head_t));
844 for(i = 0; i < zone->wait_table_size; ++i)
845 init_waitqueue_head(zone->wait_table + i);
847 pgdat->nr_zones = j+1;
849 mask = (realsize / zone_balance_ratio[j]);
850 if (mask < zone_balance_min[j])
851 mask = zone_balance_min[j];
852 else if (mask > zone_balance_max[j])
853 mask = zone_balance_max[j];
854 zone->watermarks[j].min = mask;
855 zone->watermarks[j].low = mask*2;
856 zone->watermarks[j].high = mask*3;
857 /* now set the watermarks of the lower zones in the "j" classzone */
858 for (idx = j-1; idx >= 0; idx--) {
859 zone_t * lower_zone = pgdat->node_zones + idx;
860 unsigned long lower_zone_reserve;
861 if (!lower_zone->size)
862 continue;
864 mask = lower_zone->watermarks[idx].min;
865 lower_zone->watermarks[j].min = mask;
866 lower_zone->watermarks[j].low = mask*2;
867 lower_zone->watermarks[j].high = mask*3;
869 /* now the brainer part */
870 lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx];
871 lower_zone->watermarks[j].min += lower_zone_reserve;
872 lower_zone->watermarks[j].low += lower_zone_reserve;
873 lower_zone->watermarks[j].high += lower_zone_reserve;
875 realsize += lower_zone->realsize;
876 }
878 zone->zone_mem_map = mem_map + offset;
879 zone->zone_start_mapnr = offset;
880 zone->zone_start_paddr = zone_start_paddr;
882 if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
883 printk("BUG: wrong zone alignment, it will crash\n");
885 /*
886 * Initially all pages are reserved - free ones are freed
887 * up by free_all_bootmem() once the early boot process is
888 * done. Non-atomic initialization, single-pass.
889 */
890 for (i = 0; i < size; i++) {
891 struct page *page = mem_map + offset + i;
892 set_page_zone(page, nid * MAX_NR_ZONES + j);
893 set_page_count(page, 0);
894 SetPageReserved(page);
895 INIT_LIST_HEAD(&page->list);
896 if (j != ZONE_HIGHMEM)
897 set_page_address(page, __va(zone_start_paddr));
898 zone_start_paddr += PAGE_SIZE;
899 }
901 offset += size;
902 for (i = 0; ; i++) {
903 unsigned long bitmap_size;
905 INIT_LIST_HEAD(&zone->free_area[i].free_list);
906 if (i == MAX_ORDER-1) {
907 zone->free_area[i].map = NULL;
908 break;
909 }
911 /*
912 * Page buddy system uses "index >> (i+1)",
913 * where "index" is at most "size-1".
914 *
915 * The extra "+3" is to round down to byte
916 * size (8 bits per byte assumption). Thus
917 * we get "(size-1) >> (i+4)" as the last byte
918 * we can access.
919 *
920 * The "+1" is because we want to round the
921 * byte allocation up rather than down. So
922 * we should have had a "+7" before we shifted
923 * down by three. Also, we have to add one as
924 * we actually _use_ the last bit (it's [0,n]
925 * inclusive, not [0,n[).
926 *
927 * So we actually had +7+1 before we shift
928 * down by 3. But (n+8) >> 3 == (n >> 3) + 1
929 * (modulo overflows, which we do not have).
930 *
931 * Finally, we LONG_ALIGN because all bitmap
932 * operations are on longs.
933 */
934 bitmap_size = (size-1) >> (i+4);
935 bitmap_size = LONG_ALIGN(bitmap_size+1);
936 zone->free_area[i].map =
937 (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
938 }
939 }
940 build_zonelists(pgdat);
941 }
943 void __init free_area_init(unsigned long *zones_size)
944 {
945 free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
946 }
948 static int __init setup_mem_frac(char *str)
949 {
950 int j = 0;
952 while (get_option(&str, &zone_balance_ratio[j++]) == 2);
953 printk("setup_mem_frac: ");
954 for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]);
955 printk("\n");
956 return 1;
957 }
959 __setup("memfrac=", setup_mem_frac);
961 static int __init setup_lower_zone_reserve(char *str)
962 {
963 int j = 0;
965 while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2);
966 printk("setup_lower_zone_reserve: ");
967 for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d ", lower_zone_reserve_ratio[j]);
968 printk("\n");
969 return 1;
970 }
972 __setup("lower_zone_reserve=", setup_lower_zone_reserve);