ia64/xen-unstable

view linux-2.4.26-xen-sparse/mm/page_alloc.c @ 1527:a815a43920c0

bitkeeper revision 1.994 (40d6ed9ePUmxTwjKFv1vprN2-xFpmQ)

Install mkernel odules with 'make install'
author iap10@labyrinth.cl.cam.ac.uk
date Mon Jun 21 14:15:58 2004 +0000 (2004-06-21)
parents f3123052268f
children cbbe40349d37 0f47aec8946e b1347b2eb538
line source
1 /*
2 * linux/mm/page_alloc.c
3 *
4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c
6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 */
15 #include <linux/config.h>
16 #include <linux/mm.h>
17 #include <linux/swap.h>
18 #include <linux/swapctl.h>
19 #include <linux/interrupt.h>
20 #include <linux/pagemap.h>
21 #include <linux/bootmem.h>
22 #include <linux/slab.h>
23 #include <linux/module.h>
25 int nr_swap_pages;
26 int nr_active_pages;
27 int nr_inactive_pages;
28 LIST_HEAD(inactive_list);
29 LIST_HEAD(active_list);
30 pg_data_t *pgdat_list;
32 /*
33 *
34 * The zone_table array is used to look up the address of the
35 * struct zone corresponding to a given zone number (ZONE_DMA,
36 * ZONE_NORMAL, or ZONE_HIGHMEM).
37 */
38 zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
39 EXPORT_SYMBOL(zone_table);
41 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
42 static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
43 static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
44 static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
45 static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
47 int vm_gfp_debug = 0;
49 /*
50 * Temporary debugging check.
51 */
52 #define BAD_RANGE(zone, page) \
53 ( \
54 (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \
55 || (((page) - mem_map) < (zone)->zone_start_mapnr) \
56 || ((zone) != page_zone(page)) \
57 )
59 /*
60 * Freeing function for a buddy system allocator.
61 * Contrary to prior comments, this is *NOT* hairy, and there
62 * is no reason for anyone not to understand it.
63 *
64 * The concept of a buddy system is to maintain direct-mapped tables
65 * (containing bit values) for memory blocks of various "orders".
66 * The bottom level table contains the map for the smallest allocatable
67 * units of memory (here, pages), and each level above it describes
68 * pairs of units from the levels below, hence, "buddies".
69 * At a high level, all that happens here is marking the table entry
70 * at the bottom level available, and propagating the changes upward
71 * as necessary, plus some accounting needed to play nicely with other
72 * parts of the VM system.
73 * At each level, we keep one bit for each pair of blocks, which
74 * is set to 1 iff only one of the pair is allocated. So when we
75 * are allocating or freeing one, we can derive the state of the
76 * other. That is, if we allocate a small block, and both were
77 * free, the remainder of the region must be split into blocks.
78 * If a block is freed, and its buddy is also free, then this
79 * triggers coalescing into a block of larger size.
80 *
81 * -- wli
82 */
84 static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
85 static void __free_pages_ok (struct page *page, unsigned int order)
86 {
87 unsigned long index, page_idx, mask, flags;
88 free_area_t *area;
89 struct page *base;
90 zone_t *zone;
92 /*
93 * Yes, think what happens when other parts of the kernel take
94 * a reference to a page in order to pin it for io. -ben
95 */
96 if (PageLRU(page)) {
97 if (unlikely(in_interrupt()))
98 BUG();
99 lru_cache_del(page);
100 }
102 if (page->buffers)
103 BUG();
104 if (page->mapping)
105 return (*(void(*)(struct page *))page->mapping)(page);
106 if (!VALID_PAGE(page))
107 BUG();
108 if (PageLocked(page))
109 BUG();
110 if (PageActive(page))
111 BUG();
112 ClearPageReferenced(page);
113 ClearPageDirty(page);
115 if (current->flags & PF_FREE_PAGES)
116 goto local_freelist;
117 back_local_freelist:
119 zone = page_zone(page);
121 mask = (~0UL) << order;
122 base = zone->zone_mem_map;
123 page_idx = page - base;
124 if (page_idx & ~mask)
125 BUG();
126 index = page_idx >> (1 + order);
128 area = zone->free_area + order;
130 spin_lock_irqsave(&zone->lock, flags);
132 zone->free_pages -= mask;
134 while (mask + (1 << (MAX_ORDER-1))) {
135 struct page *buddy1, *buddy2;
137 if (area >= zone->free_area + MAX_ORDER)
138 BUG();
139 if (!__test_and_change_bit(index, area->map))
140 /*
141 * the buddy page is still allocated.
142 */
143 break;
144 /*
145 * Move the buddy up one level.
146 * This code is taking advantage of the identity:
147 * -mask = 1+~mask
148 */
149 buddy1 = base + (page_idx ^ -mask);
150 buddy2 = base + page_idx;
151 if (BAD_RANGE(zone,buddy1))
152 BUG();
153 if (BAD_RANGE(zone,buddy2))
154 BUG();
156 list_del(&buddy1->list);
157 mask <<= 1;
158 area++;
159 index >>= 1;
160 page_idx &= mask;
161 }
162 list_add(&(base + page_idx)->list, &area->free_list);
164 spin_unlock_irqrestore(&zone->lock, flags);
165 return;
167 local_freelist:
168 if (current->nr_local_pages)
169 goto back_local_freelist;
170 if (in_interrupt())
171 goto back_local_freelist;
173 list_add(&page->list, &current->local_pages);
174 page->index = order;
175 current->nr_local_pages++;
176 }
178 #define MARK_USED(index, order, area) \
179 __change_bit((index) >> (1+(order)), (area)->map)
181 static inline struct page * expand (zone_t *zone, struct page *page,
182 unsigned long index, int low, int high, free_area_t * area)
183 {
184 unsigned long size = 1 << high;
186 while (high > low) {
187 if (BAD_RANGE(zone,page))
188 BUG();
189 area--;
190 high--;
191 size >>= 1;
192 list_add(&(page)->list, &(area)->free_list);
193 MARK_USED(index, high, area);
194 index += size;
195 page += size;
196 }
197 if (BAD_RANGE(zone,page))
198 BUG();
199 return page;
200 }
202 static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
203 static struct page * rmqueue(zone_t *zone, unsigned int order)
204 {
205 free_area_t * area = zone->free_area + order;
206 unsigned int curr_order = order;
207 struct list_head *head, *curr;
208 unsigned long flags;
209 struct page *page;
211 spin_lock_irqsave(&zone->lock, flags);
212 do {
213 head = &area->free_list;
214 curr = head->next;
216 if (curr != head) {
217 unsigned int index;
219 page = list_entry(curr, struct page, list);
220 if (BAD_RANGE(zone,page))
221 BUG();
222 list_del(curr);
223 index = page - zone->zone_mem_map;
224 if (curr_order != MAX_ORDER-1)
225 MARK_USED(index, curr_order, area);
226 zone->free_pages -= 1UL << order;
228 page = expand(zone, page, index, order, curr_order, area);
229 spin_unlock_irqrestore(&zone->lock, flags);
231 set_page_count(page, 1);
232 if (BAD_RANGE(zone,page))
233 BUG();
234 if (PageLRU(page))
235 BUG();
236 if (PageActive(page))
237 BUG();
238 return page;
239 }
240 curr_order++;
241 area++;
242 } while (curr_order < MAX_ORDER);
243 spin_unlock_irqrestore(&zone->lock, flags);
245 return NULL;
246 }
248 #ifndef CONFIG_DISCONTIGMEM
249 struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
250 {
251 return __alloc_pages(gfp_mask, order,
252 contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
253 }
254 #endif
256 static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
257 static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
258 {
259 struct page * page = NULL;
260 int __freed;
262 if (in_interrupt())
263 BUG();
265 current->allocation_order = order;
266 current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
268 __freed = try_to_free_pages_zone(classzone, gfp_mask);
270 current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
272 if (current->nr_local_pages) {
273 struct list_head * entry, * local_pages;
274 struct page * tmp;
275 int nr_pages;
277 local_pages = &current->local_pages;
279 if (likely(__freed)) {
280 /* pick from the last inserted so we're lifo */
281 entry = local_pages->next;
282 do {
283 tmp = list_entry(entry, struct page, list);
284 if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
285 list_del(entry);
286 current->nr_local_pages--;
287 set_page_count(tmp, 1);
288 page = tmp;
290 if (page->buffers)
291 BUG();
292 if (page->mapping)
293 BUG();
294 if (!VALID_PAGE(page))
295 BUG();
296 if (PageLocked(page))
297 BUG();
298 if (PageLRU(page))
299 BUG();
300 if (PageActive(page))
301 BUG();
302 if (PageDirty(page))
303 BUG();
305 break;
306 }
307 } while ((entry = entry->next) != local_pages);
308 }
310 nr_pages = current->nr_local_pages;
311 /* free in reverse order so that the global order will be lifo */
312 while ((entry = local_pages->prev) != local_pages) {
313 list_del(entry);
314 tmp = list_entry(entry, struct page, list);
315 __free_pages_ok(tmp, tmp->index);
316 if (!nr_pages--)
317 BUG();
318 }
319 current->nr_local_pages = 0;
320 }
322 *freed = __freed;
323 return page;
324 }
326 static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order)
327 {
328 long free = zone->free_pages - (1UL << order);
329 return free >= 0 ? free : 0;
330 }
332 /*
333 * This is the 'heart' of the zoned buddy allocator:
334 */
335 struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
336 {
337 zone_t **zone, * classzone;
338 struct page * page;
339 int freed, class_idx;
341 zone = zonelist->zones;
342 classzone = *zone;
343 class_idx = zone_idx(classzone);
345 for (;;) {
346 zone_t *z = *(zone++);
347 if (!z)
348 break;
350 if (zone_free_pages(z, order) > z->watermarks[class_idx].low) {
351 page = rmqueue(z, order);
352 if (page)
353 return page;
354 }
355 }
357 classzone->need_balance = 1;
358 mb();
359 if (waitqueue_active(&kswapd_wait))
360 wake_up_interruptible(&kswapd_wait);
362 zone = zonelist->zones;
363 for (;;) {
364 unsigned long min;
365 zone_t *z = *(zone++);
366 if (!z)
367 break;
369 min = z->watermarks[class_idx].min;
370 if (!(gfp_mask & __GFP_WAIT))
371 min >>= 2;
372 if (zone_free_pages(z, order) > min) {
373 page = rmqueue(z, order);
374 if (page)
375 return page;
376 }
377 }
379 /* here we're in the low on memory slow path */
381 if ((current->flags & PF_MEMALLOC) &&
382 (!in_interrupt() || (current->flags & PF_MEMDIE))) {
383 zone = zonelist->zones;
384 for (;;) {
385 zone_t *z = *(zone++);
386 if (!z)
387 break;
389 page = rmqueue(z, order);
390 if (page)
391 return page;
392 }
393 return NULL;
394 }
396 /* Atomic allocations - we can't balance anything */
397 if (!(gfp_mask & __GFP_WAIT))
398 goto out;
400 rebalance:
401 page = balance_classzone(classzone, gfp_mask, order, &freed);
402 if (page)
403 return page;
405 zone = zonelist->zones;
406 if (likely(freed)) {
407 for (;;) {
408 zone_t *z = *(zone++);
409 if (!z)
410 break;
412 if (zone_free_pages(z, order) > z->watermarks[class_idx].min) {
413 page = rmqueue(z, order);
414 if (page)
415 return page;
416 }
417 }
418 goto rebalance;
419 } else {
420 /*
421 * Check that no other task is been killed meanwhile,
422 * in such a case we can succeed the allocation.
423 */
424 for (;;) {
425 zone_t *z = *(zone++);
426 if (!z)
427 break;
429 if (zone_free_pages(z, order) > z->watermarks[class_idx].high) {
430 page = rmqueue(z, order);
431 if (page)
432 return page;
433 }
434 }
435 }
437 out:
438 printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n",
439 order, gfp_mask, !!(current->flags & PF_MEMALLOC));
440 if (unlikely(vm_gfp_debug))
441 dump_stack();
442 return NULL;
443 }
445 /*
446 * Common helper functions.
447 */
448 unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
449 {
450 struct page * page;
452 page = alloc_pages(gfp_mask, order);
453 if (!page)
454 return 0;
455 return (unsigned long) page_address(page);
456 }
458 unsigned long get_zeroed_page(unsigned int gfp_mask)
459 {
460 struct page * page;
462 page = alloc_pages(gfp_mask, 0);
463 if (page) {
464 void *address = page_address(page);
465 clear_page(address);
466 return (unsigned long) address;
467 }
468 return 0;
469 }
471 void __free_pages(struct page *page, unsigned int order)
472 {
473 if (!PageReserved(page) && put_page_testzero(page))
474 __free_pages_ok(page, order);
475 }
477 void free_pages(unsigned long addr, unsigned int order)
478 {
479 if (addr != 0)
480 __free_pages(virt_to_page(addr), order);
481 }
483 /*
484 * Total amount of free (allocatable) RAM:
485 */
486 unsigned int nr_free_pages (void)
487 {
488 unsigned int sum = 0;
489 zone_t *zone;
491 for_each_zone(zone)
492 sum += zone->free_pages;
494 return sum;
495 }
497 /*
498 * Amount of free RAM allocatable as buffer memory:
499 */
500 unsigned int nr_free_buffer_pages (void)
501 {
502 pg_data_t *pgdat;
503 unsigned int sum = 0;
504 zonelist_t *zonelist;
505 zone_t **zonep, *zone;
507 for_each_pgdat(pgdat) {
508 int class_idx;
509 zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
510 zonep = zonelist->zones;
511 zone = *zonep;
512 class_idx = zone_idx(zone);
514 sum += zone->nr_cache_pages;
515 for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
516 int free = zone->free_pages - zone->watermarks[class_idx].high;
517 if (free <= 0)
518 continue;
519 sum += free;
520 }
521 }
523 return sum;
524 }
526 #if CONFIG_HIGHMEM
527 unsigned int nr_free_highpages (void)
528 {
529 pg_data_t *pgdat;
530 unsigned int pages = 0;
532 for_each_pgdat(pgdat)
533 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
535 return pages;
536 }
538 unsigned int freeable_lowmem(void)
539 {
540 unsigned int pages = 0;
541 pg_data_t *pgdat;
543 for_each_pgdat(pgdat) {
544 pages += pgdat->node_zones[ZONE_DMA].free_pages;
545 pages += pgdat->node_zones[ZONE_DMA].nr_active_pages;
546 pages += pgdat->node_zones[ZONE_DMA].nr_inactive_pages;
547 pages += pgdat->node_zones[ZONE_NORMAL].free_pages;
548 pages += pgdat->node_zones[ZONE_NORMAL].nr_active_pages;
549 pages += pgdat->node_zones[ZONE_NORMAL].nr_inactive_pages;
550 }
552 return pages;
553 }
554 #endif
556 #define K(x) ((x) << (PAGE_SHIFT-10))
558 /*
559 * Show free area list (used inside shift_scroll-lock stuff)
560 * We also calculate the percentage fragmentation. We do this by counting the
561 * memory on each free list with the exception of the first item on the list.
562 */
563 void show_free_areas_core(pg_data_t *pgdat)
564 {
565 unsigned int order;
566 unsigned type;
567 pg_data_t *tmpdat = pgdat;
569 printk("Free pages: %6dkB (%6dkB HighMem)\n",
570 K(nr_free_pages()),
571 K(nr_free_highpages()));
573 while (tmpdat) {
574 zone_t *zone;
575 for (zone = tmpdat->node_zones;
576 zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
577 printk("Zone:%s freepages:%6lukB\n",
578 zone->name,
579 K(zone->free_pages));
581 tmpdat = tmpdat->node_next;
582 }
584 printk("( Active: %d, inactive: %d, free: %d )\n",
585 nr_active_pages,
586 nr_inactive_pages,
587 nr_free_pages());
589 for (type = 0; type < MAX_NR_ZONES; type++) {
590 struct list_head *head, *curr;
591 zone_t *zone = pgdat->node_zones + type;
592 unsigned long nr, total, flags;
594 total = 0;
595 if (zone->size) {
596 spin_lock_irqsave(&zone->lock, flags);
597 for (order = 0; order < MAX_ORDER; order++) {
598 head = &(zone->free_area + order)->free_list;
599 curr = head;
600 nr = 0;
601 for (;;) {
602 if ((curr = curr->next) == head)
603 break;
604 nr++;
605 }
606 total += nr * (1 << order);
607 printk("%lu*%lukB ", nr, K(1UL) << order);
608 }
609 spin_unlock_irqrestore(&zone->lock, flags);
610 }
611 printk("= %lukB)\n", K(total));
612 }
614 #ifdef SWAP_CACHE_INFO
615 show_swap_cache_info();
616 #endif
617 }
619 void show_free_areas(void)
620 {
621 show_free_areas_core(pgdat_list);
622 }
624 /*
625 * Builds allocation fallback zone lists.
626 */
627 static inline void build_zonelists(pg_data_t *pgdat)
628 {
629 int i, j, k;
631 for (i = 0; i <= GFP_ZONEMASK; i++) {
632 zonelist_t *zonelist;
633 zone_t *zone;
635 zonelist = pgdat->node_zonelists + i;
636 memset(zonelist, 0, sizeof(*zonelist));
638 j = 0;
639 k = ZONE_NORMAL;
640 if (i & __GFP_HIGHMEM)
641 k = ZONE_HIGHMEM;
642 if (i & __GFP_DMA)
643 k = ZONE_DMA;
645 switch (k) {
646 default:
647 BUG();
648 /*
649 * fallthrough:
650 */
651 case ZONE_HIGHMEM:
652 zone = pgdat->node_zones + ZONE_HIGHMEM;
653 if (zone->size) {
654 #ifndef CONFIG_HIGHMEM
655 BUG();
656 #endif
657 zonelist->zones[j++] = zone;
658 }
659 case ZONE_NORMAL:
660 zone = pgdat->node_zones + ZONE_NORMAL;
661 if (zone->size)
662 zonelist->zones[j++] = zone;
663 case ZONE_DMA:
664 zone = pgdat->node_zones + ZONE_DMA;
665 if (zone->size)
666 zonelist->zones[j++] = zone;
667 }
668 zonelist->zones[j++] = NULL;
669 }
670 }
672 /*
673 * Helper functions to size the waitqueue hash table.
674 * Essentially these want to choose hash table sizes sufficiently
675 * large so that collisions trying to wait on pages are rare.
676 * But in fact, the number of active page waitqueues on typical
677 * systems is ridiculously low, less than 200. So this is even
678 * conservative, even though it seems large.
679 *
680 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
681 * waitqueues, i.e. the size of the waitq table given the number of pages.
682 */
683 #define PAGES_PER_WAITQUEUE 256
685 static inline unsigned long wait_table_size(unsigned long pages)
686 {
687 unsigned long size = 1;
689 pages /= PAGES_PER_WAITQUEUE;
691 while (size < pages)
692 size <<= 1;
694 /*
695 * Once we have dozens or even hundreds of threads sleeping
696 * on IO we've got bigger problems than wait queue collision.
697 * Limit the size of the wait table to a reasonable size.
698 */
699 size = min(size, 4096UL);
701 return size;
702 }
704 /*
705 * This is an integer logarithm so that shifts can be used later
706 * to extract the more random high bits from the multiplicative
707 * hash function before the remainder is taken.
708 */
709 static inline unsigned long wait_table_bits(unsigned long size)
710 {
711 return ffz(~size);
712 }
714 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
716 /*
717 * Set up the zone data structures:
718 * - mark all pages reserved
719 * - mark all memory queues empty
720 * - clear the memory bitmaps
721 */
722 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
723 unsigned long *zones_size, unsigned long zone_start_paddr,
724 unsigned long *zholes_size, struct page *lmem_map)
725 {
726 unsigned long i, j;
727 unsigned long map_size;
728 unsigned long totalpages, offset, realtotalpages;
729 const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
731 if (zone_start_paddr & ~PAGE_MASK)
732 BUG();
734 totalpages = 0;
735 for (i = 0; i < MAX_NR_ZONES; i++) {
736 unsigned long size = zones_size[i];
737 totalpages += size;
738 }
739 realtotalpages = totalpages;
740 if (zholes_size)
741 for (i = 0; i < MAX_NR_ZONES; i++)
742 realtotalpages -= zholes_size[i];
744 printk("On node %d totalpages: %lu\n", nid, realtotalpages);
746 /*
747 * Some architectures (with lots of mem and discontinous memory
748 * maps) have to search for a good mem_map area:
749 * For discontigmem, the conceptual mem map array starts from
750 * PAGE_OFFSET, we need to align the actual array onto a mem map
751 * boundary, so that MAP_NR works.
752 */
753 map_size = (totalpages + 1)*sizeof(struct page);
754 if (lmem_map == (struct page *)0) {
755 lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
756 lmem_map = (struct page *)(PAGE_OFFSET +
757 MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
758 }
759 *gmap = pgdat->node_mem_map = lmem_map;
760 pgdat->node_size = totalpages;
761 pgdat->node_start_paddr = zone_start_paddr;
762 pgdat->node_start_mapnr = (lmem_map - mem_map);
763 pgdat->nr_zones = 0;
765 offset = lmem_map - mem_map;
766 for (j = 0; j < MAX_NR_ZONES; j++) {
767 zone_t *zone = pgdat->node_zones + j;
768 unsigned long mask;
769 unsigned long size, realsize;
770 int idx;
772 zone_table[nid * MAX_NR_ZONES + j] = zone;
773 realsize = size = zones_size[j];
774 if (zholes_size)
775 realsize -= zholes_size[j];
777 printk("zone(%lu): %lu pages.\n", j, size);
778 zone->size = size;
779 zone->realsize = realsize;
780 zone->name = zone_names[j];
781 zone->lock = SPIN_LOCK_UNLOCKED;
782 zone->zone_pgdat = pgdat;
783 zone->free_pages = 0;
784 zone->need_balance = 0;
785 zone->nr_active_pages = zone->nr_inactive_pages = 0;
788 if (!size)
789 continue;
791 /*
792 * The per-page waitqueue mechanism uses hashed waitqueues
793 * per zone.
794 */
795 zone->wait_table_size = wait_table_size(size);
796 zone->wait_table_shift =
797 BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
798 zone->wait_table = (wait_queue_head_t *)
799 alloc_bootmem_node(pgdat, zone->wait_table_size
800 * sizeof(wait_queue_head_t));
802 for(i = 0; i < zone->wait_table_size; ++i)
803 init_waitqueue_head(zone->wait_table + i);
805 pgdat->nr_zones = j+1;
807 mask = (realsize / zone_balance_ratio[j]);
808 if (mask < zone_balance_min[j])
809 mask = zone_balance_min[j];
810 else if (mask > zone_balance_max[j])
811 mask = zone_balance_max[j];
812 zone->watermarks[j].min = mask;
813 zone->watermarks[j].low = mask*2;
814 zone->watermarks[j].high = mask*3;
815 /* now set the watermarks of the lower zones in the "j" classzone */
816 for (idx = j-1; idx >= 0; idx--) {
817 zone_t * lower_zone = pgdat->node_zones + idx;
818 unsigned long lower_zone_reserve;
819 if (!lower_zone->size)
820 continue;
822 mask = lower_zone->watermarks[idx].min;
823 lower_zone->watermarks[j].min = mask;
824 lower_zone->watermarks[j].low = mask*2;
825 lower_zone->watermarks[j].high = mask*3;
827 /* now the brainer part */
828 lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx];
829 lower_zone->watermarks[j].min += lower_zone_reserve;
830 lower_zone->watermarks[j].low += lower_zone_reserve;
831 lower_zone->watermarks[j].high += lower_zone_reserve;
833 realsize += lower_zone->realsize;
834 }
836 zone->zone_mem_map = mem_map + offset;
837 zone->zone_start_mapnr = offset;
838 zone->zone_start_paddr = zone_start_paddr;
840 if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
841 printk("BUG: wrong zone alignment, it will crash\n");
843 /*
844 * Initially all pages are reserved - free ones are freed
845 * up by free_all_bootmem() once the early boot process is
846 * done. Non-atomic initialization, single-pass.
847 */
848 for (i = 0; i < size; i++) {
849 struct page *page = mem_map + offset + i;
850 set_page_zone(page, nid * MAX_NR_ZONES + j);
851 set_page_count(page, 0);
852 SetPageReserved(page);
853 INIT_LIST_HEAD(&page->list);
854 if (j != ZONE_HIGHMEM)
855 set_page_address(page, __va(zone_start_paddr));
856 zone_start_paddr += PAGE_SIZE;
857 }
859 offset += size;
860 for (i = 0; ; i++) {
861 unsigned long bitmap_size;
863 INIT_LIST_HEAD(&zone->free_area[i].free_list);
864 if (i == MAX_ORDER-1) {
865 zone->free_area[i].map = NULL;
866 break;
867 }
869 /*
870 * Page buddy system uses "index >> (i+1)",
871 * where "index" is at most "size-1".
872 *
873 * The extra "+3" is to round down to byte
874 * size (8 bits per byte assumption). Thus
875 * we get "(size-1) >> (i+4)" as the last byte
876 * we can access.
877 *
878 * The "+1" is because we want to round the
879 * byte allocation up rather than down. So
880 * we should have had a "+7" before we shifted
881 * down by three. Also, we have to add one as
882 * we actually _use_ the last bit (it's [0,n]
883 * inclusive, not [0,n[).
884 *
885 * So we actually had +7+1 before we shift
886 * down by 3. But (n+8) >> 3 == (n >> 3) + 1
887 * (modulo overflows, which we do not have).
888 *
889 * Finally, we LONG_ALIGN because all bitmap
890 * operations are on longs.
891 */
892 bitmap_size = (size-1) >> (i+4);
893 bitmap_size = LONG_ALIGN(bitmap_size+1);
894 zone->free_area[i].map =
895 (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
896 }
897 }
898 build_zonelists(pgdat);
899 }
901 void __init free_area_init(unsigned long *zones_size)
902 {
903 free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
904 }
906 static int __init setup_mem_frac(char *str)
907 {
908 int j = 0;
910 while (get_option(&str, &zone_balance_ratio[j++]) == 2);
911 printk("setup_mem_frac: ");
912 for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]);
913 printk("\n");
914 return 1;
915 }
917 __setup("memfrac=", setup_mem_frac);
919 static int __init setup_lower_zone_reserve(char *str)
920 {
921 int j = 0;
923 while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2);
924 printk("setup_lower_zone_reserve: ");
925 for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d ", lower_zone_reserve_ratio[j]);
926 printk("\n");
927 return 1;
928 }
930 __setup("lower_zone_reserve=", setup_lower_zone_reserve);