ia64/xen-unstable

view xen/common/page_alloc.c @ 4434:88fed5c27d6d

bitkeeper revision 1.1159.258.87 (424d8879K9lhGxxSZd9bVE9LZZ2YDw)

Fix page scrubbing when fewer than 16 pages remain in the scrub list.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Fri Apr 01 17:44:25 2005 +0000 (2005-04-01)
parents 1c0462767898
children 185169ecc5e0 0dc3b8b8c298
line source
1 /******************************************************************************
2 * page_alloc.c
3 *
4 * Simple buddy heap allocator for Xen.
5 *
6 * Copyright (c) 2002-2004 K A Fraser
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
23 #include <xen/config.h>
24 #include <xen/init.h>
25 #include <xen/types.h>
26 #include <xen/lib.h>
27 #include <asm/page.h>
28 #include <xen/spinlock.h>
29 #include <xen/slab.h>
30 #include <xen/irq.h>
31 #include <xen/softirq.h>
32 #include <asm/domain_page.h>
34 /*
35 * Comma-separated list of hexadecimal page numbers containing bad bytes.
36 * e.g. 'badpage=0x3f45,0x8a321'.
37 */
38 static char opt_badpage[100] = "";
39 string_param("badpage", opt_badpage);
41 #define round_pgdown(_p) ((_p)&PAGE_MASK)
42 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
44 static spinlock_t page_scrub_lock;
45 struct list_head page_scrub_list;
47 /*********************
48 * ALLOCATION BITMAP
49 * One bit per page of memory. Bit set => page is allocated.
50 */
52 static unsigned long bitmap_size; /* in bytes */
53 static unsigned long *alloc_bitmap;
54 #define PAGES_PER_MAPWORD (sizeof(unsigned long) * 8)
56 #define allocated_in_map(_pn) \
57 (alloc_bitmap[(_pn)/PAGES_PER_MAPWORD] & (1<<((_pn)&(PAGES_PER_MAPWORD-1))))
59 /*
60 * Hint regarding bitwise arithmetic in map_{alloc,free}:
61 * -(1<<n) sets all bits >= n.
62 * (1<<n)-1 sets all bits < n.
63 * Variable names in map_{alloc,free}:
64 * *_idx == Index into `alloc_bitmap' array.
65 * *_off == Bit offset within an element of the `alloc_bitmap' array.
66 */
68 static void map_alloc(unsigned long first_page, unsigned long nr_pages)
69 {
70 unsigned long start_off, end_off, curr_idx, end_idx;
72 #ifndef NDEBUG
73 unsigned long i;
74 /* Check that the block isn't already allocated. */
75 for ( i = 0; i < nr_pages; i++ )
76 ASSERT(!allocated_in_map(first_page + i));
77 #endif
79 curr_idx = first_page / PAGES_PER_MAPWORD;
80 start_off = first_page & (PAGES_PER_MAPWORD-1);
81 end_idx = (first_page + nr_pages) / PAGES_PER_MAPWORD;
82 end_off = (first_page + nr_pages) & (PAGES_PER_MAPWORD-1);
84 if ( curr_idx == end_idx )
85 {
86 alloc_bitmap[curr_idx] |= ((1<<end_off)-1) & -(1<<start_off);
87 }
88 else
89 {
90 alloc_bitmap[curr_idx] |= -(1<<start_off);
91 while ( ++curr_idx < end_idx ) alloc_bitmap[curr_idx] = ~0L;
92 alloc_bitmap[curr_idx] |= (1<<end_off)-1;
93 }
94 }
97 static void map_free(unsigned long first_page, unsigned long nr_pages)
98 {
99 unsigned long start_off, end_off, curr_idx, end_idx;
101 #ifndef NDEBUG
102 unsigned long i;
103 /* Check that the block isn't already freed. */
104 for ( i = 0; i < nr_pages; i++ )
105 ASSERT(allocated_in_map(first_page + i));
106 #endif
108 curr_idx = first_page / PAGES_PER_MAPWORD;
109 start_off = first_page & (PAGES_PER_MAPWORD-1);
110 end_idx = (first_page + nr_pages) / PAGES_PER_MAPWORD;
111 end_off = (first_page + nr_pages) & (PAGES_PER_MAPWORD-1);
113 if ( curr_idx == end_idx )
114 {
115 alloc_bitmap[curr_idx] &= -(1<<end_off) | ((1<<start_off)-1);
116 }
117 else
118 {
119 alloc_bitmap[curr_idx] &= (1<<start_off)-1;
120 while ( ++curr_idx != end_idx ) alloc_bitmap[curr_idx] = 0;
121 alloc_bitmap[curr_idx] &= -(1<<end_off);
122 }
123 }
127 /*************************
128 * BOOT-TIME ALLOCATOR
129 */
131 /* Initialise allocator to handle up to @max_page pages. */
132 unsigned long init_boot_allocator(unsigned long bitmap_start)
133 {
134 bitmap_start = round_pgup(bitmap_start);
136 /* Allocate space for the allocation bitmap. */
137 bitmap_size = max_page / 8;
138 bitmap_size = round_pgup(bitmap_size);
139 alloc_bitmap = (unsigned long *)phys_to_virt(bitmap_start);
141 /* All allocated by default. */
142 memset(alloc_bitmap, ~0, bitmap_size);
144 return bitmap_start + bitmap_size;
145 }
147 void init_boot_pages(unsigned long ps, unsigned long pe)
148 {
149 unsigned long bad_pfn;
150 char *p;
152 ps = round_pgup(ps);
153 pe = round_pgdown(pe);
155 map_free(ps >> PAGE_SHIFT, (pe - ps) >> PAGE_SHIFT);
157 /* Check new pages against the bad-page list. */
158 p = opt_badpage;
159 while ( *p != '\0' )
160 {
161 bad_pfn = simple_strtoul(p, &p, 0);
163 if ( *p == ',' )
164 p++;
165 else if ( *p != '\0' )
166 break;
168 if ( (bad_pfn < (bitmap_size*8)) && !allocated_in_map(bad_pfn) )
169 {
170 printk("Marking page %08lx as bad\n", bad_pfn);
171 map_alloc(bad_pfn, 1);
172 }
173 }
174 }
176 unsigned long alloc_boot_pages(unsigned long size, unsigned long align)
177 {
178 unsigned long pg, i;
180 size = round_pgup(size) >> PAGE_SHIFT;
181 align = round_pgup(align) >> PAGE_SHIFT;
183 for ( pg = 0; (pg + size) < (bitmap_size*8); pg += align )
184 {
185 for ( i = 0; i < size; i++ )
186 if ( allocated_in_map(pg + i) )
187 break;
189 if ( i == size )
190 {
191 map_alloc(pg, size);
192 return pg << PAGE_SHIFT;
193 }
194 }
196 return 0;
197 }
201 /*************************
202 * BINARY BUDDY ALLOCATOR
203 */
205 #define MEMZONE_XEN 0
206 #define MEMZONE_DOM 1
207 #define NR_ZONES 2
209 /* Up to 2^10 pages can be allocated at once. */
210 #define MAX_ORDER 10
211 static struct list_head heap[NR_ZONES][MAX_ORDER+1];
213 static unsigned long avail[NR_ZONES];
215 static spinlock_t heap_lock = SPIN_LOCK_UNLOCKED;
217 void end_boot_allocator(void)
218 {
219 unsigned long i, j;
220 int curr_free = 0, next_free = 0;
222 memset(avail, 0, sizeof(avail));
224 for ( i = 0; i < NR_ZONES; i++ )
225 for ( j = 0; j <= MAX_ORDER; j++ )
226 INIT_LIST_HEAD(&heap[i][j]);
228 /* Pages that are free now go to the domain sub-allocator. */
229 for ( i = 0; i < max_page; i++ )
230 {
231 curr_free = next_free;
232 next_free = !allocated_in_map(i+1);
233 if ( next_free )
234 map_alloc(i+1, 1); /* prevent merging in free_heap_pages() */
235 if ( curr_free )
236 free_heap_pages(MEMZONE_DOM, pfn_to_page(i), 0);
237 }
238 }
240 /* Hand the specified arbitrary page range to the specified heap zone. */
241 void init_heap_pages(
242 unsigned int zone, struct pfn_info *pg, unsigned long nr_pages)
243 {
244 unsigned long i;
246 ASSERT(zone < NR_ZONES);
248 for ( i = 0; i < nr_pages; i++ )
249 free_heap_pages(zone, pg+i, 0);
250 }
253 /* Allocate 2^@order contiguous pages. */
254 struct pfn_info *alloc_heap_pages(unsigned int zone, unsigned int order)
255 {
256 int i;
257 struct pfn_info *pg;
259 ASSERT(zone < NR_ZONES);
261 if ( unlikely(order > MAX_ORDER) )
262 return NULL;
264 spin_lock(&heap_lock);
266 /* Find smallest order which can satisfy the request. */
267 for ( i = order; i <= MAX_ORDER; i++ )
268 if ( !list_empty(&heap[zone][i]) )
269 goto found;
271 /* No suitable memory blocks. Fail the request. */
272 spin_unlock(&heap_lock);
273 return NULL;
275 found:
276 pg = list_entry(heap[zone][i].next, struct pfn_info, list);
277 list_del(&pg->list);
279 /* We may have to halve the chunk a number of times. */
280 while ( i != order )
281 {
282 PFN_ORDER(pg) = --i;
283 list_add_tail(&pg->list, &heap[zone][i]);
284 pg += 1 << i;
285 }
287 map_alloc(page_to_pfn(pg), 1 << order);
288 avail[zone] -= 1 << order;
290 spin_unlock(&heap_lock);
292 return pg;
293 }
296 /* Free 2^@order set of pages. */
297 void free_heap_pages(
298 unsigned int zone, struct pfn_info *pg, unsigned int order)
299 {
300 unsigned long mask;
302 ASSERT(zone < NR_ZONES);
303 ASSERT(order <= MAX_ORDER);
305 spin_lock(&heap_lock);
307 map_free(page_to_pfn(pg), 1 << order);
308 avail[zone] += 1 << order;
310 /* Merge chunks as far as possible. */
311 while ( order < MAX_ORDER )
312 {
313 mask = 1 << order;
315 if ( (page_to_pfn(pg) & mask) )
316 {
317 /* Merge with predecessor block? */
318 if ( allocated_in_map(page_to_pfn(pg)-mask) ||
319 (PFN_ORDER(pg-mask) != order) )
320 break;
321 list_del(&(pg-mask)->list);
322 pg -= mask;
323 }
324 else
325 {
326 /* Merge with successor block? */
327 if ( allocated_in_map(page_to_pfn(pg)+mask) ||
328 (PFN_ORDER(pg+mask) != order) )
329 break;
330 list_del(&(pg+mask)->list);
331 }
333 order++;
334 }
336 PFN_ORDER(pg) = order;
337 list_add_tail(&pg->list, &heap[zone][order]);
339 spin_unlock(&heap_lock);
340 }
343 /*
344 * Scrub all unallocated pages in all heap zones. This function is more
345 * convoluted than appears necessary because we do not want to continuously
346 * hold the lock or disable interrupts while scrubbing very large memory areas.
347 */
348 void scrub_heap_pages(void)
349 {
350 void *p;
351 unsigned long pfn, flags;
353 printk("Scrubbing Free RAM: ");
355 for ( pfn = 0; pfn < (bitmap_size * 8); pfn++ )
356 {
357 /* Every 100MB, print a progress dot and appease the watchdog. */
358 if ( (pfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
359 {
360 printk(".");
361 touch_nmi_watchdog();
362 }
364 /* Quick lock-free check. */
365 if ( allocated_in_map(pfn) )
366 continue;
368 spin_lock_irqsave(&heap_lock, flags);
370 /* Re-check page status with lock held. */
371 if ( !allocated_in_map(pfn) )
372 {
373 p = map_domain_mem(pfn << PAGE_SHIFT);
374 clear_page(p);
375 unmap_domain_mem(p);
376 }
378 spin_unlock_irqrestore(&heap_lock, flags);
379 }
381 printk("done.\n");
382 }
386 /*************************
387 * XEN-HEAP SUB-ALLOCATOR
388 */
390 void init_xenheap_pages(unsigned long ps, unsigned long pe)
391 {
392 unsigned long flags;
394 ps = round_pgup(ps);
395 pe = round_pgdown(pe);
397 memguard_guard_range(__va(ps), pe - ps);
399 /*
400 * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
401 * prevent merging of power-of-two blocks across the zone boundary.
402 */
403 if ( !IS_XEN_HEAP_FRAME(phys_to_page(pe)) )
404 pe -= PAGE_SIZE;
406 local_irq_save(flags);
407 init_heap_pages(MEMZONE_XEN, phys_to_page(ps), (pe - ps) >> PAGE_SHIFT);
408 local_irq_restore(flags);
409 }
412 unsigned long alloc_xenheap_pages(unsigned int order)
413 {
414 unsigned long flags;
415 struct pfn_info *pg;
416 int i, attempts = 0;
418 retry:
419 local_irq_save(flags);
420 pg = alloc_heap_pages(MEMZONE_XEN, order);
421 local_irq_restore(flags);
423 if ( unlikely(pg == NULL) )
424 goto no_memory;
426 memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
428 for ( i = 0; i < (1 << order); i++ )
429 {
430 pg[i].count_info = 0;
431 pg[i].u.inuse.domain = NULL;
432 pg[i].u.inuse.type_info = 0;
433 }
435 return (unsigned long)page_to_virt(pg);
437 no_memory:
438 if ( attempts++ < 8 )
439 {
440 xmem_cache_reap();
441 goto retry;
442 }
444 printk("Cannot handle page request order %d!\n", order);
445 dump_slabinfo();
446 return 0;
447 }
450 void free_xenheap_pages(unsigned long p, unsigned int order)
451 {
452 unsigned long flags;
454 memguard_guard_range((void *)p, 1 << (order + PAGE_SHIFT));
456 local_irq_save(flags);
457 free_heap_pages(MEMZONE_XEN, virt_to_page(p), order);
458 local_irq_restore(flags);
459 }
463 /*************************
464 * DOMAIN-HEAP SUB-ALLOCATOR
465 */
467 void init_domheap_pages(unsigned long ps, unsigned long pe)
468 {
469 ASSERT(!in_irq());
471 ps = round_pgup(ps);
472 pe = round_pgdown(pe);
474 init_heap_pages(MEMZONE_DOM, phys_to_page(ps), (pe - ps) >> PAGE_SHIFT);
475 }
478 struct pfn_info *alloc_domheap_pages(struct domain *d, unsigned int order)
479 {
480 struct pfn_info *pg;
481 unsigned long mask, flushed_mask, pfn_stamp, cpu_stamp;
482 int i, j;
484 ASSERT(!in_irq());
486 if ( unlikely((pg = alloc_heap_pages(MEMZONE_DOM, order)) == NULL) )
487 return NULL;
489 flushed_mask = 0;
490 for ( i = 0; i < (1 << order); i++ )
491 {
492 if ( (mask = (pg[i].u.free.cpu_mask & ~flushed_mask)) != 0 )
493 {
494 pfn_stamp = pg[i].tlbflush_timestamp;
495 for ( j = 0; (mask != 0) && (j < smp_num_cpus); j++ )
496 {
497 if ( mask & (1<<j) )
498 {
499 cpu_stamp = tlbflush_time[j];
500 if ( !NEED_FLUSH(cpu_stamp, pfn_stamp) )
501 mask &= ~(1<<j);
502 }
503 }
505 if ( unlikely(mask != 0) )
506 {
507 flush_tlb_mask(mask);
508 perfc_incrc(need_flush_tlb_flush);
509 flushed_mask |= mask;
510 }
511 }
513 pg[i].count_info = 0;
514 pg[i].u.inuse.domain = NULL;
515 pg[i].u.inuse.type_info = 0;
516 }
518 if ( d == NULL )
519 return pg;
521 spin_lock(&d->page_alloc_lock);
523 if ( unlikely(test_bit(DF_DYING, &d->flags)) ||
524 unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
525 {
526 DPRINTK("Over-allocation for domain %u: %u > %u\n",
527 d->id, d->tot_pages + (1 << order), d->max_pages);
528 DPRINTK("...or the domain is dying (%d)\n",
529 !!test_bit(DF_DYING, &d->flags));
530 spin_unlock(&d->page_alloc_lock);
531 free_heap_pages(MEMZONE_DOM, pg, order);
532 return NULL;
533 }
535 if ( unlikely(d->tot_pages == 0) )
536 get_knownalive_domain(d);
538 d->tot_pages += 1 << order;
540 for ( i = 0; i < (1 << order); i++ )
541 {
542 pg[i].u.inuse.domain = d;
543 wmb(); /* Domain pointer must be visible before updating refcnt. */
544 pg[i].count_info |= PGC_allocated | 1;
545 list_add_tail(&pg[i].list, &d->page_list);
546 }
548 spin_unlock(&d->page_alloc_lock);
550 return pg;
551 }
554 void free_domheap_pages(struct pfn_info *pg, unsigned int order)
555 {
556 int i, drop_dom_ref;
557 struct domain *d = pg->u.inuse.domain;
559 ASSERT(!in_irq());
561 if ( unlikely(IS_XEN_HEAP_FRAME(pg)) )
562 {
563 /* NB. May recursively lock from domain_relinquish_memory(). */
564 spin_lock_recursive(&d->page_alloc_lock);
566 for ( i = 0; i < (1 << order); i++ )
567 list_del(&pg[i].list);
569 d->xenheap_pages -= 1 << order;
570 drop_dom_ref = (d->xenheap_pages == 0);
572 spin_unlock_recursive(&d->page_alloc_lock);
573 }
574 else if ( likely(d != NULL) )
575 {
576 /* NB. May recursively lock from domain_relinquish_memory(). */
577 spin_lock_recursive(&d->page_alloc_lock);
579 for ( i = 0; i < (1 << order); i++ )
580 {
581 ASSERT((pg[i].u.inuse.type_info & PGT_count_mask) == 0);
582 pg[i].tlbflush_timestamp = tlbflush_current_time();
583 pg[i].u.free.cpu_mask = 1 << d->processor;
584 list_del(&pg[i].list);
585 }
587 d->tot_pages -= 1 << order;
588 drop_dom_ref = (d->tot_pages == 0);
590 spin_unlock_recursive(&d->page_alloc_lock);
592 if ( likely(!test_bit(DF_DYING, &d->flags)) )
593 {
594 free_heap_pages(MEMZONE_DOM, pg, order);
595 }
596 else
597 {
598 /*
599 * Normally we expect a domain to clear pages before freeing them,
600 * if it cares about the secrecy of their contents. However, after
601 * a domain has died we assume responsibility for erasure.
602 */
603 for ( i = 0; i < (1 << order); i++ )
604 {
605 spin_lock(&page_scrub_lock);
606 list_add(&pg[i].list, &page_scrub_list);
607 spin_unlock(&page_scrub_lock);
608 }
609 }
610 }
611 else
612 {
613 /* Freeing an anonymous domain-heap page. */
614 free_heap_pages(MEMZONE_DOM, pg, order);
615 drop_dom_ref = 0;
616 }
618 if ( drop_dom_ref )
619 put_domain(d);
620 }
623 unsigned long avail_domheap_pages(void)
624 {
625 return avail[MEMZONE_DOM];
626 }
630 /*************************
631 * PAGE SCRUBBING
632 */
634 static void page_scrub_softirq(void)
635 {
636 struct list_head *ent;
637 struct pfn_info *pg;
638 void *p;
639 int i;
640 s_time_t start = NOW();
642 /* Aim to do 1ms of work (ten percent of a 10ms jiffy). */
643 do {
644 spin_lock(&page_scrub_lock);
646 if ( unlikely((ent = page_scrub_list.next) == &page_scrub_list) )
647 {
648 spin_unlock(&page_scrub_lock);
649 return;
650 }
652 /* Peel up to 16 pages from the list. */
653 for ( i = 0; i < 16; i++ )
654 {
655 if ( ent->next == &page_scrub_list )
656 break;
657 ent = ent->next;
658 }
660 /* Remove peeled pages from the list. */
661 ent->next->prev = &page_scrub_list;
662 page_scrub_list.next = ent->next;
664 spin_unlock(&page_scrub_lock);
666 /* Working backwards, scrub each page in turn. */
667 while ( ent != &page_scrub_list )
668 {
669 pg = list_entry(ent, struct pfn_info, list);
670 ent = ent->prev;
671 p = map_domain_mem(page_to_phys(pg));
672 clear_page(p);
673 unmap_domain_mem(p);
674 free_heap_pages(MEMZONE_DOM, pg, 0);
675 }
676 } while ( (NOW() - start) < MILLISECS(1) );
677 }
679 static __init int page_scrub_init(void)
680 {
681 spin_lock_init(&page_scrub_lock);
682 INIT_LIST_HEAD(&page_scrub_list);
683 open_softirq(PAGE_SCRUB_SOFTIRQ, page_scrub_softirq);
684 return 0;
685 }
686 __initcall(page_scrub_init);