direct-io.hg

view xen/common/page_alloc.c @ 5517:10e9028c8e3d

bitkeeper revision 1.1718.1.10 (42b7b19aqOS_1M8I4pIOFjiTPYWV-g)

Merge bk://xenbits.xensource.com/xen-unstable.bk
into spot.cl.cam.ac.uk:C:/Documents and Settings/iap10/xen-unstable.bk
author iap10@spot.cl.cam.ac.uk
date Tue Jun 21 06:20:10 2005 +0000 (2005-06-21)
parents 849b58da37b7
children 43564304cf94 b53a65034532
line source
1 /******************************************************************************
2 * page_alloc.c
3 *
4 * Simple buddy heap allocator for Xen.
5 *
6 * Copyright (c) 2002-2004 K A Fraser
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
23 #include <xen/config.h>
24 #include <xen/init.h>
25 #include <xen/types.h>
26 #include <xen/lib.h>
27 #include <xen/perfc.h>
28 #include <xen/sched.h>
29 #include <xen/spinlock.h>
30 #include <xen/mm.h>
31 #include <xen/irq.h>
32 #include <xen/softirq.h>
33 #include <xen/shadow.h>
34 #include <xen/domain_page.h>
35 #include <asm/page.h>
37 /*
38 * Comma-separated list of hexadecimal page numbers containing bad bytes.
39 * e.g. 'badpage=0x3f45,0x8a321'.
40 */
41 static char opt_badpage[100] = "";
42 string_param("badpage", opt_badpage);
44 #define round_pgdown(_p) ((_p)&PAGE_MASK)
45 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
47 static spinlock_t page_scrub_lock = SPIN_LOCK_UNLOCKED;
48 LIST_HEAD(page_scrub_list);
50 /*********************
51 * ALLOCATION BITMAP
52 * One bit per page of memory. Bit set => page is allocated.
53 */
55 static unsigned long bitmap_size; /* in bytes */
56 static unsigned long *alloc_bitmap;
57 #define PAGES_PER_MAPWORD (sizeof(unsigned long) * 8)
59 #define allocated_in_map(_pn) \
60 ( !! (alloc_bitmap[(_pn)/PAGES_PER_MAPWORD] & \
61 (1UL<<((_pn)&(PAGES_PER_MAPWORD-1)))) )
63 /*
64 * Hint regarding bitwise arithmetic in map_{alloc,free}:
65 * -(1<<n) sets all bits >= n.
66 * (1<<n)-1 sets all bits < n.
67 * Variable names in map_{alloc,free}:
68 * *_idx == Index into `alloc_bitmap' array.
69 * *_off == Bit offset within an element of the `alloc_bitmap' array.
70 */
72 static void map_alloc(unsigned long first_page, unsigned long nr_pages)
73 {
74 unsigned long start_off, end_off, curr_idx, end_idx;
76 #ifndef NDEBUG
77 unsigned long i;
78 /* Check that the block isn't already allocated. */
79 for ( i = 0; i < nr_pages; i++ )
80 ASSERT(!allocated_in_map(first_page + i));
81 #endif
83 curr_idx = first_page / PAGES_PER_MAPWORD;
84 start_off = first_page & (PAGES_PER_MAPWORD-1);
85 end_idx = (first_page + nr_pages) / PAGES_PER_MAPWORD;
86 end_off = (first_page + nr_pages) & (PAGES_PER_MAPWORD-1);
88 if ( curr_idx == end_idx )
89 {
90 alloc_bitmap[curr_idx] |= ((1UL<<end_off)-1) & -(1UL<<start_off);
91 }
92 else
93 {
94 alloc_bitmap[curr_idx] |= -(1UL<<start_off);
95 while ( ++curr_idx < end_idx ) alloc_bitmap[curr_idx] = ~0UL;
96 alloc_bitmap[curr_idx] |= (1UL<<end_off)-1;
97 }
98 }
101 static void map_free(unsigned long first_page, unsigned long nr_pages)
102 {
103 unsigned long start_off, end_off, curr_idx, end_idx;
105 #ifndef NDEBUG
106 unsigned long i;
107 /* Check that the block isn't already freed. */
108 for ( i = 0; i < nr_pages; i++ )
109 ASSERT(allocated_in_map(first_page + i));
110 #endif
112 curr_idx = first_page / PAGES_PER_MAPWORD;
113 start_off = first_page & (PAGES_PER_MAPWORD-1);
114 end_idx = (first_page + nr_pages) / PAGES_PER_MAPWORD;
115 end_off = (first_page + nr_pages) & (PAGES_PER_MAPWORD-1);
117 if ( curr_idx == end_idx )
118 {
119 alloc_bitmap[curr_idx] &= -(1UL<<end_off) | ((1UL<<start_off)-1);
120 }
121 else
122 {
123 alloc_bitmap[curr_idx] &= (1UL<<start_off)-1;
124 while ( ++curr_idx != end_idx ) alloc_bitmap[curr_idx] = 0;
125 alloc_bitmap[curr_idx] &= -(1UL<<end_off);
126 }
127 }
131 /*************************
132 * BOOT-TIME ALLOCATOR
133 */
135 /* Initialise allocator to handle up to @max_page pages. */
136 physaddr_t init_boot_allocator(physaddr_t bitmap_start)
137 {
138 bitmap_start = round_pgup(bitmap_start);
140 /* Allocate space for the allocation bitmap. */
141 bitmap_size = max_page / 8;
142 bitmap_size = round_pgup(bitmap_size);
143 alloc_bitmap = (unsigned long *)phys_to_virt(bitmap_start);
145 /* All allocated by default. */
146 memset(alloc_bitmap, ~0, bitmap_size);
148 return bitmap_start + bitmap_size;
149 }
151 void init_boot_pages(physaddr_t ps, physaddr_t pe)
152 {
153 unsigned long bad_pfn;
154 char *p;
156 ps = round_pgup(ps);
157 pe = round_pgdown(pe);
158 if ( pe <= ps )
159 return;
161 map_free(ps >> PAGE_SHIFT, (pe - ps) >> PAGE_SHIFT);
163 /* Check new pages against the bad-page list. */
164 p = opt_badpage;
165 while ( *p != '\0' )
166 {
167 bad_pfn = simple_strtoul(p, &p, 0);
169 if ( *p == ',' )
170 p++;
171 else if ( *p != '\0' )
172 break;
174 if ( (bad_pfn < (bitmap_size*8)) && !allocated_in_map(bad_pfn) )
175 {
176 printk("Marking page %lx as bad\n", bad_pfn);
177 map_alloc(bad_pfn, 1);
178 }
179 }
180 }
182 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
183 {
184 unsigned long pg, i;
186 for ( pg = 0; (pg + nr_pfns) < (bitmap_size*8); pg += pfn_align )
187 {
188 for ( i = 0; i < nr_pfns; i++ )
189 if ( allocated_in_map(pg + i) )
190 break;
192 if ( i == nr_pfns )
193 {
194 map_alloc(pg, nr_pfns);
195 return pg;
196 }
197 }
199 return 0;
200 }
204 /*************************
205 * BINARY BUDDY ALLOCATOR
206 */
208 #define MEMZONE_XEN 0
209 #define MEMZONE_DOM 1
210 #define NR_ZONES 2
212 /* Up to 2^20 pages can be allocated at once. */
213 #define MAX_ORDER 20
214 static struct list_head heap[NR_ZONES][MAX_ORDER+1];
216 static unsigned long avail[NR_ZONES];
218 static spinlock_t heap_lock = SPIN_LOCK_UNLOCKED;
220 void end_boot_allocator(void)
221 {
222 unsigned long i, j;
223 int curr_free = 0, next_free = 0;
225 memset(avail, 0, sizeof(avail));
227 for ( i = 0; i < NR_ZONES; i++ )
228 for ( j = 0; j <= MAX_ORDER; j++ )
229 INIT_LIST_HEAD(&heap[i][j]);
231 /* Pages that are free now go to the domain sub-allocator. */
232 for ( i = 0; i < max_page; i++ )
233 {
234 curr_free = next_free;
235 next_free = !allocated_in_map(i+1);
236 if ( next_free )
237 map_alloc(i+1, 1); /* prevent merging in free_heap_pages() */
238 if ( curr_free )
239 free_heap_pages(MEMZONE_DOM, pfn_to_page(i), 0);
240 }
241 }
243 /* Hand the specified arbitrary page range to the specified heap zone. */
244 void init_heap_pages(
245 unsigned int zone, struct pfn_info *pg, unsigned long nr_pages)
246 {
247 unsigned long i;
249 ASSERT(zone < NR_ZONES);
251 for ( i = 0; i < nr_pages; i++ )
252 free_heap_pages(zone, pg+i, 0);
253 }
256 /* Allocate 2^@order contiguous pages. */
257 struct pfn_info *alloc_heap_pages(unsigned int zone, unsigned int order)
258 {
259 int i;
260 struct pfn_info *pg;
262 ASSERT(zone < NR_ZONES);
264 if ( unlikely(order > MAX_ORDER) )
265 return NULL;
267 spin_lock(&heap_lock);
269 /* Find smallest order which can satisfy the request. */
270 for ( i = order; i <= MAX_ORDER; i++ )
271 if ( !list_empty(&heap[zone][i]) )
272 goto found;
274 /* No suitable memory blocks. Fail the request. */
275 spin_unlock(&heap_lock);
276 return NULL;
278 found:
279 pg = list_entry(heap[zone][i].next, struct pfn_info, list);
280 list_del(&pg->list);
282 /* We may have to halve the chunk a number of times. */
283 while ( i != order )
284 {
285 PFN_ORDER(pg) = --i;
286 list_add_tail(&pg->list, &heap[zone][i]);
287 pg += 1 << i;
288 }
290 map_alloc(page_to_pfn(pg), 1 << order);
291 avail[zone] -= 1 << order;
293 spin_unlock(&heap_lock);
295 return pg;
296 }
299 /* Free 2^@order set of pages. */
300 void free_heap_pages(
301 unsigned int zone, struct pfn_info *pg, unsigned int order)
302 {
303 unsigned long mask;
305 ASSERT(zone < NR_ZONES);
306 ASSERT(order <= MAX_ORDER);
308 spin_lock(&heap_lock);
310 map_free(page_to_pfn(pg), 1 << order);
311 avail[zone] += 1 << order;
313 /* Merge chunks as far as possible. */
314 while ( order < MAX_ORDER )
315 {
316 mask = 1 << order;
318 if ( (page_to_pfn(pg) & mask) )
319 {
320 /* Merge with predecessor block? */
321 if ( allocated_in_map(page_to_pfn(pg)-mask) ||
322 (PFN_ORDER(pg-mask) != order) )
323 break;
324 list_del(&(pg-mask)->list);
325 pg -= mask;
326 }
327 else
328 {
329 /* Merge with successor block? */
330 if ( allocated_in_map(page_to_pfn(pg)+mask) ||
331 (PFN_ORDER(pg+mask) != order) )
332 break;
333 list_del(&(pg+mask)->list);
334 }
336 order++;
337 }
339 PFN_ORDER(pg) = order;
340 list_add_tail(&pg->list, &heap[zone][order]);
342 spin_unlock(&heap_lock);
343 }
346 /*
347 * Scrub all unallocated pages in all heap zones. This function is more
348 * convoluted than appears necessary because we do not want to continuously
349 * hold the lock or disable interrupts while scrubbing very large memory areas.
350 */
351 void scrub_heap_pages(void)
352 {
353 void *p;
354 unsigned long pfn, flags;
356 printk("Scrubbing Free RAM: ");
357 watchdog_disable();
359 for ( pfn = 0; pfn < (bitmap_size * 8); pfn++ )
360 {
361 /* Every 100MB, print a progress dot. */
362 if ( (pfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
363 printk(".");
365 /* Quick lock-free check. */
366 if ( allocated_in_map(pfn) )
367 continue;
369 spin_lock_irqsave(&heap_lock, flags);
371 /* Re-check page status with lock held. */
372 if ( !allocated_in_map(pfn) )
373 {
374 if ( IS_XEN_HEAP_FRAME(pfn_to_page(pfn)) )
375 {
376 p = page_to_virt(pfn_to_page(pfn));
377 memguard_unguard_range(p, PAGE_SIZE);
378 clear_page(p);
379 memguard_guard_range(p, PAGE_SIZE);
380 }
381 else
382 {
383 p = map_domain_page(pfn);
384 clear_page(p);
385 unmap_domain_page(p);
386 }
387 }
389 spin_unlock_irqrestore(&heap_lock, flags);
390 }
392 watchdog_enable();
393 printk("done.\n");
394 }
398 /*************************
399 * XEN-HEAP SUB-ALLOCATOR
400 */
402 void init_xenheap_pages(physaddr_t ps, physaddr_t pe)
403 {
404 unsigned long flags;
406 ps = round_pgup(ps);
407 pe = round_pgdown(pe);
409 memguard_guard_range(phys_to_virt(ps), pe - ps);
411 /*
412 * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
413 * prevent merging of power-of-two blocks across the zone boundary.
414 */
415 if ( !IS_XEN_HEAP_FRAME(phys_to_page(pe)) )
416 pe -= PAGE_SIZE;
418 local_irq_save(flags);
419 init_heap_pages(MEMZONE_XEN, phys_to_page(ps), (pe - ps) >> PAGE_SHIFT);
420 local_irq_restore(flags);
421 }
424 void *alloc_xenheap_pages(unsigned int order)
425 {
426 unsigned long flags;
427 struct pfn_info *pg;
428 int i;
430 local_irq_save(flags);
431 pg = alloc_heap_pages(MEMZONE_XEN, order);
432 local_irq_restore(flags);
434 if ( unlikely(pg == NULL) )
435 goto no_memory;
437 memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
439 for ( i = 0; i < (1 << order); i++ )
440 {
441 pg[i].count_info = 0;
442 pg[i].u.inuse._domain = 0;
443 pg[i].u.inuse.type_info = 0;
444 }
446 return page_to_virt(pg);
448 no_memory:
449 printk("Cannot handle page request order %d!\n", order);
450 return NULL;
451 }
454 void free_xenheap_pages(void *v, unsigned int order)
455 {
456 unsigned long flags;
458 memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
460 local_irq_save(flags);
461 free_heap_pages(MEMZONE_XEN, virt_to_page(v), order);
462 local_irq_restore(flags);
463 }
467 /*************************
468 * DOMAIN-HEAP SUB-ALLOCATOR
469 */
471 void init_domheap_pages(physaddr_t ps, physaddr_t pe)
472 {
473 ASSERT(!in_irq());
475 ps = round_pgup(ps);
476 pe = round_pgdown(pe);
478 init_heap_pages(MEMZONE_DOM, phys_to_page(ps), (pe - ps) >> PAGE_SHIFT);
479 }
482 struct pfn_info *alloc_domheap_pages(struct domain *d, unsigned int order)
483 {
484 struct pfn_info *pg;
485 cpumask_t mask;
486 int i;
488 ASSERT(!in_irq());
490 if ( unlikely((pg = alloc_heap_pages(MEMZONE_DOM, order)) == NULL) )
491 return NULL;
493 mask = pg->u.free.cpumask;
494 tlbflush_filter(mask, pg->tlbflush_timestamp);
496 pg->count_info = 0;
497 pg->u.inuse._domain = 0;
498 pg->u.inuse.type_info = 0;
500 for ( i = 1; i < (1 << order); i++ )
501 {
502 /* Add in any extra CPUs that need flushing because of this page. */
503 cpumask_t extra_cpus_mask;
504 cpus_andnot(extra_cpus_mask, pg[i].u.free.cpumask, mask);
505 tlbflush_filter(extra_cpus_mask, pg[i].tlbflush_timestamp);
506 cpus_or(mask, mask, extra_cpus_mask);
508 pg[i].count_info = 0;
509 pg[i].u.inuse._domain = 0;
510 pg[i].u.inuse.type_info = 0;
511 }
513 if ( unlikely(!cpus_empty(mask)) )
514 {
515 perfc_incrc(need_flush_tlb_flush);
516 flush_tlb_mask(mask);
517 }
519 if ( d == NULL )
520 return pg;
522 spin_lock(&d->page_alloc_lock);
524 if ( unlikely(test_bit(_DOMF_dying, &d->domain_flags)) ||
525 unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
526 {
527 DPRINTK("Over-allocation for domain %u: %u > %u\n",
528 d->domain_id, d->tot_pages + (1 << order), d->max_pages);
529 DPRINTK("...or the domain is dying (%d)\n",
530 !!test_bit(_DOMF_dying, &d->domain_flags));
531 spin_unlock(&d->page_alloc_lock);
532 free_heap_pages(MEMZONE_DOM, pg, order);
533 return NULL;
534 }
536 if ( unlikely(d->tot_pages == 0) )
537 get_knownalive_domain(d);
539 d->tot_pages += 1 << order;
541 for ( i = 0; i < (1 << order); i++ )
542 {
543 page_set_owner(&pg[i], d);
544 wmb(); /* Domain pointer must be visible before updating refcnt. */
545 pg[i].count_info |= PGC_allocated | 1;
546 list_add_tail(&pg[i].list, &d->page_list);
547 }
549 spin_unlock(&d->page_alloc_lock);
551 return pg;
552 }
555 void free_domheap_pages(struct pfn_info *pg, unsigned int order)
556 {
557 int i, drop_dom_ref;
558 struct domain *d = page_get_owner(pg);
560 ASSERT(!in_irq());
562 if ( unlikely(IS_XEN_HEAP_FRAME(pg)) )
563 {
564 /* NB. May recursively lock from relinquish_memory(). */
565 spin_lock_recursive(&d->page_alloc_lock);
567 for ( i = 0; i < (1 << order); i++ )
568 list_del(&pg[i].list);
570 d->xenheap_pages -= 1 << order;
571 drop_dom_ref = (d->xenheap_pages == 0);
573 spin_unlock_recursive(&d->page_alloc_lock);
574 }
575 else if ( likely(d != NULL) )
576 {
577 /* NB. May recursively lock from relinquish_memory(). */
578 spin_lock_recursive(&d->page_alloc_lock);
580 for ( i = 0; i < (1 << order); i++ )
581 {
582 shadow_drop_references(d, &pg[i]);
583 ASSERT(((pg[i].u.inuse.type_info & PGT_count_mask) == 0) ||
584 shadow_tainted_refcnts(d));
585 pg[i].tlbflush_timestamp = tlbflush_current_time();
586 pg[i].u.free.cpumask = d->cpumask;
587 list_del(&pg[i].list);
588 }
590 d->tot_pages -= 1 << order;
591 drop_dom_ref = (d->tot_pages == 0);
593 spin_unlock_recursive(&d->page_alloc_lock);
595 if ( likely(!test_bit(_DOMF_dying, &d->domain_flags)) )
596 {
597 free_heap_pages(MEMZONE_DOM, pg, order);
598 }
599 else
600 {
601 /*
602 * Normally we expect a domain to clear pages before freeing them,
603 * if it cares about the secrecy of their contents. However, after
604 * a domain has died we assume responsibility for erasure.
605 */
606 for ( i = 0; i < (1 << order); i++ )
607 {
608 spin_lock(&page_scrub_lock);
609 list_add(&pg[i].list, &page_scrub_list);
610 spin_unlock(&page_scrub_lock);
611 }
612 }
613 }
614 else
615 {
616 /* Freeing an anonymous domain-heap page. */
617 free_heap_pages(MEMZONE_DOM, pg, order);
618 drop_dom_ref = 0;
619 }
621 if ( drop_dom_ref )
622 put_domain(d);
623 }
626 unsigned long avail_domheap_pages(void)
627 {
628 return avail[MEMZONE_DOM];
629 }
633 /*************************
634 * PAGE SCRUBBING
635 */
637 static void page_scrub_softirq(void)
638 {
639 struct list_head *ent;
640 struct pfn_info *pg;
641 void *p;
642 int i;
643 s_time_t start = NOW();
645 /* Aim to do 1ms of work (ten percent of a 10ms jiffy). */
646 do {
647 spin_lock(&page_scrub_lock);
649 if ( unlikely((ent = page_scrub_list.next) == &page_scrub_list) )
650 {
651 spin_unlock(&page_scrub_lock);
652 return;
653 }
655 /* Peel up to 16 pages from the list. */
656 for ( i = 0; i < 16; i++ )
657 {
658 if ( ent->next == &page_scrub_list )
659 break;
660 ent = ent->next;
661 }
663 /* Remove peeled pages from the list. */
664 ent->next->prev = &page_scrub_list;
665 page_scrub_list.next = ent->next;
667 spin_unlock(&page_scrub_lock);
669 /* Working backwards, scrub each page in turn. */
670 while ( ent != &page_scrub_list )
671 {
672 pg = list_entry(ent, struct pfn_info, list);
673 ent = ent->prev;
674 p = map_domain_page(page_to_pfn(pg));
675 clear_page(p);
676 unmap_domain_page(p);
677 free_heap_pages(MEMZONE_DOM, pg, 0);
678 }
679 } while ( (NOW() - start) < MILLISECS(1) );
680 }
682 static __init int page_scrub_init(void)
683 {
684 open_softirq(PAGE_SCRUB_SOFTIRQ, page_scrub_softirq);
685 return 0;
686 }
687 __initcall(page_scrub_init);
689 /*
690 * Local variables:
691 * mode: C
692 * c-set-style: "BSD"
693 * c-basic-offset: 4
694 * tab-width: 4
695 * indent-tabs-mode: nil
696 * End:
697 */