ia64/xen-unstable

view xen/common/page_alloc.c @ 19164:de853e901b5c

Remove cpumask for page_info struct.

This makes TLB flushing on page allocation more conservative, but the
flush clock should still save us most of the time (page freeing and
alloc'ing tends to happen in batches, and not necesasrily close
together). We could add some optimisations to the flush filter if this
does turn out to be a significant overhead for some (useful)
workloads.

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Feb 04 15:29:51 2009 +0000 (2009-02-04)
parents 86159a906bec
children 416197f0292b
line source
1 /******************************************************************************
2 * page_alloc.c
3 *
4 * Simple buddy heap allocator for Xen.
5 *
6 * Copyright (c) 2002-2004 K A Fraser
7 * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/init.h>
26 #include <xen/types.h>
27 #include <xen/lib.h>
28 #include <xen/sched.h>
29 #include <xen/spinlock.h>
30 #include <xen/mm.h>
31 #include <xen/irq.h>
32 #include <xen/softirq.h>
33 #include <xen/domain_page.h>
34 #include <xen/keyhandler.h>
35 #include <xen/perfc.h>
36 #include <xen/numa.h>
37 #include <xen/nodemask.h>
38 #include <asm/page.h>
39 #include <asm/numa.h>
40 #include <asm/flushtlb.h>
42 /*
43 * Comma-separated list of hexadecimal page numbers containing bad bytes.
44 * e.g. 'badpage=0x3f45,0x8a321'.
45 */
46 static char opt_badpage[100] = "";
47 string_param("badpage", opt_badpage);
49 /*
50 * no-bootscrub -> Free pages are not zeroed during boot.
51 */
52 static int opt_bootscrub __initdata = 1;
53 boolean_param("bootscrub", opt_bootscrub);
55 /*
56 * Bit width of the DMA heap -- used to override NUMA-node-first.
57 * allocation strategy, which can otherwise exhaust low memory.
58 */
59 static unsigned int dma_bitsize;
60 integer_param("dma_bits", dma_bitsize);
62 #define round_pgdown(_p) ((_p)&PAGE_MASK)
63 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
65 #ifndef NDEBUG
66 /* Avoid callers relying on allocations returning zeroed pages. */
67 #define scrub_page(p) memset((p), 0xc2, PAGE_SIZE)
68 #else
69 /* For a production build, clear_page() is the fastest way to scrub. */
70 #define scrub_page(p) clear_page(p)
71 #endif
73 static DEFINE_SPINLOCK(page_scrub_lock);
74 PAGE_LIST_HEAD(page_scrub_list);
75 static unsigned long scrub_pages;
77 /*********************
78 * ALLOCATION BITMAP
79 * One bit per page of memory. Bit set => page is allocated.
80 */
82 unsigned long *alloc_bitmap;
83 #define PAGES_PER_MAPWORD (sizeof(unsigned long) * 8)
85 #define allocated_in_map(_pn) \
86 ({ unsigned long ___pn = (_pn); \
87 !!(alloc_bitmap[___pn/PAGES_PER_MAPWORD] & \
88 (1UL<<(___pn&(PAGES_PER_MAPWORD-1)))); })
90 /*
91 * Hint regarding bitwise arithmetic in map_{alloc,free}:
92 * -(1<<n) sets all bits >= n.
93 * (1<<n)-1 sets all bits < n.
94 * Variable names in map_{alloc,free}:
95 * *_idx == Index into `alloc_bitmap' array.
96 * *_off == Bit offset within an element of the `alloc_bitmap' array.
97 */
99 static void map_alloc(unsigned long first_page, unsigned long nr_pages)
100 {
101 unsigned long start_off, end_off, curr_idx, end_idx;
103 #ifndef NDEBUG
104 unsigned long i;
105 /* Check that the block isn't already allocated. */
106 for ( i = 0; i < nr_pages; i++ )
107 ASSERT(!allocated_in_map(first_page + i));
108 #endif
110 curr_idx = first_page / PAGES_PER_MAPWORD;
111 start_off = first_page & (PAGES_PER_MAPWORD-1);
112 end_idx = (first_page + nr_pages) / PAGES_PER_MAPWORD;
113 end_off = (first_page + nr_pages) & (PAGES_PER_MAPWORD-1);
115 if ( curr_idx == end_idx )
116 {
117 alloc_bitmap[curr_idx] |= ((1UL<<end_off)-1) & -(1UL<<start_off);
118 }
119 else
120 {
121 alloc_bitmap[curr_idx] |= -(1UL<<start_off);
122 while ( ++curr_idx < end_idx ) alloc_bitmap[curr_idx] = ~0UL;
123 alloc_bitmap[curr_idx] |= (1UL<<end_off)-1;
124 }
125 }
127 static void map_free(unsigned long first_page, unsigned long nr_pages)
128 {
129 unsigned long start_off, end_off, curr_idx, end_idx;
131 #ifndef NDEBUG
132 unsigned long i;
133 /* Check that the block isn't already freed. */
134 for ( i = 0; i < nr_pages; i++ )
135 ASSERT(allocated_in_map(first_page + i));
136 #endif
138 curr_idx = first_page / PAGES_PER_MAPWORD;
139 start_off = first_page & (PAGES_PER_MAPWORD-1);
140 end_idx = (first_page + nr_pages) / PAGES_PER_MAPWORD;
141 end_off = (first_page + nr_pages) & (PAGES_PER_MAPWORD-1);
143 if ( curr_idx == end_idx )
144 {
145 alloc_bitmap[curr_idx] &= -(1UL<<end_off) | ((1UL<<start_off)-1);
146 }
147 else
148 {
149 alloc_bitmap[curr_idx] &= (1UL<<start_off)-1;
150 while ( ++curr_idx != end_idx ) alloc_bitmap[curr_idx] = 0;
151 alloc_bitmap[curr_idx] &= -(1UL<<end_off);
152 }
153 }
157 /*************************
158 * BOOT-TIME ALLOCATOR
159 */
161 static unsigned long first_valid_mfn = ~0UL;
163 /* Initialise allocator to handle up to @max_page pages. */
164 paddr_t __init init_boot_allocator(paddr_t bitmap_start)
165 {
166 unsigned long bitmap_size;
168 bitmap_start = round_pgup(bitmap_start);
170 /*
171 * Allocate space for the allocation bitmap. Include an extra longword
172 * of padding for possible overrun in map_alloc and map_free.
173 */
174 bitmap_size = max_page / 8;
175 bitmap_size += sizeof(unsigned long);
176 bitmap_size = round_pgup(bitmap_size);
177 alloc_bitmap = (unsigned long *)maddr_to_virt(bitmap_start);
179 /* All allocated by default. */
180 memset(alloc_bitmap, ~0, bitmap_size);
182 return bitmap_start + bitmap_size;
183 }
185 void __init init_boot_pages(paddr_t ps, paddr_t pe)
186 {
187 unsigned long bad_spfn, bad_epfn, i;
188 const char *p;
190 ps = round_pgup(ps);
191 pe = round_pgdown(pe);
192 if ( pe <= ps )
193 return;
195 first_valid_mfn = min_t(unsigned long, ps >> PAGE_SHIFT, first_valid_mfn);
197 map_free(ps >> PAGE_SHIFT, (pe - ps) >> PAGE_SHIFT);
199 /* Check new pages against the bad-page list. */
200 p = opt_badpage;
201 while ( *p != '\0' )
202 {
203 bad_spfn = simple_strtoul(p, &p, 0);
204 bad_epfn = bad_spfn;
206 if ( *p == '-' )
207 {
208 p++;
209 bad_epfn = simple_strtoul(p, &p, 0);
210 if ( bad_epfn < bad_spfn )
211 bad_epfn = bad_spfn;
212 }
214 if ( *p == ',' )
215 p++;
216 else if ( *p != '\0' )
217 break;
219 if ( bad_epfn == bad_spfn )
220 printk("Marking page %lx as bad\n", bad_spfn);
221 else
222 printk("Marking pages %lx through %lx as bad\n",
223 bad_spfn, bad_epfn);
225 for ( i = bad_spfn; i <= bad_epfn; i++ )
226 if ( (i < max_page) && !allocated_in_map(i) )
227 map_alloc(i, 1);
228 }
229 }
231 unsigned long __init alloc_boot_pages(
232 unsigned long nr_pfns, unsigned long pfn_align)
233 {
234 unsigned long pg, i;
236 /* Search backwards to obtain highest available range. */
237 for ( pg = (max_page - nr_pfns) & ~(pfn_align - 1);
238 pg >= first_valid_mfn;
239 pg = (pg + i - nr_pfns) & ~(pfn_align - 1) )
240 {
241 for ( i = 0; i < nr_pfns; i++ )
242 if ( allocated_in_map(pg+i) )
243 break;
244 if ( i == nr_pfns )
245 {
246 map_alloc(pg, nr_pfns);
247 return pg;
248 }
249 }
251 return 0;
252 }
256 /*************************
257 * BINARY BUDDY ALLOCATOR
258 */
260 #define MEMZONE_XEN 0
261 #define NR_ZONES (PADDR_BITS - PAGE_SHIFT)
263 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 0 : ((b) - PAGE_SHIFT - 1))
264 #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN : \
265 (fls(page_to_mfn(pg)) - 1))
267 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
268 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
269 #define heap(node, zone, order) ((*_heap[node])[zone][order])
271 static unsigned long *avail[MAX_NUMNODES];
273 static DEFINE_SPINLOCK(heap_lock);
275 static unsigned long init_node_heap(int node, unsigned long mfn,
276 unsigned long nr)
277 {
278 /* First node to be discovered has its heap metadata statically alloced. */
279 static heap_by_zone_and_order_t _heap_static;
280 static unsigned long avail_static[NR_ZONES];
281 static int first_node_initialised;
282 unsigned long needed = (sizeof(**_heap) +
283 sizeof(**avail) * NR_ZONES +
284 PAGE_SIZE - 1) >> PAGE_SHIFT;
285 int i, j;
287 if ( !first_node_initialised )
288 {
289 _heap[node] = &_heap_static;
290 avail[node] = avail_static;
291 first_node_initialised = 1;
292 needed = 0;
293 }
294 #ifdef DIRECTMAP_VIRT_END
295 else if ( nr >= needed &&
296 mfn + needed <= virt_to_mfn(DIRECTMAP_VIRT_END) )
297 {
298 _heap[node] = mfn_to_virt(mfn);
299 avail[node] = mfn_to_virt(mfn + needed) - sizeof(**avail) * NR_ZONES;
300 }
301 #endif
302 else if ( get_order_from_bytes(sizeof(**_heap)) ==
303 get_order_from_pages(needed) )
304 {
305 _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0);
306 BUG_ON(!_heap[node]);
307 avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) -
308 sizeof(**avail) * NR_ZONES;
309 needed = 0;
310 }
311 else
312 {
313 _heap[node] = xmalloc(heap_by_zone_and_order_t);
314 avail[node] = xmalloc_array(unsigned long, NR_ZONES);
315 BUG_ON(!_heap[node] || !avail[node]);
316 needed = 0;
317 }
319 memset(avail[node], 0, NR_ZONES * sizeof(long));
321 for ( i = 0; i < NR_ZONES; i++ )
322 for ( j = 0; j <= MAX_ORDER; j++ )
323 INIT_PAGE_LIST_HEAD(&(*_heap[node])[i][j]);
325 return needed;
326 }
328 /* Allocate 2^@order contiguous pages. */
329 static struct page_info *alloc_heap_pages(
330 unsigned int zone_lo, unsigned int zone_hi,
331 unsigned int node, unsigned int order)
332 {
333 unsigned int i, j, zone;
334 unsigned int num_nodes = num_online_nodes();
335 unsigned long request = 1UL << order;
336 cpumask_t extra_cpus_mask, mask;
337 struct page_info *pg;
339 if ( node == NUMA_NO_NODE )
340 node = cpu_to_node(smp_processor_id());
342 ASSERT(node >= 0);
343 ASSERT(node < num_nodes);
344 ASSERT(zone_lo <= zone_hi);
345 ASSERT(zone_hi < NR_ZONES);
347 if ( unlikely(order > MAX_ORDER) )
348 return NULL;
350 spin_lock(&heap_lock);
352 /*
353 * Start with requested node, but exhaust all node memory in requested
354 * zone before failing, only calc new node value if we fail to find memory
355 * in target node, this avoids needless computation on fast-path.
356 */
357 for ( i = 0; i < num_nodes; i++ )
358 {
359 zone = zone_hi;
360 do {
361 /* Check if target node can support the allocation. */
362 if ( !avail[node] || (avail[node][zone] < request) )
363 continue;
365 /* Find smallest order which can satisfy the request. */
366 for ( j = order; j <= MAX_ORDER; j++ )
367 if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
368 goto found;
369 } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
371 /* Pick next node, wrapping around if needed. */
372 if ( ++node == num_nodes )
373 node = 0;
374 }
376 /* No suitable memory blocks. Fail the request. */
377 spin_unlock(&heap_lock);
378 return NULL;
380 found:
381 /* We may have to halve the chunk a number of times. */
382 while ( j != order )
383 {
384 PFN_ORDER(pg) = --j;
385 page_list_add_tail(pg, &heap(node, zone, j));
386 pg += 1 << j;
387 }
389 map_alloc(page_to_mfn(pg), request);
390 ASSERT(avail[node][zone] >= request);
391 avail[node][zone] -= request;
393 spin_unlock(&heap_lock);
395 cpus_clear(mask);
397 for ( i = 0; i < (1 << order); i++ )
398 {
399 /* Reference count must continuously be zero for free pages. */
400 BUG_ON(pg[i].count_info != 0);
402 /* Add in any extra CPUs that need flushing because of this page. */
403 cpus_andnot(extra_cpus_mask, cpu_online_map, mask);
404 tlbflush_filter(extra_cpus_mask, pg[i].tlbflush_timestamp);
405 cpus_or(mask, mask, extra_cpus_mask);
407 /* Initialise fields which have other uses for free pages. */
408 pg[i].u.inuse.type_info = 0;
409 page_set_owner(&pg[i], NULL);
410 }
412 if ( unlikely(!cpus_empty(mask)) )
413 {
414 perfc_incr(need_flush_tlb_flush);
415 flush_tlb_mask(mask);
416 }
418 return pg;
419 }
421 /* Free 2^@order set of pages. */
422 static void free_heap_pages(
423 struct page_info *pg, unsigned int order)
424 {
425 unsigned long mask;
426 unsigned int i, node = phys_to_nid(page_to_maddr(pg));
427 unsigned int zone = page_to_zone(pg);
429 ASSERT(order <= MAX_ORDER);
430 ASSERT(node >= 0);
431 ASSERT(node < num_online_nodes());
433 for ( i = 0; i < (1 << order); i++ )
434 {
435 /*
436 * Cannot assume that count_info == 0, as there are some corner cases
437 * where it isn't the case and yet it isn't a bug:
438 * 1. page_get_owner() is NULL
439 * 2. page_get_owner() is a domain that was never accessible by
440 * its domid (e.g., failed to fully construct the domain).
441 * 3. page was never addressable by the guest (e.g., it's an
442 * auto-translate-physmap guest and the page was never included
443 * in its pseudophysical address space).
444 * In all the above cases there can be no guest mappings of this page.
445 */
446 pg[i].count_info = 0;
448 /* If a page has no owner it will need no safety TLB flush. */
449 pg[i].tlbflush_timestamp =
450 page_get_owner(&pg[i]) ? tlbflush_current_time() : 0;
451 }
453 spin_lock(&heap_lock);
455 map_free(page_to_mfn(pg), 1 << order);
456 avail[node][zone] += 1 << order;
458 /* Merge chunks as far as possible. */
459 while ( order < MAX_ORDER )
460 {
461 mask = 1UL << order;
463 if ( (page_to_mfn(pg) & mask) )
464 {
465 /* Merge with predecessor block? */
466 if ( allocated_in_map(page_to_mfn(pg)-mask) ||
467 (PFN_ORDER(pg-mask) != order) )
468 break;
469 pg -= mask;
470 page_list_del(pg, &heap(node, zone, order));
471 }
472 else
473 {
474 /* Merge with successor block? */
475 if ( allocated_in_map(page_to_mfn(pg)+mask) ||
476 (PFN_ORDER(pg+mask) != order) )
477 break;
478 page_list_del(pg + mask, &heap(node, zone, order));
479 }
481 order++;
483 /* After merging, pg should remain in the same node. */
484 ASSERT(phys_to_nid(page_to_maddr(pg)) == node);
485 }
487 PFN_ORDER(pg) = order;
488 page_list_add_tail(pg, &heap(node, zone, order));
490 spin_unlock(&heap_lock);
491 }
493 /*
494 * Hand the specified arbitrary page range to the specified heap zone
495 * checking the node_id of the previous page. If they differ and the
496 * latter is not on a MAX_ORDER boundary, then we reserve the page by
497 * not freeing it to the buddy allocator.
498 */
499 static void init_heap_pages(
500 struct page_info *pg, unsigned long nr_pages)
501 {
502 unsigned int nid_curr, nid_prev;
503 unsigned long i;
505 nid_prev = phys_to_nid(page_to_maddr(pg-1));
507 for ( i = 0; i < nr_pages; nid_prev = nid_curr, i++ )
508 {
509 nid_curr = phys_to_nid(page_to_maddr(pg+i));
511 if ( unlikely(!avail[nid_curr]) )
512 {
513 unsigned long n;
515 n = init_node_heap(nid_curr, page_to_mfn(pg+i), nr_pages - i);
516 if ( n )
517 {
518 BUG_ON(i + n > nr_pages);
519 i += n - 1;
520 continue;
521 }
522 }
524 /*
525 * Free pages of the same node, or if they differ, but are on a
526 * MAX_ORDER alignment boundary (which already get reserved).
527 */
528 if ( (nid_curr == nid_prev) ||
529 !(page_to_mfn(pg+i) & ((1UL << MAX_ORDER) - 1)) )
530 free_heap_pages(pg+i, 0);
531 else
532 printk("Reserving non-aligned node boundary @ mfn %#lx\n",
533 page_to_mfn(pg+i));
534 }
535 }
537 static unsigned long avail_heap_pages(
538 unsigned int zone_lo, unsigned int zone_hi, unsigned int node)
539 {
540 unsigned int i, zone, num_nodes = num_online_nodes();
541 unsigned long free_pages = 0;
543 if ( zone_hi >= NR_ZONES )
544 zone_hi = NR_ZONES - 1;
546 for ( i = 0; i < num_nodes; i++ )
547 {
548 if ( !avail[i] )
549 continue;
550 for ( zone = zone_lo; zone <= zone_hi; zone++ )
551 if ( (node == -1) || (node == i) )
552 free_pages += avail[i][zone];
553 }
555 return free_pages;
556 }
558 #define avail_for_domheap(mfn) !(allocated_in_map(mfn) || is_xen_heap_mfn(mfn))
559 void __init end_boot_allocator(void)
560 {
561 unsigned long i, nr = 0;
562 int curr_free, next_free;
564 /* Pages that are free now go to the domain sub-allocator. */
565 if ( (curr_free = next_free = avail_for_domheap(first_valid_mfn)) )
566 map_alloc(first_valid_mfn, 1);
567 for ( i = first_valid_mfn; i < max_page; i++ )
568 {
569 curr_free = next_free;
570 next_free = avail_for_domheap(i+1);
571 if ( next_free )
572 map_alloc(i+1, 1); /* prevent merging in free_heap_pages() */
573 if ( curr_free )
574 ++nr;
575 else if ( nr )
576 {
577 init_heap_pages(mfn_to_page(i - nr), nr);
578 nr = 0;
579 }
580 }
581 if ( nr )
582 init_heap_pages(mfn_to_page(i - nr), nr);
584 if ( !dma_bitsize && (num_online_nodes() > 1) )
585 {
586 #ifdef CONFIG_X86
587 dma_bitsize = min_t(unsigned int,
588 fls(NODE_DATA(0)->node_spanned_pages) - 1
589 + PAGE_SHIFT - 2,
590 32);
591 #else
592 dma_bitsize = 32;
593 #endif
594 }
596 printk("Domain heap initialised");
597 if ( dma_bitsize )
598 printk(" DMA width %u bits", dma_bitsize);
599 printk("\n");
600 }
601 #undef avail_for_domheap
603 /*
604 * Scrub all unallocated pages in all heap zones. This function is more
605 * convoluted than appears necessary because we do not want to continuously
606 * hold the lock while scrubbing very large memory areas.
607 */
608 void __init scrub_heap_pages(void)
609 {
610 void *p;
611 unsigned long mfn;
613 if ( !opt_bootscrub )
614 return;
616 printk("Scrubbing Free RAM: ");
618 for ( mfn = first_valid_mfn; mfn < max_page; mfn++ )
619 {
620 process_pending_timers();
622 /* Quick lock-free check. */
623 if ( allocated_in_map(mfn) )
624 continue;
626 /* Every 100MB, print a progress dot. */
627 if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
628 printk(".");
630 spin_lock(&heap_lock);
632 /* Re-check page status with lock held. */
633 if ( !allocated_in_map(mfn) )
634 {
635 if ( is_xen_heap_mfn(mfn) )
636 {
637 p = page_to_virt(mfn_to_page(mfn));
638 memguard_unguard_range(p, PAGE_SIZE);
639 scrub_page(p);
640 memguard_guard_range(p, PAGE_SIZE);
641 }
642 else
643 {
644 p = map_domain_page(mfn);
645 scrub_page(p);
646 unmap_domain_page(p);
647 }
648 }
650 spin_unlock(&heap_lock);
651 }
653 printk("done.\n");
654 }
658 /*************************
659 * XEN-HEAP SUB-ALLOCATOR
660 */
662 #if !defined(__x86_64__) && !defined(__ia64__)
664 void init_xenheap_pages(paddr_t ps, paddr_t pe)
665 {
666 ps = round_pgup(ps);
667 pe = round_pgdown(pe);
668 if ( pe <= ps )
669 return;
671 memguard_guard_range(maddr_to_virt(ps), pe - ps);
673 /*
674 * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
675 * prevent merging of power-of-two blocks across the zone boundary.
676 */
677 if ( ps && !is_xen_heap_mfn(paddr_to_pfn(ps)-1) )
678 ps += PAGE_SIZE;
679 if ( !is_xen_heap_mfn(paddr_to_pfn(pe)) )
680 pe -= PAGE_SIZE;
682 init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
683 }
686 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
687 {
688 struct page_info *pg;
690 ASSERT(!in_irq());
692 pg = alloc_heap_pages(
693 MEMZONE_XEN, MEMZONE_XEN, cpu_to_node(smp_processor_id()), order);
694 if ( unlikely(pg == NULL) )
695 return NULL;
697 memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
699 return page_to_virt(pg);
700 }
703 void free_xenheap_pages(void *v, unsigned int order)
704 {
705 ASSERT(!in_irq());
707 if ( v == NULL )
708 return;
710 memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
712 free_heap_pages(virt_to_page(v), order);
713 }
715 #else
717 void init_xenheap_pages(paddr_t ps, paddr_t pe)
718 {
719 init_domheap_pages(ps, pe);
720 }
722 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
723 {
724 struct page_info *pg;
725 unsigned int i;
727 ASSERT(!in_irq());
729 pg = alloc_domheap_pages(NULL, order, memflags);
730 if ( unlikely(pg == NULL) )
731 return NULL;
733 for ( i = 0; i < (1u << order); i++ )
734 pg[i].count_info |= PGC_xen_heap;
736 return page_to_virt(pg);
737 }
739 void free_xenheap_pages(void *v, unsigned int order)
740 {
741 struct page_info *pg;
742 unsigned int i;
744 ASSERT(!in_irq());
746 if ( v == NULL )
747 return;
749 pg = virt_to_page(v);
751 for ( i = 0; i < (1u << order); i++ )
752 pg[i].count_info &= ~PGC_xen_heap;
754 free_heap_pages(pg, order);
755 }
757 #endif
761 /*************************
762 * DOMAIN-HEAP SUB-ALLOCATOR
763 */
765 void init_domheap_pages(paddr_t ps, paddr_t pe)
766 {
767 unsigned long smfn, emfn;
769 ASSERT(!in_irq());
771 smfn = round_pgup(ps) >> PAGE_SHIFT;
772 emfn = round_pgdown(pe) >> PAGE_SHIFT;
774 init_heap_pages(mfn_to_page(smfn), emfn - smfn);
775 }
778 int assign_pages(
779 struct domain *d,
780 struct page_info *pg,
781 unsigned int order,
782 unsigned int memflags)
783 {
784 unsigned long i;
786 spin_lock(&d->page_alloc_lock);
788 if ( unlikely(d->is_dying) )
789 {
790 gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n",
791 d->domain_id);
792 goto fail;
793 }
795 if ( !(memflags & MEMF_no_refcount) )
796 {
797 if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
798 {
799 gdprintk(XENLOG_INFO, "Over-allocation for domain %u: %u > %u\n",
800 d->domain_id, d->tot_pages + (1 << order), d->max_pages);
801 goto fail;
802 }
804 if ( unlikely(d->tot_pages == 0) )
805 get_knownalive_domain(d);
807 d->tot_pages += 1 << order;
808 }
810 for ( i = 0; i < (1 << order); i++ )
811 {
812 ASSERT(page_get_owner(&pg[i]) == NULL);
813 ASSERT((pg[i].count_info & ~(PGC_allocated | 1)) == 0);
814 page_set_owner(&pg[i], d);
815 wmb(); /* Domain pointer must be visible before updating refcnt. */
816 pg[i].count_info = PGC_allocated | 1;
817 page_list_add_tail(&pg[i], &d->page_list);
818 }
820 spin_unlock(&d->page_alloc_lock);
821 return 0;
823 fail:
824 spin_unlock(&d->page_alloc_lock);
825 return -1;
826 }
829 struct page_info *alloc_domheap_pages(
830 struct domain *d, unsigned int order, unsigned int memflags)
831 {
832 struct page_info *pg = NULL;
833 unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
834 unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1), dma_zone;
836 ASSERT(!in_irq());
838 if ( (node == NUMA_NO_NODE) && (d != NULL) )
839 node = domain_to_node(d);
841 bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT));
842 if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
843 return NULL;
845 if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
846 pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order);
848 if ( (pg == NULL) &&
849 ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi,
850 node, order)) == NULL) )
851 return NULL;
853 if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
854 {
855 free_heap_pages(pg, order);
856 return NULL;
857 }
859 return pg;
860 }
862 void free_domheap_pages(struct page_info *pg, unsigned int order)
863 {
864 int i, drop_dom_ref;
865 struct domain *d = page_get_owner(pg);
867 ASSERT(!in_irq());
869 if ( unlikely(is_xen_heap_page(pg)) )
870 {
871 /* NB. May recursively lock from relinquish_memory(). */
872 spin_lock_recursive(&d->page_alloc_lock);
874 for ( i = 0; i < (1 << order); i++ )
875 page_list_del2(&pg[i], &d->xenpage_list, &d->arch.relmem_list);
877 d->xenheap_pages -= 1 << order;
878 drop_dom_ref = (d->xenheap_pages == 0);
880 spin_unlock_recursive(&d->page_alloc_lock);
881 }
882 else if ( likely(d != NULL) )
883 {
884 /* NB. May recursively lock from relinquish_memory(). */
885 spin_lock_recursive(&d->page_alloc_lock);
887 for ( i = 0; i < (1 << order); i++ )
888 {
889 BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0);
890 page_list_del2(&pg[i], &d->page_list, &d->arch.relmem_list);
891 }
893 d->tot_pages -= 1 << order;
894 drop_dom_ref = (d->tot_pages == 0);
896 spin_unlock_recursive(&d->page_alloc_lock);
898 if ( likely(!d->is_dying) )
899 {
900 free_heap_pages(pg, order);
901 }
902 else
903 {
904 /*
905 * Normally we expect a domain to clear pages before freeing them,
906 * if it cares about the secrecy of their contents. However, after
907 * a domain has died we assume responsibility for erasure.
908 */
909 for ( i = 0; i < (1 << order); i++ )
910 {
911 page_set_owner(&pg[i], NULL);
912 spin_lock(&page_scrub_lock);
913 page_list_add(&pg[i], &page_scrub_list);
914 scrub_pages++;
915 spin_unlock(&page_scrub_lock);
916 }
917 }
918 }
919 else
920 {
921 /* Freeing anonymous domain-heap pages. */
922 free_heap_pages(pg, order);
923 drop_dom_ref = 0;
924 }
926 if ( drop_dom_ref )
927 put_domain(d);
928 }
930 unsigned long avail_domheap_pages_region(
931 unsigned int node, unsigned int min_width, unsigned int max_width)
932 {
933 int zone_lo, zone_hi;
935 zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1);
936 zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo));
938 zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1);
939 zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi));
941 return avail_heap_pages(zone_lo, zone_hi, node);
942 }
944 unsigned long avail_domheap_pages(void)
945 {
946 return avail_heap_pages(MEMZONE_XEN + 1,
947 NR_ZONES - 1,
948 -1);
949 }
951 static void pagealloc_keyhandler(unsigned char key)
952 {
953 unsigned int zone = MEMZONE_XEN;
954 unsigned long n, total = 0;
956 printk("Physical memory information:\n");
957 printk(" Xen heap: %lukB free\n",
958 avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10));
960 while ( ++zone < NR_ZONES )
961 {
962 if ( (zone + PAGE_SHIFT) == dma_bitsize )
963 {
964 printk(" DMA heap: %lukB free\n", total << (PAGE_SHIFT-10));
965 total = 0;
966 }
968 if ( (n = avail_heap_pages(zone, zone, -1)) != 0 )
969 {
970 total += n;
971 printk(" heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10));
972 }
973 }
975 printk(" Dom heap: %lukB free\n", total << (PAGE_SHIFT-10));
976 }
979 static __init int pagealloc_keyhandler_init(void)
980 {
981 register_keyhandler('m', pagealloc_keyhandler, "memory info");
982 return 0;
983 }
984 __initcall(pagealloc_keyhandler_init);
988 /*************************
989 * PAGE SCRUBBING
990 */
992 static DEFINE_PER_CPU(struct timer, page_scrub_timer);
994 static void page_scrub_softirq(void)
995 {
996 PAGE_LIST_HEAD(list);
997 struct page_info *pg;
998 void *p;
999 int i;
1000 s_time_t start = NOW();
1001 static spinlock_t serialise_lock = SPIN_LOCK_UNLOCKED;
1003 /* free_heap_pages() does not parallelise well. Serialise this function. */
1004 if ( !spin_trylock(&serialise_lock) )
1006 set_timer(&this_cpu(page_scrub_timer), NOW() + MILLISECS(1));
1007 return;
1010 /* Aim to do 1ms of work every 10ms. */
1011 do {
1012 spin_lock(&page_scrub_lock);
1014 /* Peel up to 16 pages from the list. */
1015 for ( i = 0; i < 16; i++ )
1017 if ( !(pg = page_list_remove_head(&page_scrub_list)) )
1018 break;
1019 page_list_add_tail(pg, &list);
1022 if ( unlikely(i == 0) )
1024 spin_unlock(&page_scrub_lock);
1025 goto out;
1028 scrub_pages -= i;
1030 spin_unlock(&page_scrub_lock);
1032 /* Scrub each page in turn. */
1033 while ( (pg = page_list_remove_head(&list)) ) {
1034 p = map_domain_page(page_to_mfn(pg));
1035 scrub_page(p);
1036 unmap_domain_page(p);
1037 free_heap_pages(pg, 0);
1039 } while ( (NOW() - start) < MILLISECS(1) );
1041 set_timer(&this_cpu(page_scrub_timer), NOW() + MILLISECS(10));
1043 out:
1044 spin_unlock(&serialise_lock);
1047 static void page_scrub_timer_fn(void *unused)
1049 page_scrub_schedule_work();
1052 unsigned long avail_scrub_pages(void)
1054 return scrub_pages;
1057 static void dump_heap(unsigned char key)
1059 s_time_t now = NOW();
1060 int i, j;
1062 printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
1063 (u32)(now>>32), (u32)now);
1065 for ( i = 0; i < MAX_NUMNODES; i++ )
1067 if ( !avail[i] )
1068 continue;
1069 for ( j = 0; j < NR_ZONES; j++ )
1070 printk("heap[node=%d][zone=%d] -> %lu pages\n",
1071 i, j, avail[i][j]);
1075 static __init int register_heap_trigger(void)
1077 register_keyhandler('H', dump_heap, "dump heap info");
1078 return 0;
1080 __initcall(register_heap_trigger);
1083 static __init int page_scrub_init(void)
1085 int cpu;
1086 for_each_cpu ( cpu )
1087 init_timer(&per_cpu(page_scrub_timer, cpu),
1088 page_scrub_timer_fn, NULL, cpu);
1089 open_softirq(PAGE_SCRUB_SOFTIRQ, page_scrub_softirq);
1090 return 0;
1092 __initcall(page_scrub_init);
1094 /*
1095 * Local variables:
1096 * mode: C
1097 * c-set-style: "BSD"
1098 * c-basic-offset: 4
1099 * tab-width: 4
1100 * indent-tabs-mode: nil
1101 * End:
1102 */