ia64/xen-unstable

view xen/common/page_alloc.c @ 19835:edfdeb150f27

Fix buildsystem to detect udev > version 124

udev removed the udevinfo symlink from versions higher than 123 and
xen's build-system could not detect if udev is in place and has the
required version.

Signed-off-by: Marc-A. Dahlhaus <mad@wol.de>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jun 25 13:02:37 2009 +0100 (2009-06-25)
parents 822ea2bf0c54
children
line source
1 /******************************************************************************
2 * page_alloc.c
3 *
4 * Simple buddy heap allocator for Xen.
5 *
6 * Copyright (c) 2002-2004 K A Fraser
7 * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/init.h>
26 #include <xen/types.h>
27 #include <xen/lib.h>
28 #include <xen/sched.h>
29 #include <xen/spinlock.h>
30 #include <xen/mm.h>
31 #include <xen/irq.h>
32 #include <xen/softirq.h>
33 #include <xen/domain_page.h>
34 #include <xen/keyhandler.h>
35 #include <xen/perfc.h>
36 #include <xen/numa.h>
37 #include <xen/nodemask.h>
38 #include <xen/tmem.h>
39 #include <public/sysctl.h>
40 #include <asm/page.h>
41 #include <asm/numa.h>
42 #include <asm/flushtlb.h>
44 /*
45 * Comma-separated list of hexadecimal page numbers containing bad bytes.
46 * e.g. 'badpage=0x3f45,0x8a321'.
47 */
48 static char opt_badpage[100] = "";
49 string_param("badpage", opt_badpage);
51 /*
52 * no-bootscrub -> Free pages are not zeroed during boot.
53 */
54 static int opt_bootscrub __initdata = 1;
55 boolean_param("bootscrub", opt_bootscrub);
57 /*
58 * Bit width of the DMA heap -- used to override NUMA-node-first.
59 * allocation strategy, which can otherwise exhaust low memory.
60 */
61 static unsigned int dma_bitsize;
62 integer_param("dma_bits", dma_bitsize);
64 #define round_pgdown(_p) ((_p)&PAGE_MASK)
65 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
67 #ifndef NDEBUG
68 /* Avoid callers relying on allocations returning zeroed pages. */
69 #define scrub_page(p) memset((p), 0xc2, PAGE_SIZE)
70 #else
71 /* For a production build, clear_page() is the fastest way to scrub. */
72 #define scrub_page(p) clear_page(p)
73 #endif
75 static DEFINE_SPINLOCK(page_scrub_lock);
76 PAGE_LIST_HEAD(page_scrub_list);
77 static unsigned long scrub_pages;
79 /* Offlined page list, protected by heap_lock. */
80 PAGE_LIST_HEAD(page_offlined_list);
81 /* Broken page list, protected by heap_lock. */
82 PAGE_LIST_HEAD(page_broken_list);
84 /*********************
85 * ALLOCATION BITMAP
86 * One bit per page of memory. Bit set => page is allocated.
87 */
89 unsigned long *alloc_bitmap;
90 #define PAGES_PER_MAPWORD (sizeof(unsigned long) * 8)
92 #define allocated_in_map(_pn) \
93 ({ unsigned long ___pn = (_pn); \
94 !!(alloc_bitmap[___pn/PAGES_PER_MAPWORD] & \
95 (1UL<<(___pn&(PAGES_PER_MAPWORD-1)))); })
97 /*
98 * Hint regarding bitwise arithmetic in map_{alloc,free}:
99 * -(1<<n) sets all bits >= n.
100 * (1<<n)-1 sets all bits < n.
101 * Variable names in map_{alloc,free}:
102 * *_idx == Index into `alloc_bitmap' array.
103 * *_off == Bit offset within an element of the `alloc_bitmap' array.
104 */
106 static void map_alloc(unsigned long first_page, unsigned long nr_pages)
107 {
108 unsigned long start_off, end_off, curr_idx, end_idx;
110 #ifndef NDEBUG
111 unsigned long i;
112 /* Check that the block isn't already allocated. */
113 for ( i = 0; i < nr_pages; i++ )
114 ASSERT(!allocated_in_map(first_page + i));
115 #endif
117 curr_idx = first_page / PAGES_PER_MAPWORD;
118 start_off = first_page & (PAGES_PER_MAPWORD-1);
119 end_idx = (first_page + nr_pages) / PAGES_PER_MAPWORD;
120 end_off = (first_page + nr_pages) & (PAGES_PER_MAPWORD-1);
122 if ( curr_idx == end_idx )
123 {
124 alloc_bitmap[curr_idx] |= ((1UL<<end_off)-1) & -(1UL<<start_off);
125 }
126 else
127 {
128 alloc_bitmap[curr_idx] |= -(1UL<<start_off);
129 while ( ++curr_idx < end_idx ) alloc_bitmap[curr_idx] = ~0UL;
130 alloc_bitmap[curr_idx] |= (1UL<<end_off)-1;
131 }
132 }
134 static void map_free(unsigned long first_page, unsigned long nr_pages)
135 {
136 unsigned long start_off, end_off, curr_idx, end_idx;
138 #ifndef NDEBUG
139 unsigned long i;
140 /* Check that the block isn't already freed. */
141 for ( i = 0; i < nr_pages; i++ )
142 ASSERT(allocated_in_map(first_page + i));
143 #endif
145 curr_idx = first_page / PAGES_PER_MAPWORD;
146 start_off = first_page & (PAGES_PER_MAPWORD-1);
147 end_idx = (first_page + nr_pages) / PAGES_PER_MAPWORD;
148 end_off = (first_page + nr_pages) & (PAGES_PER_MAPWORD-1);
150 if ( curr_idx == end_idx )
151 {
152 alloc_bitmap[curr_idx] &= -(1UL<<end_off) | ((1UL<<start_off)-1);
153 }
154 else
155 {
156 alloc_bitmap[curr_idx] &= (1UL<<start_off)-1;
157 while ( ++curr_idx != end_idx ) alloc_bitmap[curr_idx] = 0;
158 alloc_bitmap[curr_idx] &= -(1UL<<end_off);
159 }
160 }
164 /*************************
165 * BOOT-TIME ALLOCATOR
166 */
168 static unsigned long first_valid_mfn = ~0UL;
170 /* Initialise allocator to handle up to @max_page pages. */
171 paddr_t __init init_boot_allocator(paddr_t bitmap_start)
172 {
173 unsigned long bitmap_size;
175 bitmap_start = round_pgup(bitmap_start);
177 /*
178 * Allocate space for the allocation bitmap. Include an extra longword
179 * of padding for possible overrun in map_alloc and map_free.
180 */
181 bitmap_size = max_page / 8;
182 bitmap_size += sizeof(unsigned long);
183 bitmap_size = round_pgup(bitmap_size);
184 alloc_bitmap = (unsigned long *)maddr_to_virt(bitmap_start);
186 /* All allocated by default. */
187 memset(alloc_bitmap, ~0, bitmap_size);
189 return bitmap_start + bitmap_size;
190 }
192 void __init init_boot_pages(paddr_t ps, paddr_t pe)
193 {
194 unsigned long bad_spfn, bad_epfn, i;
195 const char *p;
197 ps = round_pgup(ps);
198 pe = round_pgdown(pe);
199 if ( pe <= ps )
200 return;
202 first_valid_mfn = min_t(unsigned long, ps >> PAGE_SHIFT, first_valid_mfn);
204 map_free(ps >> PAGE_SHIFT, (pe - ps) >> PAGE_SHIFT);
206 /* Check new pages against the bad-page list. */
207 p = opt_badpage;
208 while ( *p != '\0' )
209 {
210 bad_spfn = simple_strtoul(p, &p, 0);
211 bad_epfn = bad_spfn;
213 if ( *p == '-' )
214 {
215 p++;
216 bad_epfn = simple_strtoul(p, &p, 0);
217 if ( bad_epfn < bad_spfn )
218 bad_epfn = bad_spfn;
219 }
221 if ( *p == ',' )
222 p++;
223 else if ( *p != '\0' )
224 break;
226 if ( bad_epfn == bad_spfn )
227 printk("Marking page %lx as bad\n", bad_spfn);
228 else
229 printk("Marking pages %lx through %lx as bad\n",
230 bad_spfn, bad_epfn);
232 for ( i = bad_spfn; i <= bad_epfn; i++ )
233 if ( (i < max_page) && !allocated_in_map(i) )
234 map_alloc(i, 1);
235 }
236 }
238 unsigned long __init alloc_boot_pages(
239 unsigned long nr_pfns, unsigned long pfn_align)
240 {
241 unsigned long pg, i;
243 /* Search backwards to obtain highest available range. */
244 for ( pg = (max_page - nr_pfns) & ~(pfn_align - 1);
245 pg >= first_valid_mfn;
246 pg = (pg + i - nr_pfns) & ~(pfn_align - 1) )
247 {
248 for ( i = 0; i < nr_pfns; i++ )
249 if ( allocated_in_map(pg+i) )
250 break;
251 if ( i == nr_pfns )
252 {
253 map_alloc(pg, nr_pfns);
254 return pg;
255 }
256 }
258 return 0;
259 }
263 /*************************
264 * BINARY BUDDY ALLOCATOR
265 */
267 #define MEMZONE_XEN 0
268 #define NR_ZONES (PADDR_BITS - PAGE_SHIFT)
270 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 0 : ((b) - PAGE_SHIFT - 1))
271 #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN : \
272 (fls(page_to_mfn(pg)) - 1))
274 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
275 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
276 #define heap(node, zone, order) ((*_heap[node])[zone][order])
278 static unsigned long *avail[MAX_NUMNODES];
280 static DEFINE_SPINLOCK(heap_lock);
282 static unsigned long init_node_heap(int node, unsigned long mfn,
283 unsigned long nr)
284 {
285 /* First node to be discovered has its heap metadata statically alloced. */
286 static heap_by_zone_and_order_t _heap_static;
287 static unsigned long avail_static[NR_ZONES];
288 static int first_node_initialised;
289 unsigned long needed = (sizeof(**_heap) +
290 sizeof(**avail) * NR_ZONES +
291 PAGE_SIZE - 1) >> PAGE_SHIFT;
292 int i, j;
294 if ( !first_node_initialised )
295 {
296 _heap[node] = &_heap_static;
297 avail[node] = avail_static;
298 first_node_initialised = 1;
299 needed = 0;
300 }
301 #ifdef DIRECTMAP_VIRT_END
302 else if ( nr >= needed &&
303 (mfn + needed) <= (virt_to_mfn(DIRECTMAP_VIRT_END - 1) + 1) )
304 {
305 _heap[node] = mfn_to_virt(mfn);
306 avail[node] = mfn_to_virt(mfn + needed - 1) +
307 PAGE_SIZE - sizeof(**avail) * NR_ZONES;
308 }
309 #endif
310 else if ( get_order_from_bytes(sizeof(**_heap)) ==
311 get_order_from_pages(needed) )
312 {
313 _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0);
314 BUG_ON(!_heap[node]);
315 avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) -
316 sizeof(**avail) * NR_ZONES;
317 needed = 0;
318 }
319 else
320 {
321 _heap[node] = xmalloc(heap_by_zone_and_order_t);
322 avail[node] = xmalloc_array(unsigned long, NR_ZONES);
323 BUG_ON(!_heap[node] || !avail[node]);
324 needed = 0;
325 }
327 memset(avail[node], 0, NR_ZONES * sizeof(long));
329 for ( i = 0; i < NR_ZONES; i++ )
330 for ( j = 0; j <= MAX_ORDER; j++ )
331 INIT_PAGE_LIST_HEAD(&(*_heap[node])[i][j]);
333 return needed;
334 }
336 /* Allocate 2^@order contiguous pages. */
337 static struct page_info *alloc_heap_pages(
338 unsigned int zone_lo, unsigned int zone_hi,
339 unsigned int node, unsigned int order, unsigned int memflags)
340 {
341 unsigned int i, j, zone = 0;
342 unsigned int num_nodes = num_online_nodes();
343 unsigned long request = 1UL << order;
344 cpumask_t extra_cpus_mask, mask;
345 struct page_info *pg;
347 if ( node == NUMA_NO_NODE )
348 node = cpu_to_node(smp_processor_id());
350 ASSERT(node >= 0);
351 ASSERT(node < num_nodes);
352 ASSERT(zone_lo <= zone_hi);
353 ASSERT(zone_hi < NR_ZONES);
355 if ( unlikely(order > MAX_ORDER) )
356 return NULL;
358 spin_lock(&heap_lock);
360 /*
361 * Start with requested node, but exhaust all node memory in requested
362 * zone before failing, only calc new node value if we fail to find memory
363 * in target node, this avoids needless computation on fast-path.
364 */
365 for ( i = 0; i < num_nodes; i++ )
366 {
367 zone = zone_hi;
368 do {
369 /* Check if target node can support the allocation. */
370 if ( !avail[node] || (avail[node][zone] < request) )
371 continue;
373 /* Find smallest order which can satisfy the request. */
374 for ( j = order; j <= MAX_ORDER; j++ )
375 if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
376 goto found;
377 } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
379 /* Pick next node, wrapping around if needed. */
380 if ( ++node == num_nodes )
381 node = 0;
382 }
384 /* Try to free memory from tmem */
385 if ( (pg = tmem_relinquish_pages(order,memflags)) != NULL )
386 {
387 /* reassigning an already allocated anonymous heap page */
388 spin_unlock(&heap_lock);
389 return pg;
390 }
392 /* No suitable memory blocks. Fail the request. */
393 spin_unlock(&heap_lock);
394 return NULL;
396 found:
397 /* We may have to halve the chunk a number of times. */
398 while ( j != order )
399 {
400 PFN_ORDER(pg) = --j;
401 page_list_add_tail(pg, &heap(node, zone, j));
402 pg += 1 << j;
403 }
405 map_alloc(page_to_mfn(pg), request);
406 ASSERT(avail[node][zone] >= request);
407 avail[node][zone] -= request;
409 spin_unlock(&heap_lock);
411 cpus_clear(mask);
413 for ( i = 0; i < (1 << order); i++ )
414 {
415 /* Reference count must continuously be zero for free pages. */
416 BUG_ON(pg[i].count_info != 0);
418 if ( pg[i].u.free.need_tlbflush )
419 {
420 /* Add in extra CPUs that need flushing because of this page. */
421 cpus_andnot(extra_cpus_mask, cpu_online_map, mask);
422 tlbflush_filter(extra_cpus_mask, pg[i].tlbflush_timestamp);
423 cpus_or(mask, mask, extra_cpus_mask);
424 }
426 /* Initialise fields which have other uses for free pages. */
427 pg[i].u.inuse.type_info = 0;
428 page_set_owner(&pg[i], NULL);
429 }
431 if ( unlikely(!cpus_empty(mask)) )
432 {
433 perfc_incr(need_flush_tlb_flush);
434 flush_tlb_mask(&mask);
435 }
437 return pg;
438 }
440 /* Remove any offlined page in the buddy pointed to by head. */
441 static int reserve_offlined_page(struct page_info *head)
442 {
443 unsigned int node = phys_to_nid(page_to_maddr(head));
444 int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
445 struct page_info *cur_head;
446 int cur_order;
448 ASSERT(spin_is_locked(&heap_lock));
450 cur_head = head;
452 page_list_del(head, &heap(node, zone, head_order));
454 while ( cur_head < (head + (1 << head_order)) )
455 {
456 struct page_info *pg;
457 int next_order;
459 if ( test_bit(_PGC_offlined, &cur_head->count_info) )
460 {
461 cur_head++;
462 continue;
463 }
465 next_order = cur_order = 0;
467 while ( cur_order < head_order )
468 {
469 next_order = cur_order + 1;
471 if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) )
472 goto merge;
474 for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order );
475 i < (1 << next_order);
476 i++, pg++ )
477 if ( test_bit(_PGC_offlined, &pg->count_info) )
478 break;
479 if ( i == ( 1 << next_order) )
480 {
481 cur_order = next_order;
482 continue;
483 }
484 else
485 {
486 merge:
487 /* We don't consider merging outside the head_order. */
488 page_list_add_tail(cur_head, &heap(node, zone, cur_order));
489 PFN_ORDER(cur_head) = cur_order;
490 cur_head += (1 << cur_order);
491 break;
492 }
493 }
494 }
496 for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ )
497 {
498 if ( !test_bit(_PGC_offlined, &cur_head->count_info) )
499 continue;
501 avail[node][zone]--;
503 map_alloc(page_to_mfn(cur_head), 1);
505 page_list_add_tail(cur_head,
506 test_bit(_PGC_broken, &cur_head->count_info) ?
507 &page_broken_list : &page_offlined_list);
509 count++;
510 }
512 return count;
513 }
515 /* Free 2^@order set of pages. */
516 static void free_heap_pages(
517 struct page_info *pg, unsigned int order)
518 {
519 unsigned long mask;
520 unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
521 unsigned int zone = page_to_zone(pg);
523 ASSERT(order <= MAX_ORDER);
524 ASSERT(node >= 0);
525 ASSERT(node < num_online_nodes());
527 for ( i = 0; i < (1 << order); i++ )
528 {
529 /*
530 * Cannot assume that count_info == 0, as there are some corner cases
531 * where it isn't the case and yet it isn't a bug:
532 * 1. page_get_owner() is NULL
533 * 2. page_get_owner() is a domain that was never accessible by
534 * its domid (e.g., failed to fully construct the domain).
535 * 3. page was never addressable by the guest (e.g., it's an
536 * auto-translate-physmap guest and the page was never included
537 * in its pseudophysical address space).
538 * In all the above cases there can be no guest mappings of this page.
539 */
540 ASSERT(!(pg[i].count_info & PGC_offlined));
541 pg[i].count_info &= PGC_offlining | PGC_broken;
542 if ( pg[i].count_info & PGC_offlining )
543 {
544 pg[i].count_info &= ~PGC_offlining;
545 pg[i].count_info |= PGC_offlined;
546 tainted = 1;
547 }
549 /* If a page has no owner it will need no safety TLB flush. */
550 pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
551 if ( pg[i].u.free.need_tlbflush )
552 pg[i].tlbflush_timestamp = tlbflush_current_time();
553 }
555 spin_lock(&heap_lock);
557 map_free(page_to_mfn(pg), 1 << order);
558 avail[node][zone] += 1 << order;
560 /* Merge chunks as far as possible. */
561 while ( order < MAX_ORDER )
562 {
563 mask = 1UL << order;
565 if ( (page_to_mfn(pg) & mask) )
566 {
567 /* Merge with predecessor block? */
568 if ( allocated_in_map(page_to_mfn(pg)-mask) ||
569 (PFN_ORDER(pg-mask) != order) )
570 break;
571 pg -= mask;
572 page_list_del(pg, &heap(node, zone, order));
573 }
574 else
575 {
576 /* Merge with successor block? */
577 if ( allocated_in_map(page_to_mfn(pg)+mask) ||
578 (PFN_ORDER(pg+mask) != order) )
579 break;
580 page_list_del(pg + mask, &heap(node, zone, order));
581 }
583 order++;
585 /* After merging, pg should remain in the same node. */
586 ASSERT(phys_to_nid(page_to_maddr(pg)) == node);
587 }
589 PFN_ORDER(pg) = order;
590 page_list_add_tail(pg, &heap(node, zone, order));
592 if ( tainted )
593 reserve_offlined_page(pg);
595 spin_unlock(&heap_lock);
596 }
599 /*
600 * Following possible status for a page:
601 * free and Online; free and offlined; free and offlined and broken;
602 * assigned and online; assigned and offlining; assigned and offling and broken
603 *
604 * Following rules applied for page offline:
605 * Once a page is broken, it can't be assigned anymore
606 * A page will be offlined only if it is free
607 * return original count_info
608 *
609 */
610 static unsigned long mark_page_offline(struct page_info *pg, int broken)
611 {
612 unsigned long nx, x, y = pg->count_info;
614 ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL));
615 ASSERT(spin_is_locked(&heap_lock));
617 do {
618 nx = x = y;
620 if ( ((x & PGC_offlined_broken) == PGC_offlined_broken) )
621 return y;
623 if ( x & PGC_offlined )
624 {
625 /* PGC_offlined means it is a free page. */
626 if ( broken && !(nx & PGC_broken) )
627 nx |= PGC_broken;
628 else
629 return y;
630 }
631 else
632 {
633 /* It is not offlined, not reserved page */
634 nx |= (allocated_in_map(page_to_mfn(pg)) ?
635 PGC_offlining : PGC_offlined);
636 }
638 if ( broken )
639 nx |= PGC_broken;
640 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
642 return y;
643 }
645 static int reserve_heap_page(struct page_info *pg)
646 {
647 struct page_info *head = NULL;
648 unsigned int i, node = phys_to_nid(page_to_maddr(pg));
649 unsigned int zone = page_to_zone(pg);
651 for ( i = 0; i <= MAX_ORDER; i++ )
652 {
653 struct page_info *tmp;
655 if ( page_list_empty(&heap(node, zone, i)) )
656 continue;
658 page_list_for_each_safe ( head, tmp, &heap(node, zone, i) )
659 {
660 if ( (head <= pg) &&
661 (head + (1UL << i) > pg) )
662 return reserve_offlined_page(head);
663 }
664 }
666 return -EINVAL;
668 }
670 int offline_page(unsigned long mfn, int broken, uint32_t *status)
671 {
672 unsigned long old_info = 0;
673 struct domain *owner;
674 int ret = 0;
675 struct page_info *pg;
677 if ( mfn > max_page )
678 {
679 dprintk(XENLOG_WARNING,
680 "try to offline page out of range %lx\n", mfn);
681 return -EINVAL;
682 }
684 *status = 0;
685 pg = mfn_to_page(mfn);
687 #if defined(__x86_64__)
688 /* Xen's txt mfn in x86_64 is reserved in e820 */
689 if ( is_xen_fixed_mfn(mfn) )
690 #elif defined(__i386__)
691 if ( is_xen_heap_mfn(mfn) )
692 #endif
693 {
694 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
695 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
696 return -EPERM;
697 }
699 /*
700 * N.B. xen's txt in x86_64 is marked reserved and handled already
701 * Also kexec range is reserved
702 */
703 if ( !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
704 {
705 *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
706 return -EINVAL;
707 }
709 spin_lock(&heap_lock);
711 old_info = mark_page_offline(pg, broken);
713 if ( !allocated_in_map(mfn) )
714 {
715 /* Free pages are reserve directly */
716 reserve_heap_page(pg);
717 *status = PG_OFFLINE_OFFLINED;
718 }
719 else if ( test_bit(_PGC_offlined, &pg->count_info) )
720 {
721 *status = PG_OFFLINE_OFFLINED;
722 }
723 else if ( (owner = page_get_owner_and_reference(pg)) )
724 {
725 *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
726 (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
727 /* Release the reference since it will not be allocated anymore */
728 put_page(pg);
729 }
730 else if ( old_info & PGC_xen_heap)
731 {
732 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
733 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
734 }
735 else
736 {
737 /*
738 * assign_pages does not hold heap_lock, so small window that the owner
739 * may be set later, but please notice owner will only change from
740 * NULL to be set, not verse, since page is offlining now.
741 * No windows If called from #MC handler, since all CPU are in softirq
742 * If called from user space like CE handling, tools can wait some time
743 * before call again.
744 */
745 *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
746 (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
747 }
749 if ( broken )
750 *status |= PG_OFFLINE_BROKEN;
752 spin_unlock(&heap_lock);
754 return ret;
755 }
757 /*
758 * Online the memory.
759 * The caller should make sure end_pfn <= max_page,
760 * if not, expand_pages() should be called prior to online_page().
761 */
762 unsigned int online_page(unsigned long mfn, uint32_t *status)
763 {
764 struct page_info *pg;
765 int ret = 0, free = 0;
767 if ( mfn > max_page )
768 {
769 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
770 return -EINVAL;
771 }
773 pg = mfn_to_page(mfn);
775 *status = 0;
777 spin_lock(&heap_lock);
779 if ( unlikely(is_page_broken(pg)) )
780 {
781 ret = -EINVAL;
782 *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
783 }
784 else if ( pg->count_info & PGC_offlined )
785 {
786 clear_bit(_PGC_offlined, &pg->count_info);
787 page_list_del(pg, &page_offlined_list);
788 *status = PG_ONLINE_ONLINED;
789 free = 1;
790 }
791 else if ( pg->count_info & PGC_offlining )
792 {
793 clear_bit(_PGC_offlining, &pg->count_info);
794 *status = PG_ONLINE_ONLINED;
795 }
796 spin_unlock(&heap_lock);
798 if ( free )
799 free_heap_pages(pg, 0);
801 return ret;
802 }
804 int query_page_offline(unsigned long mfn, uint32_t *status)
805 {
806 struct page_info *pg;
808 if ( (mfn > max_page) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
809 {
810 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
811 return -EINVAL;
812 }
814 *status = 0;
815 spin_lock(&heap_lock);
817 pg = mfn_to_page(mfn);
819 if (pg->count_info & PGC_offlining)
820 *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
821 if (pg->count_info & PGC_broken)
822 *status |= PG_OFFLINE_STATUS_BROKEN;
823 if (pg->count_info & PGC_offlined)
824 *status |= PG_OFFLINE_STATUS_OFFLINED;
826 spin_unlock(&heap_lock);
828 return 0;
829 }
831 /*
832 * Hand the specified arbitrary page range to the specified heap zone
833 * checking the node_id of the previous page. If they differ and the
834 * latter is not on a MAX_ORDER boundary, then we reserve the page by
835 * not freeing it to the buddy allocator.
836 */
837 static void init_heap_pages(
838 struct page_info *pg, unsigned long nr_pages)
839 {
840 unsigned int nid_curr, nid_prev;
841 unsigned long i;
843 nid_prev = phys_to_nid(page_to_maddr(pg-1));
845 for ( i = 0; i < nr_pages; nid_prev = nid_curr, i++ )
846 {
847 nid_curr = phys_to_nid(page_to_maddr(pg+i));
849 if ( unlikely(!avail[nid_curr]) )
850 {
851 unsigned long n;
853 n = init_node_heap(nid_curr, page_to_mfn(pg+i), nr_pages - i);
854 if ( n )
855 {
856 BUG_ON(i + n > nr_pages);
857 i += n - 1;
858 continue;
859 }
860 }
862 /*
863 * Free pages of the same node, or if they differ, but are on a
864 * MAX_ORDER alignment boundary (which already get reserved).
865 */
866 if ( (nid_curr == nid_prev) ||
867 !(page_to_mfn(pg+i) & ((1UL << MAX_ORDER) - 1)) )
868 free_heap_pages(pg+i, 0);
869 else
870 printk("Reserving non-aligned node boundary @ mfn %#lx\n",
871 page_to_mfn(pg+i));
872 }
873 }
875 static unsigned long avail_heap_pages(
876 unsigned int zone_lo, unsigned int zone_hi, unsigned int node)
877 {
878 unsigned int i, zone, num_nodes = num_online_nodes();
879 unsigned long free_pages = 0;
881 if ( zone_hi >= NR_ZONES )
882 zone_hi = NR_ZONES - 1;
884 for ( i = 0; i < num_nodes; i++ )
885 {
886 if ( !avail[i] )
887 continue;
888 for ( zone = zone_lo; zone <= zone_hi; zone++ )
889 if ( (node == -1) || (node == i) )
890 free_pages += avail[i][zone];
891 }
893 return free_pages;
894 }
896 #define avail_for_domheap(mfn) !(allocated_in_map(mfn) || is_xen_heap_mfn(mfn))
897 void __init end_boot_allocator(void)
898 {
899 unsigned long i, nr = 0;
900 int curr_free, next_free;
902 /* Pages that are free now go to the domain sub-allocator. */
903 if ( (curr_free = next_free = avail_for_domheap(first_valid_mfn)) )
904 map_alloc(first_valid_mfn, 1);
905 for ( i = first_valid_mfn; i < max_page; i++ )
906 {
907 curr_free = next_free;
908 next_free = avail_for_domheap(i+1);
909 if ( next_free )
910 map_alloc(i+1, 1); /* prevent merging in free_heap_pages() */
911 if ( curr_free )
912 ++nr;
913 else if ( nr )
914 {
915 init_heap_pages(mfn_to_page(i - nr), nr);
916 nr = 0;
917 }
918 }
919 if ( nr )
920 init_heap_pages(mfn_to_page(i - nr), nr);
922 if ( !dma_bitsize && (num_online_nodes() > 1) )
923 {
924 #ifdef CONFIG_X86
925 dma_bitsize = min_t(unsigned int,
926 fls(NODE_DATA(0)->node_spanned_pages) - 1
927 + PAGE_SHIFT - 2,
928 32);
929 #else
930 dma_bitsize = 32;
931 #endif
932 }
934 printk("Domain heap initialised");
935 if ( dma_bitsize )
936 printk(" DMA width %u bits", dma_bitsize);
937 printk("\n");
938 }
939 #undef avail_for_domheap
941 /*
942 * Scrub all unallocated pages in all heap zones. This function is more
943 * convoluted than appears necessary because we do not want to continuously
944 * hold the lock while scrubbing very large memory areas.
945 */
946 void __init scrub_heap_pages(void)
947 {
948 void *p;
949 unsigned long mfn;
951 if ( !opt_bootscrub )
952 return;
954 printk("Scrubbing Free RAM: ");
956 for ( mfn = first_valid_mfn; mfn < max_page; mfn++ )
957 {
958 process_pending_timers();
960 /* Quick lock-free check. */
961 if ( allocated_in_map(mfn) )
962 continue;
964 /* Every 100MB, print a progress dot. */
965 if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
966 printk(".");
968 spin_lock(&heap_lock);
970 /* Re-check page status with lock held. */
971 if ( !allocated_in_map(mfn) )
972 {
973 if ( is_xen_heap_mfn(mfn) )
974 {
975 p = page_to_virt(mfn_to_page(mfn));
976 memguard_unguard_range(p, PAGE_SIZE);
977 scrub_page(p);
978 memguard_guard_range(p, PAGE_SIZE);
979 }
980 else
981 {
982 p = map_domain_page(mfn);
983 scrub_page(p);
984 unmap_domain_page(p);
985 }
986 }
988 spin_unlock(&heap_lock);
989 }
991 printk("done.\n");
992 }
996 /*************************
997 * XEN-HEAP SUB-ALLOCATOR
998 */
1000 #if !defined(__x86_64__) && !defined(__ia64__)
1002 void init_xenheap_pages(paddr_t ps, paddr_t pe)
1004 ps = round_pgup(ps);
1005 pe = round_pgdown(pe);
1006 if ( pe <= ps )
1007 return;
1009 memguard_guard_range(maddr_to_virt(ps), pe - ps);
1011 /*
1012 * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
1013 * prevent merging of power-of-two blocks across the zone boundary.
1014 */
1015 if ( ps && !is_xen_heap_mfn(paddr_to_pfn(ps)-1) )
1016 ps += PAGE_SIZE;
1017 if ( !is_xen_heap_mfn(paddr_to_pfn(pe)) )
1018 pe -= PAGE_SIZE;
1020 init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
1024 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
1026 struct page_info *pg;
1028 ASSERT(!in_irq());
1030 pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
1031 cpu_to_node(smp_processor_id()), order, memflags);
1032 if ( unlikely(pg == NULL) )
1033 return NULL;
1035 memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
1037 return page_to_virt(pg);
1041 void free_xenheap_pages(void *v, unsigned int order)
1043 ASSERT(!in_irq());
1045 if ( v == NULL )
1046 return;
1048 memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
1050 free_heap_pages(virt_to_page(v), order);
1053 #else
1055 void init_xenheap_pages(paddr_t ps, paddr_t pe)
1057 init_domheap_pages(ps, pe);
1060 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
1062 struct page_info *pg;
1063 unsigned int i;
1065 ASSERT(!in_irq());
1067 pg = alloc_domheap_pages(NULL, order, memflags);
1068 if ( unlikely(pg == NULL) )
1069 return NULL;
1071 for ( i = 0; i < (1u << order); i++ )
1072 pg[i].count_info |= PGC_xen_heap;
1074 return page_to_virt(pg);
1077 void free_xenheap_pages(void *v, unsigned int order)
1079 struct page_info *pg;
1080 unsigned int i;
1082 ASSERT(!in_irq());
1084 if ( v == NULL )
1085 return;
1087 pg = virt_to_page(v);
1089 for ( i = 0; i < (1u << order); i++ )
1090 pg[i].count_info &= ~PGC_xen_heap;
1092 free_heap_pages(pg, order);
1095 #endif
1099 /*************************
1100 * DOMAIN-HEAP SUB-ALLOCATOR
1101 */
1103 void init_domheap_pages(paddr_t ps, paddr_t pe)
1105 unsigned long smfn, emfn;
1107 ASSERT(!in_irq());
1109 smfn = round_pgup(ps) >> PAGE_SHIFT;
1110 emfn = round_pgdown(pe) >> PAGE_SHIFT;
1112 init_heap_pages(mfn_to_page(smfn), emfn - smfn);
1116 int assign_pages(
1117 struct domain *d,
1118 struct page_info *pg,
1119 unsigned int order,
1120 unsigned int memflags)
1122 unsigned long i;
1124 spin_lock(&d->page_alloc_lock);
1126 if ( unlikely(d->is_dying) )
1128 gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n",
1129 d->domain_id);
1130 goto fail;
1133 if ( !(memflags & MEMF_no_refcount) )
1135 if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
1137 gdprintk(XENLOG_INFO, "Over-allocation for domain %u: %u > %u\n",
1138 d->domain_id, d->tot_pages + (1 << order), d->max_pages);
1139 goto fail;
1142 if ( unlikely(d->tot_pages == 0) )
1143 get_knownalive_domain(d);
1145 d->tot_pages += 1 << order;
1148 for ( i = 0; i < (1 << order); i++ )
1150 ASSERT(page_get_owner(&pg[i]) == NULL);
1151 ASSERT((pg[i].count_info & ~(PGC_allocated | 1)) == 0);
1152 page_set_owner(&pg[i], d);
1153 wmb(); /* Domain pointer must be visible before updating refcnt. */
1154 pg[i].count_info = PGC_allocated | 1;
1155 page_list_add_tail(&pg[i], &d->page_list);
1158 spin_unlock(&d->page_alloc_lock);
1159 return 0;
1161 fail:
1162 spin_unlock(&d->page_alloc_lock);
1163 return -1;
1167 struct page_info *alloc_domheap_pages(
1168 struct domain *d, unsigned int order, unsigned int memflags)
1170 struct page_info *pg = NULL;
1171 unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
1172 unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1), dma_zone;
1174 ASSERT(!in_irq());
1176 if ( (node == NUMA_NO_NODE) && (d != NULL) )
1177 node = domain_to_node(d);
1179 bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT));
1180 if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
1181 return NULL;
1183 if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
1184 pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order, memflags);
1186 if ( (pg == NULL) &&
1187 ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi,
1188 node, order, memflags)) == NULL) )
1189 return NULL;
1191 if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
1193 free_heap_pages(pg, order);
1194 return NULL;
1197 return pg;
1200 void free_domheap_pages(struct page_info *pg, unsigned int order)
1202 int i, drop_dom_ref;
1203 struct domain *d = page_get_owner(pg);
1205 ASSERT(!in_irq());
1207 if ( unlikely(is_xen_heap_page(pg)) )
1209 /* NB. May recursively lock from relinquish_memory(). */
1210 spin_lock_recursive(&d->page_alloc_lock);
1212 for ( i = 0; i < (1 << order); i++ )
1213 page_list_del2(&pg[i], &d->xenpage_list, &d->arch.relmem_list);
1215 d->xenheap_pages -= 1 << order;
1216 drop_dom_ref = (d->xenheap_pages == 0);
1218 spin_unlock_recursive(&d->page_alloc_lock);
1220 else if ( likely(d != NULL) )
1222 /* NB. May recursively lock from relinquish_memory(). */
1223 spin_lock_recursive(&d->page_alloc_lock);
1225 for ( i = 0; i < (1 << order); i++ )
1227 BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0);
1228 page_list_del2(&pg[i], &d->page_list, &d->arch.relmem_list);
1231 d->tot_pages -= 1 << order;
1232 drop_dom_ref = (d->tot_pages == 0);
1234 spin_unlock_recursive(&d->page_alloc_lock);
1236 if ( likely(!d->is_dying) )
1238 free_heap_pages(pg, order);
1240 else
1242 /*
1243 * Normally we expect a domain to clear pages before freeing them,
1244 * if it cares about the secrecy of their contents. However, after
1245 * a domain has died we assume responsibility for erasure.
1246 */
1247 for ( i = 0; i < (1 << order); i++ )
1249 page_set_owner(&pg[i], NULL);
1250 spin_lock(&page_scrub_lock);
1251 page_list_add(&pg[i], &page_scrub_list);
1252 scrub_pages++;
1253 spin_unlock(&page_scrub_lock);
1257 else
1259 /* Freeing anonymous domain-heap pages. */
1260 free_heap_pages(pg, order);
1261 drop_dom_ref = 0;
1264 if ( drop_dom_ref )
1265 put_domain(d);
1268 unsigned long avail_domheap_pages_region(
1269 unsigned int node, unsigned int min_width, unsigned int max_width)
1271 int zone_lo, zone_hi;
1273 zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1);
1274 zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo));
1276 zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1);
1277 zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi));
1279 return avail_heap_pages(zone_lo, zone_hi, node);
1282 unsigned long avail_domheap_pages(void)
1284 return avail_heap_pages(MEMZONE_XEN + 1,
1285 NR_ZONES - 1,
1286 -1);
1289 static void pagealloc_keyhandler(unsigned char key)
1291 unsigned int zone = MEMZONE_XEN;
1292 unsigned long n, total = 0;
1294 printk("Physical memory information:\n");
1295 printk(" Xen heap: %lukB free\n",
1296 avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10));
1298 while ( ++zone < NR_ZONES )
1300 if ( (zone + PAGE_SHIFT) == dma_bitsize )
1302 printk(" DMA heap: %lukB free\n", total << (PAGE_SHIFT-10));
1303 total = 0;
1306 if ( (n = avail_heap_pages(zone, zone, -1)) != 0 )
1308 total += n;
1309 printk(" heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10));
1313 printk(" Dom heap: %lukB free\n", total << (PAGE_SHIFT-10));
1317 static __init int pagealloc_keyhandler_init(void)
1319 register_keyhandler('m', pagealloc_keyhandler, "memory info");
1320 return 0;
1322 __initcall(pagealloc_keyhandler_init);
1326 /*************************
1327 * PAGE SCRUBBING
1328 */
1330 static DEFINE_PER_CPU(struct timer, page_scrub_timer);
1332 static void page_scrub_softirq(void)
1334 PAGE_LIST_HEAD(list);
1335 struct page_info *pg;
1336 void *p;
1337 int i;
1338 s_time_t start = NOW();
1339 static spinlock_t serialise_lock = SPIN_LOCK_UNLOCKED;
1341 /* free_heap_pages() does not parallelise well. Serialise this function. */
1342 if ( !spin_trylock(&serialise_lock) )
1344 set_timer(&this_cpu(page_scrub_timer), NOW() + MILLISECS(1));
1345 return;
1348 /* Aim to do 1ms of work every 10ms. */
1349 do {
1350 spin_lock(&page_scrub_lock);
1352 /* Peel up to 16 pages from the list. */
1353 for ( i = 0; i < 16; i++ )
1355 if ( !(pg = page_list_remove_head(&page_scrub_list)) )
1356 break;
1357 page_list_add_tail(pg, &list);
1360 if ( unlikely(i == 0) )
1362 spin_unlock(&page_scrub_lock);
1363 goto out;
1366 scrub_pages -= i;
1368 spin_unlock(&page_scrub_lock);
1370 /* Scrub each page in turn. */
1371 while ( (pg = page_list_remove_head(&list)) ) {
1372 p = map_domain_page(page_to_mfn(pg));
1373 scrub_page(p);
1374 unmap_domain_page(p);
1375 free_heap_pages(pg, 0);
1377 } while ( (NOW() - start) < MILLISECS(1) );
1379 set_timer(&this_cpu(page_scrub_timer), NOW() + MILLISECS(10));
1381 out:
1382 spin_unlock(&serialise_lock);
1385 void scrub_list_splice(struct page_list_head *list)
1387 spin_lock(&page_scrub_lock);
1388 page_list_splice(list, &page_scrub_list);
1389 spin_unlock(&page_scrub_lock);
1392 void scrub_list_add(struct page_info *pg)
1394 spin_lock(&page_scrub_lock);
1395 page_list_add(pg, &page_scrub_list);
1396 spin_unlock(&page_scrub_lock);
1399 void scrub_one_page(struct page_info *pg)
1401 void *p = map_domain_page(page_to_mfn(pg));
1403 scrub_page(p);
1404 unmap_domain_page(p);
1407 static void page_scrub_timer_fn(void *unused)
1409 page_scrub_schedule_work();
1412 unsigned long avail_scrub_pages(void)
1414 return scrub_pages;
1417 static void dump_heap(unsigned char key)
1419 s_time_t now = NOW();
1420 int i, j;
1422 printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
1423 (u32)(now>>32), (u32)now);
1425 for ( i = 0; i < MAX_NUMNODES; i++ )
1427 if ( !avail[i] )
1428 continue;
1429 for ( j = 0; j < NR_ZONES; j++ )
1430 printk("heap[node=%d][zone=%d] -> %lu pages\n",
1431 i, j, avail[i][j]);
1435 static __init int register_heap_trigger(void)
1437 register_keyhandler('H', dump_heap, "dump heap info");
1438 return 0;
1440 __initcall(register_heap_trigger);
1443 static __init int page_scrub_init(void)
1445 int cpu;
1446 for_each_cpu ( cpu )
1447 init_timer(&per_cpu(page_scrub_timer, cpu),
1448 page_scrub_timer_fn, NULL, cpu);
1449 open_softirq(PAGE_SCRUB_SOFTIRQ, page_scrub_softirq);
1450 return 0;
1452 __initcall(page_scrub_init);
1454 /*
1455 * Local variables:
1456 * mode: C
1457 * c-set-style: "BSD"
1458 * c-basic-offset: 4
1459 * tab-width: 4
1460 * indent-tabs-mode: nil
1461 * End:
1462 */