ia64/xen-unstable

view xen/common/page_alloc.c @ 19348:dd3219cd019a

Code cleanups after page offline patch.

Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Mar 12 15:31:36 2009 +0000 (2009-03-12)
parents dd489125a2e7
children 97f78142cd4c
line source
1 /******************************************************************************
2 * page_alloc.c
3 *
4 * Simple buddy heap allocator for Xen.
5 *
6 * Copyright (c) 2002-2004 K A Fraser
7 * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
24 #include <xen/config.h>
25 #include <xen/init.h>
26 #include <xen/types.h>
27 #include <xen/lib.h>
28 #include <xen/sched.h>
29 #include <xen/spinlock.h>
30 #include <xen/mm.h>
31 #include <xen/irq.h>
32 #include <xen/softirq.h>
33 #include <xen/domain_page.h>
34 #include <xen/keyhandler.h>
35 #include <xen/perfc.h>
36 #include <xen/numa.h>
37 #include <xen/nodemask.h>
38 #include <public/sysctl.h>
39 #include <asm/page.h>
40 #include <asm/numa.h>
41 #include <asm/flushtlb.h>
43 /*
44 * Comma-separated list of hexadecimal page numbers containing bad bytes.
45 * e.g. 'badpage=0x3f45,0x8a321'.
46 */
47 static char opt_badpage[100] = "";
48 string_param("badpage", opt_badpage);
50 /*
51 * no-bootscrub -> Free pages are not zeroed during boot.
52 */
53 static int opt_bootscrub __initdata = 1;
54 boolean_param("bootscrub", opt_bootscrub);
56 /*
57 * Bit width of the DMA heap -- used to override NUMA-node-first.
58 * allocation strategy, which can otherwise exhaust low memory.
59 */
60 static unsigned int dma_bitsize;
61 integer_param("dma_bits", dma_bitsize);
63 #define round_pgdown(_p) ((_p)&PAGE_MASK)
64 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
66 #ifndef NDEBUG
67 /* Avoid callers relying on allocations returning zeroed pages. */
68 #define scrub_page(p) memset((p), 0xc2, PAGE_SIZE)
69 #else
70 /* For a production build, clear_page() is the fastest way to scrub. */
71 #define scrub_page(p) clear_page(p)
72 #endif
74 static DEFINE_SPINLOCK(page_scrub_lock);
75 PAGE_LIST_HEAD(page_scrub_list);
76 static unsigned long scrub_pages;
78 /* Offlined page list, protected by heap_lock. */
79 PAGE_LIST_HEAD(page_offlined_list);
80 /* Broken page list, protected by heap_lock. */
81 PAGE_LIST_HEAD(page_broken_list);
83 /*********************
84 * ALLOCATION BITMAP
85 * One bit per page of memory. Bit set => page is allocated.
86 */
88 unsigned long *alloc_bitmap;
89 #define PAGES_PER_MAPWORD (sizeof(unsigned long) * 8)
91 #define allocated_in_map(_pn) \
92 ({ unsigned long ___pn = (_pn); \
93 !!(alloc_bitmap[___pn/PAGES_PER_MAPWORD] & \
94 (1UL<<(___pn&(PAGES_PER_MAPWORD-1)))); })
96 /*
97 * Hint regarding bitwise arithmetic in map_{alloc,free}:
98 * -(1<<n) sets all bits >= n.
99 * (1<<n)-1 sets all bits < n.
100 * Variable names in map_{alloc,free}:
101 * *_idx == Index into `alloc_bitmap' array.
102 * *_off == Bit offset within an element of the `alloc_bitmap' array.
103 */
105 static void map_alloc(unsigned long first_page, unsigned long nr_pages)
106 {
107 unsigned long start_off, end_off, curr_idx, end_idx;
109 #ifndef NDEBUG
110 unsigned long i;
111 /* Check that the block isn't already allocated. */
112 for ( i = 0; i < nr_pages; i++ )
113 ASSERT(!allocated_in_map(first_page + i));
114 #endif
116 curr_idx = first_page / PAGES_PER_MAPWORD;
117 start_off = first_page & (PAGES_PER_MAPWORD-1);
118 end_idx = (first_page + nr_pages) / PAGES_PER_MAPWORD;
119 end_off = (first_page + nr_pages) & (PAGES_PER_MAPWORD-1);
121 if ( curr_idx == end_idx )
122 {
123 alloc_bitmap[curr_idx] |= ((1UL<<end_off)-1) & -(1UL<<start_off);
124 }
125 else
126 {
127 alloc_bitmap[curr_idx] |= -(1UL<<start_off);
128 while ( ++curr_idx < end_idx ) alloc_bitmap[curr_idx] = ~0UL;
129 alloc_bitmap[curr_idx] |= (1UL<<end_off)-1;
130 }
131 }
133 static void map_free(unsigned long first_page, unsigned long nr_pages)
134 {
135 unsigned long start_off, end_off, curr_idx, end_idx;
137 #ifndef NDEBUG
138 unsigned long i;
139 /* Check that the block isn't already freed. */
140 for ( i = 0; i < nr_pages; i++ )
141 ASSERT(allocated_in_map(first_page + i));
142 #endif
144 curr_idx = first_page / PAGES_PER_MAPWORD;
145 start_off = first_page & (PAGES_PER_MAPWORD-1);
146 end_idx = (first_page + nr_pages) / PAGES_PER_MAPWORD;
147 end_off = (first_page + nr_pages) & (PAGES_PER_MAPWORD-1);
149 if ( curr_idx == end_idx )
150 {
151 alloc_bitmap[curr_idx] &= -(1UL<<end_off) | ((1UL<<start_off)-1);
152 }
153 else
154 {
155 alloc_bitmap[curr_idx] &= (1UL<<start_off)-1;
156 while ( ++curr_idx != end_idx ) alloc_bitmap[curr_idx] = 0;
157 alloc_bitmap[curr_idx] &= -(1UL<<end_off);
158 }
159 }
163 /*************************
164 * BOOT-TIME ALLOCATOR
165 */
167 static unsigned long first_valid_mfn = ~0UL;
169 /* Initialise allocator to handle up to @max_page pages. */
170 paddr_t __init init_boot_allocator(paddr_t bitmap_start)
171 {
172 unsigned long bitmap_size;
174 bitmap_start = round_pgup(bitmap_start);
176 /*
177 * Allocate space for the allocation bitmap. Include an extra longword
178 * of padding for possible overrun in map_alloc and map_free.
179 */
180 bitmap_size = max_page / 8;
181 bitmap_size += sizeof(unsigned long);
182 bitmap_size = round_pgup(bitmap_size);
183 alloc_bitmap = (unsigned long *)maddr_to_virt(bitmap_start);
185 /* All allocated by default. */
186 memset(alloc_bitmap, ~0, bitmap_size);
188 return bitmap_start + bitmap_size;
189 }
191 void __init init_boot_pages(paddr_t ps, paddr_t pe)
192 {
193 unsigned long bad_spfn, bad_epfn, i;
194 const char *p;
196 ps = round_pgup(ps);
197 pe = round_pgdown(pe);
198 if ( pe <= ps )
199 return;
201 first_valid_mfn = min_t(unsigned long, ps >> PAGE_SHIFT, first_valid_mfn);
203 map_free(ps >> PAGE_SHIFT, (pe - ps) >> PAGE_SHIFT);
205 /* Check new pages against the bad-page list. */
206 p = opt_badpage;
207 while ( *p != '\0' )
208 {
209 bad_spfn = simple_strtoul(p, &p, 0);
210 bad_epfn = bad_spfn;
212 if ( *p == '-' )
213 {
214 p++;
215 bad_epfn = simple_strtoul(p, &p, 0);
216 if ( bad_epfn < bad_spfn )
217 bad_epfn = bad_spfn;
218 }
220 if ( *p == ',' )
221 p++;
222 else if ( *p != '\0' )
223 break;
225 if ( bad_epfn == bad_spfn )
226 printk("Marking page %lx as bad\n", bad_spfn);
227 else
228 printk("Marking pages %lx through %lx as bad\n",
229 bad_spfn, bad_epfn);
231 for ( i = bad_spfn; i <= bad_epfn; i++ )
232 if ( (i < max_page) && !allocated_in_map(i) )
233 map_alloc(i, 1);
234 }
235 }
237 unsigned long __init alloc_boot_pages(
238 unsigned long nr_pfns, unsigned long pfn_align)
239 {
240 unsigned long pg, i;
242 /* Search backwards to obtain highest available range. */
243 for ( pg = (max_page - nr_pfns) & ~(pfn_align - 1);
244 pg >= first_valid_mfn;
245 pg = (pg + i - nr_pfns) & ~(pfn_align - 1) )
246 {
247 for ( i = 0; i < nr_pfns; i++ )
248 if ( allocated_in_map(pg+i) )
249 break;
250 if ( i == nr_pfns )
251 {
252 map_alloc(pg, nr_pfns);
253 return pg;
254 }
255 }
257 return 0;
258 }
262 /*************************
263 * BINARY BUDDY ALLOCATOR
264 */
266 #define MEMZONE_XEN 0
267 #define NR_ZONES (PADDR_BITS - PAGE_SHIFT)
269 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 0 : ((b) - PAGE_SHIFT - 1))
270 #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN : \
271 (fls(page_to_mfn(pg)) - 1))
273 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
274 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
275 #define heap(node, zone, order) ((*_heap[node])[zone][order])
277 static unsigned long *avail[MAX_NUMNODES];
279 static DEFINE_SPINLOCK(heap_lock);
281 static unsigned long init_node_heap(int node, unsigned long mfn,
282 unsigned long nr)
283 {
284 /* First node to be discovered has its heap metadata statically alloced. */
285 static heap_by_zone_and_order_t _heap_static;
286 static unsigned long avail_static[NR_ZONES];
287 static int first_node_initialised;
288 unsigned long needed = (sizeof(**_heap) +
289 sizeof(**avail) * NR_ZONES +
290 PAGE_SIZE - 1) >> PAGE_SHIFT;
291 int i, j;
293 if ( !first_node_initialised )
294 {
295 _heap[node] = &_heap_static;
296 avail[node] = avail_static;
297 first_node_initialised = 1;
298 needed = 0;
299 }
300 #ifdef DIRECTMAP_VIRT_END
301 else if ( nr >= needed &&
302 mfn + needed <= virt_to_mfn(DIRECTMAP_VIRT_END) )
303 {
304 _heap[node] = mfn_to_virt(mfn);
305 avail[node] = mfn_to_virt(mfn + needed) - sizeof(**avail) * NR_ZONES;
306 }
307 #endif
308 else if ( get_order_from_bytes(sizeof(**_heap)) ==
309 get_order_from_pages(needed) )
310 {
311 _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0);
312 BUG_ON(!_heap[node]);
313 avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) -
314 sizeof(**avail) * NR_ZONES;
315 needed = 0;
316 }
317 else
318 {
319 _heap[node] = xmalloc(heap_by_zone_and_order_t);
320 avail[node] = xmalloc_array(unsigned long, NR_ZONES);
321 BUG_ON(!_heap[node] || !avail[node]);
322 needed = 0;
323 }
325 memset(avail[node], 0, NR_ZONES * sizeof(long));
327 for ( i = 0; i < NR_ZONES; i++ )
328 for ( j = 0; j <= MAX_ORDER; j++ )
329 INIT_PAGE_LIST_HEAD(&(*_heap[node])[i][j]);
331 return needed;
332 }
334 /* Allocate 2^@order contiguous pages. */
335 static struct page_info *alloc_heap_pages(
336 unsigned int zone_lo, unsigned int zone_hi,
337 unsigned int node, unsigned int order)
338 {
339 unsigned int i, j, zone;
340 unsigned int num_nodes = num_online_nodes();
341 unsigned long request = 1UL << order;
342 cpumask_t extra_cpus_mask, mask;
343 struct page_info *pg;
345 if ( node == NUMA_NO_NODE )
346 node = cpu_to_node(smp_processor_id());
348 ASSERT(node >= 0);
349 ASSERT(node < num_nodes);
350 ASSERT(zone_lo <= zone_hi);
351 ASSERT(zone_hi < NR_ZONES);
353 if ( unlikely(order > MAX_ORDER) )
354 return NULL;
356 spin_lock(&heap_lock);
358 /*
359 * Start with requested node, but exhaust all node memory in requested
360 * zone before failing, only calc new node value if we fail to find memory
361 * in target node, this avoids needless computation on fast-path.
362 */
363 for ( i = 0; i < num_nodes; i++ )
364 {
365 zone = zone_hi;
366 do {
367 /* Check if target node can support the allocation. */
368 if ( !avail[node] || (avail[node][zone] < request) )
369 continue;
371 /* Find smallest order which can satisfy the request. */
372 for ( j = order; j <= MAX_ORDER; j++ )
373 if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
374 goto found;
375 } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
377 /* Pick next node, wrapping around if needed. */
378 if ( ++node == num_nodes )
379 node = 0;
380 }
382 /* No suitable memory blocks. Fail the request. */
383 spin_unlock(&heap_lock);
384 return NULL;
386 found:
387 /* We may have to halve the chunk a number of times. */
388 while ( j != order )
389 {
390 PFN_ORDER(pg) = --j;
391 page_list_add_tail(pg, &heap(node, zone, j));
392 pg += 1 << j;
393 }
395 map_alloc(page_to_mfn(pg), request);
396 ASSERT(avail[node][zone] >= request);
397 avail[node][zone] -= request;
399 spin_unlock(&heap_lock);
401 cpus_clear(mask);
403 for ( i = 0; i < (1 << order); i++ )
404 {
405 /* Reference count must continuously be zero for free pages. */
406 BUG_ON(pg[i].count_info != 0);
408 if ( pg[i].u.free.need_tlbflush )
409 {
410 /* Add in extra CPUs that need flushing because of this page. */
411 cpus_andnot(extra_cpus_mask, cpu_online_map, mask);
412 tlbflush_filter(extra_cpus_mask, pg[i].tlbflush_timestamp);
413 cpus_or(mask, mask, extra_cpus_mask);
414 }
416 /* Initialise fields which have other uses for free pages. */
417 pg[i].u.inuse.type_info = 0;
418 page_set_owner(&pg[i], NULL);
419 }
421 if ( unlikely(!cpus_empty(mask)) )
422 {
423 perfc_incr(need_flush_tlb_flush);
424 flush_tlb_mask(mask);
425 }
427 return pg;
428 }
430 /* Remove any offlined page in the buddy pointed to by head. */
431 static int reserve_offlined_page(struct page_info *head)
432 {
433 unsigned int node = phys_to_nid(page_to_maddr(head));
434 int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
435 struct page_info *cur_head;
436 int cur_order;
438 ASSERT(spin_is_locked(&heap_lock));
440 cur_head = head;
442 page_list_del(head, &heap(node, zone, head_order));
444 while ( cur_head < (head + (1 << head_order)) )
445 {
446 struct page_info *pg;
447 int next_order;
449 if ( test_bit(_PGC_offlined, &cur_head->count_info) )
450 {
451 cur_head++;
452 continue;
453 }
455 next_order = cur_order = 0;
457 while ( cur_order < head_order )
458 {
459 next_order = cur_order + 1;
461 if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) )
462 goto merge;
464 for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order );
465 i < (1 << next_order);
466 i++, pg++ )
467 if ( test_bit(_PGC_offlined, &pg->count_info) )
468 break;
469 if ( i == ( 1 << next_order) )
470 {
471 cur_order = next_order;
472 continue;
473 }
474 else
475 {
476 merge:
477 /* We don't consider merging outside the head_order. */
478 page_list_add_tail(cur_head, &heap(node, zone, cur_order));
479 PFN_ORDER(cur_head) = cur_order;
480 cur_head += (1 << cur_order);
481 break;
482 }
483 }
484 }
486 for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ )
487 {
488 if ( !test_bit(_PGC_offlined, &cur_head->count_info) )
489 continue;
491 avail[node][zone]--;
493 map_alloc(page_to_mfn(cur_head), 1);
495 page_list_add_tail(cur_head,
496 test_bit(_PGC_broken, &cur_head->count_info) ?
497 &page_broken_list : &page_offlined_list);
499 count++;
500 }
502 return count;
503 }
505 /* Free 2^@order set of pages. */
506 static void free_heap_pages(
507 struct page_info *pg, unsigned int order)
508 {
509 unsigned long mask;
510 unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
511 unsigned int zone = page_to_zone(pg);
513 ASSERT(order <= MAX_ORDER);
514 ASSERT(node >= 0);
515 ASSERT(node < num_online_nodes());
517 for ( i = 0; i < (1 << order); i++ )
518 {
519 /*
520 * Cannot assume that count_info == 0, as there are some corner cases
521 * where it isn't the case and yet it isn't a bug:
522 * 1. page_get_owner() is NULL
523 * 2. page_get_owner() is a domain that was never accessible by
524 * its domid (e.g., failed to fully construct the domain).
525 * 3. page was never addressable by the guest (e.g., it's an
526 * auto-translate-physmap guest and the page was never included
527 * in its pseudophysical address space).
528 * In all the above cases there can be no guest mappings of this page.
529 */
530 ASSERT(!(pg[i].count_info & PGC_offlined));
531 pg[i].count_info &= PGC_offlining | PGC_broken;
532 if ( pg[i].count_info & PGC_offlining )
533 {
534 pg[i].count_info &= ~PGC_offlining;
535 pg[i].count_info |= PGC_offlined;
536 tainted = 1;
537 }
539 /* If a page has no owner it will need no safety TLB flush. */
540 pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
541 if ( pg[i].u.free.need_tlbflush )
542 pg[i].tlbflush_timestamp = tlbflush_current_time();
543 }
545 spin_lock(&heap_lock);
547 map_free(page_to_mfn(pg), 1 << order);
548 avail[node][zone] += 1 << order;
550 /* Merge chunks as far as possible. */
551 while ( order < MAX_ORDER )
552 {
553 mask = 1UL << order;
555 if ( (page_to_mfn(pg) & mask) )
556 {
557 /* Merge with predecessor block? */
558 if ( allocated_in_map(page_to_mfn(pg)-mask) ||
559 (PFN_ORDER(pg-mask) != order) )
560 break;
561 pg -= mask;
562 page_list_del(pg, &heap(node, zone, order));
563 }
564 else
565 {
566 /* Merge with successor block? */
567 if ( allocated_in_map(page_to_mfn(pg)+mask) ||
568 (PFN_ORDER(pg+mask) != order) )
569 break;
570 page_list_del(pg + mask, &heap(node, zone, order));
571 }
573 order++;
575 /* After merging, pg should remain in the same node. */
576 ASSERT(phys_to_nid(page_to_maddr(pg)) == node);
577 }
579 PFN_ORDER(pg) = order;
580 page_list_add_tail(pg, &heap(node, zone, order));
582 if ( tainted )
583 reserve_offlined_page(pg);
585 spin_unlock(&heap_lock);
586 }
589 /*
590 * Following possible status for a page:
591 * free and Online; free and offlined; free and offlined and broken;
592 * assigned and online; assigned and offlining; assigned and offling and broken
593 *
594 * Following rules applied for page offline:
595 * Once a page is broken, it can't be assigned anymore
596 * A page will be offlined only if it is free
597 * return original count_info
598 *
599 */
600 static unsigned long mark_page_offline(struct page_info *pg, int broken)
601 {
602 unsigned long nx, x, y = pg->count_info;
604 ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL));
605 ASSERT(spin_is_locked(&heap_lock));
607 do {
608 nx = x = y;
610 if ( ((x & PGC_offlined_broken) == PGC_offlined_broken) )
611 return y;
613 if ( x & PGC_offlined )
614 {
615 /* PGC_offlined means it is a free page. */
616 if ( broken && !(nx & PGC_broken) )
617 nx |= PGC_broken;
618 else
619 return y;
620 }
621 else
622 {
623 /* It is not offlined, not reserved page */
624 nx |= (allocated_in_map(page_to_mfn(pg)) ?
625 PGC_offlining : PGC_offlined);
626 }
628 if ( broken )
629 nx |= PGC_broken;
630 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
632 return y;
633 }
635 static int reserve_heap_page(struct page_info *pg)
636 {
637 struct page_info *head = NULL;
638 unsigned int i, node = phys_to_nid(page_to_maddr(pg));
639 unsigned int zone = page_to_zone(pg);
641 for ( i = 0; i <= MAX_ORDER; i++ )
642 {
643 struct page_info *tmp;
645 if ( page_list_empty(&heap(node, zone, i)) )
646 continue;
648 page_list_for_each_safe ( head, tmp, &heap(node, zone, i) )
649 {
650 if ( (head <= pg) &&
651 (head + (1UL << i) > pg) )
652 return reserve_offlined_page(head);
653 }
654 }
656 return -EINVAL;
658 }
660 int offline_page(unsigned long mfn, int broken, uint32_t *status)
661 {
662 unsigned long old_info = 0;
663 struct domain *owner;
664 int ret = 0;
665 struct page_info *pg;
667 if ( mfn > max_page )
668 {
669 dprintk(XENLOG_WARNING,
670 "try to offline page out of range %lx\n", mfn);
671 return -EINVAL;
672 }
674 *status = 0;
675 pg = mfn_to_page(mfn);
677 #if defined(__x86_64__)
678 /* Xen's txt mfn in x86_64 is reserved in e820 */
679 if ( is_xen_fixed_mfn(mfn) )
680 #elif defined(__i386__)
681 if ( is_xen_heap_mfn(mfn) )
682 #endif
683 {
684 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
685 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
686 return -EPERM;
687 }
689 /*
690 * N.B. xen's txt in x86_64 is marked reserved and handled already
691 * Also kexec range is reserved
692 */
693 if ( !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
694 {
695 *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
696 return -EINVAL;
697 }
699 spin_lock(&heap_lock);
701 old_info = mark_page_offline(pg, broken);
703 if ( !allocated_in_map(mfn) )
704 {
705 /* Free pages are reserve directly */
706 reserve_heap_page(pg);
707 *status = PG_OFFLINE_OFFLINED;
708 }
709 else if ( test_bit(_PGC_offlined, &pg->count_info) )
710 {
711 *status = PG_OFFLINE_OFFLINED;
712 }
713 else if ( (owner = page_get_owner_and_reference(pg)) )
714 {
715 *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
716 (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
717 /* Release the reference since it will not be allocated anymore */
718 put_page(pg);
719 }
720 else if ( old_info & PGC_xen_heap)
721 {
722 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
723 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
724 }
725 else
726 {
727 /*
728 * assign_pages does not hold heap_lock, so small window that the owner
729 * may be set later, but please notice owner will only change from
730 * NULL to be set, not verse, since page is offlining now.
731 * No windows If called from #MC handler, since all CPU are in softirq
732 * If called from user space like CE handling, tools can wait some time
733 * before call again.
734 */
735 *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
736 (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
737 }
739 if ( broken )
740 *status |= PG_OFFLINE_BROKEN;
742 spin_unlock(&heap_lock);
744 return ret;
745 }
747 /*
748 * Online the memory.
749 * The caller should make sure end_pfn <= max_page,
750 * if not, expand_pages() should be called prior to online_page().
751 */
752 unsigned int online_page(unsigned long mfn, uint32_t *status)
753 {
754 struct page_info *pg;
755 int ret = 0, free = 0;
757 if ( mfn > max_page )
758 {
759 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
760 return -EINVAL;
761 }
763 pg = mfn_to_page(mfn);
765 *status = 0;
767 spin_lock(&heap_lock);
769 if ( unlikely(is_page_broken(pg)) )
770 {
771 ret = -EINVAL;
772 *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
773 }
774 else if ( pg->count_info & PGC_offlined )
775 {
776 clear_bit(_PGC_offlined, &pg->count_info);
777 page_list_del(pg, &page_offlined_list);
778 *status = PG_ONLINE_ONLINED;
779 free = 1;
780 }
781 else if ( pg->count_info & PGC_offlining )
782 {
783 clear_bit(_PGC_offlining, &pg->count_info);
784 *status = PG_ONLINE_ONLINED;
785 }
786 spin_unlock(&heap_lock);
788 if ( free )
789 free_heap_pages(pg, 0);
791 return ret;
792 }
794 int query_page_offline(unsigned long mfn, uint32_t *status)
795 {
796 struct page_info *pg;
798 if ( (mfn > max_page) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
799 {
800 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
801 return -EINVAL;
802 }
804 *status = 0;
805 spin_lock(&heap_lock);
807 pg = mfn_to_page(mfn);
809 if (pg->count_info & PGC_offlining)
810 *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
811 if (pg->count_info & PGC_broken)
812 *status |= PG_OFFLINE_STATUS_BROKEN;
813 if (pg->count_info & PGC_offlined)
814 *status |= PG_OFFLINE_STATUS_OFFLINED;
816 spin_unlock(&heap_lock);
818 return 0;
819 }
821 /*
822 * Hand the specified arbitrary page range to the specified heap zone
823 * checking the node_id of the previous page. If they differ and the
824 * latter is not on a MAX_ORDER boundary, then we reserve the page by
825 * not freeing it to the buddy allocator.
826 */
827 static void init_heap_pages(
828 struct page_info *pg, unsigned long nr_pages)
829 {
830 unsigned int nid_curr, nid_prev;
831 unsigned long i;
833 nid_prev = phys_to_nid(page_to_maddr(pg-1));
835 for ( i = 0; i < nr_pages; nid_prev = nid_curr, i++ )
836 {
837 nid_curr = phys_to_nid(page_to_maddr(pg+i));
839 if ( unlikely(!avail[nid_curr]) )
840 {
841 unsigned long n;
843 n = init_node_heap(nid_curr, page_to_mfn(pg+i), nr_pages - i);
844 if ( n )
845 {
846 BUG_ON(i + n > nr_pages);
847 i += n - 1;
848 continue;
849 }
850 }
852 /*
853 * Free pages of the same node, or if they differ, but are on a
854 * MAX_ORDER alignment boundary (which already get reserved).
855 */
856 if ( (nid_curr == nid_prev) ||
857 !(page_to_mfn(pg+i) & ((1UL << MAX_ORDER) - 1)) )
858 free_heap_pages(pg+i, 0);
859 else
860 printk("Reserving non-aligned node boundary @ mfn %#lx\n",
861 page_to_mfn(pg+i));
862 }
863 }
865 static unsigned long avail_heap_pages(
866 unsigned int zone_lo, unsigned int zone_hi, unsigned int node)
867 {
868 unsigned int i, zone, num_nodes = num_online_nodes();
869 unsigned long free_pages = 0;
871 if ( zone_hi >= NR_ZONES )
872 zone_hi = NR_ZONES - 1;
874 for ( i = 0; i < num_nodes; i++ )
875 {
876 if ( !avail[i] )
877 continue;
878 for ( zone = zone_lo; zone <= zone_hi; zone++ )
879 if ( (node == -1) || (node == i) )
880 free_pages += avail[i][zone];
881 }
883 return free_pages;
884 }
886 #define avail_for_domheap(mfn) !(allocated_in_map(mfn) || is_xen_heap_mfn(mfn))
887 void __init end_boot_allocator(void)
888 {
889 unsigned long i, nr = 0;
890 int curr_free, next_free;
892 /* Pages that are free now go to the domain sub-allocator. */
893 if ( (curr_free = next_free = avail_for_domheap(first_valid_mfn)) )
894 map_alloc(first_valid_mfn, 1);
895 for ( i = first_valid_mfn; i < max_page; i++ )
896 {
897 curr_free = next_free;
898 next_free = avail_for_domheap(i+1);
899 if ( next_free )
900 map_alloc(i+1, 1); /* prevent merging in free_heap_pages() */
901 if ( curr_free )
902 ++nr;
903 else if ( nr )
904 {
905 init_heap_pages(mfn_to_page(i - nr), nr);
906 nr = 0;
907 }
908 }
909 if ( nr )
910 init_heap_pages(mfn_to_page(i - nr), nr);
912 if ( !dma_bitsize && (num_online_nodes() > 1) )
913 {
914 #ifdef CONFIG_X86
915 dma_bitsize = min_t(unsigned int,
916 fls(NODE_DATA(0)->node_spanned_pages) - 1
917 + PAGE_SHIFT - 2,
918 32);
919 #else
920 dma_bitsize = 32;
921 #endif
922 }
924 printk("Domain heap initialised");
925 if ( dma_bitsize )
926 printk(" DMA width %u bits", dma_bitsize);
927 printk("\n");
928 }
929 #undef avail_for_domheap
931 /*
932 * Scrub all unallocated pages in all heap zones. This function is more
933 * convoluted than appears necessary because we do not want to continuously
934 * hold the lock while scrubbing very large memory areas.
935 */
936 void __init scrub_heap_pages(void)
937 {
938 void *p;
939 unsigned long mfn;
941 if ( !opt_bootscrub )
942 return;
944 printk("Scrubbing Free RAM: ");
946 for ( mfn = first_valid_mfn; mfn < max_page; mfn++ )
947 {
948 process_pending_timers();
950 /* Quick lock-free check. */
951 if ( allocated_in_map(mfn) )
952 continue;
954 /* Every 100MB, print a progress dot. */
955 if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
956 printk(".");
958 spin_lock(&heap_lock);
960 /* Re-check page status with lock held. */
961 if ( !allocated_in_map(mfn) )
962 {
963 if ( is_xen_heap_mfn(mfn) )
964 {
965 p = page_to_virt(mfn_to_page(mfn));
966 memguard_unguard_range(p, PAGE_SIZE);
967 scrub_page(p);
968 memguard_guard_range(p, PAGE_SIZE);
969 }
970 else
971 {
972 p = map_domain_page(mfn);
973 scrub_page(p);
974 unmap_domain_page(p);
975 }
976 }
978 spin_unlock(&heap_lock);
979 }
981 printk("done.\n");
982 }
986 /*************************
987 * XEN-HEAP SUB-ALLOCATOR
988 */
990 #if !defined(__x86_64__) && !defined(__ia64__)
992 void init_xenheap_pages(paddr_t ps, paddr_t pe)
993 {
994 ps = round_pgup(ps);
995 pe = round_pgdown(pe);
996 if ( pe <= ps )
997 return;
999 memguard_guard_range(maddr_to_virt(ps), pe - ps);
1001 /*
1002 * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
1003 * prevent merging of power-of-two blocks across the zone boundary.
1004 */
1005 if ( ps && !is_xen_heap_mfn(paddr_to_pfn(ps)-1) )
1006 ps += PAGE_SIZE;
1007 if ( !is_xen_heap_mfn(paddr_to_pfn(pe)) )
1008 pe -= PAGE_SIZE;
1010 init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
1014 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
1016 struct page_info *pg;
1018 ASSERT(!in_irq());
1020 pg = alloc_heap_pages(
1021 MEMZONE_XEN, MEMZONE_XEN, cpu_to_node(smp_processor_id()), order);
1022 if ( unlikely(pg == NULL) )
1023 return NULL;
1025 memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
1027 return page_to_virt(pg);
1031 void free_xenheap_pages(void *v, unsigned int order)
1033 ASSERT(!in_irq());
1035 if ( v == NULL )
1036 return;
1038 memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
1040 free_heap_pages(virt_to_page(v), order);
1043 #else
1045 void init_xenheap_pages(paddr_t ps, paddr_t pe)
1047 init_domheap_pages(ps, pe);
1050 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
1052 struct page_info *pg;
1053 unsigned int i;
1055 ASSERT(!in_irq());
1057 pg = alloc_domheap_pages(NULL, order, memflags);
1058 if ( unlikely(pg == NULL) )
1059 return NULL;
1061 for ( i = 0; i < (1u << order); i++ )
1062 pg[i].count_info |= PGC_xen_heap;
1064 return page_to_virt(pg);
1067 void free_xenheap_pages(void *v, unsigned int order)
1069 struct page_info *pg;
1070 unsigned int i;
1072 ASSERT(!in_irq());
1074 if ( v == NULL )
1075 return;
1077 pg = virt_to_page(v);
1079 for ( i = 0; i < (1u << order); i++ )
1080 pg[i].count_info &= ~PGC_xen_heap;
1082 free_heap_pages(pg, order);
1085 #endif
1089 /*************************
1090 * DOMAIN-HEAP SUB-ALLOCATOR
1091 */
1093 void init_domheap_pages(paddr_t ps, paddr_t pe)
1095 unsigned long smfn, emfn;
1097 ASSERT(!in_irq());
1099 smfn = round_pgup(ps) >> PAGE_SHIFT;
1100 emfn = round_pgdown(pe) >> PAGE_SHIFT;
1102 init_heap_pages(mfn_to_page(smfn), emfn - smfn);
1106 int assign_pages(
1107 struct domain *d,
1108 struct page_info *pg,
1109 unsigned int order,
1110 unsigned int memflags)
1112 unsigned long i;
1114 spin_lock(&d->page_alloc_lock);
1116 if ( unlikely(d->is_dying) )
1118 gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n",
1119 d->domain_id);
1120 goto fail;
1123 if ( !(memflags & MEMF_no_refcount) )
1125 if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
1127 gdprintk(XENLOG_INFO, "Over-allocation for domain %u: %u > %u\n",
1128 d->domain_id, d->tot_pages + (1 << order), d->max_pages);
1129 goto fail;
1132 if ( unlikely(d->tot_pages == 0) )
1133 get_knownalive_domain(d);
1135 d->tot_pages += 1 << order;
1138 for ( i = 0; i < (1 << order); i++ )
1140 ASSERT(page_get_owner(&pg[i]) == NULL);
1141 ASSERT((pg[i].count_info & ~(PGC_allocated | 1)) == 0);
1142 page_set_owner(&pg[i], d);
1143 wmb(); /* Domain pointer must be visible before updating refcnt. */
1144 pg[i].count_info = PGC_allocated | 1;
1145 page_list_add_tail(&pg[i], &d->page_list);
1148 spin_unlock(&d->page_alloc_lock);
1149 return 0;
1151 fail:
1152 spin_unlock(&d->page_alloc_lock);
1153 return -1;
1157 struct page_info *alloc_domheap_pages(
1158 struct domain *d, unsigned int order, unsigned int memflags)
1160 struct page_info *pg = NULL;
1161 unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
1162 unsigned int node = (uint8_t)((memflags >> _MEMF_node) - 1), dma_zone;
1164 ASSERT(!in_irq());
1166 if ( (node == NUMA_NO_NODE) && (d != NULL) )
1167 node = domain_to_node(d);
1169 bits = domain_clamp_alloc_bitsize(d, bits ? : (BITS_PER_LONG+PAGE_SHIFT));
1170 if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
1171 return NULL;
1173 if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
1174 pg = alloc_heap_pages(dma_zone + 1, zone_hi, node, order);
1176 if ( (pg == NULL) &&
1177 ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi,
1178 node, order)) == NULL) )
1179 return NULL;
1181 if ( (d != NULL) && assign_pages(d, pg, order, memflags) )
1183 free_heap_pages(pg, order);
1184 return NULL;
1187 return pg;
1190 void free_domheap_pages(struct page_info *pg, unsigned int order)
1192 int i, drop_dom_ref;
1193 struct domain *d = page_get_owner(pg);
1195 ASSERT(!in_irq());
1197 if ( unlikely(is_xen_heap_page(pg)) )
1199 /* NB. May recursively lock from relinquish_memory(). */
1200 spin_lock_recursive(&d->page_alloc_lock);
1202 for ( i = 0; i < (1 << order); i++ )
1203 page_list_del2(&pg[i], &d->xenpage_list, &d->arch.relmem_list);
1205 d->xenheap_pages -= 1 << order;
1206 drop_dom_ref = (d->xenheap_pages == 0);
1208 spin_unlock_recursive(&d->page_alloc_lock);
1210 else if ( likely(d != NULL) )
1212 /* NB. May recursively lock from relinquish_memory(). */
1213 spin_lock_recursive(&d->page_alloc_lock);
1215 for ( i = 0; i < (1 << order); i++ )
1217 BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0);
1218 page_list_del2(&pg[i], &d->page_list, &d->arch.relmem_list);
1221 d->tot_pages -= 1 << order;
1222 drop_dom_ref = (d->tot_pages == 0);
1224 spin_unlock_recursive(&d->page_alloc_lock);
1226 if ( likely(!d->is_dying) )
1228 free_heap_pages(pg, order);
1230 else
1232 /*
1233 * Normally we expect a domain to clear pages before freeing them,
1234 * if it cares about the secrecy of their contents. However, after
1235 * a domain has died we assume responsibility for erasure.
1236 */
1237 for ( i = 0; i < (1 << order); i++ )
1239 page_set_owner(&pg[i], NULL);
1240 spin_lock(&page_scrub_lock);
1241 page_list_add(&pg[i], &page_scrub_list);
1242 scrub_pages++;
1243 spin_unlock(&page_scrub_lock);
1247 else
1249 /* Freeing anonymous domain-heap pages. */
1250 free_heap_pages(pg, order);
1251 drop_dom_ref = 0;
1254 if ( drop_dom_ref )
1255 put_domain(d);
1258 unsigned long avail_domheap_pages_region(
1259 unsigned int node, unsigned int min_width, unsigned int max_width)
1261 int zone_lo, zone_hi;
1263 zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1);
1264 zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo));
1266 zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1);
1267 zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi));
1269 return avail_heap_pages(zone_lo, zone_hi, node);
1272 unsigned long avail_domheap_pages(void)
1274 return avail_heap_pages(MEMZONE_XEN + 1,
1275 NR_ZONES - 1,
1276 -1);
1279 static void pagealloc_keyhandler(unsigned char key)
1281 unsigned int zone = MEMZONE_XEN;
1282 unsigned long n, total = 0;
1284 printk("Physical memory information:\n");
1285 printk(" Xen heap: %lukB free\n",
1286 avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10));
1288 while ( ++zone < NR_ZONES )
1290 if ( (zone + PAGE_SHIFT) == dma_bitsize )
1292 printk(" DMA heap: %lukB free\n", total << (PAGE_SHIFT-10));
1293 total = 0;
1296 if ( (n = avail_heap_pages(zone, zone, -1)) != 0 )
1298 total += n;
1299 printk(" heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10));
1303 printk(" Dom heap: %lukB free\n", total << (PAGE_SHIFT-10));
1307 static __init int pagealloc_keyhandler_init(void)
1309 register_keyhandler('m', pagealloc_keyhandler, "memory info");
1310 return 0;
1312 __initcall(pagealloc_keyhandler_init);
1316 /*************************
1317 * PAGE SCRUBBING
1318 */
1320 static DEFINE_PER_CPU(struct timer, page_scrub_timer);
1322 static void page_scrub_softirq(void)
1324 PAGE_LIST_HEAD(list);
1325 struct page_info *pg;
1326 void *p;
1327 int i;
1328 s_time_t start = NOW();
1329 static spinlock_t serialise_lock = SPIN_LOCK_UNLOCKED;
1331 /* free_heap_pages() does not parallelise well. Serialise this function. */
1332 if ( !spin_trylock(&serialise_lock) )
1334 set_timer(&this_cpu(page_scrub_timer), NOW() + MILLISECS(1));
1335 return;
1338 /* Aim to do 1ms of work every 10ms. */
1339 do {
1340 spin_lock(&page_scrub_lock);
1342 /* Peel up to 16 pages from the list. */
1343 for ( i = 0; i < 16; i++ )
1345 if ( !(pg = page_list_remove_head(&page_scrub_list)) )
1346 break;
1347 page_list_add_tail(pg, &list);
1350 if ( unlikely(i == 0) )
1352 spin_unlock(&page_scrub_lock);
1353 goto out;
1356 scrub_pages -= i;
1358 spin_unlock(&page_scrub_lock);
1360 /* Scrub each page in turn. */
1361 while ( (pg = page_list_remove_head(&list)) ) {
1362 p = map_domain_page(page_to_mfn(pg));
1363 scrub_page(p);
1364 unmap_domain_page(p);
1365 free_heap_pages(pg, 0);
1367 } while ( (NOW() - start) < MILLISECS(1) );
1369 set_timer(&this_cpu(page_scrub_timer), NOW() + MILLISECS(10));
1371 out:
1372 spin_unlock(&serialise_lock);
1375 static void page_scrub_timer_fn(void *unused)
1377 page_scrub_schedule_work();
1380 unsigned long avail_scrub_pages(void)
1382 return scrub_pages;
1385 static void dump_heap(unsigned char key)
1387 s_time_t now = NOW();
1388 int i, j;
1390 printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
1391 (u32)(now>>32), (u32)now);
1393 for ( i = 0; i < MAX_NUMNODES; i++ )
1395 if ( !avail[i] )
1396 continue;
1397 for ( j = 0; j < NR_ZONES; j++ )
1398 printk("heap[node=%d][zone=%d] -> %lu pages\n",
1399 i, j, avail[i][j]);
1403 static __init int register_heap_trigger(void)
1405 register_keyhandler('H', dump_heap, "dump heap info");
1406 return 0;
1408 __initcall(register_heap_trigger);
1411 static __init int page_scrub_init(void)
1413 int cpu;
1414 for_each_cpu ( cpu )
1415 init_timer(&per_cpu(page_scrub_timer, cpu),
1416 page_scrub_timer_fn, NULL, cpu);
1417 open_softirq(PAGE_SCRUB_SOFTIRQ, page_scrub_softirq);
1418 return 0;
1420 __initcall(page_scrub_init);
1422 /*
1423 * Local variables:
1424 * mode: C
1425 * c-set-style: "BSD"
1426 * c-basic-offset: 4
1427 * tab-width: 4
1428 * indent-tabs-mode: nil
1429 * End:
1430 */