ia64/xen-unstable

view xen/common/page_alloc.c @ 6832:5959fae4722a

Set NE bit for VMX guest CR0. VMCS guest CR0.NE bit must
be set, else it will cause "vm-entry failed".

Signed-off-by: Chengyuan Li <chengyuan.li@intel.com>
author kaf24@firebug.cl.cam.ac.uk
date Wed Sep 14 13:37:50 2005 +0000 (2005-09-14)
parents cdfa7dd00c44
children b2f4823b6ff0 b35215021b32 9af349b055e5 3233e7ecfa9f
line source
1 /******************************************************************************
2 * page_alloc.c
3 *
4 * Simple buddy heap allocator for Xen.
5 *
6 * Copyright (c) 2002-2004 K A Fraser
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
23 #include <xen/config.h>
24 #include <xen/init.h>
25 #include <xen/types.h>
26 #include <xen/lib.h>
27 #include <xen/perfc.h>
28 #include <xen/sched.h>
29 #include <xen/spinlock.h>
30 #include <xen/mm.h>
31 #include <xen/irq.h>
32 #include <xen/softirq.h>
33 #include <xen/shadow.h>
34 #include <xen/domain_page.h>
35 #include <asm/page.h>
37 /*
38 * Comma-separated list of hexadecimal page numbers containing bad bytes.
39 * e.g. 'badpage=0x3f45,0x8a321'.
40 */
41 static char opt_badpage[100] = "";
42 string_param("badpage", opt_badpage);
44 #define round_pgdown(_p) ((_p)&PAGE_MASK)
45 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
47 static spinlock_t page_scrub_lock = SPIN_LOCK_UNLOCKED;
48 LIST_HEAD(page_scrub_list);
50 /*********************
51 * ALLOCATION BITMAP
52 * One bit per page of memory. Bit set => page is allocated.
53 */
55 static unsigned long *alloc_bitmap;
56 #define PAGES_PER_MAPWORD (sizeof(unsigned long) * 8)
58 #define allocated_in_map(_pn) \
59 ( !! (alloc_bitmap[(_pn)/PAGES_PER_MAPWORD] & \
60 (1UL<<((_pn)&(PAGES_PER_MAPWORD-1)))) )
62 /*
63 * Hint regarding bitwise arithmetic in map_{alloc,free}:
64 * -(1<<n) sets all bits >= n.
65 * (1<<n)-1 sets all bits < n.
66 * Variable names in map_{alloc,free}:
67 * *_idx == Index into `alloc_bitmap' array.
68 * *_off == Bit offset within an element of the `alloc_bitmap' array.
69 */
71 static void map_alloc(unsigned long first_page, unsigned long nr_pages)
72 {
73 unsigned long start_off, end_off, curr_idx, end_idx;
75 #ifndef NDEBUG
76 unsigned long i;
77 /* Check that the block isn't already allocated. */
78 for ( i = 0; i < nr_pages; i++ )
79 ASSERT(!allocated_in_map(first_page + i));
80 #endif
82 curr_idx = first_page / PAGES_PER_MAPWORD;
83 start_off = first_page & (PAGES_PER_MAPWORD-1);
84 end_idx = (first_page + nr_pages) / PAGES_PER_MAPWORD;
85 end_off = (first_page + nr_pages) & (PAGES_PER_MAPWORD-1);
87 if ( curr_idx == end_idx )
88 {
89 alloc_bitmap[curr_idx] |= ((1UL<<end_off)-1) & -(1UL<<start_off);
90 }
91 else
92 {
93 alloc_bitmap[curr_idx] |= -(1UL<<start_off);
94 while ( ++curr_idx < end_idx ) alloc_bitmap[curr_idx] = ~0UL;
95 alloc_bitmap[curr_idx] |= (1UL<<end_off)-1;
96 }
97 }
100 static void map_free(unsigned long first_page, unsigned long nr_pages)
101 {
102 unsigned long start_off, end_off, curr_idx, end_idx;
104 #ifndef NDEBUG
105 unsigned long i;
106 /* Check that the block isn't already freed. */
107 for ( i = 0; i < nr_pages; i++ )
108 ASSERT(allocated_in_map(first_page + i));
109 #endif
111 curr_idx = first_page / PAGES_PER_MAPWORD;
112 start_off = first_page & (PAGES_PER_MAPWORD-1);
113 end_idx = (first_page + nr_pages) / PAGES_PER_MAPWORD;
114 end_off = (first_page + nr_pages) & (PAGES_PER_MAPWORD-1);
116 if ( curr_idx == end_idx )
117 {
118 alloc_bitmap[curr_idx] &= -(1UL<<end_off) | ((1UL<<start_off)-1);
119 }
120 else
121 {
122 alloc_bitmap[curr_idx] &= (1UL<<start_off)-1;
123 while ( ++curr_idx != end_idx ) alloc_bitmap[curr_idx] = 0;
124 alloc_bitmap[curr_idx] &= -(1UL<<end_off);
125 }
126 }
130 /*************************
131 * BOOT-TIME ALLOCATOR
132 */
134 /* Initialise allocator to handle up to @max_page pages. */
135 physaddr_t init_boot_allocator(physaddr_t bitmap_start)
136 {
137 unsigned long bitmap_size;
139 bitmap_start = round_pgup(bitmap_start);
141 /*
142 * Allocate space for the allocation bitmap. Include an extra longword
143 * of padding for possible overrun in map_alloc and map_free.
144 */
145 bitmap_size = max_page / 8;
146 bitmap_size += sizeof(unsigned long);
147 bitmap_size = round_pgup(bitmap_size);
148 alloc_bitmap = (unsigned long *)phys_to_virt(bitmap_start);
150 /* All allocated by default. */
151 memset(alloc_bitmap, ~0, bitmap_size);
153 return bitmap_start + bitmap_size;
154 }
156 void init_boot_pages(physaddr_t ps, physaddr_t pe)
157 {
158 unsigned long bad_pfn;
159 char *p;
161 ps = round_pgup(ps);
162 pe = round_pgdown(pe);
163 if ( pe <= ps )
164 return;
166 map_free(ps >> PAGE_SHIFT, (pe - ps) >> PAGE_SHIFT);
168 /* Check new pages against the bad-page list. */
169 p = opt_badpage;
170 while ( *p != '\0' )
171 {
172 bad_pfn = simple_strtoul(p, &p, 0);
174 if ( *p == ',' )
175 p++;
176 else if ( *p != '\0' )
177 break;
179 if ( (bad_pfn < max_page) && !allocated_in_map(bad_pfn) )
180 {
181 printk("Marking page %lx as bad\n", bad_pfn);
182 map_alloc(bad_pfn, 1);
183 }
184 }
185 }
187 unsigned long alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
188 {
189 unsigned long pg, i;
191 for ( pg = 0; (pg + nr_pfns) < max_page; pg += pfn_align )
192 {
193 for ( i = 0; i < nr_pfns; i++ )
194 if ( allocated_in_map(pg + i) )
195 break;
197 if ( i == nr_pfns )
198 {
199 map_alloc(pg, nr_pfns);
200 return pg;
201 }
202 }
204 return 0;
205 }
209 /*************************
210 * BINARY BUDDY ALLOCATOR
211 */
213 #define MEMZONE_XEN 0
214 #define MEMZONE_DOM 1
215 #define MEMZONE_DMADOM 2
216 #define NR_ZONES 3
219 #define MAX_DMADOM_PFN 0x7FFFFUL /* 31 addressable bits */
220 #define pfn_dom_zone_type(_pfn) \
221 (((_pfn) <= MAX_DMADOM_PFN) ? MEMZONE_DMADOM : MEMZONE_DOM)
223 /* Up to 2^20 pages can be allocated at once. */
224 #define MAX_ORDER 20
225 static struct list_head heap[NR_ZONES][MAX_ORDER+1];
227 static unsigned long avail[NR_ZONES];
229 static spinlock_t heap_lock = SPIN_LOCK_UNLOCKED;
231 void end_boot_allocator(void)
232 {
233 unsigned long i, j;
234 int curr_free = 0, next_free = 0;
236 memset(avail, 0, sizeof(avail));
238 for ( i = 0; i < NR_ZONES; i++ )
239 for ( j = 0; j <= MAX_ORDER; j++ )
240 INIT_LIST_HEAD(&heap[i][j]);
242 /* Pages that are free now go to the domain sub-allocator. */
243 for ( i = 0; i < max_page; i++ )
244 {
245 curr_free = next_free;
246 next_free = !allocated_in_map(i+1);
247 if ( next_free )
248 map_alloc(i+1, 1); /* prevent merging in free_heap_pages() */
249 if ( curr_free )
250 free_heap_pages(pfn_dom_zone_type(i), pfn_to_page(i), 0);
251 }
252 }
254 /* Hand the specified arbitrary page range to the specified heap zone. */
255 void init_heap_pages(
256 unsigned int zone, struct pfn_info *pg, unsigned long nr_pages)
257 {
258 unsigned long i;
260 ASSERT(zone < NR_ZONES);
262 for ( i = 0; i < nr_pages; i++ )
263 free_heap_pages(zone, pg+i, 0);
264 }
267 /* Allocate 2^@order contiguous pages. */
268 struct pfn_info *alloc_heap_pages(unsigned int zone, unsigned int order)
269 {
270 int i;
271 struct pfn_info *pg;
273 ASSERT(zone < NR_ZONES);
275 if ( unlikely(order > MAX_ORDER) )
276 return NULL;
278 spin_lock(&heap_lock);
280 /* Find smallest order which can satisfy the request. */
281 for ( i = order; i <= MAX_ORDER; i++ )
282 if ( !list_empty(&heap[zone][i]) )
283 goto found;
285 /* No suitable memory blocks. Fail the request. */
286 spin_unlock(&heap_lock);
287 return NULL;
289 found:
290 pg = list_entry(heap[zone][i].next, struct pfn_info, list);
291 list_del(&pg->list);
293 /* We may have to halve the chunk a number of times. */
294 while ( i != order )
295 {
296 PFN_ORDER(pg) = --i;
297 list_add_tail(&pg->list, &heap[zone][i]);
298 pg += 1 << i;
299 }
301 map_alloc(page_to_pfn(pg), 1 << order);
302 avail[zone] -= 1 << order;
304 spin_unlock(&heap_lock);
306 return pg;
307 }
310 /* Free 2^@order set of pages. */
311 void free_heap_pages(
312 unsigned int zone, struct pfn_info *pg, unsigned int order)
313 {
314 unsigned long mask;
316 ASSERT(zone < NR_ZONES);
317 ASSERT(order <= MAX_ORDER);
319 spin_lock(&heap_lock);
321 map_free(page_to_pfn(pg), 1 << order);
322 avail[zone] += 1 << order;
324 /* Merge chunks as far as possible. */
325 while ( order < MAX_ORDER )
326 {
327 mask = 1 << order;
329 if ( (page_to_pfn(pg) & mask) )
330 {
331 /* Merge with predecessor block? */
332 if ( allocated_in_map(page_to_pfn(pg)-mask) ||
333 (PFN_ORDER(pg-mask) != order) )
334 break;
335 list_del(&(pg-mask)->list);
336 pg -= mask;
337 }
338 else
339 {
340 /* Merge with successor block? */
341 if ( allocated_in_map(page_to_pfn(pg)+mask) ||
342 (PFN_ORDER(pg+mask) != order) )
343 break;
344 list_del(&(pg+mask)->list);
345 }
347 order++;
348 }
350 PFN_ORDER(pg) = order;
351 list_add_tail(&pg->list, &heap[zone][order]);
353 spin_unlock(&heap_lock);
354 }
357 /*
358 * Scrub all unallocated pages in all heap zones. This function is more
359 * convoluted than appears necessary because we do not want to continuously
360 * hold the lock or disable interrupts while scrubbing very large memory areas.
361 */
362 void scrub_heap_pages(void)
363 {
364 void *p;
365 unsigned long pfn;
366 int cpu = smp_processor_id();
368 printk("Scrubbing Free RAM: ");
370 for ( pfn = 0; pfn < max_page; pfn++ )
371 {
372 /* Every 100MB, print a progress dot. */
373 if ( (pfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
374 printk(".");
376 if ( unlikely(softirq_pending(cpu)) )
377 do_softirq();
379 /* Quick lock-free check. */
380 if ( allocated_in_map(pfn) )
381 continue;
383 spin_lock_irq(&heap_lock);
385 /* Re-check page status with lock held. */
386 if ( !allocated_in_map(pfn) )
387 {
388 if ( IS_XEN_HEAP_FRAME(pfn_to_page(pfn)) )
389 {
390 p = page_to_virt(pfn_to_page(pfn));
391 memguard_unguard_range(p, PAGE_SIZE);
392 clear_page(p);
393 memguard_guard_range(p, PAGE_SIZE);
394 }
395 else
396 {
397 p = map_domain_page(pfn);
398 clear_page(p);
399 unmap_domain_page(p);
400 }
401 }
403 spin_unlock_irq(&heap_lock);
404 }
406 printk("done.\n");
407 }
411 /*************************
412 * XEN-HEAP SUB-ALLOCATOR
413 */
415 void init_xenheap_pages(physaddr_t ps, physaddr_t pe)
416 {
417 unsigned long flags;
419 ps = round_pgup(ps);
420 pe = round_pgdown(pe);
421 if ( pe <= ps )
422 return;
424 memguard_guard_range(phys_to_virt(ps), pe - ps);
426 /*
427 * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
428 * prevent merging of power-of-two blocks across the zone boundary.
429 */
430 if ( !IS_XEN_HEAP_FRAME(phys_to_page(pe)) )
431 pe -= PAGE_SIZE;
433 local_irq_save(flags);
434 init_heap_pages(MEMZONE_XEN, phys_to_page(ps), (pe - ps) >> PAGE_SHIFT);
435 local_irq_restore(flags);
436 }
439 void *alloc_xenheap_pages(unsigned int order)
440 {
441 unsigned long flags;
442 struct pfn_info *pg;
443 int i;
445 local_irq_save(flags);
446 pg = alloc_heap_pages(MEMZONE_XEN, order);
447 local_irq_restore(flags);
449 if ( unlikely(pg == NULL) )
450 goto no_memory;
452 memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
454 for ( i = 0; i < (1 << order); i++ )
455 {
456 pg[i].count_info = 0;
457 pg[i].u.inuse._domain = 0;
458 pg[i].u.inuse.type_info = 0;
459 }
461 return page_to_virt(pg);
463 no_memory:
464 printk("Cannot handle page request order %d!\n", order);
465 return NULL;
466 }
469 void free_xenheap_pages(void *v, unsigned int order)
470 {
471 unsigned long flags;
473 memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
475 local_irq_save(flags);
476 free_heap_pages(MEMZONE_XEN, virt_to_page(v), order);
477 local_irq_restore(flags);
478 }
482 /*************************
483 * DOMAIN-HEAP SUB-ALLOCATOR
484 */
486 void init_domheap_pages(physaddr_t ps, physaddr_t pe)
487 {
488 unsigned long s_tot, e_tot, s_dma, e_dma, s_nrm, e_nrm;
490 ASSERT(!in_irq());
492 s_tot = round_pgup(ps) >> PAGE_SHIFT;
493 e_tot = round_pgdown(pe) >> PAGE_SHIFT;
495 s_dma = min(s_tot, MAX_DMADOM_PFN + 1);
496 e_dma = min(e_tot, MAX_DMADOM_PFN + 1);
497 if ( s_dma < e_dma )
498 init_heap_pages(MEMZONE_DMADOM, pfn_to_page(s_dma), e_dma - s_dma);
500 s_nrm = max(s_tot, MAX_DMADOM_PFN + 1);
501 e_nrm = max(e_tot, MAX_DMADOM_PFN + 1);
502 if ( s_nrm < e_nrm )
503 init_heap_pages(MEMZONE_DOM, pfn_to_page(s_nrm), e_nrm - s_nrm);
504 }
507 struct pfn_info *alloc_domheap_pages(
508 struct domain *d, unsigned int order, unsigned int flags)
509 {
510 struct pfn_info *pg = NULL;
511 cpumask_t mask;
512 int i;
514 ASSERT(!in_irq());
516 if ( !(flags & ALLOC_DOM_DMA) )
517 pg = alloc_heap_pages(MEMZONE_DOM, order);
519 if ( pg == NULL )
520 if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, order)) == NULL )
521 return NULL;
523 mask = pg->u.free.cpumask;
524 tlbflush_filter(mask, pg->tlbflush_timestamp);
526 pg->count_info = 0;
527 pg->u.inuse._domain = 0;
528 pg->u.inuse.type_info = 0;
530 for ( i = 1; i < (1 << order); i++ )
531 {
532 /* Add in any extra CPUs that need flushing because of this page. */
533 cpumask_t extra_cpus_mask;
534 cpus_andnot(extra_cpus_mask, pg[i].u.free.cpumask, mask);
535 tlbflush_filter(extra_cpus_mask, pg[i].tlbflush_timestamp);
536 cpus_or(mask, mask, extra_cpus_mask);
538 pg[i].count_info = 0;
539 pg[i].u.inuse._domain = 0;
540 pg[i].u.inuse.type_info = 0;
541 }
543 if ( unlikely(!cpus_empty(mask)) )
544 {
545 perfc_incrc(need_flush_tlb_flush);
546 flush_tlb_mask(mask);
547 }
549 if ( d == NULL )
550 return pg;
552 spin_lock(&d->page_alloc_lock);
554 if ( unlikely(test_bit(_DOMF_dying, &d->domain_flags)) ||
555 unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
556 {
557 DPRINTK("Over-allocation for domain %u: %u > %u\n",
558 d->domain_id, d->tot_pages + (1 << order), d->max_pages);
559 DPRINTK("...or the domain is dying (%d)\n",
560 !!test_bit(_DOMF_dying, &d->domain_flags));
561 spin_unlock(&d->page_alloc_lock);
562 free_heap_pages(pfn_dom_zone_type(page_to_pfn(pg)), pg, order);
563 return NULL;
564 }
566 if ( unlikely(d->tot_pages == 0) )
567 get_knownalive_domain(d);
569 d->tot_pages += 1 << order;
571 for ( i = 0; i < (1 << order); i++ )
572 {
573 page_set_owner(&pg[i], d);
574 wmb(); /* Domain pointer must be visible before updating refcnt. */
575 pg[i].count_info |= PGC_allocated | 1;
576 list_add_tail(&pg[i].list, &d->page_list);
577 }
579 spin_unlock(&d->page_alloc_lock);
581 return pg;
582 }
585 void free_domheap_pages(struct pfn_info *pg, unsigned int order)
586 {
587 int i, drop_dom_ref;
588 struct domain *d = page_get_owner(pg);
590 ASSERT(!in_irq());
592 if ( unlikely(IS_XEN_HEAP_FRAME(pg)) )
593 {
594 /* NB. May recursively lock from relinquish_memory(). */
595 spin_lock_recursive(&d->page_alloc_lock);
597 for ( i = 0; i < (1 << order); i++ )
598 list_del(&pg[i].list);
600 d->xenheap_pages -= 1 << order;
601 drop_dom_ref = (d->xenheap_pages == 0);
603 spin_unlock_recursive(&d->page_alloc_lock);
604 }
605 else if ( likely(d != NULL) )
606 {
607 /* NB. May recursively lock from relinquish_memory(). */
608 spin_lock_recursive(&d->page_alloc_lock);
610 for ( i = 0; i < (1 << order); i++ )
611 {
612 shadow_drop_references(d, &pg[i]);
613 ASSERT(((pg[i].u.inuse.type_info & PGT_count_mask) == 0) ||
614 shadow_tainted_refcnts(d));
615 pg[i].tlbflush_timestamp = tlbflush_current_time();
616 pg[i].u.free.cpumask = d->cpumask;
617 list_del(&pg[i].list);
618 }
620 d->tot_pages -= 1 << order;
621 drop_dom_ref = (d->tot_pages == 0);
623 spin_unlock_recursive(&d->page_alloc_lock);
625 if ( likely(!test_bit(_DOMF_dying, &d->domain_flags)) )
626 {
627 free_heap_pages(pfn_dom_zone_type(page_to_pfn(pg)), pg, order);
628 }
629 else
630 {
631 /*
632 * Normally we expect a domain to clear pages before freeing them,
633 * if it cares about the secrecy of their contents. However, after
634 * a domain has died we assume responsibility for erasure.
635 */
636 for ( i = 0; i < (1 << order); i++ )
637 {
638 spin_lock(&page_scrub_lock);
639 list_add(&pg[i].list, &page_scrub_list);
640 spin_unlock(&page_scrub_lock);
641 }
642 }
643 }
644 else
645 {
646 /* Freeing an anonymous domain-heap page. */
647 free_heap_pages(pfn_dom_zone_type(page_to_pfn(pg)), pg, order);
648 drop_dom_ref = 0;
649 }
651 if ( drop_dom_ref )
652 put_domain(d);
653 }
656 unsigned long avail_domheap_pages(void)
657 {
658 return avail[MEMZONE_DOM] + avail[MEMZONE_DMADOM];
659 }
663 /*************************
664 * PAGE SCRUBBING
665 */
667 static void page_scrub_softirq(void)
668 {
669 struct list_head *ent;
670 struct pfn_info *pg;
671 void *p;
672 int i;
673 s_time_t start = NOW();
675 /* Aim to do 1ms of work (ten percent of a 10ms jiffy). */
676 do {
677 spin_lock(&page_scrub_lock);
679 if ( unlikely((ent = page_scrub_list.next) == &page_scrub_list) )
680 {
681 spin_unlock(&page_scrub_lock);
682 return;
683 }
685 /* Peel up to 16 pages from the list. */
686 for ( i = 0; i < 16; i++ )
687 {
688 if ( ent->next == &page_scrub_list )
689 break;
690 ent = ent->next;
691 }
693 /* Remove peeled pages from the list. */
694 ent->next->prev = &page_scrub_list;
695 page_scrub_list.next = ent->next;
697 spin_unlock(&page_scrub_lock);
699 /* Working backwards, scrub each page in turn. */
700 while ( ent != &page_scrub_list )
701 {
702 pg = list_entry(ent, struct pfn_info, list);
703 ent = ent->prev;
704 p = map_domain_page(page_to_pfn(pg));
705 clear_page(p);
706 unmap_domain_page(p);
707 free_heap_pages(pfn_dom_zone_type(page_to_pfn(pg)), pg, 0);
708 }
709 } while ( (NOW() - start) < MILLISECS(1) );
710 }
712 static __init int page_scrub_init(void)
713 {
714 open_softirq(PAGE_SCRUB_SOFTIRQ, page_scrub_softirq);
715 return 0;
716 }
717 __initcall(page_scrub_init);
719 /*
720 * Local variables:
721 * mode: C
722 * c-set-style: "BSD"
723 * c-basic-offset: 4
724 * tab-width: 4
725 * indent-tabs-mode: nil
726 * End:
727 */