ia64/xen-unstable

changeset 11972:cf95c3218a70

[XEN] NUMA-ify Xen heap and page allocator

This patch adds a per-node bucket to the heap structure in Xen.
During heap initialization the patch determines which bucket to place
the memory. We reserve guard pages between node boundaries in the case
that said boundary isn't already guarded by the MAX_ORDER boundary to
prevent the buddy allocator from merging pages between nodes.

Signed-off-by: Ryan Harper <ryanh@us.ibm.com>
author kfraser@localhost.localdomain
date Wed Oct 25 12:28:46 2006 +0100 (2006-10-25)
parents f312c2d01d8b
children 041507e2754c
files xen/common/page_alloc.c xen/include/xen/mm.h
line diff
     1.1 --- a/xen/common/page_alloc.c	Wed Oct 25 12:25:54 2006 +0100
     1.2 +++ b/xen/common/page_alloc.c	Wed Oct 25 12:28:46 2006 +0100
     1.3 @@ -4,6 +4,7 @@
     1.4   * Simple buddy heap allocator for Xen.
     1.5   * 
     1.6   * Copyright (c) 2002-2004 K A Fraser
     1.7 + * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com>
     1.8   * 
     1.9   * This program is free software; you can redistribute it and/or modify
    1.10   * it under the terms of the GNU General Public License as published by
    1.11 @@ -34,6 +35,8 @@
    1.12  #include <xen/keyhandler.h>
    1.13  #include <xen/perfc.h>
    1.14  #include <asm/page.h>
    1.15 +#include <asm/numa.h>
    1.16 +#include <asm/topology.h>
    1.17  
    1.18  /*
    1.19   * Comma-separated list of hexadecimal page numbers containing bad bytes.
    1.20 @@ -247,22 +250,23 @@ unsigned long alloc_boot_pages(unsigned 
    1.21  #define pfn_dom_zone_type(_pfn)                                 \
    1.22      (((_pfn) <= MAX_DMADOM_PFN) ? MEMZONE_DMADOM : MEMZONE_DOM)
    1.23  
    1.24 -static struct list_head heap[NR_ZONES][MAX_ORDER+1];
    1.25 +static struct list_head heap[NR_ZONES][MAX_NUMNODES][MAX_ORDER+1];
    1.26  
    1.27 -static unsigned long avail[NR_ZONES];
    1.28 +static unsigned long avail[NR_ZONES][MAX_NUMNODES];
    1.29  
    1.30  static DEFINE_SPINLOCK(heap_lock);
    1.31  
    1.32  void end_boot_allocator(void)
    1.33  {
    1.34 -    unsigned long i, j;
    1.35 +    unsigned long i, j, k;
    1.36      int curr_free = 0, next_free = 0;
    1.37  
    1.38      memset(avail, 0, sizeof(avail));
    1.39  
    1.40      for ( i = 0; i < NR_ZONES; i++ )
    1.41 -        for ( j = 0; j <= MAX_ORDER; j++ )
    1.42 -            INIT_LIST_HEAD(&heap[i][j]);
    1.43 +        for ( j = 0; j < MAX_NUMNODES; j++ )
    1.44 +            for ( k = 0; k <= MAX_ORDER; k++ )
    1.45 +                INIT_LIST_HEAD(&heap[i][j][k]);
    1.46  
    1.47      /* Pages that are free now go to the domain sub-allocator. */
    1.48      for ( i = 0; i < max_page; i++ )
    1.49 @@ -272,29 +276,59 @@ void end_boot_allocator(void)
    1.50          if ( next_free )
    1.51              map_alloc(i+1, 1); /* prevent merging in free_heap_pages() */
    1.52          if ( curr_free )
    1.53 -            free_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 0);
    1.54 +            init_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 1);
    1.55      }
    1.56  }
    1.57  
    1.58 -/* Hand the specified arbitrary page range to the specified heap zone. */
    1.59 +/* 
    1.60 + * Hand the specified arbitrary page range to the specified heap zone
    1.61 + * checking the node_id of the previous page.  If they differ and the
    1.62 + * latter is not on a MAX_ORDER boundary, then we reserve the page by
    1.63 + * not freeing it to the buddy allocator.
    1.64 + */
    1.65 +#define MAX_ORDER_ALIGNED (1UL << (MAX_ORDER))
    1.66  void init_heap_pages(
    1.67      unsigned int zone, struct page_info *pg, unsigned long nr_pages)
    1.68  {
    1.69 +    unsigned int nid_curr,nid_prev;
    1.70      unsigned long i;
    1.71  
    1.72      ASSERT(zone < NR_ZONES);
    1.73  
    1.74 +    if ( likely(page_to_mfn(pg) != 0) )
    1.75 +        nid_prev = phys_to_nid(page_to_maddr(pg-1));
    1.76 +    else
    1.77 +        nid_prev = phys_to_nid(page_to_maddr(pg));
    1.78 +
    1.79      for ( i = 0; i < nr_pages; i++ )
    1.80 -        free_heap_pages(zone, pg+i, 0);
    1.81 +    {
    1.82 +        nid_curr = phys_to_nid(page_to_maddr(pg+i));
    1.83 +
    1.84 +        /*
    1.85 +         * free pages of the same node, or if they differ, but are on a
    1.86 +         * MAX_ORDER alignement boundary (which already get reserved)
    1.87 +         */
    1.88 +         if ( (nid_curr == nid_prev) || (page_to_maddr(pg+i) &
    1.89 +                                         MAX_ORDER_ALIGNED) )
    1.90 +             free_heap_pages(zone, pg+i, 0);
    1.91 +         else
    1.92 +             printk("Reserving non-aligned node boundary @ mfn %lu\n",
    1.93 +                    page_to_mfn(pg+i));
    1.94 +
    1.95 +        nid_prev = nid_curr;
    1.96 +    }
    1.97  }
    1.98  
    1.99 -
   1.100  /* Allocate 2^@order contiguous pages. */
   1.101 -struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order)
   1.102 +struct page_info *alloc_heap_pages(unsigned int zone, unsigned int cpu,
   1.103 +                                   unsigned int order)
   1.104  {
   1.105 -    int i;
   1.106 +    unsigned int i,j, node = cpu_to_node(cpu), num_nodes = num_online_nodes();
   1.107 +    unsigned int request = (1UL << order);
   1.108      struct page_info *pg;
   1.109  
   1.110 +    ASSERT(node >= 0);
   1.111 +    ASSERT(node < num_nodes);
   1.112      ASSERT(zone < NR_ZONES);
   1.113  
   1.114      if ( unlikely(order > MAX_ORDER) )
   1.115 @@ -302,29 +336,46 @@ struct page_info *alloc_heap_pages(unsig
   1.116  
   1.117      spin_lock(&heap_lock);
   1.118  
   1.119 -    /* Find smallest order which can satisfy the request. */
   1.120 -    for ( i = order; i <= MAX_ORDER; i++ )
   1.121 -        if ( !list_empty(&heap[zone][i]) )
   1.122 -            goto found;
   1.123 +    /* start with requested node, but exhaust all node memory
   1.124 +     * in requested zone before failing, only calc new node
   1.125 +     * value if we fail to find memory in target node, this avoids
   1.126 +     * needless computation on fast-path */
   1.127 +    for ( i = 0; i < num_nodes; i++ )
   1.128 +    {
   1.129 +        /* check if target node can support the allocation */
   1.130 +        if ( avail[zone][node] >= request )
   1.131 +        {
   1.132 +            /* Find smallest order which can satisfy the request. */
   1.133 +            for ( j = order; j <= MAX_ORDER; j++ )
   1.134 +            {
   1.135 +                if ( !list_empty(&heap[zone][node][j]) )
   1.136 +                    goto found;
   1.137 +            }
   1.138 +        }
   1.139 +        /* pick next node, wrapping around if needed */
   1.140 +        if ( ++node == num_nodes )
   1.141 +            node = 0;
   1.142 +    }
   1.143  
   1.144      /* No suitable memory blocks. Fail the request. */
   1.145      spin_unlock(&heap_lock);
   1.146      return NULL;
   1.147  
   1.148   found: 
   1.149 -    pg = list_entry(heap[zone][i].next, struct page_info, list);
   1.150 +    pg = list_entry(heap[zone][node][j].next, struct page_info, list);
   1.151      list_del(&pg->list);
   1.152  
   1.153      /* We may have to halve the chunk a number of times. */
   1.154 -    while ( i != order )
   1.155 +    while ( j != order )
   1.156      {
   1.157 -        PFN_ORDER(pg) = --i;
   1.158 -        list_add_tail(&pg->list, &heap[zone][i]);
   1.159 -        pg += 1 << i;
   1.160 +        PFN_ORDER(pg) = --j;
   1.161 +        list_add_tail(&pg->list, &heap[zone][node][j]);
   1.162 +        pg += 1 << j;
   1.163      }
   1.164      
   1.165 -    map_alloc(page_to_mfn(pg), 1 << order);
   1.166 -    avail[zone] -= 1 << order;
   1.167 +    map_alloc(page_to_mfn(pg), request);
   1.168 +    ASSERT(avail[zone][node] >= request);
   1.169 +    avail[zone][node] -= request;
   1.170  
   1.171      spin_unlock(&heap_lock);
   1.172  
   1.173 @@ -337,14 +388,17 @@ void free_heap_pages(
   1.174      unsigned int zone, struct page_info *pg, unsigned int order)
   1.175  {
   1.176      unsigned long mask;
   1.177 +    int node = phys_to_nid(page_to_maddr(pg));
   1.178  
   1.179      ASSERT(zone < NR_ZONES);
   1.180      ASSERT(order <= MAX_ORDER);
   1.181 +    ASSERT(node >= 0);
   1.182 +    ASSERT(node < num_online_nodes());
   1.183  
   1.184      spin_lock(&heap_lock);
   1.185  
   1.186      map_free(page_to_mfn(pg), 1 << order);
   1.187 -    avail[zone] += 1 << order;
   1.188 +    avail[zone][node] += 1 << order;
   1.189      
   1.190      /* Merge chunks as far as possible. */
   1.191      while ( order < MAX_ORDER )
   1.192 @@ -370,10 +424,13 @@ void free_heap_pages(
   1.193          }
   1.194          
   1.195          order++;
   1.196 +
   1.197 +        /* after merging, pg should be in the same node */
   1.198 +        ASSERT(phys_to_nid(page_to_maddr(pg)) == node );
   1.199      }
   1.200  
   1.201      PFN_ORDER(pg) = order;
   1.202 -    list_add_tail(&pg->list, &heap[zone][order]);
   1.203 +    list_add_tail(&pg->list, &heap[zone][node][order]);
   1.204  
   1.205      spin_unlock(&heap_lock);
   1.206  }
   1.207 @@ -466,7 +523,7 @@ void *alloc_xenheap_pages(unsigned int o
   1.208      int i;
   1.209  
   1.210      local_irq_save(flags);
   1.211 -    pg = alloc_heap_pages(MEMZONE_XEN, order);
   1.212 +    pg = alloc_heap_pages(MEMZONE_XEN, smp_processor_id(), order);
   1.213      local_irq_restore(flags);
   1.214  
   1.215      if ( unlikely(pg == NULL) )
   1.216 @@ -580,8 +637,9 @@ int assign_pages(
   1.217  }
   1.218  
   1.219  
   1.220 -struct page_info *alloc_domheap_pages(
   1.221 -    struct domain *d, unsigned int order, unsigned int memflags)
   1.222 +struct page_info *__alloc_domheap_pages(
   1.223 +    struct domain *d, unsigned int cpu, unsigned int order, 
   1.224 +    unsigned int memflags)
   1.225  {
   1.226      struct page_info *pg = NULL;
   1.227      cpumask_t mask;
   1.228 @@ -591,17 +649,17 @@ struct page_info *alloc_domheap_pages(
   1.229  
   1.230      if ( !(memflags & MEMF_dma) )
   1.231      {
   1.232 -        pg = alloc_heap_pages(MEMZONE_DOM, order);
   1.233 +        pg = alloc_heap_pages(MEMZONE_DOM, cpu, order);
   1.234          /* Failure? Then check if we can fall back to the DMA pool. */
   1.235          if ( unlikely(pg == NULL) &&
   1.236               ((order > MAX_ORDER) ||
   1.237 -              (avail[MEMZONE_DMADOM] <
   1.238 +              (avail_heap_pages(MEMZONE_DMADOM,-1) <
   1.239                 (lowmem_emergency_pool_pages + (1UL << order)))) )
   1.240              return NULL;
   1.241      }
   1.242  
   1.243      if ( pg == NULL )
   1.244 -        if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, order)) == NULL )
   1.245 +        if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, cpu, order)) == NULL )
   1.246              return NULL;
   1.247  
   1.248      mask = pg->u.free.cpumask;
   1.249 @@ -640,6 +698,13 @@ struct page_info *alloc_domheap_pages(
   1.250      return pg;
   1.251  }
   1.252  
   1.253 +inline struct page_info *alloc_domheap_pages(
   1.254 +    struct domain *d, unsigned int order, unsigned int flags)
   1.255 +{
   1.256 +    return __alloc_domheap_pages(d, smp_processor_id(), order, flags);
   1.257 +
   1.258 +}
   1.259 +
   1.260  
   1.261  void free_domheap_pages(struct page_info *pg, unsigned int order)
   1.262  {
   1.263 @@ -714,13 +779,27 @@ void free_domheap_pages(struct page_info
   1.264  }
   1.265  
   1.266  
   1.267 +unsigned long avail_heap_pages(int zone, int node)
   1.268 +{
   1.269 +    int i,j, num_nodes = num_online_nodes();
   1.270 +    unsigned long free_pages = 0;
   1.271 +   
   1.272 +    for (i=0; i<NR_ZONES; i++)
   1.273 +        if ( (zone == -1) || (zone == i) )
   1.274 +            for (j=0; j < num_nodes; j++)
   1.275 +                if ( (node == -1) || (node == j) )
   1.276 +                    free_pages += avail[i][j];            
   1.277 +
   1.278 +    return free_pages;
   1.279 +}
   1.280 +
   1.281  unsigned long avail_domheap_pages(void)
   1.282  {
   1.283      unsigned long avail_nrm, avail_dma;
   1.284 +    
   1.285 +    avail_nrm = avail_heap_pages(MEMZONE_DOM,-1);
   1.286  
   1.287 -    avail_nrm = avail[MEMZONE_DOM];
   1.288 -
   1.289 -    avail_dma = avail[MEMZONE_DMADOM];
   1.290 +    avail_dma = avail_heap_pages(MEMZONE_DMADOM,-1);
   1.291      if ( avail_dma > lowmem_emergency_pool_pages )
   1.292          avail_dma -= lowmem_emergency_pool_pages;
   1.293      else
   1.294 @@ -729,6 +808,10 @@ unsigned long avail_domheap_pages(void)
   1.295      return avail_nrm + avail_dma;
   1.296  }
   1.297  
   1.298 +unsigned long avail_nodeheap_pages(int node)
   1.299 +{
   1.300 +    return avail_heap_pages(-1, node);
   1.301 +}
   1.302  
   1.303  static void pagealloc_keyhandler(unsigned char key)
   1.304  {
   1.305 @@ -736,9 +819,9 @@ static void pagealloc_keyhandler(unsigne
   1.306      printk("    Xen heap: %lukB free\n"
   1.307             "    DMA heap: %lukB free\n"
   1.308             "    Dom heap: %lukB free\n",
   1.309 -           avail[MEMZONE_XEN]<<(PAGE_SHIFT-10),
   1.310 -           avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10),
   1.311 -           avail[MEMZONE_DOM]<<(PAGE_SHIFT-10));
   1.312 +           avail_heap_pages(MEMZONE_XEN, -1) << (PAGE_SHIFT-10), 
   1.313 +           avail_heap_pages(MEMZONE_DMADOM, -1) <<(PAGE_SHIFT-10), 
   1.314 +           avail_heap_pages(MEMZONE_DOM, -1) <<(PAGE_SHIFT-10));
   1.315  }
   1.316  
   1.317  
   1.318 @@ -806,6 +889,46 @@ unsigned long avail_scrub_pages(void)
   1.319      return scrub_pages;
   1.320  }
   1.321  
   1.322 +static unsigned long count_bucket(struct list_head* l, int order)
   1.323 +{
   1.324 +    unsigned long total_pages = 0;
   1.325 +    int pages = 1 << order;
   1.326 +    struct page_info *pg;
   1.327 +
   1.328 +    list_for_each_entry(pg, l, list)
   1.329 +        total_pages += pages;
   1.330 +
   1.331 +    return total_pages;
   1.332 +}
   1.333 +
   1.334 +static void dump_heap(unsigned char key)
   1.335 +{
   1.336 +    s_time_t       now = NOW();
   1.337 +    int i,j,k;
   1.338 +    unsigned long total;
   1.339 +
   1.340 +    printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
   1.341 +           (u32)(now>>32), (u32)now);
   1.342 +
   1.343 +    for (i=0; i<NR_ZONES; i++ )
   1.344 +        for (j=0;j<MAX_NUMNODES;j++)
   1.345 +            for (k=0;k<=MAX_ORDER;k++)
   1.346 +                if ( !list_empty(&heap[i][j][k]) )
   1.347 +                {
   1.348 +                    total = count_bucket(&heap[i][j][k], k);
   1.349 +                    printk("heap[%d][%d][%d]-> %lu pages\n",
   1.350 +                            i, j, k, total);
   1.351 +                }
   1.352 +}
   1.353 +
   1.354 +static __init int register_heap_trigger(void)
   1.355 +{
   1.356 +    register_keyhandler('H', dump_heap, "dump heap info");
   1.357 +    return 0;
   1.358 +}
   1.359 +__initcall(register_heap_trigger);
   1.360 +
   1.361 +
   1.362  static __init int page_scrub_init(void)
   1.363  {
   1.364      open_softirq(PAGE_SCRUB_SOFTIRQ, page_scrub_softirq);
     2.1 --- a/xen/include/xen/mm.h	Wed Oct 25 12:25:54 2006 +0100
     2.2 +++ b/xen/include/xen/mm.h	Wed Oct 25 12:28:46 2006 +0100
     2.3 @@ -45,7 +45,8 @@ void end_boot_allocator(void);
     2.4  /* Generic allocator. These functions are *not* interrupt-safe. */
     2.5  void init_heap_pages(
     2.6      unsigned int zone, struct page_info *pg, unsigned long nr_pages);
     2.7 -struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order);
     2.8 +struct page_info *alloc_heap_pages(
     2.9 +    unsigned int zone, unsigned int cpu, unsigned int order);
    2.10  void free_heap_pages(
    2.11      unsigned int zone, struct page_info *pg, unsigned int order);
    2.12  void scrub_heap_pages(void);
    2.13 @@ -61,8 +62,12 @@ void free_xenheap_pages(void *v, unsigne
    2.14  void init_domheap_pages(paddr_t ps, paddr_t pe);
    2.15  struct page_info *alloc_domheap_pages(
    2.16      struct domain *d, unsigned int order, unsigned int memflags);
    2.17 +struct page_info *__alloc_domheap_pages(
    2.18 +    struct domain *d, unsigned int cpu, unsigned int order, 
    2.19 +    unsigned int memflags);
    2.20  void free_domheap_pages(struct page_info *pg, unsigned int order);
    2.21  unsigned long avail_domheap_pages(void);
    2.22 +unsigned long avail_heap_pages(int zone, int node);
    2.23  #define alloc_domheap_page(d) (alloc_domheap_pages(d,0,0))
    2.24  #define free_domheap_page(p)  (free_domheap_pages(p,0))
    2.25