ia64/xen-unstable

changeset 18975:2090917489c5

PoD memory 7/9: Xen interface

Implement Xen interface to PoD functionality.
* Increase the number of MEMOP bits from 4 to 6 (increasing the number
of available memory operations from 16 to 64).
* Introduce XENMEMF_populate_on_demand, which will cause
populate_physmap() to fill a range with PoD entries rather than
backing it with ram
* Introduce XENMEM_[sg]et_pod_target operation to the memory
hypercall, to get and set PoD cache size. set_pod_target() should be
called during domain creation, as well as after modifying the memory
target of any domain which may have outstanding PoD entries.

Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Jan 05 10:45:48 2009 +0000 (2009-01-05)
parents ebe11a452393
children 2a8ae362a828
files xen/arch/x86/mm.c xen/arch/x86/mm/p2m.c xen/arch/x86/x86_64/compat/mm.c xen/common/memory.c xen/include/asm-x86/p2m.h xen/include/public/memory.h xen/include/xen/hypercall.h xen/include/xen/mm.h xen/include/xlat.lst
line diff
     1.1 --- a/xen/arch/x86/mm.c	Mon Jan 05 10:45:09 2009 +0000
     1.2 +++ b/xen/arch/x86/mm.c	Mon Jan 05 10:45:48 2009 +0000
     1.3 @@ -3976,6 +3976,49 @@ long arch_memory_op(int op, XEN_GUEST_HA
     1.4          return 0;
     1.5      }
     1.6  
     1.7 +    case XENMEM_set_pod_target:
     1.8 +    case XENMEM_get_pod_target:
     1.9 +    {
    1.10 +        xen_pod_target_t target;
    1.11 +        struct domain *d;
    1.12 +
    1.13 +        /* Support DOMID_SELF? */
    1.14 +        if ( !IS_PRIV(current->domain) )
    1.15 +            return -EINVAL;
    1.16 +
    1.17 +        if ( copy_from_guest(&target, arg, 1) )
    1.18 +            return -EFAULT;
    1.19 +
    1.20 +        rc = rcu_lock_target_domain_by_id(target.domid, &d);
    1.21 +        if ( rc != 0 )
    1.22 +            return rc;
    1.23 +
    1.24 +        if ( op == XENMEM_set_pod_target )
    1.25 +        {
    1.26 +            if ( target.target_pages > d->max_pages )
    1.27 +            {
    1.28 +                rc = -EINVAL;
    1.29 +                goto pod_target_out_unlock;
    1.30 +            }
    1.31 +            
    1.32 +            rc = p2m_pod_set_mem_target(d, target.target_pages);
    1.33 +        }
    1.34 +
    1.35 +        target.tot_pages       = d->tot_pages;
    1.36 +        target.pod_cache_pages = d->arch.p2m->pod.count;
    1.37 +        target.pod_entries     = d->arch.p2m->pod.entry_count;
    1.38 +
    1.39 +        if ( copy_to_guest(arg, &target, 1) )
    1.40 +        {
    1.41 +            rc= -EFAULT;
    1.42 +            goto pod_target_out_unlock;
    1.43 +        }
    1.44 +        
    1.45 +    pod_target_out_unlock:
    1.46 +        rcu_unlock_domain(d);
    1.47 +        return rc;
    1.48 +    }
    1.49 +
    1.50      default:
    1.51          return subarch_memory_op(op, arg);
    1.52      }
     2.1 --- a/xen/arch/x86/mm/p2m.c	Mon Jan 05 10:45:09 2009 +0000
     2.2 +++ b/xen/arch/x86/mm/p2m.c	Mon Jan 05 10:45:48 2009 +0000
     2.3 @@ -387,6 +387,150 @@ static struct page_info * p2m_pod_cache_
     2.4      return p;
     2.5  }
     2.6  
     2.7 +/* Set the size of the cache, allocating or freeing as necessary. */
     2.8 +static int
     2.9 +p2m_pod_set_cache_target(struct domain *d, unsigned long pod_target)
    2.10 +{
    2.11 +    struct p2m_domain *p2md = d->arch.p2m;
    2.12 +    int ret = 0;
    2.13 +
    2.14 +    /* Increasing the target */
    2.15 +    while ( pod_target > p2md->pod.count )
    2.16 +    {
    2.17 +        struct page_info * page;
    2.18 +        int order;
    2.19 +
    2.20 +        if ( (pod_target - p2md->pod.count) >= (1>>9) )
    2.21 +            order = 9;
    2.22 +        else
    2.23 +            order = 0;
    2.24 +
    2.25 +        page = alloc_domheap_pages(d, order, 0);
    2.26 +        if ( unlikely(page == NULL) )
    2.27 +            goto out;
    2.28 +
    2.29 +        p2m_pod_cache_add(d, page, order);
    2.30 +    }
    2.31 +
    2.32 +    /* Decreasing the target */
    2.33 +    /* We hold the p2m lock here, so we don't need to worry about
    2.34 +     * cache disappearing under our feet. */
    2.35 +    while ( pod_target < p2md->pod.count )
    2.36 +    {
    2.37 +        struct page_info * page;
    2.38 +        int order, i;
    2.39 +
    2.40 +        /* Grab the lock before checking that pod.super is empty, or the last
    2.41 +         * entries may disappear before we grab the lock. */
    2.42 +        spin_lock(&d->page_alloc_lock);
    2.43 +
    2.44 +        if ( (p2md->pod.count - pod_target) > (1>>9)
    2.45 +             && !list_empty(&p2md->pod.super) )
    2.46 +            order = 9;
    2.47 +        else
    2.48 +            order = 0;
    2.49 +
    2.50 +        page = p2m_pod_cache_get(d, order);
    2.51 +
    2.52 +        ASSERT(page != NULL);
    2.53 +
    2.54 +        spin_unlock(&d->page_alloc_lock);
    2.55 +
    2.56 +        /* Then free them */
    2.57 +        for ( i = 0 ; i < (1 << order) ; i++ )
    2.58 +        {
    2.59 +            /* Copied from common/memory.c:guest_remove_page() */
    2.60 +            if ( unlikely(!get_page(page+i, d)) )
    2.61 +            {
    2.62 +                gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", d->domain_id);
    2.63 +                ret = -EINVAL;
    2.64 +                goto out;
    2.65 +            }
    2.66 +
    2.67 +            if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) )
    2.68 +                put_page_and_type(page+i);
    2.69 +            
    2.70 +            if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) )
    2.71 +                put_page(page+i);
    2.72 +
    2.73 +            put_page(page+i);
    2.74 +        }
    2.75 +    }
    2.76 +
    2.77 +out:
    2.78 +    return ret;
    2.79 +}
    2.80 +
    2.81 +/*
    2.82 + * The "right behavior" here requires some careful thought.  First, some
    2.83 + * definitions:
    2.84 + * + M: static_max
    2.85 + * + B: number of pages the balloon driver has ballooned down to.
    2.86 + * + P: Number of populated pages. 
    2.87 + * + T: Old target
    2.88 + * + T': New target
    2.89 + *
    2.90 + * The following equations should hold:
    2.91 + *  0 <= P <= T <= B <= M
    2.92 + *  d->arch.p2m->pod.entry_count == B - P
    2.93 + *  d->tot_pages == P + d->arch.p2m->pod.count
    2.94 + *
    2.95 + * Now we have the following potential cases to cover:
    2.96 + *     B <T': Set the PoD cache size equal to the number of outstanding PoD
    2.97 + *   entries.  The balloon driver will deflate the balloon to give back
    2.98 + *   the remainder of the ram to the guest OS.
    2.99 + *  T <T'<B : Increase PoD cache size.
   2.100 + *  T'<T<=B : Here we have a choice.  We can decrease the size of the cache,
   2.101 + *   get the memory right away.  However, that means every time we 
   2.102 + *   reduce the memory target we risk the guest attempting to populate the 
   2.103 + *   memory before the balloon driver has reached its new target.  Safer to
   2.104 + *   never reduce the cache size here, but only when the balloon driver frees 
   2.105 + *   PoD ranges.
   2.106 + *
   2.107 + * If there are many zero pages, we could reach the target also by doing
   2.108 + * zero sweeps and marking the ranges PoD; but the balloon driver will have
   2.109 + * to free this memory eventually anyway, so we don't actually gain that much
   2.110 + * by doing so.
   2.111 + *
   2.112 + * NB that the equation (B<T') may require adjustment to the cache
   2.113 + * size as PoD pages are freed as well; i.e., freeing a PoD-backed
   2.114 + * entry when pod.entry_count == pod.count requires us to reduce both
   2.115 + * pod.entry_count and pod.count.
   2.116 + */
   2.117 +int
   2.118 +p2m_pod_set_mem_target(struct domain *d, unsigned long target)
   2.119 +{
   2.120 +    unsigned pod_target;
   2.121 +    struct p2m_domain *p2md = d->arch.p2m;
   2.122 +    int ret = 0;
   2.123 +    unsigned long populated;
   2.124 +
   2.125 +    /* P == B: Nothing to do. */
   2.126 +    if ( p2md->pod.entry_count == 0 )
   2.127 +        goto out;
   2.128 +
   2.129 +    /* T' < B: Don't reduce the cache size; let the balloon driver
   2.130 +     * take care of it. */
   2.131 +    if ( target < d->tot_pages )
   2.132 +        goto out;
   2.133 +
   2.134 +    populated  = d->tot_pages - p2md->pod.count;
   2.135 +
   2.136 +    pod_target = target - populated;
   2.137 +
   2.138 +    /* B < T': Set the cache size equal to # of outstanding entries,
   2.139 +     * let the balloon driver fill in the rest. */
   2.140 +    if ( pod_target > p2md->pod.entry_count )
   2.141 +        pod_target = p2md->pod.entry_count;
   2.142 +
   2.143 +    ASSERT( pod_target > p2md->pod.count );
   2.144 +
   2.145 +    ret = p2m_pod_set_cache_target(d, pod_target);
   2.146 +
   2.147 +out:
   2.148 +    return ret;
   2.149 +}
   2.150 +
   2.151  void
   2.152  p2m_pod_empty_cache(struct domain *d)
   2.153  {
   2.154 @@ -538,6 +682,13 @@ p2m_pod_decrease_reservation(struct doma
   2.155          }
   2.156      }    
   2.157  
   2.158 +    /* If we've reduced our "liabilities" beyond our "assets", free some */
   2.159 +    if ( p2md->pod.entry_count < p2md->pod.count )
   2.160 +    {
   2.161 +        printk("b %d\n", p2md->pod.entry_count);
   2.162 +        p2m_pod_set_cache_target(d, p2md->pod.entry_count);
   2.163 +    }
   2.164 +
   2.165      /* If there are no more non-PoD entries, tell decrease_reservation() that
   2.166       * there's nothing left to do. */
   2.167      if ( nonpod == 0 )
   2.168 @@ -786,7 +937,7 @@ p2m_pod_emergency_sweep_super(struct dom
   2.169          /* Stop if we're past our limit and we have found *something*.
   2.170           *
   2.171           * NB that this is a zero-sum game; we're increasing our cache size
   2.172 -         * by re-increasing our 'debt'.  Since we hold the p2m lock,
   2.173 +         * by increasing our 'debt'.  Since we hold the p2m lock,
   2.174           * (entry_count - count) must remain the same. */
   2.175          if ( !list_empty(&p2md->pod.super) &&  i < limit )
   2.176              break;
     3.1 --- a/xen/arch/x86/x86_64/compat/mm.c	Mon Jan 05 10:45:09 2009 +0000
     3.2 +++ b/xen/arch/x86/x86_64/compat/mm.c	Mon Jan 05 10:45:48 2009 +0000
     3.3 @@ -128,6 +128,29 @@ int compat_arch_memory_op(int op, XEN_GU
     3.4          break;
     3.5      }
     3.6  
     3.7 +    case XENMEM_set_pod_target:
     3.8 +    case XENMEM_get_pod_target:
     3.9 +    {
    3.10 +        struct compat_pod_target cmp;
    3.11 +        struct xen_pod_target *nat = (void *)COMPAT_ARG_XLAT_VIRT_BASE;
    3.12 +
    3.13 +        if ( copy_from_guest(&cmp, arg, 1) )
    3.14 +            return -EFAULT;
    3.15 +
    3.16 +        XLAT_pod_target(nat, &cmp);
    3.17 +
    3.18 +        rc = arch_memory_op(op, guest_handle_from_ptr(nat, void));
    3.19 +        if ( rc < 0 )
    3.20 +            break;
    3.21 +
    3.22 +        XLAT_pod_target(&cmp, nat);
    3.23 +
    3.24 +        if ( copy_to_guest(arg, &cmp, 1) )
    3.25 +            rc = -EFAULT;
    3.26 +
    3.27 +        break;
    3.28 +    }
    3.29 +
    3.30      case XENMEM_machphys_mapping:
    3.31      {
    3.32          struct domain *d = current->domain;
     4.1 --- a/xen/common/memory.c	Mon Jan 05 10:45:09 2009 +0000
     4.2 +++ b/xen/common/memory.c	Mon Jan 05 10:45:48 2009 +0000
     4.3 @@ -111,31 +111,40 @@ static void populate_physmap(struct memo
     4.4          if ( unlikely(__copy_from_guest_offset(&gpfn, a->extent_list, i, 1)) )
     4.5              goto out;
     4.6  
     4.7 -        page = alloc_domheap_pages(d, a->extent_order, a->memflags);
     4.8 -        if ( unlikely(page == NULL) ) 
     4.9 +        if ( a->memflags & MEMF_populate_on_demand )
    4.10 +        {
    4.11 +            if ( guest_physmap_mark_populate_on_demand(d, gpfn,
    4.12 +                                                       a->extent_order) < 0 )
    4.13 +                goto out;
    4.14 +        }
    4.15 +        else
    4.16          {
    4.17 -            gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
    4.18 -                     "id=%d memflags=%x (%ld of %d)\n",
    4.19 -                     a->extent_order, d->domain_id, a->memflags,
    4.20 -                     i, a->nr_extents);
    4.21 -            goto out;
    4.22 -        }
    4.23 +            page = alloc_domheap_pages(d, a->extent_order, a->memflags);
    4.24 +            if ( unlikely(page == NULL) ) 
    4.25 +            {
    4.26 +                gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
    4.27 +                         "id=%d memflags=%x (%ld of %d)\n",
    4.28 +                         a->extent_order, d->domain_id, a->memflags,
    4.29 +                         i, a->nr_extents);
    4.30 +                goto out;
    4.31 +            }
    4.32  
    4.33 -        mfn = page_to_mfn(page);
    4.34 -        guest_physmap_add_page(d, gpfn, mfn, a->extent_order);
    4.35 +            mfn = page_to_mfn(page);
    4.36 +            guest_physmap_add_page(d, gpfn, mfn, a->extent_order);
    4.37  
    4.38 -        if ( !paging_mode_translate(d) )
    4.39 -        {
    4.40 -            for ( j = 0; j < (1 << a->extent_order); j++ )
    4.41 -                set_gpfn_from_mfn(mfn + j, gpfn + j);
    4.42 +            if ( !paging_mode_translate(d) )
    4.43 +            {
    4.44 +                for ( j = 0; j < (1 << a->extent_order); j++ )
    4.45 +                    set_gpfn_from_mfn(mfn + j, gpfn + j);
    4.46  
    4.47 -            /* Inform the domain of the new page's machine address. */ 
    4.48 -            if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) )
    4.49 -                goto out;
    4.50 +                /* Inform the domain of the new page's machine address. */ 
    4.51 +                if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) )
    4.52 +                    goto out;
    4.53 +            }
    4.54          }
    4.55      }
    4.56  
    4.57 - out:
    4.58 +out:
    4.59      a->nr_done = i;
    4.60  }
    4.61  
    4.62 @@ -527,6 +536,10 @@ long do_memory_op(unsigned long cmd, XEN
    4.63  
    4.64          args.memflags |= MEMF_node(XENMEMF_get_node(reservation.mem_flags));
    4.65  
    4.66 +        if ( op == XENMEM_populate_physmap
    4.67 +             && (reservation.mem_flags & XENMEMF_populate_on_demand) )
    4.68 +            args.memflags |= MEMF_populate_on_demand;
    4.69 +
    4.70          if ( likely(reservation.domid == DOMID_SELF) )
    4.71          {
    4.72              d = rcu_lock_current_domain();
     5.1 --- a/xen/include/asm-x86/p2m.h	Mon Jan 05 10:45:09 2009 +0000
     5.2 +++ b/xen/include/asm-x86/p2m.h	Mon Jan 05 10:45:48 2009 +0000
     5.3 @@ -261,6 +261,10 @@ void p2m_pod_dump_data(struct domain *d)
     5.4   * (usually in preparation for domain destruction) */
     5.5  void p2m_pod_empty_cache(struct domain *d);
     5.6  
     5.7 +/* Set populate-on-demand cache size so that the total memory allocated to a
     5.8 + * domain matches target */
     5.9 +int p2m_pod_set_mem_target(struct domain *d, unsigned long target);
    5.10 +
    5.11  /* Call when decreasing memory reservation to handle PoD entries properly.
    5.12   * Will return '1' if all entries were handled and nothing more need be done.*/
    5.13  int
     6.1 --- a/xen/include/public/memory.h	Mon Jan 05 10:45:09 2009 +0000
     6.2 +++ b/xen/include/public/memory.h	Mon Jan 05 10:45:48 2009 +0000
     6.3 @@ -48,6 +48,8 @@
     6.4  /* NUMA node to allocate from. */
     6.5  #define XENMEMF_node(x)     (((x) + 1) << 8)
     6.6  #define XENMEMF_get_node(x) ((((x) >> 8) - 1) & 0xffu)
     6.7 +/* Flag to populate physmap with populate-on-demand entries */
     6.8 +#define XENMEMF_populate_on_demand (1<<16)
     6.9  #endif
    6.10  
    6.11  struct xen_memory_reservation {
    6.12 @@ -299,6 +301,19 @@ struct xen_foreign_memory_map {
    6.13  typedef struct xen_foreign_memory_map xen_foreign_memory_map_t;
    6.14  DEFINE_XEN_GUEST_HANDLE(xen_foreign_memory_map_t);
    6.15  
    6.16 +#define XENMEM_set_pod_target       16
    6.17 +#define XENMEM_get_pod_target       17
    6.18 +struct xen_pod_target {
    6.19 +    /* IN */
    6.20 +    uint64_t target_pages;
    6.21 +    /* OUT */
    6.22 +    uint64_t tot_pages;
    6.23 +    uint64_t pod_cache_pages;
    6.24 +    uint64_t pod_entries;
    6.25 +    /* IN */
    6.26 +    domid_t domid;
    6.27 +};
    6.28 +typedef struct xen_pod_target xen_pod_target_t;
    6.29  #endif /* __XEN_PUBLIC_MEMORY_H__ */
    6.30  
    6.31  /*
     7.1 --- a/xen/include/xen/hypercall.h	Mon Jan 05 10:45:09 2009 +0000
     7.2 +++ b/xen/include/xen/hypercall.h	Mon Jan 05 10:45:48 2009 +0000
     7.3 @@ -48,7 +48,7 @@ do_platform_op(
     7.4   * at what point in the page list to resume. For this purpose I steal the
     7.5   * high-order bits of the @cmd parameter, which are otherwise unused and zero.
     7.6   */
     7.7 -#define MEMOP_EXTENT_SHIFT 4 /* cmd[:4] == start_extent */
     7.8 +#define MEMOP_EXTENT_SHIFT 6 /* cmd[:6] == start_extent */
     7.9  #define MEMOP_CMD_MASK     ((1 << MEMOP_EXTENT_SHIFT) - 1)
    7.10  
    7.11  extern long
     8.1 --- a/xen/include/xen/mm.h	Mon Jan 05 10:45:09 2009 +0000
     8.2 +++ b/xen/include/xen/mm.h	Mon Jan 05 10:45:48 2009 +0000
     8.3 @@ -72,6 +72,8 @@ int assign_pages(
     8.4  /* memflags: */
     8.5  #define _MEMF_no_refcount 0
     8.6  #define  MEMF_no_refcount (1U<<_MEMF_no_refcount)
     8.7 +#define _MEMF_populate_on_demand 1
     8.8 +#define  MEMF_populate_on_demand (1U<<_MEMF_populate_on_demand)
     8.9  #define _MEMF_node        8
    8.10  #define  MEMF_node(n)     ((((n)+1)&0xff)<<_MEMF_node)
    8.11  #define _MEMF_bits        24
     9.1 --- a/xen/include/xlat.lst	Mon Jan 05 10:45:09 2009 +0000
     9.2 +++ b/xen/include/xlat.lst	Mon Jan 05 10:45:48 2009 +0000
     9.3 @@ -38,6 +38,7 @@
     9.4  !	memory_exchange			memory.h
     9.5  !	memory_map			memory.h
     9.6  !	memory_reservation		memory.h
     9.7 +!	pod_target			memory.h
     9.8  !	translate_gpfn_list		memory.h
     9.9  !	sched_poll			sched.h
    9.10  ?	sched_remote_shutdown		sched.h