ia64/xen-unstable

changeset 18973:489f35400ef2

PoD memory 5/9: emergency scan


Implement "emergency scan" for zero pages, to deal with start-of-day
page scrubbers.

If the cache is running out, scan through memory looking for "zero
pages" that we can reclaim for the cache. This is necessary for
operating systems which have a start-of-day page scrubber which runs
before the balloon driver can balloon down to the target.

Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Jan 05 10:44:39 2009 +0000 (2009-01-05)
parents bd33ff263e2c
children ebe11a452393
files xen/arch/x86/mm/p2m.c xen/common/grant_table.c xen/include/asm-x86/p2m.h xen/include/xen/grant_table.h
line diff
     1.1 --- a/xen/arch/x86/mm/p2m.c	Mon Jan 05 10:43:50 2009 +0000
     1.2 +++ b/xen/arch/x86/mm/p2m.c	Mon Jan 05 10:44:39 2009 +0000
     1.3 @@ -496,6 +496,289 @@ p2m_pod_dump_data(struct domain *d)
     1.4             p2md->pod.entry_count, p2md->pod.count);
     1.5  }
     1.6  
     1.7 +#define superpage_aligned(_x)  (((_x)&((1<<9)-1))==0)
     1.8 +
     1.9 +/* Must be called w/ p2m lock held, page_alloc lock not held */
    1.10 +static int
    1.11 +p2m_pod_zero_check_superpage(struct domain *d, unsigned long gfn)
    1.12 +{
    1.13 +    mfn_t mfns[1<<9];
    1.14 +    p2m_type_t types[1<<9];
    1.15 +    unsigned long * map[1<<9] = { NULL };
    1.16 +    int ret=0, reset = 0, reset_max = 0;
    1.17 +    int i, j;
    1.18 +
    1.19 +    if ( !superpage_aligned(gfn) )
    1.20 +        goto out;
    1.21 +
    1.22 +    /* Look up the mfns, checking to make sure they're the same mfn
    1.23 +     * and aligned, and mapping them. */
    1.24 +    for ( i=0; i<(1<<9); i++ )
    1.25 +    {
    1.26 +        mfns[i] = gfn_to_mfn_query(d, gfn + i, types + i);
    1.27 +
    1.28 +        /* Conditions that must be met for superpage-superpage:
    1.29 +         * + All gfns are ram types
    1.30 +         * + All gfns have the same type
    1.31 +         * + None of the mfns are used as pagetables
    1.32 +         * + The first mfn is 2-meg aligned
    1.33 +         * + All the other mfns are in sequence
    1.34 +         */
    1.35 +        if ( p2m_is_ram(types[i])
    1.36 +             && types[i] == types[0]
    1.37 +             && ( (mfn_to_page(mfns[i])->count_info & PGC_page_table) == 0 )
    1.38 +             && ( ( i == 0 && superpage_aligned(mfn_x(mfns[0])) )
    1.39 +                  || ( i != 0 && mfn_x(mfns[i]) == mfn_x(mfns[0]) + i ) ) )
    1.40 +            map[i] = map_domain_page(mfn_x(mfns[i]));
    1.41 +        else
    1.42 +            goto out_unmap;
    1.43 +    }
    1.44 +
    1.45 +    /* Now, do a quick check to see if it may be zero before unmapping. */
    1.46 +    for ( i=0; i<(1<<9); i++ )
    1.47 +    {
    1.48 +        /* Quick zero-check */
    1.49 +        for ( j=0; j<16; j++ )
    1.50 +            if( *(map[i]+j) != 0 )
    1.51 +                break;
    1.52 +
    1.53 +        if ( j < 16 )
    1.54 +            goto out_unmap;
    1.55 +
    1.56 +    }
    1.57 +
    1.58 +    /* Try to remove the page, restoring old mapping if it fails. */
    1.59 +    reset_max = 1<<9;
    1.60 +    set_p2m_entry(d, gfn,
    1.61 +                  _mfn(POPULATE_ON_DEMAND_MFN), 9,
    1.62 +                  p2m_populate_on_demand);
    1.63 +
    1.64 +    if ( (mfn_to_page(mfns[0])->u.inuse.type_info & PGT_count_mask) != 0 )
    1.65 +    {
    1.66 +        reset = 1;
    1.67 +        goto out_reset;
    1.68 +    }
    1.69 +
    1.70 +    /* Timing here is important.  We need to make sure not to reclaim
    1.71 +     * a page which has been grant-mapped to another domain.  But we
    1.72 +     * can't grab the grant table lock, because we may be invoked from
    1.73 +     * the grant table code!  So we first remove the page from the
    1.74 +     * p2m, then check to see if the gpfn has been granted.  Once this
    1.75 +     * gpfn is marked PoD, any future gfn_to_mfn() call will block
    1.76 +     * waiting for the p2m lock.  If we find that it has been granted, we
    1.77 +     * simply restore the old value.
    1.78 +     */
    1.79 +    if ( gnttab_is_granted(d, gfn, 9) )
    1.80 +    {
    1.81 +        printk("gfn contains grant table %lx\n", gfn);
    1.82 +        reset = 1;
    1.83 +        goto out_reset;
    1.84 +    }
    1.85 +
    1.86 +    /* Finally, do a full zero-check */
    1.87 +    for ( i=0; i < (1<<9); i++ )
    1.88 +    {
    1.89 +        for ( j=0; j<PAGE_SIZE/sizeof(*map[i]); j++ )
    1.90 +            if( *(map[i]+j) != 0 )
    1.91 +            {
    1.92 +                reset = 1;
    1.93 +                break;
    1.94 +            }
    1.95 +
    1.96 +        if ( reset )
    1.97 +            goto out_reset;
    1.98 +    }
    1.99 +
   1.100 +    /* Finally!  We've passed all the checks, and can add the mfn superpage
   1.101 +     * back on the PoD cache, and account for the new p2m PoD entries */
   1.102 +    p2m_pod_cache_add(d, mfn_to_page(mfns[0]), 9);
   1.103 +    d->arch.p2m->pod.entry_count += (1<<9);
   1.104 +
   1.105 +out_reset:
   1.106 +    if ( reset )
   1.107 +    {
   1.108 +        if (reset_max == (1<<9) )
   1.109 +            set_p2m_entry(d, gfn, mfns[0], 9, types[0]);
   1.110 +        else
   1.111 +            for ( i=0; i<reset_max; i++)
   1.112 +                set_p2m_entry(d, gfn + i, mfns[i], 0, types[i]);
   1.113 +    }
   1.114 +    
   1.115 +out_unmap:
   1.116 +    for ( i=0; i<(1<<9); i++ )
   1.117 +        if ( map[i] )
   1.118 +            unmap_domain_page(map[i]);
   1.119 +out:
   1.120 +    return ret;
   1.121 +}
   1.122 +
   1.123 +static void
   1.124 +p2m_pod_zero_check(struct domain *d, unsigned long *gfns, int count)
   1.125 +{
   1.126 +    mfn_t mfns[count];
   1.127 +    p2m_type_t types[count];
   1.128 +    unsigned long * map[count];
   1.129 +
   1.130 +    int i, j;
   1.131 +
   1.132 +    /* First, get the gfn list, translate to mfns, and map the pages. */
   1.133 +    for ( i=0; i<count; i++ )
   1.134 +    {
   1.135 +        mfns[i] = gfn_to_mfn_query(d, gfns[i], types + i);
   1.136 +        /* If this is ram, and not a pagetable, map it; otherwise,
   1.137 +         * skip. */
   1.138 +        if ( p2m_is_ram(types[i])
   1.139 +             && ( (mfn_to_page(mfns[i])->count_info & PGC_page_table) == 0 ) )
   1.140 +            map[i] = map_domain_page(mfn_x(mfns[i]));
   1.141 +        else
   1.142 +            map[i] = NULL;
   1.143 +    }
   1.144 +
   1.145 +    /* Then, go through and check for zeroed pages, removing write permission
   1.146 +     * for those with zeroes. */
   1.147 +    for ( i=0; i<count; i++ )
   1.148 +    {
   1.149 +        if(!map[i])
   1.150 +            continue;
   1.151 +
   1.152 +        /* Quick zero-check */
   1.153 +        for ( j=0; j<16; j++ )
   1.154 +            if( *(map[i]+j) != 0 )
   1.155 +                break;
   1.156 +
   1.157 +        if ( j < 16 )
   1.158 +        {
   1.159 +            unmap_domain_page(map[i]);
   1.160 +            map[i] = NULL;
   1.161 +            continue;
   1.162 +        }
   1.163 +
   1.164 +        /* Try to remove the page, restoring old mapping if it fails. */
   1.165 +        set_p2m_entry(d, gfns[i],
   1.166 +                      _mfn(POPULATE_ON_DEMAND_MFN), 0,
   1.167 +                      p2m_populate_on_demand);
   1.168 +
   1.169 +        if ( (mfn_to_page(mfns[i])->u.inuse.type_info & PGT_count_mask) != 0 )
   1.170 +        {
   1.171 +            unmap_domain_page(map[i]);
   1.172 +            map[i] = NULL;
   1.173 +
   1.174 +            set_p2m_entry(d, gfns[i], mfns[i], 0, types[i]);
   1.175 +
   1.176 +            continue;
   1.177 +        }
   1.178 +    }
   1.179 +
   1.180 +    /* Now check each page for real */
   1.181 +    for ( i=0; i < count; i++ )
   1.182 +    {
   1.183 +        if(!map[i])
   1.184 +            continue;
   1.185 +
   1.186 +        for ( j=0; j<PAGE_SIZE/sizeof(*map[i]); j++ )
   1.187 +            if( *(map[i]+j) != 0 )
   1.188 +                break;
   1.189 +
   1.190 +        /* See comment in p2m_pod_zero_check_superpage() re gnttab
   1.191 +         * check timing.  */
   1.192 +        if ( j < PAGE_SIZE/sizeof(*map[i])
   1.193 +             || gnttab_is_granted(d, gfns[i], 0) )
   1.194 +        {
   1.195 +            set_p2m_entry(d, gfns[i], mfns[i], 0, types[i]);
   1.196 +            continue;
   1.197 +        }
   1.198 +        else
   1.199 +        {
   1.200 +            /* Add to cache, and account for the new p2m PoD entry */
   1.201 +            p2m_pod_cache_add(d, mfn_to_page(mfns[i]), 0);
   1.202 +            d->arch.p2m->pod.entry_count++;
   1.203 +        }
   1.204 +
   1.205 +        unmap_domain_page(map[i]);
   1.206 +        map[i] = NULL;
   1.207 +    }
   1.208 +    
   1.209 +}
   1.210 +
   1.211 +#define POD_SWEEP_LIMIT 1024
   1.212 +static void
   1.213 +p2m_pod_emergency_sweep_super(struct domain *d)
   1.214 +{
   1.215 +    struct p2m_domain *p2md = d->arch.p2m;
   1.216 +    unsigned long i, start, limit;
   1.217 +
   1.218 +    if ( p2md->pod.reclaim_super == 0 )
   1.219 +    {
   1.220 +        p2md->pod.reclaim_super = (p2md->pod.max_guest>>9)<<9;
   1.221 +        p2md->pod.reclaim_super -= (1<<9);
   1.222 +    }
   1.223 +    
   1.224 +    start = p2md->pod.reclaim_super;
   1.225 +    limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
   1.226 +
   1.227 +    for ( i=p2md->pod.reclaim_super ; i > 0 ; i-=(1<<9) )
   1.228 +    {
   1.229 +        p2m_pod_zero_check_superpage(d, i);
   1.230 +        /* Stop if we're past our limit and we have found *something*.
   1.231 +         *
   1.232 +         * NB that this is a zero-sum game; we're increasing our cache size
   1.233 +         * by re-increasing our 'debt'.  Since we hold the p2m lock,
   1.234 +         * (entry_count - count) must remain the same. */
   1.235 +        if ( !list_empty(&p2md->pod.super) &&  i < limit )
   1.236 +            break;
   1.237 +    }
   1.238 +
   1.239 +    p2md->pod.reclaim_super = i ? i - (1<<9) : 0;
   1.240 +
   1.241 +}
   1.242 +
   1.243 +#define POD_SWEEP_STRIDE  16
   1.244 +static void
   1.245 +p2m_pod_emergency_sweep(struct domain *d)
   1.246 +{
   1.247 +    struct p2m_domain *p2md = d->arch.p2m;
   1.248 +    unsigned long gfns[POD_SWEEP_STRIDE];
   1.249 +    unsigned long i, j=0, start, limit;
   1.250 +    p2m_type_t t;
   1.251 +
   1.252 +
   1.253 +    if ( p2md->pod.reclaim_single == 0 )
   1.254 +        p2md->pod.reclaim_single = p2md->pod.max_guest;
   1.255 +
   1.256 +    start = p2md->pod.reclaim_single;
   1.257 +    limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
   1.258 +
   1.259 +    /* FIXME: Figure out how to avoid superpages */
   1.260 +    for ( i=p2md->pod.reclaim_single ; i > 0 ; i-- )
   1.261 +    {
   1.262 +        gfn_to_mfn_query(d, i, &t );
   1.263 +        if ( p2m_is_ram(t) )
   1.264 +        {
   1.265 +            gfns[j] = i;
   1.266 +            j++;
   1.267 +            BUG_ON(j > POD_SWEEP_STRIDE);
   1.268 +            if ( j == POD_SWEEP_STRIDE )
   1.269 +            {
   1.270 +                p2m_pod_zero_check(d, gfns, j);
   1.271 +                j = 0;
   1.272 +            }
   1.273 +        }
   1.274 +        /* Stop if we're past our limit and we have found *something*.
   1.275 +         *
   1.276 +         * NB that this is a zero-sum game; we're increasing our cache size
   1.277 +         * by re-increasing our 'debt'.  Since we hold the p2m lock,
   1.278 +         * (entry_count - count) must remain the same. */
   1.279 +        if ( p2md->pod.count > 0 && i < limit )
   1.280 +            break;
   1.281 +    }
   1.282 +
   1.283 +    if ( j )
   1.284 +        p2m_pod_zero_check(d, gfns, j);
   1.285 +
   1.286 +    p2md->pod.reclaim_single = i ? i - 1 : i;
   1.287 +
   1.288 +}
   1.289 +
   1.290  static int
   1.291  p2m_pod_demand_populate(struct domain *d, unsigned long gfn,
   1.292                          mfn_t table_mfn,
   1.293 @@ -523,6 +806,19 @@ p2m_pod_demand_populate(struct domain *d
   1.294          return 0;
   1.295      }
   1.296  
   1.297 +    /* If we're low, start a sweep */
   1.298 +    if ( order == 9 && list_empty(&p2md->pod.super) )
   1.299 +        p2m_pod_emergency_sweep_super(d);
   1.300 +
   1.301 +    if ( list_empty(&p2md->pod.single) &&
   1.302 +         ( ( order == 0 )
   1.303 +           || (order == 9 && list_empty(&p2md->pod.super) ) ) )
   1.304 +        p2m_pod_emergency_sweep(d);
   1.305 +
   1.306 +    /* Keep track of the highest gfn demand-populated by a guest fault */
   1.307 +    if ( q == p2m_guest && gfn > p2md->pod.max_guest )
   1.308 +        p2md->pod.max_guest = gfn;
   1.309 +
   1.310      spin_lock(&d->page_alloc_lock);
   1.311  
   1.312      if ( p2md->pod.count == 0 )
     2.1 --- a/xen/common/grant_table.c	Mon Jan 05 10:43:50 2009 +0000
     2.2 +++ b/xen/common/grant_table.c	Mon Jan 05 10:44:39 2009 +0000
     2.3 @@ -111,6 +111,33 @@ static unsigned inline int max_nr_maptra
     2.4  #define active_entry(t, e) \
     2.5      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
     2.6  
     2.7 +/* The p2m emergency sweep code should not reclaim a frame that is currenlty
     2.8 + * grant mapped by another domain.  That would involve checking all other
     2.9 + * domains grant maps, which is impractical.  Instead, we check the active
    2.10 + * grant table for this domain to see if it's been granted.  Since this
    2.11 + * may be called as a result of a grant table op, we can't grab the lock. */
    2.12 +int
    2.13 +gnttab_is_granted(struct domain *d, xen_pfn_t gfn, int order)
    2.14 +{
    2.15 +    int i, found=0;
    2.16 +    struct active_grant_entry *act;
    2.17 +
    2.18 +    /* We need to compare with active grant entries to make sure that
    2.19 +     * pinned (== currently mapped) entries don't disappear under our
    2.20 +     * feet. */
    2.21 +    for ( i=0; i<nr_grant_entries(d->grant_table); i++ )
    2.22 +    {
    2.23 +        act = &active_entry(d->grant_table, i);
    2.24 +        if ( act->gfn >> order == gfn >> order )
    2.25 +        {
    2.26 +            found = 1;
    2.27 +            break;
    2.28 +        }
    2.29 +    }
    2.30 +
    2.31 +    return found;
    2.32 +}
    2.33 +
    2.34  static inline int
    2.35  __get_maptrack_handle(
    2.36      struct grant_table *t)
    2.37 @@ -317,6 +344,7 @@ static void
    2.38          if ( !act->pin )
    2.39          {
    2.40              act->domid = scombo.shorts.domid;
    2.41 +            act->gfn = sha->frame;
    2.42              act->frame = gmfn_to_mfn(rd, sha->frame);
    2.43          }
    2.44      }
    2.45 @@ -1335,6 +1363,7 @@ static int
    2.46          if ( !act->pin )
    2.47          {
    2.48              act->domid = scombo.shorts.domid;
    2.49 +            act->gfn = sha->frame;
    2.50              act->frame = gmfn_to_mfn(rd, sha->frame);
    2.51          }
    2.52      }
     3.1 --- a/xen/include/asm-x86/p2m.h	Mon Jan 05 10:43:50 2009 +0000
     3.2 +++ b/xen/include/asm-x86/p2m.h	Mon Jan 05 10:44:39 2009 +0000
     3.3 @@ -152,6 +152,9 @@ struct p2m_domain {
     3.4                           single;       /* Non-super lists                   */
     3.5          int              count,        /* # of pages in cache lists         */
     3.6                           entry_count;  /* # of pages in p2m marked pod      */
     3.7 +        unsigned         reclaim_super; /* Last gpfn of a scan */
     3.8 +        unsigned         reclaim_single; /* Last gpfn of a scan */
     3.9 +        unsigned         max_guest;    /* gpfn of max guest demand-populate */
    3.10      } pod;
    3.11  };
    3.12  
     4.1 --- a/xen/include/xen/grant_table.h	Mon Jan 05 10:43:50 2009 +0000
     4.2 +++ b/xen/include/xen/grant_table.h	Mon Jan 05 10:44:39 2009 +0000
     4.3 @@ -32,6 +32,7 @@
     4.4  struct active_grant_entry {
     4.5      u32           pin;    /* Reference count information.  */
     4.6      domid_t       domid;  /* Domain being granted access.  */
     4.7 +    unsigned long gfn;    /* Guest's idea of the frame being granted. */
     4.8      unsigned long frame;  /* Frame being granted.          */
     4.9  };
    4.10  
    4.11 @@ -146,4 +147,7 @@ nr_active_grant_frames(struct grant_tabl
    4.12      return num_act_frames_from_sha_frames(nr_grant_frames(gt));
    4.13  }
    4.14  
    4.15 +int
    4.16 +gnttab_is_granted(struct domain *d, xen_pfn_t gfn, int order);
    4.17 +
    4.18  #endif /* __XEN_GRANT_TABLE_H__ */