ia64/xen-unstable

changeset 18971:f2ba08549466

PoD memory 3/9: PoD core
X-BeenThere: xen-devel@lists.xensource.com
X-Mailman-Version: 2.1.5
Precedence: list
List-Id: Xen developer discussion <xen-devel.lists.xensource.com>
List-Unsubscribe:
<http://lists.xensource.com/mailman/listinfo/xen-devel>,
<mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xensource.com>
List-Help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-Subscribe:
<http://lists.xensource.com/mailman/listinfo/xen-devel>,
<mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
Sender: xen-devel-bounces@lists.xensource.com
Errors-To: xen-devel-bounces@lists.xensource.com
Return-Path: xen-devel-bounces@lists.xensource.com
X-OriginalArrivalTime: 23 Dec 2008 13:47:03.0625 (UTC)
FILETIME=[EFEBC390:01C96504]

Core of populate-on-demand functionality:
* Introduce a populate-on-demand type
* Call p2m_demand_populate() when gfn_to_mfn() encounters PoD entries
* Return p2m memory to the domain list for freeing during domain destruction
* Audit p2m checks our PoD-entry reference-counting
* Add PoD information to the 'q' debug key

Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Jan 05 10:43:19 2009 +0000 (2009-01-05)
parents 629f028d22f9
children bd33ff263e2c
files xen/arch/x86/domain.c xen/arch/x86/mm/p2m.c xen/arch/x86/mm/paging.c xen/arch/x86/mm/shadow/multi.c xen/include/asm-x86/p2m.h
line diff
     1.1 --- a/xen/arch/x86/domain.c	Mon Jan 05 10:42:39 2009 +0000
     1.2 +++ b/xen/arch/x86/domain.c	Mon Jan 05 10:43:19 2009 +0000
     1.3 @@ -149,6 +149,11 @@ void dump_pageframe_info(struct domain *
     1.4          }
     1.5      }
     1.6  
     1.7 +    if ( is_hvm_domain(d) )
     1.8 +    {
     1.9 +        p2m_pod_dump_data(d);
    1.10 +    }
    1.11 +
    1.12      list_for_each_entry ( page, &d->xenpage_list, list )
    1.13      {
    1.14          printk("    XenPage %p: caf=%08x, taf=%" PRtype_info "\n",
     2.1 --- a/xen/arch/x86/mm/p2m.c	Mon Jan 05 10:42:39 2009 +0000
     2.2 +++ b/xen/arch/x86/mm/p2m.c	Mon Jan 05 10:43:19 2009 +0000
     2.3 @@ -118,9 +118,16 @@ static unsigned long p2m_type_to_flags(p
     2.4          return flags;
     2.5      case p2m_mmio_direct:
     2.6          return flags | P2M_BASE_FLAGS | _PAGE_RW | _PAGE_PCD;
     2.7 +    case p2m_populate_on_demand:
     2.8 +        return flags;
     2.9      }
    2.10  }
    2.11  
    2.12 +#if P2M_AUDIT
    2.13 +static void audit_p2m(struct domain *d);
    2.14 +#else
    2.15 +# define audit_p2m(_d) do { (void)(_d); } while(0)
    2.16 +#endif /* P2M_AUDIT */
    2.17  
    2.18  // Find the next level's P2M entry, checking for out-of-range gfn's...
    2.19  // Returns NULL on error.
    2.20 @@ -162,7 +169,8 @@ p2m_next_level(struct domain *d, mfn_t *
    2.21                                        shift, max)) )
    2.22          return 0;
    2.23  
    2.24 -    if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
    2.25 +    /* PoD: Not present doesn't imply empty. */
    2.26 +    if ( !l1e_get_flags(*p2m_entry) )
    2.27      {
    2.28          struct page_info *pg = d->arch.p2m->alloc_page(d);
    2.29          if ( pg == NULL )
    2.30 @@ -197,7 +205,7 @@ p2m_next_level(struct domain *d, mfn_t *
    2.31          }
    2.32      }
    2.33  
    2.34 -    ASSERT(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT);
    2.35 +    ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE));
    2.36  
    2.37      /* split single large page into 4KB page in P2M table */
    2.38      if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
    2.39 @@ -242,6 +250,236 @@ p2m_next_level(struct domain *d, mfn_t *
    2.40      return 1;
    2.41  }
    2.42  
    2.43 +/*
    2.44 + * Populate-on-demand functionality
    2.45 + */
    2.46 +int
    2.47 +p2m_pod_cache_add(struct domain *d,
    2.48 +                  struct page_info *page,
    2.49 +                  unsigned long order)
    2.50 +{
    2.51 +    int i;
    2.52 +    struct page_info *p;
    2.53 +    struct p2m_domain *p2md = d->arch.p2m;
    2.54 +
    2.55 +#ifndef NDEBUG
    2.56 +    mfn_t mfn;
    2.57 +
    2.58 +    mfn = page_to_mfn(page);
    2.59 +
    2.60 +    /* Check to make sure this is a contiguous region */
    2.61 +    if( mfn_x(mfn) & ((1 << order) - 1) )
    2.62 +    {
    2.63 +        printk("%s: mfn %lx not aligned order %lu! (mask %lx)\n",
    2.64 +               __func__, mfn_x(mfn), order, ((1UL << order) - 1));
    2.65 +        return -1;
    2.66 +    }
    2.67 +    
    2.68 +    for(i=0; i < 1 << order ; i++) {
    2.69 +        struct domain * od;
    2.70 +
    2.71 +        p = mfn_to_page(_mfn(mfn_x(mfn) + i));
    2.72 +        od = page_get_owner(p);
    2.73 +        if(od != d)
    2.74 +        {
    2.75 +            printk("%s: mfn %lx expected owner d%d, got owner d%d!\n",
    2.76 +                   __func__, mfn_x(mfn), d->domain_id,
    2.77 +                   od?od->domain_id:-1);
    2.78 +            return -1;
    2.79 +        }
    2.80 +    }
    2.81 +#endif
    2.82 +
    2.83 +    spin_lock(&d->page_alloc_lock);
    2.84 +
    2.85 +    /* First, take all pages off the domain list */
    2.86 +    for(i=0; i < 1 << order ; i++)
    2.87 +    {
    2.88 +        p = page + i;
    2.89 +        list_del(&p->list);
    2.90 +    }
    2.91 +
    2.92 +    /* Then add the first one to the appropriate populate-on-demand list */
    2.93 +    switch(order)
    2.94 +    {
    2.95 +    case 9:
    2.96 +        list_add_tail(&page->list, &p2md->pod.super); /* lock: page_alloc */
    2.97 +        p2md->pod.count += 1 << order;
    2.98 +        break;
    2.99 +    case 0:
   2.100 +        list_add_tail(&page->list, &p2md->pod.single); /* lock: page_alloc */
   2.101 +        p2md->pod.count += 1 ;
   2.102 +        break;
   2.103 +    default:
   2.104 +        BUG();
   2.105 +    }
   2.106 +
   2.107 +    spin_unlock(&d->page_alloc_lock);
   2.108 +
   2.109 +    return 0;
   2.110 +}
   2.111 +
   2.112 +void
   2.113 +p2m_pod_empty_cache(struct domain *d)
   2.114 +{
   2.115 +    struct p2m_domain *p2md = d->arch.p2m;
   2.116 +    struct list_head *q, *p;
   2.117 +
   2.118 +    spin_lock(&d->page_alloc_lock);
   2.119 +
   2.120 +    list_for_each_safe(p, q, &p2md->pod.super) /* lock: page_alloc */
   2.121 +    {
   2.122 +        int i;
   2.123 +        struct page_info *page;
   2.124 +            
   2.125 +        list_del(p);
   2.126 +            
   2.127 +        page = list_entry(p, struct page_info, list);
   2.128 +
   2.129 +        for ( i = 0 ; i < (1 << 9) ; i++ )
   2.130 +        {
   2.131 +            BUG_ON(page_get_owner(page + i) != d);
   2.132 +            list_add_tail(&page[i].list, &d->page_list);
   2.133 +        }
   2.134 +
   2.135 +        p2md->pod.count -= 1<<9;
   2.136 +    }
   2.137 +
   2.138 +    list_for_each_safe(p, q, &p2md->pod.single)
   2.139 +    {
   2.140 +        struct page_info *page;
   2.141 +            
   2.142 +        list_del(p);
   2.143 +            
   2.144 +        page = list_entry(p, struct page_info, list);
   2.145 +
   2.146 +        BUG_ON(page_get_owner(page) != d);
   2.147 +        list_add_tail(&page->list, &d->page_list);
   2.148 +
   2.149 +        p2md->pod.count -= 1;
   2.150 +    }
   2.151 +
   2.152 +    BUG_ON(p2md->pod.count != 0);
   2.153 +
   2.154 +    spin_unlock(&d->page_alloc_lock);
   2.155 +}
   2.156 +
   2.157 +void
   2.158 +p2m_pod_dump_data(struct domain *d)
   2.159 +{
   2.160 +    struct p2m_domain *p2md = d->arch.p2m;
   2.161 +    
   2.162 +    printk("    PoD entries=%d cachesize=%d\n",
   2.163 +           p2md->pod.entry_count, p2md->pod.count);
   2.164 +}
   2.165 +
   2.166 +static int
   2.167 +p2m_pod_demand_populate(struct domain *d, unsigned long gfn,
   2.168 +                        mfn_t table_mfn,
   2.169 +                        l1_pgentry_t *p2m_entry,
   2.170 +                        unsigned int order,
   2.171 +                        p2m_query_t q)
   2.172 +{
   2.173 +    struct page_info *p = NULL; /* Compiler warnings */
   2.174 +    unsigned long gfn_aligned;
   2.175 +    mfn_t mfn;
   2.176 +    l1_pgentry_t entry_content = l1e_empty();
   2.177 +    struct p2m_domain *p2md = d->arch.p2m;
   2.178 +    int i;
   2.179 +
   2.180 +    /* We need to grab the p2m lock here and re-check the entry to make
   2.181 +     * sure that someone else hasn't populated it for us, then hold it
   2.182 +     * until we're done. */
   2.183 +    p2m_lock(p2md);
   2.184 +    audit_p2m(d);
   2.185 +
   2.186 +    /* Check to make sure this is still PoD */
   2.187 +    if ( p2m_flags_to_type(l1e_get_flags(*p2m_entry)) != p2m_populate_on_demand )
   2.188 +    {
   2.189 +        p2m_unlock(p2md);
   2.190 +        return 0;
   2.191 +    }
   2.192 +
   2.193 +    spin_lock(&d->page_alloc_lock);
   2.194 +
   2.195 +    if ( p2md->pod.count == 0 )
   2.196 +        goto out_of_memory;
   2.197 +
   2.198 +    /* FIXME -- use single pages / splinter superpages if need be */
   2.199 +    switch ( order )
   2.200 +    {
   2.201 +    case 9:
   2.202 +        BUG_ON( list_empty(&p2md->pod.super) );
   2.203 +        p = list_entry(p2md->pod.super.next, struct page_info, list); 
   2.204 +        p2md->pod.count -= 1 << order; /* Lock: page_alloc */
   2.205 +        break;
   2.206 +    case 0:
   2.207 +        BUG_ON( list_empty(&p2md->pod.single) );
   2.208 +        p = list_entry(p2md->pod.single.next, struct page_info, list);
   2.209 +        p2md->pod.count -= 1;
   2.210 +        break;
   2.211 +    default:
   2.212 +        BUG();
   2.213 +    }
   2.214 +        
   2.215 +    list_del(&p->list);
   2.216 +
   2.217 +    mfn = page_to_mfn(p);
   2.218 +
   2.219 +    BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0);
   2.220 +
   2.221 +    /* Put the pages back on the domain page_list */
   2.222 +    for ( i = 0 ; i < (1 << order) ; i++ )
   2.223 +    {
   2.224 +        BUG_ON(page_get_owner(p + i) != d);
   2.225 +        list_add_tail(&p[i].list, &d->page_list);
   2.226 +    }
   2.227 +
   2.228 +    spin_unlock(&d->page_alloc_lock);
   2.229 +
   2.230 +    /* Fill in the entry in the p2m */
   2.231 +    switch ( order )
   2.232 +    {
   2.233 +    case 9:
   2.234 +    {
   2.235 +        l2_pgentry_t l2e_content;
   2.236 +        
   2.237 +        l2e_content = l2e_from_pfn(mfn_x(mfn),
   2.238 +                                   p2m_type_to_flags(p2m_ram_rw) | _PAGE_PSE);
   2.239 +
   2.240 +        entry_content.l1 = l2e_content.l2;
   2.241 +    }
   2.242 +    break;
   2.243 +    case 0:
   2.244 +        entry_content = l1e_from_pfn(mfn_x(mfn),
   2.245 +                                     p2m_type_to_flags(p2m_ram_rw));
   2.246 +        break;
   2.247 +        
   2.248 +    }
   2.249 +
   2.250 +    gfn_aligned = (gfn >> order) << order;
   2.251 +
   2.252 +    paging_write_p2m_entry(d, gfn_aligned, p2m_entry, table_mfn,
   2.253 +                           entry_content, (order==9)?2:1);
   2.254 +
   2.255 +    for( i = 0 ; i < (1UL << order) ; i++ )
   2.256 +        set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_aligned + i);
   2.257 +    
   2.258 +    p2md->pod.entry_count -= (1 << order); /* Lock: p2m */
   2.259 +    BUG_ON(p2md->pod.entry_count < 0);
   2.260 +    audit_p2m(d);
   2.261 +    p2m_unlock(p2md);
   2.262 +
   2.263 +    return 0;
   2.264 +out_of_memory:
   2.265 +    spin_unlock(&d->page_alloc_lock);
   2.266 +    audit_p2m(d);
   2.267 +    p2m_unlock(p2md);
   2.268 +    printk("%s: Out of populate-on-demand memory!\n", __func__);
   2.269 +    domain_crash(d);
   2.270 +    return -1;
   2.271 +}
   2.272 +
   2.273  // Returns 0 on error (out of memory)
   2.274  static int
   2.275  p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
   2.276 @@ -303,6 +541,7 @@ p2m_set_entry(struct domain *d, unsigned
   2.277                                     L2_PAGETABLE_ENTRIES);
   2.278          ASSERT(p2m_entry);
   2.279          
   2.280 +        /* FIXME: Deal with 4k replaced by 2meg pages */
   2.281          if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
   2.282               !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
   2.283          {
   2.284 @@ -311,7 +550,7 @@ p2m_set_entry(struct domain *d, unsigned
   2.285              goto out;
   2.286          }
   2.287          
   2.288 -        if ( mfn_valid(mfn) )
   2.289 +        if ( mfn_valid(mfn) || p2m_is_magic(p2mt) )
   2.290              l2e_content = l2e_from_pfn(mfn_x(mfn),
   2.291                                         p2m_type_to_flags(p2mt) | _PAGE_PSE);
   2.292          else
   2.293 @@ -403,8 +642,21 @@ p2m_gfn_to_mfn(struct domain *d, unsigne
   2.294  
   2.295      l2e = map_domain_page(mfn_x(mfn));
   2.296      l2e += l2_table_offset(addr);
   2.297 +
   2.298 +pod_retry_l2:
   2.299      if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
   2.300      {
   2.301 +        /* PoD: Try to populate a 2-meg chunk */
   2.302 +        if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand )
   2.303 +        {
   2.304 +            if ( q != p2m_query ) {
   2.305 +                if( !p2m_pod_demand_populate(d, gfn, mfn,
   2.306 +                                             (l1_pgentry_t *)l2e, 9, q) )
   2.307 +                    goto pod_retry_l2;
   2.308 +            } else
   2.309 +                *t = p2m_populate_on_demand;
   2.310 +        }
   2.311 +    
   2.312          unmap_domain_page(l2e);
   2.313          return _mfn(INVALID_MFN);
   2.314      }
   2.315 @@ -423,8 +675,20 @@ p2m_gfn_to_mfn(struct domain *d, unsigne
   2.316  
   2.317      l1e = map_domain_page(mfn_x(mfn));
   2.318      l1e += l1_table_offset(addr);
   2.319 +pod_retry_l1:
   2.320      if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
   2.321      {
   2.322 +        /* PoD: Try to populate */
   2.323 +        if ( p2m_flags_to_type(l1e_get_flags(*l1e)) == p2m_populate_on_demand )
   2.324 +        {
   2.325 +            if ( q != p2m_query ) {
   2.326 +                if( !p2m_pod_demand_populate(d, gfn, mfn,
   2.327 +                                             (l1_pgentry_t *)l1e, 0, q) )
   2.328 +                    goto pod_retry_l1;
   2.329 +            } else
   2.330 +                *t = p2m_populate_on_demand;
   2.331 +        }
   2.332 +    
   2.333          unmap_domain_page(l1e);
   2.334          return _mfn(INVALID_MFN);
   2.335      }
   2.336 @@ -450,48 +714,114 @@ static mfn_t p2m_gfn_to_mfn_current(unsi
   2.337  
   2.338      if ( gfn <= current->domain->arch.p2m->max_mapped_pfn )
   2.339      {
   2.340 -        l1_pgentry_t l1e = l1e_empty();
   2.341 +        l1_pgentry_t l1e = l1e_empty(), *p2m_entry;
   2.342          l2_pgentry_t l2e = l2e_empty();
   2.343          int ret;
   2.344  
   2.345          ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 
   2.346                 / sizeof(l1_pgentry_t));
   2.347  
   2.348 +        /*
   2.349 +         * Read & process L2
   2.350 +         */
   2.351 +        p2m_entry = &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START)
   2.352 +                                       + l2_linear_offset(addr)];
   2.353 +
   2.354 +    pod_retry_l2:
   2.355          ret = __copy_from_user(&l2e,
   2.356 -                               &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START) + l2_linear_offset(addr)],
   2.357 +                               p2m_entry,
   2.358                                 sizeof(l2e));
   2.359 +        if ( ret != 0
   2.360 +             || !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
   2.361 +        {
   2.362 +            if( (l2e_get_flags(l2e) & _PAGE_PSE)
   2.363 +                && ( p2m_flags_to_type(l2e_get_flags(l2e))
   2.364 +                     == p2m_populate_on_demand ) )
   2.365 +            {
   2.366 +                /* The read has succeeded, so we know that the mapping
   2.367 +                 * exits at this point.  */
   2.368 +                if ( q != p2m_query )
   2.369 +                {
   2.370 +                    if( !p2m_pod_demand_populate(current->domain, gfn, mfn,
   2.371 +                                                 p2m_entry, 9, q) )
   2.372 +                        goto pod_retry_l2;
   2.373 +
   2.374 +                    /* Allocate failed. */
   2.375 +                    p2mt = p2m_invalid;
   2.376 +                    printk("%s: Allocate failed!\n", __func__);
   2.377 +                    goto out;
   2.378 +                }
   2.379 +                else
   2.380 +                {
   2.381 +                    p2mt = p2m_populate_on_demand;
   2.382 +                    goto out;
   2.383 +                }
   2.384 +            }
   2.385 +
   2.386 +            goto pod_retry_l1;
   2.387 +        }
   2.388          
   2.389 -        if ( (ret == 0) && (l2e_get_flags(l2e) & _PAGE_PRESENT) && 
   2.390 -             (l2e_get_flags(l2e) & _PAGE_PSE) ) 
   2.391 +        if (l2e_get_flags(l2e) & _PAGE_PSE)
   2.392          {
   2.393              p2mt = p2m_flags_to_type(l2e_get_flags(l2e));
   2.394              ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt));
   2.395 +
   2.396              if ( p2m_is_valid(p2mt) )
   2.397                  mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr));
   2.398              else
   2.399                  p2mt = p2m_mmio_dm;
   2.400 +
   2.401 +            goto out;
   2.402          }
   2.403 -        else
   2.404 -        {
   2.405 -        
   2.406 -            /* Need to __copy_from_user because the p2m is sparse and this
   2.407 -             * part might not exist */
   2.408 -            ret = __copy_from_user(&l1e,
   2.409 -                                   &phys_to_machine_mapping[gfn],
   2.410 -                                   sizeof(l1e));
   2.411 +
   2.412 +        /*
   2.413 +         * Read and process L1
   2.414 +         */
   2.415 +
   2.416 +        /* Need to __copy_from_user because the p2m is sparse and this
   2.417 +         * part might not exist */
   2.418 +    pod_retry_l1:
   2.419 +        p2m_entry = &phys_to_machine_mapping[gfn];
   2.420 +
   2.421 +        ret = __copy_from_user(&l1e,
   2.422 +                               p2m_entry,
   2.423 +                               sizeof(l1e));
   2.424              
   2.425 -            if ( ret == 0 ) {
   2.426 -                p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
   2.427 -                ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
   2.428 -                if ( p2m_is_valid(p2mt) )
   2.429 -                    mfn = _mfn(l1e_get_pfn(l1e));
   2.430 -                else 
   2.431 -                    /* XXX see above */
   2.432 -                    p2mt = p2m_mmio_dm;
   2.433 +        if ( ret == 0 ) {
   2.434 +            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
   2.435 +            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
   2.436 +
   2.437 +            if ( p2m_flags_to_type(l1e_get_flags(l1e))
   2.438 +                 == p2m_populate_on_demand )
   2.439 +            {
   2.440 +                /* The read has succeeded, so we know that the mapping
   2.441 +                 * exits at this point.  */
   2.442 +                if ( q != p2m_query )
   2.443 +                {
   2.444 +                    if( !p2m_pod_demand_populate(current->domain, gfn, mfn,
   2.445 +                                                 (l1_pgentry_t *)p2m_entry, 0,
   2.446 +                                                 q) )
   2.447 +                        goto pod_retry_l1;
   2.448 +
   2.449 +                    /* Allocate failed. */
   2.450 +                    p2mt = p2m_invalid;
   2.451 +                    goto out;
   2.452 +                }
   2.453 +                else
   2.454 +                {
   2.455 +                    p2mt = p2m_populate_on_demand;
   2.456 +                    goto out;
   2.457 +                }
   2.458              }
   2.459 +
   2.460 +            if ( p2m_is_valid(p2mt) )
   2.461 +                mfn = _mfn(l1e_get_pfn(l1e));
   2.462 +            else 
   2.463 +                /* XXX see above */
   2.464 +                p2mt = p2m_mmio_dm;
   2.465          }
   2.466      }
   2.467 -
   2.468 +out:
   2.469      *t = p2mt;
   2.470      return mfn;
   2.471  }
   2.472 @@ -510,6 +840,8 @@ int p2m_init(struct domain *d)
   2.473      memset(p2m, 0, sizeof(*p2m));
   2.474      p2m_lock_init(p2m);
   2.475      INIT_LIST_HEAD(&p2m->pages);
   2.476 +    INIT_LIST_HEAD(&p2m->pod.super);
   2.477 +    INIT_LIST_HEAD(&p2m->pod.single);
   2.478  
   2.479      p2m->set_entry = p2m_set_entry;
   2.480      p2m->get_entry = p2m_gfn_to_mfn;
   2.481 @@ -680,6 +1012,7 @@ static void audit_p2m(struct domain *d)
   2.482      struct page_info *page;
   2.483      struct domain *od;
   2.484      unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
   2.485 +    int entry_count = 0;
   2.486      mfn_t p2mfn;
   2.487      unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
   2.488      int test_linear;
   2.489 @@ -805,6 +1138,10 @@ static void audit_p2m(struct domain *d)
   2.490                  {
   2.491                      if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
   2.492                      {
   2.493 +                        if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE)
   2.494 +                             && ( p2m_flags_to_type(l2e_get_flags(l2e[i2]))
   2.495 +                                  == p2m_populate_on_demand ) )
   2.496 +                            entry_count+=(1<<9);
   2.497                          gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
   2.498                          continue;
   2.499                      }
   2.500 @@ -835,13 +1172,20 @@ static void audit_p2m(struct domain *d)
   2.501                      for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
   2.502                      {
   2.503                          if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
   2.504 +                        {
   2.505 +                            if ( p2m_flags_to_type(l1e_get_flags(l1e[i1]))
   2.506 +                                 == p2m_populate_on_demand )
   2.507 +                            entry_count++;
   2.508                              continue;
   2.509 +                        }
   2.510                          mfn = l1e_get_pfn(l1e[i1]);
   2.511                          ASSERT(mfn_valid(_mfn(mfn)));
   2.512                          m2pfn = get_gpfn_from_mfn(mfn);
   2.513                          if ( m2pfn != gfn )
   2.514                          {
   2.515                              pmbad++;
   2.516 +                            printk("mismatch: gfn %#lx -> mfn %#lx"
   2.517 +                                   " -> gfn %#lx\n", gfn, mfn, m2pfn);
   2.518                              P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
   2.519                                         " -> gfn %#lx\n", gfn, mfn, m2pfn);
   2.520                              BUG();
   2.521 @@ -864,6 +1208,15 @@ static void audit_p2m(struct domain *d)
   2.522  
   2.523      }
   2.524  
   2.525 +    if ( entry_count != d->arch.p2m->pod.entry_count )
   2.526 +    {
   2.527 +        printk("%s: refcounted entry count %d, audit count %d!\n",
   2.528 +               __func__,
   2.529 +               d->arch.p2m->pod.entry_count,
   2.530 +               entry_count);
   2.531 +        BUG();
   2.532 +    }
   2.533 +        
   2.534      //P2M_PRINTK("p2m audit complete\n");
   2.535      //if ( orphans_i | orphans_d | mpbad | pmbad )
   2.536      //    P2M_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
   2.537 @@ -872,8 +1225,6 @@ static void audit_p2m(struct domain *d)
   2.538          P2M_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
   2.539                     pmbad, mpbad);
   2.540  }
   2.541 -#else
   2.542 -#define audit_p2m(_d) do { (void)(_d); } while(0)
   2.543  #endif /* P2M_AUDIT */
   2.544  
   2.545  
   2.546 @@ -911,6 +1262,77 @@ guest_physmap_remove_page(struct domain 
   2.547  }
   2.548  
   2.549  int
   2.550 +guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
   2.551 +                                      unsigned int order)
   2.552 +{
   2.553 +    struct p2m_domain *p2md = d->arch.p2m;
   2.554 +    unsigned long i;
   2.555 +    p2m_type_t ot;
   2.556 +    mfn_t omfn;
   2.557 +    int pod_count = 0;
   2.558 +    int rc = 0;
   2.559 +
   2.560 +    BUG_ON(!paging_mode_translate(d));
   2.561 +
   2.562 +#if CONFIG_PAGING_LEVELS == 3
   2.563 +    /*
   2.564 +     * 32bit PAE nested paging does not support over 4GB guest due to 
   2.565 +     * hardware translation limit. This limitation is checked by comparing
   2.566 +     * gfn with 0xfffffUL.
   2.567 +     */
   2.568 +    if ( paging_mode_hap(d) && (gfn > 0xfffffUL) )
   2.569 +    {
   2.570 +        if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) )
   2.571 +            dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond"
   2.572 +                    " 4GB: specify 'hap=0' domain config option.\n",
   2.573 +                    d->domain_id);
   2.574 +        return -EINVAL;
   2.575 +    }
   2.576 +#endif
   2.577 +
   2.578 +    p2m_lock(p2md);
   2.579 +    audit_p2m(d);
   2.580 +
   2.581 +    P2M_DEBUG("adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
   2.582 +
   2.583 +    /* Make sure all gpfns are unused */
   2.584 +    for ( i = 0; i < (1UL << order); i++ )
   2.585 +    {
   2.586 +        omfn = gfn_to_mfn_query(d, gfn + i, &ot);
   2.587 +        if ( p2m_is_ram(ot) )
   2.588 +        {
   2.589 +            printk("%s: gfn_to_mfn returned type %d!\n",
   2.590 +                   __func__, ot);
   2.591 +            rc = -EBUSY;
   2.592 +            goto out;
   2.593 +        }
   2.594 +        else if ( ot == p2m_populate_on_demand )
   2.595 +        {
   2.596 +            /* Count how man PoD entries we'll be replacing if successful */
   2.597 +            pod_count++;
   2.598 +        }
   2.599 +    }
   2.600 +
   2.601 +    /* Now, actually do the two-way mapping */
   2.602 +    if ( !set_p2m_entry(d, gfn, _mfn(POPULATE_ON_DEMAND_MFN), order,
   2.603 +                        p2m_populate_on_demand) )
   2.604 +        rc = -EINVAL;
   2.605 +    else
   2.606 +    {
   2.607 +        p2md->pod.entry_count += 1 << order; /* Lock: p2m */
   2.608 +        p2md->pod.entry_count -= pod_count;
   2.609 +        BUG_ON(p2md->pod.entry_count < 0);
   2.610 +    }
   2.611 +
   2.612 +    audit_p2m(d);
   2.613 +    p2m_unlock(p2md);
   2.614 +
   2.615 +out:
   2.616 +    return rc;
   2.617 +
   2.618 +}
   2.619 +
   2.620 +int
   2.621  guest_physmap_add_entry(struct domain *d, unsigned long gfn,
   2.622                          unsigned long mfn, unsigned int page_order, 
   2.623                          p2m_type_t t)
   2.624 @@ -918,6 +1340,7 @@ guest_physmap_add_entry(struct domain *d
   2.625      unsigned long i, ogfn;
   2.626      p2m_type_t ot;
   2.627      mfn_t omfn;
   2.628 +    int pod_count = 0;
   2.629      int rc = 0;
   2.630  
   2.631      if ( !paging_mode_translate(d) )
   2.632 @@ -966,6 +1389,11 @@ guest_physmap_add_entry(struct domain *d
   2.633              ASSERT(mfn_valid(omfn));
   2.634              set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
   2.635          }
   2.636 +        else if ( ot == p2m_populate_on_demand )
   2.637 +        {
   2.638 +            /* Count how man PoD entries we'll be replacing if successful */
   2.639 +            pod_count++;
   2.640 +        }
   2.641      }
   2.642  
   2.643      /* Then, look for m->p mappings for this range and deal with them */
   2.644 @@ -1012,6 +1440,11 @@ guest_physmap_add_entry(struct domain *d
   2.645          if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order, 
   2.646                              p2m_invalid) )
   2.647              rc = -EINVAL;
   2.648 +        else
   2.649 +        {
   2.650 +            d->arch.p2m->pod.entry_count -= pod_count; /* Lock: p2m */
   2.651 +            BUG_ON(d->arch.p2m->pod.entry_count < 0);
   2.652 +        }
   2.653      }
   2.654  
   2.655      audit_p2m(d);
     3.1 --- a/xen/arch/x86/mm/paging.c	Mon Jan 05 10:42:39 2009 +0000
     3.2 +++ b/xen/arch/x86/mm/paging.c	Mon Jan 05 10:43:19 2009 +0000
     3.3 @@ -585,6 +585,9 @@ void paging_teardown(struct domain *d)
     3.4  
     3.5      /* clean up log dirty resources. */
     3.6      paging_log_dirty_teardown(d);
     3.7 +
     3.8 +    /* Move populate-on-demand cache back to domain_list for destruction */
     3.9 +    p2m_pod_empty_cache(d);
    3.10  }
    3.11  
    3.12  /* Call once all of the references to the domain have gone away */
     4.1 --- a/xen/arch/x86/mm/shadow/multi.c	Mon Jan 05 10:42:39 2009 +0000
     4.2 +++ b/xen/arch/x86/mm/shadow/multi.c	Mon Jan 05 10:43:19 2009 +0000
     4.3 @@ -2173,7 +2173,7 @@ static int validate_gl4e(struct vcpu *v,
     4.4          mfn_t gl3mfn = gfn_to_mfn_query(d, gl3gfn, &p2mt);
     4.5          if ( p2m_is_ram(p2mt) )
     4.6              sl3mfn = get_shadow_status(v, gl3mfn, SH_type_l3_shadow);
     4.7 -        else
     4.8 +        else if ( p2mt != p2m_populate_on_demand )
     4.9              result |= SHADOW_SET_ERROR;
    4.10  
    4.11  #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
    4.12 @@ -2230,7 +2230,7 @@ static int validate_gl3e(struct vcpu *v,
    4.13          mfn_t gl2mfn = gfn_to_mfn_query(v->domain, gl2gfn, &p2mt);
    4.14          if ( p2m_is_ram(p2mt) )
    4.15              sl2mfn = get_shadow_status(v, gl2mfn, SH_type_l2_shadow);
    4.16 -        else
    4.17 +        else if ( p2mt != p2m_populate_on_demand )
    4.18              result |= SHADOW_SET_ERROR;
    4.19  
    4.20  #if (SHADOW_OPTIMIZATIONS && SHOPT_OUT_OF_SYNC )
    4.21 @@ -2278,8 +2278,8 @@ static int validate_gl2e(struct vcpu *v,
    4.22          {
    4.23              mfn_t gl1mfn = gfn_to_mfn_query(v->domain, gl1gfn, &p2mt);
    4.24              if ( p2m_is_ram(p2mt) )
    4.25 -                sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
    4.26 -            else
    4.27 +                sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow); 
    4.28 +            else if ( p2mt != p2m_populate_on_demand )
    4.29                  result |= SHADOW_SET_ERROR;
    4.30          }
    4.31      }
     5.1 --- a/xen/include/asm-x86/p2m.h	Mon Jan 05 10:42:39 2009 +0000
     5.2 +++ b/xen/include/asm-x86/p2m.h	Mon Jan 05 10:43:19 2009 +0000
     5.3 @@ -64,6 +64,7 @@ typedef enum {
     5.4      p2m_ram_ro = 3,             /* Read-only; writes are silently dropped */
     5.5      p2m_mmio_dm = 4,            /* Reads and write go to the device model */
     5.6      p2m_mmio_direct = 5,        /* Read/write mapping of genuine MMIO area */
     5.7 +    p2m_populate_on_demand = 6, /* Place-holder for empty memory */
     5.8  } p2m_type_t;
     5.9  
    5.10  typedef enum {
    5.11 @@ -88,12 +89,20 @@ typedef enum {
    5.12  #define P2M_RO_TYPES (p2m_to_mask(p2m_ram_logdirty)     \
    5.13                        | p2m_to_mask(p2m_ram_ro))
    5.14  
    5.15 +#define P2M_MAGIC_TYPES (p2m_to_mask(p2m_populate_on_demand))
    5.16 +
    5.17  /* Useful predicates */
    5.18  #define p2m_is_ram(_t) (p2m_to_mask(_t) & P2M_RAM_TYPES)
    5.19  #define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES)
    5.20  #define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES)
    5.21 +#define p2m_is_magic(_t) (p2m_to_mask(_t) & P2M_MAGIC_TYPES)
    5.22  #define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES))
    5.23  
    5.24 +/* Populate-on-demand */
    5.25 +#define POPULATE_ON_DEMAND_MFN  (1<<9)
    5.26 +#define POD_PAGE_ORDER 9
    5.27 +
    5.28 +
    5.29  struct p2m_domain {
    5.30      /* Lock that protects updates to the p2m */
    5.31      spinlock_t         lock;
    5.32 @@ -122,6 +131,28 @@ struct p2m_domain {
    5.33  
    5.34      /* Highest guest frame that's ever been mapped in the p2m */
    5.35      unsigned long max_mapped_pfn;
    5.36 +
    5.37 +    /* Populate-on-demand variables
    5.38 +     * NB on locking.  {super,single,count} are
    5.39 +     * covered by d->page_alloc_lock, since they're almost always used in
    5.40 +     * conjunction with that functionality.  {entry_count} is covered by
    5.41 +     * the domain p2m lock, since it's almost always used in conjunction
    5.42 +     * with changing the p2m tables.
    5.43 +     *
    5.44 +     * At this point, both locks are held in two places.  In both,
    5.45 +     * the order is [p2m,page_alloc]:
    5.46 +     * + p2m_pod_decrease_reservation() calls p2m_pod_cache_add(),
    5.47 +     *   which grabs page_alloc
    5.48 +     * + p2m_pod_demand_populate() grabs both; the p2m lock to avoid
    5.49 +     *   double-demand-populating of pages, the page_alloc lock to
    5.50 +     *   protect moving stuff from the PoD cache to the domain page list.
    5.51 +     */
    5.52 +    struct {
    5.53 +        struct list_head super,        /* List of superpages                */
    5.54 +                         single;       /* Non-super lists                   */
    5.55 +        int              count,        /* # of pages in cache lists         */
    5.56 +                         entry_count;  /* # of pages in p2m marked pod      */
    5.57 +    } pod;
    5.58  };
    5.59  
    5.60  /* Extract the type from the PTE flags that store it */
    5.61 @@ -220,11 +251,22 @@ int p2m_alloc_table(struct domain *d,
    5.62  void p2m_teardown(struct domain *d);
    5.63  void p2m_final_teardown(struct domain *d);
    5.64  
    5.65 +/* Dump PoD information about the domain */
    5.66 +void p2m_pod_dump_data(struct domain *d);
    5.67 +
    5.68 +/* Move all pages from the populate-on-demand cache to the domain page_list
    5.69 + * (usually in preparation for domain destruction) */
    5.70 +void p2m_pod_empty_cache(struct domain *d);
    5.71 +
    5.72  /* Add a page to a domain's p2m table */
    5.73  int guest_physmap_add_entry(struct domain *d, unsigned long gfn,
    5.74                              unsigned long mfn, unsigned int page_order, 
    5.75                              p2m_type_t t);
    5.76  
    5.77 +/* Set a p2m range as populate-on-demand */
    5.78 +int guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
    5.79 +                                          unsigned int order);
    5.80 +
    5.81  /* Untyped version for RAM only, for compatibility 
    5.82   *
    5.83   * Return 0 for success