ia64/xen-unstable

changeset 18425:86b956d8cf04

x86: make {get,put}_page_type() preemptible

This is only a first step - more call sites need to be hooked up.

Most of this is really Keir's work, I just took what he handed me and
fixed a few remaining issues.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Sep 01 10:52:05 2008 +0100 (2008-09-01)
parents 7cb51e8484f6
children b6eea72ea9dc
files xen/arch/x86/domain.c xen/arch/x86/mm.c xen/include/asm-x86/mm.h
line diff
     1.1 --- a/xen/arch/x86/domain.c	Mon Sep 01 10:49:00 2008 +0100
     1.2 +++ b/xen/arch/x86/domain.c	Mon Sep 01 10:52:05 2008 +0100
     1.3 @@ -1645,23 +1645,26 @@ static int relinquish_memory(
     1.4  
     1.5          /*
     1.6           * Forcibly invalidate top-most, still valid page tables at this point
     1.7 -         * to break circular 'linear page table' references. This is okay
     1.8 -         * because MMU structures are not shared across domains and this domain
     1.9 -         * is now dead. Thus top-most valid tables are not in use so a non-zero
    1.10 -         * count means circular reference.
    1.11 +         * to break circular 'linear page table' references as well as clean up
    1.12 +         * partially validated pages. This is okay because MMU structures are
    1.13 +         * not shared across domains and this domain is now dead. Thus top-most
    1.14 +         * valid tables are not in use so a non-zero count means circular
    1.15 +         * reference or partially validated.
    1.16           */
    1.17          y = page->u.inuse.type_info;
    1.18          for ( ; ; )
    1.19          {
    1.20              x = y;
    1.21 -            if ( likely((x & (PGT_type_mask|PGT_validated)) !=
    1.22 -                        (type|PGT_validated)) )
    1.23 +            if ( likely((x & PGT_type_mask) != type) ||
    1.24 +                 likely(!(x & (PGT_validated|PGT_partial))) )
    1.25                  break;
    1.26  
    1.27 -            y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
    1.28 +            y = cmpxchg(&page->u.inuse.type_info, x,
    1.29 +                        x & ~(PGT_validated|PGT_partial));
    1.30              if ( likely(y == x) )
    1.31              {
    1.32 -                free_page_type(page, type);
    1.33 +                if ( free_page_type(page, x, 0) != 0 )
    1.34 +                    BUG();
    1.35                  break;
    1.36              }
    1.37          }
     2.1 --- a/xen/arch/x86/mm.c	Mon Sep 01 10:49:00 2008 +0100
     2.2 +++ b/xen/arch/x86/mm.c	Mon Sep 01 10:52:05 2008 +0100
     2.3 @@ -507,11 +507,11 @@ static int alloc_segdesc_page(struct pag
     2.4              goto fail;
     2.5  
     2.6      unmap_domain_page(descs);
     2.7 -    return 1;
     2.8 +    return 0;
     2.9  
    2.10   fail:
    2.11      unmap_domain_page(descs);
    2.12 -    return 0;
    2.13 +    return -EINVAL;
    2.14  }
    2.15  
    2.16  
    2.17 @@ -565,20 +565,23 @@ static int get_page_from_pagenr(unsigned
    2.18  
    2.19  static int get_page_and_type_from_pagenr(unsigned long page_nr, 
    2.20                                           unsigned long type,
    2.21 -                                         struct domain *d)
    2.22 +                                         struct domain *d,
    2.23 +                                         int preemptible)
    2.24  {
    2.25      struct page_info *page = mfn_to_page(page_nr);
    2.26 +    int rc;
    2.27  
    2.28      if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
    2.29 -        return 0;
    2.30 -
    2.31 -    if ( unlikely(!get_page_type(page, type)) )
    2.32 -    {
    2.33 +        return -EINVAL;
    2.34 +
    2.35 +    rc = (preemptible ?
    2.36 +          get_page_type_preemptible(page, type) :
    2.37 +          (get_page_type(page, type) ? 0 : -EINVAL));
    2.38 +
    2.39 +    if ( rc )
    2.40          put_page(page);
    2.41 -        return 0;
    2.42 -    }
    2.43 -
    2.44 -    return 1;
    2.45 +
    2.46 +    return rc;
    2.47  }
    2.48  
    2.49  /*
    2.50 @@ -754,22 +757,23 @@ get_page_from_l2e(
    2.51      if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
    2.52      {
    2.53          MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
    2.54 -        return 0;
    2.55 +        return -EINVAL;
    2.56      }
    2.57  
    2.58 -    rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
    2.59 -    if ( unlikely(!rc) )
    2.60 -        rc = get_l2_linear_pagetable(l2e, pfn, d);
    2.61 +    rc = get_page_and_type_from_pagenr(
    2.62 +        l2e_get_pfn(l2e), PGT_l1_page_table, d, 0);
    2.63 +    if ( unlikely(rc) && rc != -EAGAIN &&
    2.64 +         get_l2_linear_pagetable(l2e, pfn, d) )
    2.65 +        rc = -EINVAL;
    2.66  
    2.67      return rc;
    2.68  }
    2.69  
    2.70  
    2.71 -#if CONFIG_PAGING_LEVELS >= 3
    2.72  define_get_linear_pagetable(l3);
    2.73  static int
    2.74  get_page_from_l3e(
    2.75 -    l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
    2.76 +    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible)
    2.77  {
    2.78      int rc;
    2.79  
    2.80 @@ -779,22 +783,23 @@ get_page_from_l3e(
    2.81      if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
    2.82      {
    2.83          MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
    2.84 -        return 0;
    2.85 +        return -EINVAL;
    2.86      }
    2.87  
    2.88 -    rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
    2.89 -    if ( unlikely(!rc) )
    2.90 -        rc = get_l3_linear_pagetable(l3e, pfn, d);
    2.91 +    rc = get_page_and_type_from_pagenr(
    2.92 +        l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible);
    2.93 +    if ( unlikely(rc) && rc != -EAGAIN && rc != -EINTR &&
    2.94 +         get_l3_linear_pagetable(l3e, pfn, d) )
    2.95 +        rc = -EINVAL;
    2.96  
    2.97      return rc;
    2.98  }
    2.99 -#endif /* 3 level */
   2.100  
   2.101  #if CONFIG_PAGING_LEVELS >= 4
   2.102  define_get_linear_pagetable(l4);
   2.103  static int
   2.104  get_page_from_l4e(
   2.105 -    l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
   2.106 +    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible)
   2.107  {
   2.108      int rc;
   2.109  
   2.110 @@ -804,12 +809,14 @@ get_page_from_l4e(
   2.111      if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
   2.112      {
   2.113          MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
   2.114 -        return 0;
   2.115 +        return -EINVAL;
   2.116      }
   2.117  
   2.118 -    rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
   2.119 -    if ( unlikely(!rc) )
   2.120 -        rc = get_l4_linear_pagetable(l4e, pfn, d);
   2.121 +    rc = get_page_and_type_from_pagenr(
   2.122 +        l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible);
   2.123 +    if ( unlikely(rc) && rc != -EAGAIN && rc != -EINTR &&
   2.124 +         get_l4_linear_pagetable(l4e, pfn, d) )
   2.125 +        rc = -EINVAL;
   2.126  
   2.127      return rc;
   2.128  }
   2.129 @@ -946,29 +953,35 @@ void put_page_from_l1e(l1_pgentry_t l1e,
   2.130   * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
   2.131   * Note also that this automatically deals correctly with linear p.t.'s.
   2.132   */
   2.133 -static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
   2.134 +static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
   2.135  {
   2.136      if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) && 
   2.137           (l2e_get_pfn(l2e) != pfn) )
   2.138 +    {
   2.139          put_page_and_type(l2e_get_page(l2e));
   2.140 +        return 0;
   2.141 +    }
   2.142 +    return 1;
   2.143  }
   2.144  
   2.145  
   2.146 -#if CONFIG_PAGING_LEVELS >= 3
   2.147 -static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
   2.148 +static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
   2.149 +                             int preemptible)
   2.150  {
   2.151      if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && 
   2.152           (l3e_get_pfn(l3e) != pfn) )
   2.153 -        put_page_and_type(l3e_get_page(l3e));
   2.154 +        return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
   2.155 +    return 1;
   2.156  }
   2.157 -#endif
   2.158  
   2.159  #if CONFIG_PAGING_LEVELS >= 4
   2.160 -static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
   2.161 +static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
   2.162 +                             int preemptible)
   2.163  {
   2.164      if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 
   2.165           (l4e_get_pfn(l4e) != pfn) )
   2.166 -        put_page_and_type(l4e_get_page(l4e));
   2.167 +        return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
   2.168 +    return 1;
   2.169  }
   2.170  #endif
   2.171  
   2.172 @@ -977,7 +990,7 @@ static int alloc_l1_table(struct page_in
   2.173      struct domain *d = page_get_owner(page);
   2.174      unsigned long  pfn = page_to_mfn(page);
   2.175      l1_pgentry_t  *pl1e;
   2.176 -    int            i;
   2.177 +    unsigned int   i;
   2.178  
   2.179      pl1e = map_domain_page(pfn);
   2.180  
   2.181 @@ -991,7 +1004,7 @@ static int alloc_l1_table(struct page_in
   2.182      }
   2.183  
   2.184      unmap_domain_page(pl1e);
   2.185 -    return 1;
   2.186 +    return 0;
   2.187  
   2.188   fail:
   2.189      MEM_LOG("Failure in alloc_l1_table: entry %d", i);
   2.190 @@ -1000,7 +1013,7 @@ static int alloc_l1_table(struct page_in
   2.191              put_page_from_l1e(pl1e[i], d);
   2.192  
   2.193      unmap_domain_page(pl1e);
   2.194 -    return 0;
   2.195 +    return -EINVAL;
   2.196  }
   2.197  
   2.198  static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
   2.199 @@ -1128,47 +1141,53 @@ static void pae_flush_pgd(
   2.200  # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
   2.201  #endif
   2.202  
   2.203 -static int alloc_l2_table(struct page_info *page, unsigned long type)
   2.204 +static int alloc_l2_table(struct page_info *page, unsigned long type,
   2.205 +                          int preemptible)
   2.206  {
   2.207      struct domain *d = page_get_owner(page);
   2.208      unsigned long  pfn = page_to_mfn(page);
   2.209      l2_pgentry_t  *pl2e;
   2.210 -    int            i;
   2.211 +    unsigned int   i;
   2.212 +    int            rc = 0;
   2.213  
   2.214      pl2e = map_domain_page(pfn);
   2.215  
   2.216 -    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
   2.217 +    for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
   2.218      {
   2.219 -        if ( !is_guest_l2_slot(d, type, i) )
   2.220 +        if ( preemptible && i && hypercall_preempt_check() )
   2.221 +        {
   2.222 +            page->nr_validated_ptes = i;
   2.223 +            rc = -EAGAIN;
   2.224 +            break;
   2.225 +        }
   2.226 +
   2.227 +        if ( !is_guest_l2_slot(d, type, i) ||
   2.228 +             (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
   2.229              continue;
   2.230  
   2.231 -        if ( unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
   2.232 -            goto fail;
   2.233 -        
   2.234 +        if ( rc < 0 )
   2.235 +        {
   2.236 +            MEM_LOG("Failure in alloc_l2_table: entry %d", i);
   2.237 +            while ( i-- > 0 )
   2.238 +                if ( is_guest_l2_slot(d, type, i) )
   2.239 +                    put_page_from_l2e(pl2e[i], pfn);
   2.240 +            break;
   2.241 +        }
   2.242 +
   2.243          adjust_guest_l2e(pl2e[i], d);
   2.244      }
   2.245  
   2.246      unmap_domain_page(pl2e);
   2.247 -    return 1;
   2.248 -
   2.249 - fail:
   2.250 -    MEM_LOG("Failure in alloc_l2_table: entry %d", i);
   2.251 -    while ( i-- > 0 )
   2.252 -        if ( is_guest_l2_slot(d, type, i) )
   2.253 -            put_page_from_l2e(pl2e[i], pfn);
   2.254 -
   2.255 -    unmap_domain_page(pl2e);
   2.256 -    return 0;
   2.257 +    return rc > 0 ? 0 : rc;
   2.258  }
   2.259  
   2.260 -
   2.261 -#if CONFIG_PAGING_LEVELS >= 3
   2.262 -static int alloc_l3_table(struct page_info *page)
   2.263 +static int alloc_l3_table(struct page_info *page, int preemptible)
   2.264  {
   2.265      struct domain *d = page_get_owner(page);
   2.266      unsigned long  pfn = page_to_mfn(page);
   2.267      l3_pgentry_t  *pl3e;
   2.268 -    int            i;
   2.269 +    unsigned int   i;
   2.270 +    int            rc = 0;
   2.271  
   2.272  #if CONFIG_PAGING_LEVELS == 3
   2.273      /*
   2.274 @@ -1181,7 +1200,7 @@ static int alloc_l3_table(struct page_in
   2.275           d->vcpu[0] && d->vcpu[0]->is_initialised )
   2.276      {
   2.277          MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
   2.278 -        return 0;
   2.279 +        return -EINVAL;
   2.280      }
   2.281  #endif
   2.282  
   2.283 @@ -1197,64 +1216,96 @@ static int alloc_l3_table(struct page_in
   2.284      if ( is_pv_32on64_domain(d) )
   2.285          memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
   2.286  
   2.287 -    for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
   2.288 +    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ )
   2.289      {
   2.290          if ( is_pv_32bit_domain(d) && (i == 3) )
   2.291          {
   2.292              if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
   2.293 -                 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
   2.294 -                 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
   2.295 -                                                PGT_l2_page_table |
   2.296 -                                                PGT_pae_xen_l2,
   2.297 -                                                d) )
   2.298 -                goto fail;
   2.299 +                 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
   2.300 +                rc = -EINVAL;
   2.301 +            else
   2.302 +                rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
   2.303 +                                                   PGT_l2_page_table |
   2.304 +                                                   PGT_pae_xen_l2,
   2.305 +                                                   d, preemptible);
   2.306          }
   2.307 -        else if ( !is_guest_l3_slot(i) )
   2.308 +        else if ( !is_guest_l3_slot(i) ||
   2.309 +                  (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 )
   2.310              continue;
   2.311 -        else if ( unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
   2.312 -            goto fail;
   2.313 +
   2.314 +        if ( rc == -EAGAIN )
   2.315 +        {
   2.316 +            page->nr_validated_ptes = i;
   2.317 +            page->partial_pte = 1;
   2.318 +        }
   2.319 +        else if ( rc == -EINTR && i )
   2.320 +        {
   2.321 +            page->nr_validated_ptes = i;
   2.322 +            page->partial_pte = 0;
   2.323 +            rc = -EAGAIN;
   2.324 +        }
   2.325 +        if ( rc < 0 )
   2.326 +            break;
   2.327  
   2.328          adjust_guest_l3e(pl3e[i], d);
   2.329      }
   2.330  
   2.331 -    if ( !create_pae_xen_mappings(d, pl3e) )
   2.332 -        goto fail;
   2.333 -
   2.334 -    unmap_domain_page(pl3e);
   2.335 -    return 1;
   2.336 -
   2.337 - fail:
   2.338 -    MEM_LOG("Failure in alloc_l3_table: entry %d", i);
   2.339 -    while ( i-- > 0 )
   2.340 +    if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
   2.341 +        rc = -EINVAL;
   2.342 +    if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
   2.343      {
   2.344 -        if ( !is_guest_l3_slot(i) )
   2.345 -            continue;
   2.346 -        unadjust_guest_l3e(pl3e[i], d);
   2.347 -        put_page_from_l3e(pl3e[i], pfn);
   2.348 +        MEM_LOG("Failure in alloc_l3_table: entry %d", i);
   2.349 +        while ( i-- > 0 )
   2.350 +        {
   2.351 +            if ( !is_guest_l3_slot(i) )
   2.352 +                continue;
   2.353 +            unadjust_guest_l3e(pl3e[i], d);
   2.354 +            put_page_from_l3e(pl3e[i], pfn, 0);
   2.355 +        }
   2.356      }
   2.357  
   2.358      unmap_domain_page(pl3e);
   2.359 -    return 0;
   2.360 +    return rc > 0 ? 0 : rc;
   2.361  }
   2.362 -#else
   2.363 -#define alloc_l3_table(page) (0)
   2.364 -#endif
   2.365  
   2.366  #if CONFIG_PAGING_LEVELS >= 4
   2.367 -static int alloc_l4_table(struct page_info *page)
   2.368 +static int alloc_l4_table(struct page_info *page, int preemptible)
   2.369  {
   2.370      struct domain *d = page_get_owner(page);
   2.371      unsigned long  pfn = page_to_mfn(page);
   2.372      l4_pgentry_t  *pl4e = page_to_virt(page);
   2.373 -    int            i;
   2.374 -
   2.375 -    for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
   2.376 +    unsigned int   i;
   2.377 +    int            rc = 0;
   2.378 +
   2.379 +    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ )
   2.380      {
   2.381 -        if ( !is_guest_l4_slot(d, i) )
   2.382 +        if ( !is_guest_l4_slot(d, i) ||
   2.383 +             (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 )
   2.384              continue;
   2.385  
   2.386 -        if ( unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
   2.387 -            goto fail;
   2.388 +        if ( rc == -EAGAIN )
   2.389 +        {
   2.390 +            page->nr_validated_ptes = i;
   2.391 +            page->partial_pte = 1;
   2.392 +        }
   2.393 +        else if ( rc == -EINTR )
   2.394 +        {
   2.395 +            if ( i )
   2.396 +            {
   2.397 +                page->nr_validated_ptes = i;
   2.398 +                page->partial_pte = 0;
   2.399 +                rc = -EAGAIN;
   2.400 +            }
   2.401 +        }
   2.402 +        else if ( rc < 0 )
   2.403 +        {
   2.404 +            MEM_LOG("Failure in alloc_l4_table: entry %d", i);
   2.405 +            while ( i-- > 0 )
   2.406 +                if ( is_guest_l4_slot(d, i) )
   2.407 +                    put_page_from_l4e(pl4e[i], pfn, 0);
   2.408 +        }
   2.409 +        if ( rc < 0 )
   2.410 +            return rc;
   2.411  
   2.412          adjust_guest_l4e(pl4e[i], d);
   2.413      }
   2.414 @@ -1269,18 +1320,10 @@ static int alloc_l4_table(struct page_in
   2.415          l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
   2.416                        __PAGE_HYPERVISOR);
   2.417  
   2.418 -    return 1;
   2.419 -
   2.420 - fail:
   2.421 -    MEM_LOG("Failure in alloc_l4_table: entry %d", i);
   2.422 -    while ( i-- > 0 )
   2.423 -        if ( is_guest_l4_slot(d, i) )
   2.424 -            put_page_from_l4e(pl4e[i], pfn);
   2.425 -
   2.426 -    return 0;
   2.427 +    return rc > 0 ? 0 : rc;
   2.428  }
   2.429  #else
   2.430 -#define alloc_l4_table(page) (0)
   2.431 +#define alloc_l4_table(page, preemptible) (-EINVAL)
   2.432  #endif
   2.433  
   2.434  
   2.435 @@ -1289,7 +1332,7 @@ static void free_l1_table(struct page_in
   2.436      struct domain *d = page_get_owner(page);
   2.437      unsigned long pfn = page_to_mfn(page);
   2.438      l1_pgentry_t *pl1e;
   2.439 -    int i;
   2.440 +    unsigned int  i;
   2.441  
   2.442      pl1e = map_domain_page(pfn);
   2.443  
   2.444 @@ -1301,74 +1344,114 @@ static void free_l1_table(struct page_in
   2.445  }
   2.446  
   2.447  
   2.448 -static void free_l2_table(struct page_info *page)
   2.449 +static int free_l2_table(struct page_info *page, int preemptible)
   2.450  {
   2.451  #ifdef CONFIG_COMPAT
   2.452      struct domain *d = page_get_owner(page);
   2.453  #endif
   2.454      unsigned long pfn = page_to_mfn(page);
   2.455      l2_pgentry_t *pl2e;
   2.456 -    int i;
   2.457 +    unsigned int  i = page->nr_validated_ptes - 1;
   2.458 +    int err = 0;
   2.459  
   2.460      pl2e = map_domain_page(pfn);
   2.461  
   2.462 -    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
   2.463 -        if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
   2.464 -            put_page_from_l2e(pl2e[i], pfn);
   2.465 +    ASSERT(page->nr_validated_ptes);
   2.466 +    do {
   2.467 +        if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
   2.468 +             put_page_from_l2e(pl2e[i], pfn) == 0 &&
   2.469 +             preemptible && i && hypercall_preempt_check() )
   2.470 +        {
   2.471 +           page->nr_validated_ptes = i;
   2.472 +           err = -EAGAIN;
   2.473 +        }
   2.474 +    } while ( !err && i-- );
   2.475  
   2.476      unmap_domain_page(pl2e);
   2.477  
   2.478 -    page->u.inuse.type_info &= ~PGT_pae_xen_l2;
   2.479 +    if ( !err )
   2.480 +        page->u.inuse.type_info &= ~PGT_pae_xen_l2;
   2.481 +
   2.482 +    return err;
   2.483  }
   2.484  
   2.485 -
   2.486 -#if CONFIG_PAGING_LEVELS >= 3
   2.487 -
   2.488 -static void free_l3_table(struct page_info *page)
   2.489 +static int free_l3_table(struct page_info *page, int preemptible)
   2.490  {
   2.491      struct domain *d = page_get_owner(page);
   2.492      unsigned long pfn = page_to_mfn(page);
   2.493      l3_pgentry_t *pl3e;
   2.494 -    int           i;
   2.495 +    unsigned int  i = page->nr_validated_ptes - !page->partial_pte;
   2.496 +    int rc = 0;
   2.497  
   2.498  #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
   2.499      if ( d->arch.relmem == RELMEM_l3 )
   2.500 -        return;
   2.501 +        return 0;
   2.502  #endif
   2.503  
   2.504      pl3e = map_domain_page(pfn);
   2.505  
   2.506 -    for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
   2.507 +    do {
   2.508          if ( is_guest_l3_slot(i) )
   2.509          {
   2.510 -            put_page_from_l3e(pl3e[i], pfn);
   2.511 +            rc = put_page_from_l3e(pl3e[i], pfn, preemptible);
   2.512 +            if ( rc > 0 )
   2.513 +                continue;
   2.514 +            if ( rc )
   2.515 +                break;
   2.516              unadjust_guest_l3e(pl3e[i], d);
   2.517          }
   2.518 +    } while ( i-- );
   2.519  
   2.520      unmap_domain_page(pl3e);
   2.521 +
   2.522 +    if ( rc == -EAGAIN )
   2.523 +    {
   2.524 +        page->nr_validated_ptes = i;
   2.525 +        page->partial_pte = 1;
   2.526 +    }
   2.527 +    else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
   2.528 +    {
   2.529 +        page->nr_validated_ptes = i + 1;
   2.530 +        page->partial_pte = 0;
   2.531 +        rc = -EAGAIN;
   2.532 +    }
   2.533 +    return rc > 0 ? 0 : rc;
   2.534  }
   2.535  
   2.536 -#endif
   2.537 -
   2.538  #if CONFIG_PAGING_LEVELS >= 4
   2.539 -
   2.540 -static void free_l4_table(struct page_info *page)
   2.541 +static int free_l4_table(struct page_info *page, int preemptible)
   2.542  {
   2.543      struct domain *d = page_get_owner(page);
   2.544      unsigned long pfn = page_to_mfn(page);
   2.545      l4_pgentry_t *pl4e = page_to_virt(page);
   2.546 -    int           i;
   2.547 +    unsigned int  i = page->nr_validated_ptes - !page->partial_pte;
   2.548 +    int rc = 0;
   2.549  
   2.550  #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
   2.551      if ( d->arch.relmem == RELMEM_l4 )
   2.552 -        return;
   2.553 +        return 0;
   2.554  #endif
   2.555  
   2.556 -    for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
   2.557 +    do {
   2.558          if ( is_guest_l4_slot(d, i) )
   2.559 -            put_page_from_l4e(pl4e[i], pfn);
   2.560 +            rc = put_page_from_l4e(pl4e[i], pfn, preemptible);
   2.561 +    } while ( rc >= 0 && i-- );
   2.562 +
   2.563 +    if ( rc == -EAGAIN )
   2.564 +    {
   2.565 +        page->nr_validated_ptes = i;
   2.566 +        page->partial_pte = 1;
   2.567 +    }
   2.568 +    else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
   2.569 +    {
   2.570 +        page->nr_validated_ptes = i + 1;
   2.571 +        page->partial_pte = 0;
   2.572 +        rc = -EAGAIN;
   2.573 +    }
   2.574 +    return rc > 0 ? 0 : rc;
   2.575  }
   2.576 -
   2.577 +#else
   2.578 +#define free_l4_table(page, preemptible) (-EINVAL)
   2.579  #endif
   2.580  
   2.581  static void page_lock(struct page_info *page)
   2.582 @@ -1560,7 +1643,7 @@ static int mod_l2_entry(l2_pgentry_t *pl
   2.583              return rc;
   2.584          }
   2.585  
   2.586 -        if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
   2.587 +        if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
   2.588              return page_unlock(l2pg), 0;
   2.589  
   2.590          adjust_guest_l2e(nl2e, d);
   2.591 @@ -1583,24 +1666,23 @@ static int mod_l2_entry(l2_pgentry_t *pl
   2.592      return rc;
   2.593  }
   2.594  
   2.595 -#if CONFIG_PAGING_LEVELS >= 3
   2.596 -
   2.597  /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
   2.598  static int mod_l3_entry(l3_pgentry_t *pl3e, 
   2.599                          l3_pgentry_t nl3e, 
   2.600                          unsigned long pfn,
   2.601 -                        int preserve_ad)
   2.602 +                        int preserve_ad,
   2.603 +                        int preemptible)
   2.604  {
   2.605      l3_pgentry_t ol3e;
   2.606      struct vcpu *curr = current;
   2.607      struct domain *d = curr->domain;
   2.608      struct page_info *l3pg = mfn_to_page(pfn);
   2.609 -    int rc = 1;
   2.610 +    int rc = 0;
   2.611  
   2.612      if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
   2.613      {
   2.614          MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
   2.615 -        return 0;
   2.616 +        return -EINVAL;
   2.617      }
   2.618  
   2.619      /*
   2.620 @@ -1608,12 +1690,12 @@ static int mod_l3_entry(l3_pgentry_t *pl
   2.621       * would be a pain to ensure they remain continuously valid throughout.
   2.622       */
   2.623      if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
   2.624 -        return 0;
   2.625 +        return -EINVAL;
   2.626  
   2.627      page_lock(l3pg);
   2.628  
   2.629      if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
   2.630 -        return page_unlock(l3pg), 0;
   2.631 +        return page_unlock(l3pg), -EFAULT;
   2.632  
   2.633      if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
   2.634      {
   2.635 @@ -1622,7 +1704,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
   2.636              page_unlock(l3pg);
   2.637              MEM_LOG("Bad L3 flags %x",
   2.638                      l3e_get_flags(nl3e) & l3_disallow_mask(d));
   2.639 -            return 0;
   2.640 +            return -EINVAL;
   2.641          }
   2.642  
   2.643          /* Fast path for identical mapping and presence. */
   2.644 @@ -1631,28 +1713,30 @@ static int mod_l3_entry(l3_pgentry_t *pl
   2.645              adjust_guest_l3e(nl3e, d);
   2.646              rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad);
   2.647              page_unlock(l3pg);
   2.648 -            return rc;
   2.649 +            return rc ? 0 : -EFAULT;
   2.650          }
   2.651  
   2.652 -        if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
   2.653 -            return page_unlock(l3pg), 0;
   2.654 +        rc = get_page_from_l3e(nl3e, pfn, d, preemptible);
   2.655 +        if ( unlikely(rc < 0) )
   2.656 +            return page_unlock(l3pg), rc;
   2.657 +        rc = 0;
   2.658  
   2.659          adjust_guest_l3e(nl3e, d);
   2.660          if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
   2.661                                      preserve_ad)) )
   2.662          {
   2.663              ol3e = nl3e;
   2.664 -            rc = 0;
   2.665 +            rc = -EFAULT;
   2.666          }
   2.667      }
   2.668      else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
   2.669                                       preserve_ad)) )
   2.670      {
   2.671          page_unlock(l3pg);
   2.672 -        return 0;
   2.673 +        return -EFAULT;
   2.674      }
   2.675  
   2.676 -    if ( likely(rc) )
   2.677 +    if ( likely(rc == 0) )
   2.678      {
   2.679          if ( !create_pae_xen_mappings(d, pl3e) )
   2.680              BUG();
   2.681 @@ -1661,36 +1745,35 @@ static int mod_l3_entry(l3_pgentry_t *pl
   2.682      }
   2.683  
   2.684      page_unlock(l3pg);
   2.685 -    put_page_from_l3e(ol3e, pfn);
   2.686 +    put_page_from_l3e(ol3e, pfn, 0);
   2.687      return rc;
   2.688  }
   2.689  
   2.690 -#endif
   2.691 -
   2.692  #if CONFIG_PAGING_LEVELS >= 4
   2.693  
   2.694  /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
   2.695  static int mod_l4_entry(l4_pgentry_t *pl4e, 
   2.696                          l4_pgentry_t nl4e, 
   2.697                          unsigned long pfn,
   2.698 -                        int preserve_ad)
   2.699 +                        int preserve_ad,
   2.700 +                        int preemptible)
   2.701  {
   2.702      struct vcpu *curr = current;
   2.703      struct domain *d = curr->domain;
   2.704      l4_pgentry_t ol4e;
   2.705      struct page_info *l4pg = mfn_to_page(pfn);
   2.706 -    int rc = 1;
   2.707 +    int rc = 0;
   2.708  
   2.709      if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
   2.710      {
   2.711          MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
   2.712 -        return 0;
   2.713 +        return -EINVAL;
   2.714      }
   2.715  
   2.716      page_lock(l4pg);
   2.717  
   2.718      if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
   2.719 -        return page_unlock(l4pg), 0;
   2.720 +        return page_unlock(l4pg), -EFAULT;
   2.721  
   2.722      if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
   2.723      {
   2.724 @@ -1699,7 +1782,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
   2.725              page_unlock(l4pg);
   2.726              MEM_LOG("Bad L4 flags %x",
   2.727                      l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
   2.728 -            return 0;
   2.729 +            return -EINVAL;
   2.730          }
   2.731  
   2.732          /* Fast path for identical mapping and presence. */
   2.733 @@ -1708,29 +1791,31 @@ static int mod_l4_entry(l4_pgentry_t *pl
   2.734              adjust_guest_l4e(nl4e, d);
   2.735              rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad);
   2.736              page_unlock(l4pg);
   2.737 -            return rc;
   2.738 +            return rc ? 0 : -EFAULT;
   2.739          }
   2.740  
   2.741 -        if ( unlikely(!get_page_from_l4e(nl4e, pfn, d)) )
   2.742 -            return page_unlock(l4pg), 0;
   2.743 +        rc = get_page_from_l4e(nl4e, pfn, d, preemptible);
   2.744 +        if ( unlikely(rc < 0) )
   2.745 +            return page_unlock(l4pg), rc;
   2.746 +        rc = 0;
   2.747  
   2.748          adjust_guest_l4e(nl4e, d);
   2.749          if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
   2.750                                      preserve_ad)) )
   2.751          {
   2.752              ol4e = nl4e;
   2.753 -            rc = 0;
   2.754 +            rc = -EFAULT;
   2.755          }
   2.756      }
   2.757      else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
   2.758                                       preserve_ad)) )
   2.759      {
   2.760          page_unlock(l4pg);
   2.761 -        return 0;
   2.762 +        return -EFAULT;
   2.763      }
   2.764  
   2.765      page_unlock(l4pg);
   2.766 -    put_page_from_l4e(ol4e, pfn);
   2.767 +    put_page_from_l4e(ol4e, pfn, 0);
   2.768      return rc;
   2.769  }
   2.770  
   2.771 @@ -1788,9 +1873,11 @@ int get_page(struct page_info *page, str
   2.772  }
   2.773  
   2.774  
   2.775 -static int alloc_page_type(struct page_info *page, unsigned long type)
   2.776 +static int alloc_page_type(struct page_info *page, unsigned long type,
   2.777 +                           int preemptible)
   2.778  {
   2.779      struct domain *owner = page_get_owner(page);
   2.780 +    int rc;
   2.781  
   2.782      /* A page table is dirtied when its type count becomes non-zero. */
   2.783      if ( likely(owner != NULL) )
   2.784 @@ -1799,30 +1886,65 @@ static int alloc_page_type(struct page_i
   2.785      switch ( type & PGT_type_mask )
   2.786      {
   2.787      case PGT_l1_page_table:
   2.788 -        return alloc_l1_table(page);
   2.789 +        alloc_l1_table(page);
   2.790 +        rc = 0;
   2.791 +        break;
   2.792      case PGT_l2_page_table:
   2.793 -        return alloc_l2_table(page, type);
   2.794 +        rc = alloc_l2_table(page, type, preemptible);
   2.795 +        break;
   2.796      case PGT_l3_page_table:
   2.797 -        return alloc_l3_table(page);
   2.798 +        rc = alloc_l3_table(page, preemptible);
   2.799 +        break;
   2.800      case PGT_l4_page_table:
   2.801 -        return alloc_l4_table(page);
   2.802 +        rc = alloc_l4_table(page, preemptible);
   2.803 +        break;
   2.804      case PGT_seg_desc_page:
   2.805 -        return alloc_segdesc_page(page);
   2.806 +        rc = alloc_segdesc_page(page);
   2.807 +        break;
   2.808      default:
   2.809          printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n", 
   2.810                 type, page->u.inuse.type_info,
   2.811                 page->count_info);
   2.812 +        rc = -EINVAL;
   2.813          BUG();
   2.814      }
   2.815  
   2.816 -    return 0;
   2.817 +    /* No need for atomic update of type_info here: noone else updates it. */
   2.818 +    wmb();
   2.819 +    if ( rc == -EAGAIN )
   2.820 +    {
   2.821 +        page->u.inuse.type_info |= PGT_partial;
   2.822 +    }
   2.823 +    else if ( rc == -EINTR )
   2.824 +    {
   2.825 +        ASSERT((page->u.inuse.type_info &
   2.826 +                (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
   2.827 +        page->u.inuse.type_info &= ~PGT_count_mask;
   2.828 +    }
   2.829 +    else if ( rc )
   2.830 +    {
   2.831 +        ASSERT(rc < 0);
   2.832 +        MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
   2.833 +                PRtype_info ": caf=%08x taf=%" PRtype_info,
   2.834 +                page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
   2.835 +                type, page->count_info, page->u.inuse.type_info);
   2.836 +        page->u.inuse.type_info = 0;
   2.837 +    }
   2.838 +    else
   2.839 +    {
   2.840 +        page->u.inuse.type_info |= PGT_validated;
   2.841 +    }
   2.842 +
   2.843 +    return rc;
   2.844  }
   2.845  
   2.846  
   2.847 -void free_page_type(struct page_info *page, unsigned long type)
   2.848 +int free_page_type(struct page_info *page, unsigned long type,
   2.849 +                   int preemptible)
   2.850  {
   2.851      struct domain *owner = page_get_owner(page);
   2.852      unsigned long gmfn;
   2.853 +    int rc;
   2.854  
   2.855      if ( likely(owner != NULL) )
   2.856      {
   2.857 @@ -1842,7 +1964,7 @@ void free_page_type(struct page_info *pa
   2.858              paging_mark_dirty(owner, page_to_mfn(page));
   2.859  
   2.860              if ( shadow_mode_refcounts(owner) )
   2.861 -                return;
   2.862 +                return 0;
   2.863  
   2.864              gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
   2.865              ASSERT(VALID_M2P(gmfn));
   2.866 @@ -1850,42 +1972,80 @@ void free_page_type(struct page_info *pa
   2.867          }
   2.868      }
   2.869  
   2.870 +    if ( !(type & PGT_partial) )
   2.871 +    {
   2.872 +        page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
   2.873 +        page->partial_pte = 0;
   2.874 +    }
   2.875      switch ( type & PGT_type_mask )
   2.876      {
   2.877      case PGT_l1_page_table:
   2.878          free_l1_table(page);
   2.879 -        break;
   2.880 -
   2.881 -    case PGT_l2_page_table:
   2.882 -        free_l2_table(page);
   2.883 -        break;
   2.884 -
   2.885 -#if CONFIG_PAGING_LEVELS >= 3
   2.886 -    case PGT_l3_page_table:
   2.887 -        free_l3_table(page);
   2.888 +        rc = 0;
   2.889          break;
   2.890 +    case PGT_l2_page_table:
   2.891 +        rc = free_l2_table(page, preemptible);
   2.892 +        break;
   2.893 +    case PGT_l3_page_table:
   2.894 +#if CONFIG_PAGING_LEVELS == 3
   2.895 +        if ( !(type & PGT_partial) )
   2.896 +            page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
   2.897  #endif
   2.898 -
   2.899 -#if CONFIG_PAGING_LEVELS >= 4
   2.900 +        rc = free_l3_table(page, preemptible);
   2.901 +        break;
   2.902      case PGT_l4_page_table:
   2.903 -        free_l4_table(page);
   2.904 +        rc = free_l4_table(page, preemptible);
   2.905          break;
   2.906 -#endif
   2.907 -
   2.908      default:
   2.909 -        printk("%s: type %lx pfn %lx\n",__FUNCTION__,
   2.910 -               type, page_to_mfn(page));
   2.911 +        MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page));
   2.912 +        rc = -EINVAL;
   2.913          BUG();
   2.914      }
   2.915 +
   2.916 +    /* No need for atomic update of type_info here: noone else updates it. */
   2.917 +    if ( rc == 0 )
   2.918 +    {
   2.919 +        /*
   2.920 +         * Record TLB information for flush later. We do not stamp page tables
   2.921 +         * when running in shadow mode:
   2.922 +         *  1. Pointless, since it's the shadow pt's which must be tracked.
   2.923 +         *  2. Shadow mode reuses this field for shadowed page tables to
   2.924 +         *     store flags info -- we don't want to conflict with that.
   2.925 +         */
   2.926 +        if ( !(shadow_mode_enabled(page_get_owner(page)) &&
   2.927 +               (page->count_info & PGC_page_table)) )
   2.928 +            page->tlbflush_timestamp = tlbflush_current_time();
   2.929 +        wmb();
   2.930 +        page->u.inuse.type_info--;
   2.931 +    }
   2.932 +    else if ( rc == -EINTR )
   2.933 +    {
   2.934 +        ASSERT(!(page->u.inuse.type_info &
   2.935 +                 (PGT_count_mask|PGT_validated|PGT_partial)));
   2.936 +        if ( !(shadow_mode_enabled(page_get_owner(page)) &&
   2.937 +               (page->count_info & PGC_page_table)) )
   2.938 +            page->tlbflush_timestamp = tlbflush_current_time();
   2.939 +        wmb();
   2.940 +        page->u.inuse.type_info |= PGT_validated;
   2.941 +    }
   2.942 +    else
   2.943 +    {
   2.944 +        BUG_ON(rc != -EAGAIN);
   2.945 +        wmb();
   2.946 +        page->u.inuse.type_info |= PGT_partial;
   2.947 +    }
   2.948 +
   2.949 +    return rc;
   2.950  }
   2.951  
   2.952  
   2.953 -void put_page_type(struct page_info *page)
   2.954 +static int __put_page_type(struct page_info *page,
   2.955 +                           int preemptible)
   2.956  {
   2.957      unsigned long nx, x, y = page->u.inuse.type_info;
   2.958  
   2.959 - again:
   2.960 -    do {
   2.961 +    for ( ; ; )
   2.962 +    {
   2.963          x  = y;
   2.964          nx = x - 1;
   2.965  
   2.966 @@ -1894,21 +2054,19 @@ void put_page_type(struct page_info *pag
   2.967          if ( unlikely((nx & PGT_count_mask) == 0) )
   2.968          {
   2.969              if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
   2.970 -                 likely(nx & PGT_validated) )
   2.971 +                 likely(nx & (PGT_validated|PGT_partial)) )
   2.972              {
   2.973                  /*
   2.974                   * Page-table pages must be unvalidated when count is zero. The
   2.975                   * 'free' is safe because the refcnt is non-zero and validated
   2.976                   * bit is clear => other ops will spin or fail.
   2.977                   */
   2.978 -                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
   2.979 -                                           x & ~PGT_validated)) != x) )
   2.980 -                    goto again;
   2.981 +                nx = x & ~(PGT_validated|PGT_partial);
   2.982 +                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
   2.983 +                                           x, nx)) != x) )
   2.984 +                    continue;
   2.985                  /* We cleared the 'valid bit' so we do the clean up. */
   2.986 -                free_page_type(page, x);
   2.987 -                /* Carry on, but with the 'valid bit' now clear. */
   2.988 -                x  &= ~PGT_validated;
   2.989 -                nx &= ~PGT_validated;
   2.990 +                return free_page_type(page, x, preemptible);
   2.991              }
   2.992  
   2.993              /*
   2.994 @@ -1922,25 +2080,33 @@ void put_page_type(struct page_info *pag
   2.995                     (page->count_info & PGC_page_table)) )
   2.996                  page->tlbflush_timestamp = tlbflush_current_time();
   2.997          }
   2.998 +
   2.999 +        if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
  2.1000 +            break;
  2.1001 +
  2.1002 +        if ( preemptible && hypercall_preempt_check() )
  2.1003 +            return -EINTR;
  2.1004      }
  2.1005 -    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
  2.1006 +
  2.1007 +    return 0;
  2.1008  }
  2.1009  
  2.1010  
  2.1011 -int get_page_type(struct page_info *page, unsigned long type)
  2.1012 +static int __get_page_type(struct page_info *page, unsigned long type,
  2.1013 +                           int preemptible)
  2.1014  {
  2.1015      unsigned long nx, x, y = page->u.inuse.type_info;
  2.1016  
  2.1017      ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
  2.1018  
  2.1019 - again:
  2.1020 -    do {
  2.1021 +    for ( ; ; )
  2.1022 +    {
  2.1023          x  = y;
  2.1024          nx = x + 1;
  2.1025          if ( unlikely((nx & PGT_count_mask) == 0) )
  2.1026          {
  2.1027              MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
  2.1028 -            return 0;
  2.1029 +            return -EINVAL;
  2.1030          }
  2.1031          else if ( unlikely((x & PGT_count_mask) == 0) )
  2.1032          {
  2.1033 @@ -1993,28 +2159,43 @@ int get_page_type(struct page_info *page
  2.1034              /* Don't log failure if it could be a recursive-mapping attempt. */
  2.1035              if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
  2.1036                   (type == PGT_l1_page_table) )
  2.1037 -                return 0;
  2.1038 +                return -EINVAL;
  2.1039              if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
  2.1040                   (type == PGT_l2_page_table) )
  2.1041 -                return 0;
  2.1042 +                return -EINVAL;
  2.1043              if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
  2.1044                   (type == PGT_l3_page_table) )
  2.1045 -                return 0;
  2.1046 +                return -EINVAL;
  2.1047              MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
  2.1048                      "for mfn %lx (pfn %lx)",
  2.1049                      x, type, page_to_mfn(page),
  2.1050                      get_gpfn_from_mfn(page_to_mfn(page)));
  2.1051 -            return 0;
  2.1052 +            return -EINVAL;
  2.1053          }
  2.1054          else if ( unlikely(!(x & PGT_validated)) )
  2.1055          {
  2.1056 -            /* Someone else is updating validation of this page. Wait... */
  2.1057 -            while ( (y = page->u.inuse.type_info) == x )
  2.1058 -                cpu_relax();
  2.1059 -            goto again;
  2.1060 +            if ( !(x & PGT_partial) )
  2.1061 +            {
  2.1062 +                /* Someone else is updating validation of this page. Wait... */
  2.1063 +                while ( (y = page->u.inuse.type_info) == x )
  2.1064 +                {
  2.1065 +                    if ( preemptible && hypercall_preempt_check() )
  2.1066 +                        return -EINTR;
  2.1067 +                    cpu_relax();
  2.1068 +                }
  2.1069 +                continue;
  2.1070 +            }
  2.1071 +            /* Type ref count was left at 1 when PGT_partial got set. */
  2.1072 +            ASSERT((x & PGT_count_mask) == 1);
  2.1073 +            nx = x & ~PGT_partial;
  2.1074          }
  2.1075 +
  2.1076 +        if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
  2.1077 +            break;
  2.1078 +
  2.1079 +        if ( preemptible && hypercall_preempt_check() )
  2.1080 +            return -EINTR;
  2.1081      }
  2.1082 -    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
  2.1083  
  2.1084      if ( unlikely((x & PGT_type_mask) != type) )
  2.1085      {
  2.1086 @@ -2032,25 +2213,42 @@ int get_page_type(struct page_info *page
  2.1087  
  2.1088      if ( unlikely(!(nx & PGT_validated)) )
  2.1089      {
  2.1090 -        /* Try to validate page type; drop the new reference on failure. */
  2.1091 -        if ( unlikely(!alloc_page_type(page, type)) )
  2.1092 +        if ( !(x & PGT_partial) )
  2.1093          {
  2.1094 -            MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
  2.1095 -                    PRtype_info ": caf=%08x taf=%" PRtype_info,
  2.1096 -                    page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
  2.1097 -                    type, page->count_info, page->u.inuse.type_info);
  2.1098 -            /* Noone else can get a reference. We hold the only ref. */
  2.1099 -            page->u.inuse.type_info = 0;
  2.1100 -            return 0;
  2.1101 +            page->nr_validated_ptes = 0;
  2.1102 +            page->partial_pte = 0;
  2.1103          }
  2.1104 -
  2.1105 -        /* Noone else is updating simultaneously. */
  2.1106 -        __set_bit(_PGT_validated, &page->u.inuse.type_info);
  2.1107 +        return alloc_page_type(page, type, preemptible);
  2.1108      }
  2.1109  
  2.1110 -    return 1;
  2.1111 +    return 0;
  2.1112  }
  2.1113  
  2.1114 +void put_page_type(struct page_info *page)
  2.1115 +{
  2.1116 +    int rc = __put_page_type(page, 0);
  2.1117 +    ASSERT(rc == 0);
  2.1118 +    (void)rc;
  2.1119 +}
  2.1120 +
  2.1121 +int get_page_type(struct page_info *page, unsigned long type)
  2.1122 +{
  2.1123 +    int rc = __get_page_type(page, type, 0);
  2.1124 +    if ( likely(rc == 0) )
  2.1125 +        return 1;
  2.1126 +    ASSERT(rc == -EINVAL);
  2.1127 +    return 0;
  2.1128 +}
  2.1129 +
  2.1130 +int put_page_type_preemptible(struct page_info *page)
  2.1131 +{
  2.1132 +    return __put_page_type(page, 1);
  2.1133 +}
  2.1134 +
  2.1135 +int get_page_type_preemptible(struct page_info *page, unsigned long type)
  2.1136 +{
  2.1137 +    return __get_page_type(page, type, 1);
  2.1138 +}
  2.1139  
  2.1140  void cleanup_page_cacheattr(struct page_info *page)
  2.1141  {
  2.1142 @@ -2087,7 +2285,7 @@ int new_guest_cr3(unsigned long mfn)
  2.1143                      l4e_from_pfn(
  2.1144                          mfn,
  2.1145                          (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
  2.1146 -                    pagetable_get_pfn(v->arch.guest_table), 0);
  2.1147 +                    pagetable_get_pfn(v->arch.guest_table), 0, 0) == 0;
  2.1148          if ( unlikely(!okay) )
  2.1149          {
  2.1150              MEM_LOG("Error while installing new compat baseptr %lx", mfn);
  2.1151 @@ -2102,7 +2300,7 @@ int new_guest_cr3(unsigned long mfn)
  2.1152  #endif
  2.1153      okay = paging_mode_refcounts(d)
  2.1154          ? get_page_from_pagenr(mfn, d)
  2.1155 -        : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
  2.1156 +        : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0);
  2.1157      if ( unlikely(!okay) )
  2.1158      {
  2.1159          MEM_LOG("Error while installing new baseptr %lx", mfn);
  2.1160 @@ -2276,9 +2474,7 @@ int do_mmuext_op(
  2.1161      {
  2.1162          if ( hypercall_preempt_check() )
  2.1163          {
  2.1164 -            rc = hypercall_create_continuation(
  2.1165 -                __HYPERVISOR_mmuext_op, "hihi",
  2.1166 -                uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
  2.1167 +            rc = -EAGAIN;
  2.1168              break;
  2.1169          }
  2.1170  
  2.1171 @@ -2325,10 +2521,14 @@ int do_mmuext_op(
  2.1172              if ( paging_mode_refcounts(FOREIGNDOM) )
  2.1173                  break;
  2.1174  
  2.1175 -            okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
  2.1176 +            rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1);
  2.1177 +            okay = !rc;
  2.1178              if ( unlikely(!okay) )
  2.1179              {
  2.1180 -                MEM_LOG("Error while pinning mfn %lx", mfn);
  2.1181 +                if ( rc == -EINTR )
  2.1182 +                    rc = -EAGAIN;
  2.1183 +                else if ( rc != -EAGAIN )
  2.1184 +                    MEM_LOG("Error while pinning mfn %lx", mfn);
  2.1185                  break;
  2.1186              }
  2.1187  
  2.1188 @@ -2373,8 +2573,11 @@ int do_mmuext_op(
  2.1189              {
  2.1190                  put_page_and_type(page);
  2.1191                  put_page(page);
  2.1192 -                /* A page is dirtied when its pin status is cleared. */
  2.1193 -                paging_mark_dirty(d, mfn);
  2.1194 +                if ( !rc )
  2.1195 +                {
  2.1196 +                    /* A page is dirtied when its pin status is cleared. */
  2.1197 +                    paging_mark_dirty(d, mfn);
  2.1198 +                }
  2.1199              }
  2.1200              else
  2.1201              {
  2.1202 @@ -2398,8 +2601,8 @@ int do_mmuext_op(
  2.1203                  if ( paging_mode_refcounts(d) )
  2.1204                      okay = get_page_from_pagenr(mfn, d);
  2.1205                  else
  2.1206 -                    okay = get_page_and_type_from_pagenr(
  2.1207 -                        mfn, PGT_root_page_table, d);
  2.1208 +                    okay = !get_page_and_type_from_pagenr(
  2.1209 +                        mfn, PGT_root_page_table, d, 0);
  2.1210                  if ( unlikely(!okay) )
  2.1211                  {
  2.1212                      MEM_LOG("Error while installing new mfn %lx", mfn);
  2.1213 @@ -2517,6 +2720,11 @@ int do_mmuext_op(
  2.1214          guest_handle_add_offset(uops, 1);
  2.1215      }
  2.1216  
  2.1217 +    if ( rc == -EAGAIN )
  2.1218 +        rc = hypercall_create_continuation(
  2.1219 +            __HYPERVISOR_mmuext_op, "hihi",
  2.1220 +            uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
  2.1221 +
  2.1222      process_deferred_ops();
  2.1223  
  2.1224      perfc_add(num_mmuext_ops, i);
  2.1225 @@ -2576,9 +2784,7 @@ int do_mmu_update(
  2.1226      {
  2.1227          if ( hypercall_preempt_check() )
  2.1228          {
  2.1229 -            rc = hypercall_create_continuation(
  2.1230 -                __HYPERVISOR_mmu_update, "hihi",
  2.1231 -                ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
  2.1232 +            rc = -EAGAIN;
  2.1233              break;
  2.1234          }
  2.1235  
  2.1236 @@ -2653,27 +2859,29 @@ int do_mmu_update(
  2.1237                                          cmd == MMU_PT_UPDATE_PRESERVE_AD);
  2.1238                  }
  2.1239                  break;
  2.1240 -#if CONFIG_PAGING_LEVELS >= 3
  2.1241                  case PGT_l3_page_table:
  2.1242                  {
  2.1243                      l3_pgentry_t l3e = l3e_from_intpte(req.val);
  2.1244 -                    okay = mod_l3_entry(va, l3e, mfn,
  2.1245 -                                        cmd == MMU_PT_UPDATE_PRESERVE_AD);
  2.1246 +                    rc = mod_l3_entry(va, l3e, mfn,
  2.1247 +                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
  2.1248 +                    okay = !rc;
  2.1249                  }
  2.1250                  break;
  2.1251 -#endif
  2.1252  #if CONFIG_PAGING_LEVELS >= 4
  2.1253                  case PGT_l4_page_table:
  2.1254                  {
  2.1255                      l4_pgentry_t l4e = l4e_from_intpte(req.val);
  2.1256 -                    okay = mod_l4_entry(va, l4e, mfn,
  2.1257 -                                        cmd == MMU_PT_UPDATE_PRESERVE_AD);
  2.1258 +                    rc = mod_l4_entry(va, l4e, mfn,
  2.1259 +                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
  2.1260 +                    okay = !rc;
  2.1261                  }
  2.1262                  break;
  2.1263  #endif
  2.1264                  }
  2.1265  
  2.1266                  put_page_type(page);
  2.1267 +                if ( rc == -EINTR )
  2.1268 +                    rc = -EAGAIN;
  2.1269              }
  2.1270              break;
  2.1271  
  2.1272 @@ -2742,6 +2950,11 @@ int do_mmu_update(
  2.1273          guest_handle_add_offset(ureqs, 1);
  2.1274      }
  2.1275  
  2.1276 +    if ( rc == -EAGAIN )
  2.1277 +        rc = hypercall_create_continuation(
  2.1278 +            __HYPERVISOR_mmu_update, "hihi",
  2.1279 +            ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
  2.1280 +
  2.1281      process_deferred_ops();
  2.1282  
  2.1283      domain_mmap_cache_destroy(&mapcache);
  2.1284 @@ -3695,9 +3908,8 @@ static int ptwr_emulated_update(
  2.1285      nl1e = l1e_from_intpte(val);
  2.1286      if ( unlikely(!get_page_from_l1e(nl1e, d)) )
  2.1287      {
  2.1288 -        if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) &&
  2.1289 -             (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg &&
  2.1290 -             (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
  2.1291 +        if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
  2.1292 +             !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
  2.1293          {
  2.1294              /*
  2.1295               * If this is an upper-half write to a PAE PTE then we assume that
     3.1 --- a/xen/include/asm-x86/mm.h	Mon Sep 01 10:49:00 2008 +0100
     3.2 +++ b/xen/include/asm-x86/mm.h	Mon Sep 01 10:52:05 2008 +0100
     3.3 @@ -59,6 +59,17 @@ struct page_info
     3.4          u32 tlbflush_timestamp;
     3.5  
     3.6          /*
     3.7 +         * When PGT_partial is true then this field is valid and indicates
     3.8 +         * that PTEs in the range [0, @nr_validated_ptes) have been validated.
     3.9 +         * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been
    3.10 +         * partially validated.
    3.11 +         */
    3.12 +        struct {
    3.13 +            u16 nr_validated_ptes;
    3.14 +            bool_t partial_pte;
    3.15 +        };
    3.16 +
    3.17 +        /*
    3.18           * Guest pages with a shadow.  This does not conflict with
    3.19           * tlbflush_timestamp since page table pages are explicitly not
    3.20           * tracked for TLB-flush avoidance when a guest runs in shadow mode.
    3.21 @@ -86,9 +97,12 @@ struct page_info
    3.22   /* PAE only: is this an L2 page directory containing Xen-private mappings? */
    3.23  #define _PGT_pae_xen_l2     26
    3.24  #define PGT_pae_xen_l2      (1U<<_PGT_pae_xen_l2)
    3.25 +/* Has this page been *partially* validated for use as its current type? */
    3.26 +#define _PGT_partial        25
    3.27 +#define PGT_partial         (1U<<_PGT_partial)
    3.28  
    3.29 - /* 26-bit count of uses of this frame as its current type. */
    3.30 -#define PGT_count_mask      ((1U<<26)-1)
    3.31 + /* 25-bit count of uses of this frame as its current type. */
    3.32 +#define PGT_count_mask      ((1U<<25)-1)
    3.33  
    3.34   /* Cleared when the owning guest 'frees' this page. */
    3.35  #define _PGC_allocated      31
    3.36 @@ -154,7 +168,8 @@ extern unsigned long max_page;
    3.37  extern unsigned long total_pages;
    3.38  void init_frametable(void);
    3.39  
    3.40 -void free_page_type(struct page_info *page, unsigned long type);
    3.41 +int free_page_type(struct page_info *page, unsigned long type,
    3.42 +                   int preemptible);
    3.43  int _shadow_mode_refcounts(struct domain *d);
    3.44  
    3.45  void cleanup_page_cacheattr(struct page_info *page);
    3.46 @@ -165,6 +180,8 @@ void put_page(struct page_info *page);
    3.47  int  get_page(struct page_info *page, struct domain *domain);
    3.48  void put_page_type(struct page_info *page);
    3.49  int  get_page_type(struct page_info *page, unsigned long type);
    3.50 +int  put_page_type_preemptible(struct page_info *page);
    3.51 +int  get_page_type_preemptible(struct page_info *page, unsigned long type);
    3.52  int  get_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
    3.53  void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
    3.54  
    3.55 @@ -174,6 +191,19 @@ static inline void put_page_and_type(str
    3.56      put_page(page);
    3.57  }
    3.58  
    3.59 +static inline int put_page_and_type_preemptible(struct page_info *page,
    3.60 +                                                int preemptible)
    3.61 +{
    3.62 +    int rc = 0;
    3.63 +
    3.64 +    if ( preemptible )
    3.65 +        rc = put_page_type_preemptible(page);
    3.66 +    else
    3.67 +        put_page_type(page);
    3.68 +    if ( likely(rc == 0) )
    3.69 +        put_page(page);
    3.70 +    return rc;
    3.71 +}
    3.72  
    3.73  static inline int get_page_and_type(struct page_info *page,
    3.74                                      struct domain *domain,