ia64/xen-unstable
changeset 18425:86b956d8cf04
x86: make {get,put}_page_type() preemptible
This is only a first step - more call sites need to be hooked up.
Most of this is really Keir's work, I just took what he handed me and
fixed a few remaining issues.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
This is only a first step - more call sites need to be hooked up.
Most of this is really Keir's work, I just took what he handed me and
fixed a few remaining issues.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
author | Keir Fraser <keir.fraser@citrix.com> |
---|---|
date | Mon Sep 01 10:52:05 2008 +0100 (2008-09-01) |
parents | 7cb51e8484f6 |
children | b6eea72ea9dc |
files | xen/arch/x86/domain.c xen/arch/x86/mm.c xen/include/asm-x86/mm.h |
line diff
1.1 --- a/xen/arch/x86/domain.c Mon Sep 01 10:49:00 2008 +0100 1.2 +++ b/xen/arch/x86/domain.c Mon Sep 01 10:52:05 2008 +0100 1.3 @@ -1645,23 +1645,26 @@ static int relinquish_memory( 1.4 1.5 /* 1.6 * Forcibly invalidate top-most, still valid page tables at this point 1.7 - * to break circular 'linear page table' references. This is okay 1.8 - * because MMU structures are not shared across domains and this domain 1.9 - * is now dead. Thus top-most valid tables are not in use so a non-zero 1.10 - * count means circular reference. 1.11 + * to break circular 'linear page table' references as well as clean up 1.12 + * partially validated pages. This is okay because MMU structures are 1.13 + * not shared across domains and this domain is now dead. Thus top-most 1.14 + * valid tables are not in use so a non-zero count means circular 1.15 + * reference or partially validated. 1.16 */ 1.17 y = page->u.inuse.type_info; 1.18 for ( ; ; ) 1.19 { 1.20 x = y; 1.21 - if ( likely((x & (PGT_type_mask|PGT_validated)) != 1.22 - (type|PGT_validated)) ) 1.23 + if ( likely((x & PGT_type_mask) != type) || 1.24 + likely(!(x & (PGT_validated|PGT_partial))) ) 1.25 break; 1.26 1.27 - y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated); 1.28 + y = cmpxchg(&page->u.inuse.type_info, x, 1.29 + x & ~(PGT_validated|PGT_partial)); 1.30 if ( likely(y == x) ) 1.31 { 1.32 - free_page_type(page, type); 1.33 + if ( free_page_type(page, x, 0) != 0 ) 1.34 + BUG(); 1.35 break; 1.36 } 1.37 }
2.1 --- a/xen/arch/x86/mm.c Mon Sep 01 10:49:00 2008 +0100 2.2 +++ b/xen/arch/x86/mm.c Mon Sep 01 10:52:05 2008 +0100 2.3 @@ -507,11 +507,11 @@ static int alloc_segdesc_page(struct pag 2.4 goto fail; 2.5 2.6 unmap_domain_page(descs); 2.7 - return 1; 2.8 + return 0; 2.9 2.10 fail: 2.11 unmap_domain_page(descs); 2.12 - return 0; 2.13 + return -EINVAL; 2.14 } 2.15 2.16 2.17 @@ -565,20 +565,23 @@ static int get_page_from_pagenr(unsigned 2.18 2.19 static int get_page_and_type_from_pagenr(unsigned long page_nr, 2.20 unsigned long type, 2.21 - struct domain *d) 2.22 + struct domain *d, 2.23 + int preemptible) 2.24 { 2.25 struct page_info *page = mfn_to_page(page_nr); 2.26 + int rc; 2.27 2.28 if ( unlikely(!get_page_from_pagenr(page_nr, d)) ) 2.29 - return 0; 2.30 - 2.31 - if ( unlikely(!get_page_type(page, type)) ) 2.32 - { 2.33 + return -EINVAL; 2.34 + 2.35 + rc = (preemptible ? 2.36 + get_page_type_preemptible(page, type) : 2.37 + (get_page_type(page, type) ? 0 : -EINVAL)); 2.38 + 2.39 + if ( rc ) 2.40 put_page(page); 2.41 - return 0; 2.42 - } 2.43 - 2.44 - return 1; 2.45 + 2.46 + return rc; 2.47 } 2.48 2.49 /* 2.50 @@ -754,22 +757,23 @@ get_page_from_l2e( 2.51 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) ) 2.52 { 2.53 MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK); 2.54 - return 0; 2.55 + return -EINVAL; 2.56 } 2.57 2.58 - rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d); 2.59 - if ( unlikely(!rc) ) 2.60 - rc = get_l2_linear_pagetable(l2e, pfn, d); 2.61 + rc = get_page_and_type_from_pagenr( 2.62 + l2e_get_pfn(l2e), PGT_l1_page_table, d, 0); 2.63 + if ( unlikely(rc) && rc != -EAGAIN && 2.64 + get_l2_linear_pagetable(l2e, pfn, d) ) 2.65 + rc = -EINVAL; 2.66 2.67 return rc; 2.68 } 2.69 2.70 2.71 -#if CONFIG_PAGING_LEVELS >= 3 2.72 define_get_linear_pagetable(l3); 2.73 static int 2.74 get_page_from_l3e( 2.75 - l3_pgentry_t l3e, unsigned long pfn, struct domain *d) 2.76 + l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible) 2.77 { 2.78 int rc; 2.79 2.80 @@ -779,22 +783,23 @@ get_page_from_l3e( 2.81 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) ) 2.82 { 2.83 MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d)); 2.84 - return 0; 2.85 + return -EINVAL; 2.86 } 2.87 2.88 - rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d); 2.89 - if ( unlikely(!rc) ) 2.90 - rc = get_l3_linear_pagetable(l3e, pfn, d); 2.91 + rc = get_page_and_type_from_pagenr( 2.92 + l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible); 2.93 + if ( unlikely(rc) && rc != -EAGAIN && rc != -EINTR && 2.94 + get_l3_linear_pagetable(l3e, pfn, d) ) 2.95 + rc = -EINVAL; 2.96 2.97 return rc; 2.98 } 2.99 -#endif /* 3 level */ 2.100 2.101 #if CONFIG_PAGING_LEVELS >= 4 2.102 define_get_linear_pagetable(l4); 2.103 static int 2.104 get_page_from_l4e( 2.105 - l4_pgentry_t l4e, unsigned long pfn, struct domain *d) 2.106 + l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible) 2.107 { 2.108 int rc; 2.109 2.110 @@ -804,12 +809,14 @@ get_page_from_l4e( 2.111 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) ) 2.112 { 2.113 MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK); 2.114 - return 0; 2.115 + return -EINVAL; 2.116 } 2.117 2.118 - rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d); 2.119 - if ( unlikely(!rc) ) 2.120 - rc = get_l4_linear_pagetable(l4e, pfn, d); 2.121 + rc = get_page_and_type_from_pagenr( 2.122 + l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible); 2.123 + if ( unlikely(rc) && rc != -EAGAIN && rc != -EINTR && 2.124 + get_l4_linear_pagetable(l4e, pfn, d) ) 2.125 + rc = -EINVAL; 2.126 2.127 return rc; 2.128 } 2.129 @@ -946,29 +953,35 @@ void put_page_from_l1e(l1_pgentry_t l1e, 2.130 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. 2.131 * Note also that this automatically deals correctly with linear p.t.'s. 2.132 */ 2.133 -static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) 2.134 +static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) 2.135 { 2.136 if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) && 2.137 (l2e_get_pfn(l2e) != pfn) ) 2.138 + { 2.139 put_page_and_type(l2e_get_page(l2e)); 2.140 + return 0; 2.141 + } 2.142 + return 1; 2.143 } 2.144 2.145 2.146 -#if CONFIG_PAGING_LEVELS >= 3 2.147 -static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn) 2.148 +static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, 2.149 + int preemptible) 2.150 { 2.151 if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && 2.152 (l3e_get_pfn(l3e) != pfn) ) 2.153 - put_page_and_type(l3e_get_page(l3e)); 2.154 + return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible); 2.155 + return 1; 2.156 } 2.157 -#endif 2.158 2.159 #if CONFIG_PAGING_LEVELS >= 4 2.160 -static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn) 2.161 +static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, 2.162 + int preemptible) 2.163 { 2.164 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 2.165 (l4e_get_pfn(l4e) != pfn) ) 2.166 - put_page_and_type(l4e_get_page(l4e)); 2.167 + return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible); 2.168 + return 1; 2.169 } 2.170 #endif 2.171 2.172 @@ -977,7 +990,7 @@ static int alloc_l1_table(struct page_in 2.173 struct domain *d = page_get_owner(page); 2.174 unsigned long pfn = page_to_mfn(page); 2.175 l1_pgentry_t *pl1e; 2.176 - int i; 2.177 + unsigned int i; 2.178 2.179 pl1e = map_domain_page(pfn); 2.180 2.181 @@ -991,7 +1004,7 @@ static int alloc_l1_table(struct page_in 2.182 } 2.183 2.184 unmap_domain_page(pl1e); 2.185 - return 1; 2.186 + return 0; 2.187 2.188 fail: 2.189 MEM_LOG("Failure in alloc_l1_table: entry %d", i); 2.190 @@ -1000,7 +1013,7 @@ static int alloc_l1_table(struct page_in 2.191 put_page_from_l1e(pl1e[i], d); 2.192 2.193 unmap_domain_page(pl1e); 2.194 - return 0; 2.195 + return -EINVAL; 2.196 } 2.197 2.198 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e) 2.199 @@ -1128,47 +1141,53 @@ static void pae_flush_pgd( 2.200 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0) 2.201 #endif 2.202 2.203 -static int alloc_l2_table(struct page_info *page, unsigned long type) 2.204 +static int alloc_l2_table(struct page_info *page, unsigned long type, 2.205 + int preemptible) 2.206 { 2.207 struct domain *d = page_get_owner(page); 2.208 unsigned long pfn = page_to_mfn(page); 2.209 l2_pgentry_t *pl2e; 2.210 - int i; 2.211 + unsigned int i; 2.212 + int rc = 0; 2.213 2.214 pl2e = map_domain_page(pfn); 2.215 2.216 - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) 2.217 + for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ ) 2.218 { 2.219 - if ( !is_guest_l2_slot(d, type, i) ) 2.220 + if ( preemptible && i && hypercall_preempt_check() ) 2.221 + { 2.222 + page->nr_validated_ptes = i; 2.223 + rc = -EAGAIN; 2.224 + break; 2.225 + } 2.226 + 2.227 + if ( !is_guest_l2_slot(d, type, i) || 2.228 + (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 ) 2.229 continue; 2.230 2.231 - if ( unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) ) 2.232 - goto fail; 2.233 - 2.234 + if ( rc < 0 ) 2.235 + { 2.236 + MEM_LOG("Failure in alloc_l2_table: entry %d", i); 2.237 + while ( i-- > 0 ) 2.238 + if ( is_guest_l2_slot(d, type, i) ) 2.239 + put_page_from_l2e(pl2e[i], pfn); 2.240 + break; 2.241 + } 2.242 + 2.243 adjust_guest_l2e(pl2e[i], d); 2.244 } 2.245 2.246 unmap_domain_page(pl2e); 2.247 - return 1; 2.248 - 2.249 - fail: 2.250 - MEM_LOG("Failure in alloc_l2_table: entry %d", i); 2.251 - while ( i-- > 0 ) 2.252 - if ( is_guest_l2_slot(d, type, i) ) 2.253 - put_page_from_l2e(pl2e[i], pfn); 2.254 - 2.255 - unmap_domain_page(pl2e); 2.256 - return 0; 2.257 + return rc > 0 ? 0 : rc; 2.258 } 2.259 2.260 - 2.261 -#if CONFIG_PAGING_LEVELS >= 3 2.262 -static int alloc_l3_table(struct page_info *page) 2.263 +static int alloc_l3_table(struct page_info *page, int preemptible) 2.264 { 2.265 struct domain *d = page_get_owner(page); 2.266 unsigned long pfn = page_to_mfn(page); 2.267 l3_pgentry_t *pl3e; 2.268 - int i; 2.269 + unsigned int i; 2.270 + int rc = 0; 2.271 2.272 #if CONFIG_PAGING_LEVELS == 3 2.273 /* 2.274 @@ -1181,7 +1200,7 @@ static int alloc_l3_table(struct page_in 2.275 d->vcpu[0] && d->vcpu[0]->is_initialised ) 2.276 { 2.277 MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn); 2.278 - return 0; 2.279 + return -EINVAL; 2.280 } 2.281 #endif 2.282 2.283 @@ -1197,64 +1216,96 @@ static int alloc_l3_table(struct page_in 2.284 if ( is_pv_32on64_domain(d) ) 2.285 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e)); 2.286 2.287 - for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) 2.288 + for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ ) 2.289 { 2.290 if ( is_pv_32bit_domain(d) && (i == 3) ) 2.291 { 2.292 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) || 2.293 - (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) || 2.294 - !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]), 2.295 - PGT_l2_page_table | 2.296 - PGT_pae_xen_l2, 2.297 - d) ) 2.298 - goto fail; 2.299 + (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ) 2.300 + rc = -EINVAL; 2.301 + else 2.302 + rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]), 2.303 + PGT_l2_page_table | 2.304 + PGT_pae_xen_l2, 2.305 + d, preemptible); 2.306 } 2.307 - else if ( !is_guest_l3_slot(i) ) 2.308 + else if ( !is_guest_l3_slot(i) || 2.309 + (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 ) 2.310 continue; 2.311 - else if ( unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) ) 2.312 - goto fail; 2.313 + 2.314 + if ( rc == -EAGAIN ) 2.315 + { 2.316 + page->nr_validated_ptes = i; 2.317 + page->partial_pte = 1; 2.318 + } 2.319 + else if ( rc == -EINTR && i ) 2.320 + { 2.321 + page->nr_validated_ptes = i; 2.322 + page->partial_pte = 0; 2.323 + rc = -EAGAIN; 2.324 + } 2.325 + if ( rc < 0 ) 2.326 + break; 2.327 2.328 adjust_guest_l3e(pl3e[i], d); 2.329 } 2.330 2.331 - if ( !create_pae_xen_mappings(d, pl3e) ) 2.332 - goto fail; 2.333 - 2.334 - unmap_domain_page(pl3e); 2.335 - return 1; 2.336 - 2.337 - fail: 2.338 - MEM_LOG("Failure in alloc_l3_table: entry %d", i); 2.339 - while ( i-- > 0 ) 2.340 + if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) ) 2.341 + rc = -EINVAL; 2.342 + if ( rc < 0 && rc != -EAGAIN && rc != -EINTR ) 2.343 { 2.344 - if ( !is_guest_l3_slot(i) ) 2.345 - continue; 2.346 - unadjust_guest_l3e(pl3e[i], d); 2.347 - put_page_from_l3e(pl3e[i], pfn); 2.348 + MEM_LOG("Failure in alloc_l3_table: entry %d", i); 2.349 + while ( i-- > 0 ) 2.350 + { 2.351 + if ( !is_guest_l3_slot(i) ) 2.352 + continue; 2.353 + unadjust_guest_l3e(pl3e[i], d); 2.354 + put_page_from_l3e(pl3e[i], pfn, 0); 2.355 + } 2.356 } 2.357 2.358 unmap_domain_page(pl3e); 2.359 - return 0; 2.360 + return rc > 0 ? 0 : rc; 2.361 } 2.362 -#else 2.363 -#define alloc_l3_table(page) (0) 2.364 -#endif 2.365 2.366 #if CONFIG_PAGING_LEVELS >= 4 2.367 -static int alloc_l4_table(struct page_info *page) 2.368 +static int alloc_l4_table(struct page_info *page, int preemptible) 2.369 { 2.370 struct domain *d = page_get_owner(page); 2.371 unsigned long pfn = page_to_mfn(page); 2.372 l4_pgentry_t *pl4e = page_to_virt(page); 2.373 - int i; 2.374 - 2.375 - for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ ) 2.376 + unsigned int i; 2.377 + int rc = 0; 2.378 + 2.379 + for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ ) 2.380 { 2.381 - if ( !is_guest_l4_slot(d, i) ) 2.382 + if ( !is_guest_l4_slot(d, i) || 2.383 + (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 ) 2.384 continue; 2.385 2.386 - if ( unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) ) 2.387 - goto fail; 2.388 + if ( rc == -EAGAIN ) 2.389 + { 2.390 + page->nr_validated_ptes = i; 2.391 + page->partial_pte = 1; 2.392 + } 2.393 + else if ( rc == -EINTR ) 2.394 + { 2.395 + if ( i ) 2.396 + { 2.397 + page->nr_validated_ptes = i; 2.398 + page->partial_pte = 0; 2.399 + rc = -EAGAIN; 2.400 + } 2.401 + } 2.402 + else if ( rc < 0 ) 2.403 + { 2.404 + MEM_LOG("Failure in alloc_l4_table: entry %d", i); 2.405 + while ( i-- > 0 ) 2.406 + if ( is_guest_l4_slot(d, i) ) 2.407 + put_page_from_l4e(pl4e[i], pfn, 0); 2.408 + } 2.409 + if ( rc < 0 ) 2.410 + return rc; 2.411 2.412 adjust_guest_l4e(pl4e[i], d); 2.413 } 2.414 @@ -1269,18 +1320,10 @@ static int alloc_l4_table(struct page_in 2.415 l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3), 2.416 __PAGE_HYPERVISOR); 2.417 2.418 - return 1; 2.419 - 2.420 - fail: 2.421 - MEM_LOG("Failure in alloc_l4_table: entry %d", i); 2.422 - while ( i-- > 0 ) 2.423 - if ( is_guest_l4_slot(d, i) ) 2.424 - put_page_from_l4e(pl4e[i], pfn); 2.425 - 2.426 - return 0; 2.427 + return rc > 0 ? 0 : rc; 2.428 } 2.429 #else 2.430 -#define alloc_l4_table(page) (0) 2.431 +#define alloc_l4_table(page, preemptible) (-EINVAL) 2.432 #endif 2.433 2.434 2.435 @@ -1289,7 +1332,7 @@ static void free_l1_table(struct page_in 2.436 struct domain *d = page_get_owner(page); 2.437 unsigned long pfn = page_to_mfn(page); 2.438 l1_pgentry_t *pl1e; 2.439 - int i; 2.440 + unsigned int i; 2.441 2.442 pl1e = map_domain_page(pfn); 2.443 2.444 @@ -1301,74 +1344,114 @@ static void free_l1_table(struct page_in 2.445 } 2.446 2.447 2.448 -static void free_l2_table(struct page_info *page) 2.449 +static int free_l2_table(struct page_info *page, int preemptible) 2.450 { 2.451 #ifdef CONFIG_COMPAT 2.452 struct domain *d = page_get_owner(page); 2.453 #endif 2.454 unsigned long pfn = page_to_mfn(page); 2.455 l2_pgentry_t *pl2e; 2.456 - int i; 2.457 + unsigned int i = page->nr_validated_ptes - 1; 2.458 + int err = 0; 2.459 2.460 pl2e = map_domain_page(pfn); 2.461 2.462 - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) 2.463 - if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) ) 2.464 - put_page_from_l2e(pl2e[i], pfn); 2.465 + ASSERT(page->nr_validated_ptes); 2.466 + do { 2.467 + if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) && 2.468 + put_page_from_l2e(pl2e[i], pfn) == 0 && 2.469 + preemptible && i && hypercall_preempt_check() ) 2.470 + { 2.471 + page->nr_validated_ptes = i; 2.472 + err = -EAGAIN; 2.473 + } 2.474 + } while ( !err && i-- ); 2.475 2.476 unmap_domain_page(pl2e); 2.477 2.478 - page->u.inuse.type_info &= ~PGT_pae_xen_l2; 2.479 + if ( !err ) 2.480 + page->u.inuse.type_info &= ~PGT_pae_xen_l2; 2.481 + 2.482 + return err; 2.483 } 2.484 2.485 - 2.486 -#if CONFIG_PAGING_LEVELS >= 3 2.487 - 2.488 -static void free_l3_table(struct page_info *page) 2.489 +static int free_l3_table(struct page_info *page, int preemptible) 2.490 { 2.491 struct domain *d = page_get_owner(page); 2.492 unsigned long pfn = page_to_mfn(page); 2.493 l3_pgentry_t *pl3e; 2.494 - int i; 2.495 + unsigned int i = page->nr_validated_ptes - !page->partial_pte; 2.496 + int rc = 0; 2.497 2.498 #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION 2.499 if ( d->arch.relmem == RELMEM_l3 ) 2.500 - return; 2.501 + return 0; 2.502 #endif 2.503 2.504 pl3e = map_domain_page(pfn); 2.505 2.506 - for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) 2.507 + do { 2.508 if ( is_guest_l3_slot(i) ) 2.509 { 2.510 - put_page_from_l3e(pl3e[i], pfn); 2.511 + rc = put_page_from_l3e(pl3e[i], pfn, preemptible); 2.512 + if ( rc > 0 ) 2.513 + continue; 2.514 + if ( rc ) 2.515 + break; 2.516 unadjust_guest_l3e(pl3e[i], d); 2.517 } 2.518 + } while ( i-- ); 2.519 2.520 unmap_domain_page(pl3e); 2.521 + 2.522 + if ( rc == -EAGAIN ) 2.523 + { 2.524 + page->nr_validated_ptes = i; 2.525 + page->partial_pte = 1; 2.526 + } 2.527 + else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) 2.528 + { 2.529 + page->nr_validated_ptes = i + 1; 2.530 + page->partial_pte = 0; 2.531 + rc = -EAGAIN; 2.532 + } 2.533 + return rc > 0 ? 0 : rc; 2.534 } 2.535 2.536 -#endif 2.537 - 2.538 #if CONFIG_PAGING_LEVELS >= 4 2.539 - 2.540 -static void free_l4_table(struct page_info *page) 2.541 +static int free_l4_table(struct page_info *page, int preemptible) 2.542 { 2.543 struct domain *d = page_get_owner(page); 2.544 unsigned long pfn = page_to_mfn(page); 2.545 l4_pgentry_t *pl4e = page_to_virt(page); 2.546 - int i; 2.547 + unsigned int i = page->nr_validated_ptes - !page->partial_pte; 2.548 + int rc = 0; 2.549 2.550 #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION 2.551 if ( d->arch.relmem == RELMEM_l4 ) 2.552 - return; 2.553 + return 0; 2.554 #endif 2.555 2.556 - for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ ) 2.557 + do { 2.558 if ( is_guest_l4_slot(d, i) ) 2.559 - put_page_from_l4e(pl4e[i], pfn); 2.560 + rc = put_page_from_l4e(pl4e[i], pfn, preemptible); 2.561 + } while ( rc >= 0 && i-- ); 2.562 + 2.563 + if ( rc == -EAGAIN ) 2.564 + { 2.565 + page->nr_validated_ptes = i; 2.566 + page->partial_pte = 1; 2.567 + } 2.568 + else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) 2.569 + { 2.570 + page->nr_validated_ptes = i + 1; 2.571 + page->partial_pte = 0; 2.572 + rc = -EAGAIN; 2.573 + } 2.574 + return rc > 0 ? 0 : rc; 2.575 } 2.576 - 2.577 +#else 2.578 +#define free_l4_table(page, preemptible) (-EINVAL) 2.579 #endif 2.580 2.581 static void page_lock(struct page_info *page) 2.582 @@ -1560,7 +1643,7 @@ static int mod_l2_entry(l2_pgentry_t *pl 2.583 return rc; 2.584 } 2.585 2.586 - if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) ) 2.587 + if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) ) 2.588 return page_unlock(l2pg), 0; 2.589 2.590 adjust_guest_l2e(nl2e, d); 2.591 @@ -1583,24 +1666,23 @@ static int mod_l2_entry(l2_pgentry_t *pl 2.592 return rc; 2.593 } 2.594 2.595 -#if CONFIG_PAGING_LEVELS >= 3 2.596 - 2.597 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */ 2.598 static int mod_l3_entry(l3_pgentry_t *pl3e, 2.599 l3_pgentry_t nl3e, 2.600 unsigned long pfn, 2.601 - int preserve_ad) 2.602 + int preserve_ad, 2.603 + int preemptible) 2.604 { 2.605 l3_pgentry_t ol3e; 2.606 struct vcpu *curr = current; 2.607 struct domain *d = curr->domain; 2.608 struct page_info *l3pg = mfn_to_page(pfn); 2.609 - int rc = 1; 2.610 + int rc = 0; 2.611 2.612 if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) ) 2.613 { 2.614 MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e); 2.615 - return 0; 2.616 + return -EINVAL; 2.617 } 2.618 2.619 /* 2.620 @@ -1608,12 +1690,12 @@ static int mod_l3_entry(l3_pgentry_t *pl 2.621 * would be a pain to ensure they remain continuously valid throughout. 2.622 */ 2.623 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) ) 2.624 - return 0; 2.625 + return -EINVAL; 2.626 2.627 page_lock(l3pg); 2.628 2.629 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) ) 2.630 - return page_unlock(l3pg), 0; 2.631 + return page_unlock(l3pg), -EFAULT; 2.632 2.633 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT ) 2.634 { 2.635 @@ -1622,7 +1704,7 @@ static int mod_l3_entry(l3_pgentry_t *pl 2.636 page_unlock(l3pg); 2.637 MEM_LOG("Bad L3 flags %x", 2.638 l3e_get_flags(nl3e) & l3_disallow_mask(d)); 2.639 - return 0; 2.640 + return -EINVAL; 2.641 } 2.642 2.643 /* Fast path for identical mapping and presence. */ 2.644 @@ -1631,28 +1713,30 @@ static int mod_l3_entry(l3_pgentry_t *pl 2.645 adjust_guest_l3e(nl3e, d); 2.646 rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad); 2.647 page_unlock(l3pg); 2.648 - return rc; 2.649 + return rc ? 0 : -EFAULT; 2.650 } 2.651 2.652 - if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) ) 2.653 - return page_unlock(l3pg), 0; 2.654 + rc = get_page_from_l3e(nl3e, pfn, d, preemptible); 2.655 + if ( unlikely(rc < 0) ) 2.656 + return page_unlock(l3pg), rc; 2.657 + rc = 0; 2.658 2.659 adjust_guest_l3e(nl3e, d); 2.660 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, 2.661 preserve_ad)) ) 2.662 { 2.663 ol3e = nl3e; 2.664 - rc = 0; 2.665 + rc = -EFAULT; 2.666 } 2.667 } 2.668 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, 2.669 preserve_ad)) ) 2.670 { 2.671 page_unlock(l3pg); 2.672 - return 0; 2.673 + return -EFAULT; 2.674 } 2.675 2.676 - if ( likely(rc) ) 2.677 + if ( likely(rc == 0) ) 2.678 { 2.679 if ( !create_pae_xen_mappings(d, pl3e) ) 2.680 BUG(); 2.681 @@ -1661,36 +1745,35 @@ static int mod_l3_entry(l3_pgentry_t *pl 2.682 } 2.683 2.684 page_unlock(l3pg); 2.685 - put_page_from_l3e(ol3e, pfn); 2.686 + put_page_from_l3e(ol3e, pfn, 0); 2.687 return rc; 2.688 } 2.689 2.690 -#endif 2.691 - 2.692 #if CONFIG_PAGING_LEVELS >= 4 2.693 2.694 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */ 2.695 static int mod_l4_entry(l4_pgentry_t *pl4e, 2.696 l4_pgentry_t nl4e, 2.697 unsigned long pfn, 2.698 - int preserve_ad) 2.699 + int preserve_ad, 2.700 + int preemptible) 2.701 { 2.702 struct vcpu *curr = current; 2.703 struct domain *d = curr->domain; 2.704 l4_pgentry_t ol4e; 2.705 struct page_info *l4pg = mfn_to_page(pfn); 2.706 - int rc = 1; 2.707 + int rc = 0; 2.708 2.709 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) ) 2.710 { 2.711 MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e); 2.712 - return 0; 2.713 + return -EINVAL; 2.714 } 2.715 2.716 page_lock(l4pg); 2.717 2.718 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) ) 2.719 - return page_unlock(l4pg), 0; 2.720 + return page_unlock(l4pg), -EFAULT; 2.721 2.722 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT ) 2.723 { 2.724 @@ -1699,7 +1782,7 @@ static int mod_l4_entry(l4_pgentry_t *pl 2.725 page_unlock(l4pg); 2.726 MEM_LOG("Bad L4 flags %x", 2.727 l4e_get_flags(nl4e) & L4_DISALLOW_MASK); 2.728 - return 0; 2.729 + return -EINVAL; 2.730 } 2.731 2.732 /* Fast path for identical mapping and presence. */ 2.733 @@ -1708,29 +1791,31 @@ static int mod_l4_entry(l4_pgentry_t *pl 2.734 adjust_guest_l4e(nl4e, d); 2.735 rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad); 2.736 page_unlock(l4pg); 2.737 - return rc; 2.738 + return rc ? 0 : -EFAULT; 2.739 } 2.740 2.741 - if ( unlikely(!get_page_from_l4e(nl4e, pfn, d)) ) 2.742 - return page_unlock(l4pg), 0; 2.743 + rc = get_page_from_l4e(nl4e, pfn, d, preemptible); 2.744 + if ( unlikely(rc < 0) ) 2.745 + return page_unlock(l4pg), rc; 2.746 + rc = 0; 2.747 2.748 adjust_guest_l4e(nl4e, d); 2.749 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, 2.750 preserve_ad)) ) 2.751 { 2.752 ol4e = nl4e; 2.753 - rc = 0; 2.754 + rc = -EFAULT; 2.755 } 2.756 } 2.757 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, 2.758 preserve_ad)) ) 2.759 { 2.760 page_unlock(l4pg); 2.761 - return 0; 2.762 + return -EFAULT; 2.763 } 2.764 2.765 page_unlock(l4pg); 2.766 - put_page_from_l4e(ol4e, pfn); 2.767 + put_page_from_l4e(ol4e, pfn, 0); 2.768 return rc; 2.769 } 2.770 2.771 @@ -1788,9 +1873,11 @@ int get_page(struct page_info *page, str 2.772 } 2.773 2.774 2.775 -static int alloc_page_type(struct page_info *page, unsigned long type) 2.776 +static int alloc_page_type(struct page_info *page, unsigned long type, 2.777 + int preemptible) 2.778 { 2.779 struct domain *owner = page_get_owner(page); 2.780 + int rc; 2.781 2.782 /* A page table is dirtied when its type count becomes non-zero. */ 2.783 if ( likely(owner != NULL) ) 2.784 @@ -1799,30 +1886,65 @@ static int alloc_page_type(struct page_i 2.785 switch ( type & PGT_type_mask ) 2.786 { 2.787 case PGT_l1_page_table: 2.788 - return alloc_l1_table(page); 2.789 + alloc_l1_table(page); 2.790 + rc = 0; 2.791 + break; 2.792 case PGT_l2_page_table: 2.793 - return alloc_l2_table(page, type); 2.794 + rc = alloc_l2_table(page, type, preemptible); 2.795 + break; 2.796 case PGT_l3_page_table: 2.797 - return alloc_l3_table(page); 2.798 + rc = alloc_l3_table(page, preemptible); 2.799 + break; 2.800 case PGT_l4_page_table: 2.801 - return alloc_l4_table(page); 2.802 + rc = alloc_l4_table(page, preemptible); 2.803 + break; 2.804 case PGT_seg_desc_page: 2.805 - return alloc_segdesc_page(page); 2.806 + rc = alloc_segdesc_page(page); 2.807 + break; 2.808 default: 2.809 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n", 2.810 type, page->u.inuse.type_info, 2.811 page->count_info); 2.812 + rc = -EINVAL; 2.813 BUG(); 2.814 } 2.815 2.816 - return 0; 2.817 + /* No need for atomic update of type_info here: noone else updates it. */ 2.818 + wmb(); 2.819 + if ( rc == -EAGAIN ) 2.820 + { 2.821 + page->u.inuse.type_info |= PGT_partial; 2.822 + } 2.823 + else if ( rc == -EINTR ) 2.824 + { 2.825 + ASSERT((page->u.inuse.type_info & 2.826 + (PGT_count_mask|PGT_validated|PGT_partial)) == 1); 2.827 + page->u.inuse.type_info &= ~PGT_count_mask; 2.828 + } 2.829 + else if ( rc ) 2.830 + { 2.831 + ASSERT(rc < 0); 2.832 + MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %" 2.833 + PRtype_info ": caf=%08x taf=%" PRtype_info, 2.834 + page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)), 2.835 + type, page->count_info, page->u.inuse.type_info); 2.836 + page->u.inuse.type_info = 0; 2.837 + } 2.838 + else 2.839 + { 2.840 + page->u.inuse.type_info |= PGT_validated; 2.841 + } 2.842 + 2.843 + return rc; 2.844 } 2.845 2.846 2.847 -void free_page_type(struct page_info *page, unsigned long type) 2.848 +int free_page_type(struct page_info *page, unsigned long type, 2.849 + int preemptible) 2.850 { 2.851 struct domain *owner = page_get_owner(page); 2.852 unsigned long gmfn; 2.853 + int rc; 2.854 2.855 if ( likely(owner != NULL) ) 2.856 { 2.857 @@ -1842,7 +1964,7 @@ void free_page_type(struct page_info *pa 2.858 paging_mark_dirty(owner, page_to_mfn(page)); 2.859 2.860 if ( shadow_mode_refcounts(owner) ) 2.861 - return; 2.862 + return 0; 2.863 2.864 gmfn = mfn_to_gmfn(owner, page_to_mfn(page)); 2.865 ASSERT(VALID_M2P(gmfn)); 2.866 @@ -1850,42 +1972,80 @@ void free_page_type(struct page_info *pa 2.867 } 2.868 } 2.869 2.870 + if ( !(type & PGT_partial) ) 2.871 + { 2.872 + page->nr_validated_ptes = 1U << PAGETABLE_ORDER; 2.873 + page->partial_pte = 0; 2.874 + } 2.875 switch ( type & PGT_type_mask ) 2.876 { 2.877 case PGT_l1_page_table: 2.878 free_l1_table(page); 2.879 + rc = 0; 2.880 break; 2.881 - 2.882 case PGT_l2_page_table: 2.883 - free_l2_table(page); 2.884 + rc = free_l2_table(page, preemptible); 2.885 break; 2.886 - 2.887 -#if CONFIG_PAGING_LEVELS >= 3 2.888 case PGT_l3_page_table: 2.889 - free_l3_table(page); 2.890 - break; 2.891 +#if CONFIG_PAGING_LEVELS == 3 2.892 + if ( !(type & PGT_partial) ) 2.893 + page->nr_validated_ptes = L3_PAGETABLE_ENTRIES; 2.894 #endif 2.895 - 2.896 -#if CONFIG_PAGING_LEVELS >= 4 2.897 - case PGT_l4_page_table: 2.898 - free_l4_table(page); 2.899 + rc = free_l3_table(page, preemptible); 2.900 break; 2.901 -#endif 2.902 - 2.903 + case PGT_l4_page_table: 2.904 + rc = free_l4_table(page, preemptible); 2.905 + break; 2.906 default: 2.907 - printk("%s: type %lx pfn %lx\n",__FUNCTION__, 2.908 - type, page_to_mfn(page)); 2.909 + MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page)); 2.910 + rc = -EINVAL; 2.911 BUG(); 2.912 } 2.913 + 2.914 + /* No need for atomic update of type_info here: noone else updates it. */ 2.915 + if ( rc == 0 ) 2.916 + { 2.917 + /* 2.918 + * Record TLB information for flush later. We do not stamp page tables 2.919 + * when running in shadow mode: 2.920 + * 1. Pointless, since it's the shadow pt's which must be tracked. 2.921 + * 2. Shadow mode reuses this field for shadowed page tables to 2.922 + * store flags info -- we don't want to conflict with that. 2.923 + */ 2.924 + if ( !(shadow_mode_enabled(page_get_owner(page)) && 2.925 + (page->count_info & PGC_page_table)) ) 2.926 + page->tlbflush_timestamp = tlbflush_current_time(); 2.927 + wmb(); 2.928 + page->u.inuse.type_info--; 2.929 + } 2.930 + else if ( rc == -EINTR ) 2.931 + { 2.932 + ASSERT(!(page->u.inuse.type_info & 2.933 + (PGT_count_mask|PGT_validated|PGT_partial))); 2.934 + if ( !(shadow_mode_enabled(page_get_owner(page)) && 2.935 + (page->count_info & PGC_page_table)) ) 2.936 + page->tlbflush_timestamp = tlbflush_current_time(); 2.937 + wmb(); 2.938 + page->u.inuse.type_info |= PGT_validated; 2.939 + } 2.940 + else 2.941 + { 2.942 + BUG_ON(rc != -EAGAIN); 2.943 + wmb(); 2.944 + page->u.inuse.type_info |= PGT_partial; 2.945 + } 2.946 + 2.947 + return rc; 2.948 } 2.949 2.950 2.951 -void put_page_type(struct page_info *page) 2.952 +static int __put_page_type(struct page_info *page, 2.953 + int preemptible) 2.954 { 2.955 unsigned long nx, x, y = page->u.inuse.type_info; 2.956 2.957 - again: 2.958 - do { 2.959 + for ( ; ; ) 2.960 + { 2.961 x = y; 2.962 nx = x - 1; 2.963 2.964 @@ -1894,21 +2054,19 @@ void put_page_type(struct page_info *pag 2.965 if ( unlikely((nx & PGT_count_mask) == 0) ) 2.966 { 2.967 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) && 2.968 - likely(nx & PGT_validated) ) 2.969 + likely(nx & (PGT_validated|PGT_partial)) ) 2.970 { 2.971 /* 2.972 * Page-table pages must be unvalidated when count is zero. The 2.973 * 'free' is safe because the refcnt is non-zero and validated 2.974 * bit is clear => other ops will spin or fail. 2.975 */ 2.976 - if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 2.977 - x & ~PGT_validated)) != x) ) 2.978 - goto again; 2.979 + nx = x & ~(PGT_validated|PGT_partial); 2.980 + if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, 2.981 + x, nx)) != x) ) 2.982 + continue; 2.983 /* We cleared the 'valid bit' so we do the clean up. */ 2.984 - free_page_type(page, x); 2.985 - /* Carry on, but with the 'valid bit' now clear. */ 2.986 - x &= ~PGT_validated; 2.987 - nx &= ~PGT_validated; 2.988 + return free_page_type(page, x, preemptible); 2.989 } 2.990 2.991 /* 2.992 @@ -1922,25 +2080,33 @@ void put_page_type(struct page_info *pag 2.993 (page->count_info & PGC_page_table)) ) 2.994 page->tlbflush_timestamp = tlbflush_current_time(); 2.995 } 2.996 + 2.997 + if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) ) 2.998 + break; 2.999 + 2.1000 + if ( preemptible && hypercall_preempt_check() ) 2.1001 + return -EINTR; 2.1002 } 2.1003 - while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); 2.1004 + 2.1005 + return 0; 2.1006 } 2.1007 2.1008 2.1009 -int get_page_type(struct page_info *page, unsigned long type) 2.1010 +static int __get_page_type(struct page_info *page, unsigned long type, 2.1011 + int preemptible) 2.1012 { 2.1013 unsigned long nx, x, y = page->u.inuse.type_info; 2.1014 2.1015 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2))); 2.1016 2.1017 - again: 2.1018 - do { 2.1019 + for ( ; ; ) 2.1020 + { 2.1021 x = y; 2.1022 nx = x + 1; 2.1023 if ( unlikely((nx & PGT_count_mask) == 0) ) 2.1024 { 2.1025 MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page)); 2.1026 - return 0; 2.1027 + return -EINVAL; 2.1028 } 2.1029 else if ( unlikely((x & PGT_count_mask) == 0) ) 2.1030 { 2.1031 @@ -1993,28 +2159,43 @@ int get_page_type(struct page_info *page 2.1032 /* Don't log failure if it could be a recursive-mapping attempt. */ 2.1033 if ( ((x & PGT_type_mask) == PGT_l2_page_table) && 2.1034 (type == PGT_l1_page_table) ) 2.1035 - return 0; 2.1036 + return -EINVAL; 2.1037 if ( ((x & PGT_type_mask) == PGT_l3_page_table) && 2.1038 (type == PGT_l2_page_table) ) 2.1039 - return 0; 2.1040 + return -EINVAL; 2.1041 if ( ((x & PGT_type_mask) == PGT_l4_page_table) && 2.1042 (type == PGT_l3_page_table) ) 2.1043 - return 0; 2.1044 + return -EINVAL; 2.1045 MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") " 2.1046 "for mfn %lx (pfn %lx)", 2.1047 x, type, page_to_mfn(page), 2.1048 get_gpfn_from_mfn(page_to_mfn(page))); 2.1049 - return 0; 2.1050 + return -EINVAL; 2.1051 } 2.1052 else if ( unlikely(!(x & PGT_validated)) ) 2.1053 { 2.1054 - /* Someone else is updating validation of this page. Wait... */ 2.1055 - while ( (y = page->u.inuse.type_info) == x ) 2.1056 - cpu_relax(); 2.1057 - goto again; 2.1058 + if ( !(x & PGT_partial) ) 2.1059 + { 2.1060 + /* Someone else is updating validation of this page. Wait... */ 2.1061 + while ( (y = page->u.inuse.type_info) == x ) 2.1062 + { 2.1063 + if ( preemptible && hypercall_preempt_check() ) 2.1064 + return -EINTR; 2.1065 + cpu_relax(); 2.1066 + } 2.1067 + continue; 2.1068 + } 2.1069 + /* Type ref count was left at 1 when PGT_partial got set. */ 2.1070 + ASSERT((x & PGT_count_mask) == 1); 2.1071 + nx = x & ~PGT_partial; 2.1072 } 2.1073 + 2.1074 + if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) ) 2.1075 + break; 2.1076 + 2.1077 + if ( preemptible && hypercall_preempt_check() ) 2.1078 + return -EINTR; 2.1079 } 2.1080 - while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); 2.1081 2.1082 if ( unlikely((x & PGT_type_mask) != type) ) 2.1083 { 2.1084 @@ -2032,25 +2213,42 @@ int get_page_type(struct page_info *page 2.1085 2.1086 if ( unlikely(!(nx & PGT_validated)) ) 2.1087 { 2.1088 - /* Try to validate page type; drop the new reference on failure. */ 2.1089 - if ( unlikely(!alloc_page_type(page, type)) ) 2.1090 + if ( !(x & PGT_partial) ) 2.1091 { 2.1092 - MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %" 2.1093 - PRtype_info ": caf=%08x taf=%" PRtype_info, 2.1094 - page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)), 2.1095 - type, page->count_info, page->u.inuse.type_info); 2.1096 - /* Noone else can get a reference. We hold the only ref. */ 2.1097 - page->u.inuse.type_info = 0; 2.1098 - return 0; 2.1099 + page->nr_validated_ptes = 0; 2.1100 + page->partial_pte = 0; 2.1101 } 2.1102 - 2.1103 - /* Noone else is updating simultaneously. */ 2.1104 - __set_bit(_PGT_validated, &page->u.inuse.type_info); 2.1105 + return alloc_page_type(page, type, preemptible); 2.1106 } 2.1107 2.1108 - return 1; 2.1109 + return 0; 2.1110 +} 2.1111 + 2.1112 +void put_page_type(struct page_info *page) 2.1113 +{ 2.1114 + int rc = __put_page_type(page, 0); 2.1115 + ASSERT(rc == 0); 2.1116 + (void)rc; 2.1117 } 2.1118 2.1119 +int get_page_type(struct page_info *page, unsigned long type) 2.1120 +{ 2.1121 + int rc = __get_page_type(page, type, 0); 2.1122 + if ( likely(rc == 0) ) 2.1123 + return 1; 2.1124 + ASSERT(rc == -EINVAL); 2.1125 + return 0; 2.1126 +} 2.1127 + 2.1128 +int put_page_type_preemptible(struct page_info *page) 2.1129 +{ 2.1130 + return __put_page_type(page, 1); 2.1131 +} 2.1132 + 2.1133 +int get_page_type_preemptible(struct page_info *page, unsigned long type) 2.1134 +{ 2.1135 + return __get_page_type(page, type, 1); 2.1136 +} 2.1137 2.1138 void cleanup_page_cacheattr(struct page_info *page) 2.1139 { 2.1140 @@ -2087,7 +2285,7 @@ int new_guest_cr3(unsigned long mfn) 2.1141 l4e_from_pfn( 2.1142 mfn, 2.1143 (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)), 2.1144 - pagetable_get_pfn(v->arch.guest_table), 0); 2.1145 + pagetable_get_pfn(v->arch.guest_table), 0, 0) == 0; 2.1146 if ( unlikely(!okay) ) 2.1147 { 2.1148 MEM_LOG("Error while installing new compat baseptr %lx", mfn); 2.1149 @@ -2102,7 +2300,7 @@ int new_guest_cr3(unsigned long mfn) 2.1150 #endif 2.1151 okay = paging_mode_refcounts(d) 2.1152 ? get_page_from_pagenr(mfn, d) 2.1153 - : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d); 2.1154 + : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0); 2.1155 if ( unlikely(!okay) ) 2.1156 { 2.1157 MEM_LOG("Error while installing new baseptr %lx", mfn); 2.1158 @@ -2276,9 +2474,7 @@ int do_mmuext_op( 2.1159 { 2.1160 if ( hypercall_preempt_check() ) 2.1161 { 2.1162 - rc = hypercall_create_continuation( 2.1163 - __HYPERVISOR_mmuext_op, "hihi", 2.1164 - uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); 2.1165 + rc = -EAGAIN; 2.1166 break; 2.1167 } 2.1168 2.1169 @@ -2325,10 +2521,14 @@ int do_mmuext_op( 2.1170 if ( paging_mode_refcounts(FOREIGNDOM) ) 2.1171 break; 2.1172 2.1173 - okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM); 2.1174 + rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1); 2.1175 + okay = !rc; 2.1176 if ( unlikely(!okay) ) 2.1177 { 2.1178 - MEM_LOG("Error while pinning mfn %lx", mfn); 2.1179 + if ( rc == -EINTR ) 2.1180 + rc = -EAGAIN; 2.1181 + else if ( rc != -EAGAIN ) 2.1182 + MEM_LOG("Error while pinning mfn %lx", mfn); 2.1183 break; 2.1184 } 2.1185 2.1186 @@ -2373,8 +2573,11 @@ int do_mmuext_op( 2.1187 { 2.1188 put_page_and_type(page); 2.1189 put_page(page); 2.1190 - /* A page is dirtied when its pin status is cleared. */ 2.1191 - paging_mark_dirty(d, mfn); 2.1192 + if ( !rc ) 2.1193 + { 2.1194 + /* A page is dirtied when its pin status is cleared. */ 2.1195 + paging_mark_dirty(d, mfn); 2.1196 + } 2.1197 } 2.1198 else 2.1199 { 2.1200 @@ -2398,8 +2601,8 @@ int do_mmuext_op( 2.1201 if ( paging_mode_refcounts(d) ) 2.1202 okay = get_page_from_pagenr(mfn, d); 2.1203 else 2.1204 - okay = get_page_and_type_from_pagenr( 2.1205 - mfn, PGT_root_page_table, d); 2.1206 + okay = !get_page_and_type_from_pagenr( 2.1207 + mfn, PGT_root_page_table, d, 0); 2.1208 if ( unlikely(!okay) ) 2.1209 { 2.1210 MEM_LOG("Error while installing new mfn %lx", mfn); 2.1211 @@ -2517,6 +2720,11 @@ int do_mmuext_op( 2.1212 guest_handle_add_offset(uops, 1); 2.1213 } 2.1214 2.1215 + if ( rc == -EAGAIN ) 2.1216 + rc = hypercall_create_continuation( 2.1217 + __HYPERVISOR_mmuext_op, "hihi", 2.1218 + uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); 2.1219 + 2.1220 process_deferred_ops(); 2.1221 2.1222 perfc_add(num_mmuext_ops, i); 2.1223 @@ -2576,9 +2784,7 @@ int do_mmu_update( 2.1224 { 2.1225 if ( hypercall_preempt_check() ) 2.1226 { 2.1227 - rc = hypercall_create_continuation( 2.1228 - __HYPERVISOR_mmu_update, "hihi", 2.1229 - ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); 2.1230 + rc = -EAGAIN; 2.1231 break; 2.1232 } 2.1233 2.1234 @@ -2653,27 +2859,29 @@ int do_mmu_update( 2.1235 cmd == MMU_PT_UPDATE_PRESERVE_AD); 2.1236 } 2.1237 break; 2.1238 -#if CONFIG_PAGING_LEVELS >= 3 2.1239 case PGT_l3_page_table: 2.1240 { 2.1241 l3_pgentry_t l3e = l3e_from_intpte(req.val); 2.1242 - okay = mod_l3_entry(va, l3e, mfn, 2.1243 - cmd == MMU_PT_UPDATE_PRESERVE_AD); 2.1244 + rc = mod_l3_entry(va, l3e, mfn, 2.1245 + cmd == MMU_PT_UPDATE_PRESERVE_AD, 1); 2.1246 + okay = !rc; 2.1247 } 2.1248 break; 2.1249 -#endif 2.1250 #if CONFIG_PAGING_LEVELS >= 4 2.1251 case PGT_l4_page_table: 2.1252 { 2.1253 l4_pgentry_t l4e = l4e_from_intpte(req.val); 2.1254 - okay = mod_l4_entry(va, l4e, mfn, 2.1255 - cmd == MMU_PT_UPDATE_PRESERVE_AD); 2.1256 + rc = mod_l4_entry(va, l4e, mfn, 2.1257 + cmd == MMU_PT_UPDATE_PRESERVE_AD, 1); 2.1258 + okay = !rc; 2.1259 } 2.1260 break; 2.1261 #endif 2.1262 } 2.1263 2.1264 put_page_type(page); 2.1265 + if ( rc == -EINTR ) 2.1266 + rc = -EAGAIN; 2.1267 } 2.1268 break; 2.1269 2.1270 @@ -2742,6 +2950,11 @@ int do_mmu_update( 2.1271 guest_handle_add_offset(ureqs, 1); 2.1272 } 2.1273 2.1274 + if ( rc == -EAGAIN ) 2.1275 + rc = hypercall_create_continuation( 2.1276 + __HYPERVISOR_mmu_update, "hihi", 2.1277 + ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); 2.1278 + 2.1279 process_deferred_ops(); 2.1280 2.1281 domain_mmap_cache_destroy(&mapcache); 2.1282 @@ -3695,9 +3908,8 @@ static int ptwr_emulated_update( 2.1283 nl1e = l1e_from_intpte(val); 2.1284 if ( unlikely(!get_page_from_l1e(nl1e, d)) ) 2.1285 { 2.1286 - if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) && 2.1287 - (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg && 2.1288 - (l1e_get_flags(nl1e) & _PAGE_PRESENT) ) 2.1289 + if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) && 2.1290 + !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) ) 2.1291 { 2.1292 /* 2.1293 * If this is an upper-half write to a PAE PTE then we assume that
3.1 --- a/xen/include/asm-x86/mm.h Mon Sep 01 10:49:00 2008 +0100 3.2 +++ b/xen/include/asm-x86/mm.h Mon Sep 01 10:52:05 2008 +0100 3.3 @@ -59,6 +59,17 @@ struct page_info 3.4 u32 tlbflush_timestamp; 3.5 3.6 /* 3.7 + * When PGT_partial is true then this field is valid and indicates 3.8 + * that PTEs in the range [0, @nr_validated_ptes) have been validated. 3.9 + * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been 3.10 + * partially validated. 3.11 + */ 3.12 + struct { 3.13 + u16 nr_validated_ptes; 3.14 + bool_t partial_pte; 3.15 + }; 3.16 + 3.17 + /* 3.18 * Guest pages with a shadow. This does not conflict with 3.19 * tlbflush_timestamp since page table pages are explicitly not 3.20 * tracked for TLB-flush avoidance when a guest runs in shadow mode. 3.21 @@ -86,9 +97,12 @@ struct page_info 3.22 /* PAE only: is this an L2 page directory containing Xen-private mappings? */ 3.23 #define _PGT_pae_xen_l2 26 3.24 #define PGT_pae_xen_l2 (1U<<_PGT_pae_xen_l2) 3.25 +/* Has this page been *partially* validated for use as its current type? */ 3.26 +#define _PGT_partial 25 3.27 +#define PGT_partial (1U<<_PGT_partial) 3.28 3.29 - /* 26-bit count of uses of this frame as its current type. */ 3.30 -#define PGT_count_mask ((1U<<26)-1) 3.31 + /* 25-bit count of uses of this frame as its current type. */ 3.32 +#define PGT_count_mask ((1U<<25)-1) 3.33 3.34 /* Cleared when the owning guest 'frees' this page. */ 3.35 #define _PGC_allocated 31 3.36 @@ -154,7 +168,8 @@ extern unsigned long max_page; 3.37 extern unsigned long total_pages; 3.38 void init_frametable(void); 3.39 3.40 -void free_page_type(struct page_info *page, unsigned long type); 3.41 +int free_page_type(struct page_info *page, unsigned long type, 3.42 + int preemptible); 3.43 int _shadow_mode_refcounts(struct domain *d); 3.44 3.45 void cleanup_page_cacheattr(struct page_info *page); 3.46 @@ -165,6 +180,8 @@ void put_page(struct page_info *page); 3.47 int get_page(struct page_info *page, struct domain *domain); 3.48 void put_page_type(struct page_info *page); 3.49 int get_page_type(struct page_info *page, unsigned long type); 3.50 +int put_page_type_preemptible(struct page_info *page); 3.51 +int get_page_type_preemptible(struct page_info *page, unsigned long type); 3.52 int get_page_from_l1e(l1_pgentry_t l1e, struct domain *d); 3.53 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d); 3.54 3.55 @@ -174,6 +191,19 @@ static inline void put_page_and_type(str 3.56 put_page(page); 3.57 } 3.58 3.59 +static inline int put_page_and_type_preemptible(struct page_info *page, 3.60 + int preemptible) 3.61 +{ 3.62 + int rc = 0; 3.63 + 3.64 + if ( preemptible ) 3.65 + rc = put_page_type_preemptible(page); 3.66 + else 3.67 + put_page_type(page); 3.68 + if ( likely(rc == 0) ) 3.69 + put_page(page); 3.70 + return rc; 3.71 +} 3.72 3.73 static inline int get_page_and_type(struct page_info *page, 3.74 struct domain *domain,