ia64/xen-unstable
changeset 10193:954f4dea9da6
[PAE] Allow pgdirs above 4GB for paravirt guests.
**NOTE**: This obviates the need for lowmem_emergency_pool.
Unpriv guests no longer need to be able to allocate memory
below 4GB for PAE PDPTs.
Signed-off-by: Keir Fraser <keir@xensource.com>
**NOTE**: This obviates the need for lowmem_emergency_pool.
Unpriv guests no longer need to be able to allocate memory
below 4GB for PAE PDPTs.
Signed-off-by: Keir Fraser <keir@xensource.com>
author | kaf24@firebug.cl.cam.ac.uk |
---|---|
date | Fri May 26 17:22:30 2006 +0100 (2006-05-26) |
parents | 3ed325fa395b |
children | c073ebdbde8c 71fa0e0d520c |
files | tools/libxc/xc_linux_build.c tools/libxc/xc_linux_restore.c tools/libxc/xc_private.c tools/libxc/xenctrl.h xen/arch/x86/domain_build.c xen/arch/x86/mm.c xen/common/kernel.c xen/include/asm-x86/domain.h |
line diff
1.1 --- a/tools/libxc/xc_linux_build.c Fri May 26 13:53:49 2006 +0100 1.2 +++ b/tools/libxc/xc_linux_build.c Fri May 26 17:22:30 2006 +0100 1.3 @@ -268,22 +268,11 @@ static int setup_pg_tables_pae(int xc_ha 1.4 l2_pgentry_64_t *vl2tab = NULL, *vl2e = NULL; 1.5 l3_pgentry_64_t *vl3tab = NULL, *vl3e = NULL; 1.6 uint64_t l1tab, l2tab, l3tab, pl1tab, pl2tab, pl3tab; 1.7 - unsigned long ppt_alloc, count, nmfn; 1.8 + unsigned long ppt_alloc, count; 1.9 1.10 /* First allocate page for page dir. */ 1.11 ppt_alloc = (vpt_start - dsi_v_start) >> PAGE_SHIFT; 1.12 1.13 - if ( page_array[ppt_alloc] > 0xfffff ) 1.14 - { 1.15 - nmfn = xc_make_page_below_4G(xc_handle, dom, page_array[ppt_alloc]); 1.16 - if ( nmfn == 0 ) 1.17 - { 1.18 - fprintf(stderr, "Couldn't get a page below 4GB :-(\n"); 1.19 - goto error_out; 1.20 - } 1.21 - page_array[ppt_alloc] = nmfn; 1.22 - } 1.23 - 1.24 alloc_pt(l3tab, vl3tab, pl3tab); 1.25 vl3e = &vl3tab[l3_table_offset_pae(dsi_v_start)]; 1.26 if (shadow_mode_enabled)
2.1 --- a/tools/libxc/xc_linux_restore.c Fri May 26 13:53:49 2006 +0100 2.2 +++ b/tools/libxc/xc_linux_restore.c Fri May 26 17:22:30 2006 +0100 2.3 @@ -331,25 +331,17 @@ int xc_linux_restore(int xc_handle, int 2.4 ** A page table page - need to 'uncanonicalize' it, i.e. 2.5 ** replace all the references to pfns with the corresponding 2.6 ** mfns for the new domain. 2.7 - ** 2.8 - ** On PAE we need to ensure that PGDs are in MFNs < 4G, and 2.9 - ** so we may need to update the p2m after the main loop. 2.10 - ** Hence we defer canonicalization of L1s until then. 2.11 */ 2.12 - if(pt_levels != 3 || pagetype != L1TAB) { 2.13 - 2.14 - if(!uncanonicalize_pagetable(pagetype, page)) { 2.15 - /* 2.16 - ** Failing to uncanonicalize a page table can be ok 2.17 - ** under live migration since the pages type may have 2.18 - ** changed by now (and we'll get an update later). 2.19 - */ 2.20 - DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n", 2.21 - pagetype >> 28, pfn, mfn); 2.22 - nraces++; 2.23 - continue; 2.24 - } 2.25 - 2.26 + if(!uncanonicalize_pagetable(pagetype, page)) { 2.27 + /* 2.28 + ** Failing to uncanonicalize a page table can be ok 2.29 + ** under live migration since the pages type may have 2.30 + ** changed by now (and we'll get an update later). 2.31 + */ 2.32 + DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n", 2.33 + pagetype >> 28, pfn, mfn); 2.34 + nraces++; 2.35 + continue; 2.36 } 2.37 2.38 } else if(pagetype != NOTAB) { 2.39 @@ -398,100 +390,6 @@ int xc_linux_restore(int xc_handle, int 2.40 2.41 DPRINTF("Received all pages (%d races)\n", nraces); 2.42 2.43 - if(pt_levels == 3) { 2.44 - 2.45 - /* 2.46 - ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This 2.47 - ** is a little awkward and involves (a) finding all such PGDs and 2.48 - ** replacing them with 'lowmem' versions; (b) upating the p2m[] 2.49 - ** with the new info; and (c) canonicalizing all the L1s using the 2.50 - ** (potentially updated) p2m[]. 2.51 - ** 2.52 - ** This is relatively slow (and currently involves two passes through 2.53 - ** the pfn_type[] array), but at least seems to be correct. May wish 2.54 - ** to consider more complex approaches to optimize this later. 2.55 - */ 2.56 - 2.57 - int j, k; 2.58 - 2.59 - /* First pass: find all L3TABs current in > 4G mfns and get new mfns */ 2.60 - for (i = 0; i < max_pfn; i++) { 2.61 - 2.62 - if (((pfn_type[i] & LTABTYPE_MASK)==L3TAB) && (p2m[i]>0xfffffUL)) { 2.63 - 2.64 - unsigned long new_mfn; 2.65 - uint64_t l3ptes[4]; 2.66 - uint64_t *l3tab; 2.67 - 2.68 - l3tab = (uint64_t *) 2.69 - xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 2.70 - PROT_READ, p2m[i]); 2.71 - 2.72 - for(j = 0; j < 4; j++) 2.73 - l3ptes[j] = l3tab[j]; 2.74 - 2.75 - munmap(l3tab, PAGE_SIZE); 2.76 - 2.77 - if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) { 2.78 - ERR("Couldn't get a page below 4GB :-("); 2.79 - goto out; 2.80 - } 2.81 - 2.82 - p2m[i] = new_mfn; 2.83 - if (xc_add_mmu_update(xc_handle, mmu, 2.84 - (((unsigned long long)new_mfn) 2.85 - << PAGE_SHIFT) | 2.86 - MMU_MACHPHYS_UPDATE, i)) { 2.87 - ERR("Couldn't m2p on PAE root pgdir"); 2.88 - goto out; 2.89 - } 2.90 - 2.91 - l3tab = (uint64_t *) 2.92 - xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 2.93 - PROT_READ | PROT_WRITE, p2m[i]); 2.94 - 2.95 - for(j = 0; j < 4; j++) 2.96 - l3tab[j] = l3ptes[j]; 2.97 - 2.98 - munmap(l3tab, PAGE_SIZE); 2.99 - 2.100 - } 2.101 - } 2.102 - 2.103 - /* Second pass: find all L1TABs and uncanonicalize them */ 2.104 - j = 0; 2.105 - 2.106 - for(i = 0; i < max_pfn; i++) { 2.107 - 2.108 - if (((pfn_type[i] & LTABTYPE_MASK)==L1TAB)) { 2.109 - region_mfn[j] = p2m[i]; 2.110 - j++; 2.111 - } 2.112 - 2.113 - if(i == (max_pfn-1) || j == MAX_BATCH_SIZE) { 2.114 - 2.115 - if (!(region_base = xc_map_foreign_batch( 2.116 - xc_handle, dom, PROT_READ | PROT_WRITE, 2.117 - region_mfn, j))) { 2.118 - ERR("map batch failed"); 2.119 - goto out; 2.120 - } 2.121 - 2.122 - for(k = 0; k < j; k++) { 2.123 - if(!uncanonicalize_pagetable(L1TAB, 2.124 - region_base + k*PAGE_SIZE)) { 2.125 - ERR("failed uncanonicalize pt!"); 2.126 - goto out; 2.127 - } 2.128 - } 2.129 - 2.130 - munmap(region_base, j*PAGE_SIZE); 2.131 - j = 0; 2.132 - } 2.133 - } 2.134 - 2.135 - } 2.136 - 2.137 2.138 if (xc_finish_mmu_updates(xc_handle, mmu)) { 2.139 ERR("Error doing finish_mmu_updates()");
3.1 --- a/tools/libxc/xc_private.c Fri May 26 13:53:49 2006 +0100 3.2 +++ b/tools/libxc/xc_private.c Fri May 26 17:22:30 2006 +0100 3.3 @@ -430,28 +430,6 @@ int xc_version(int xc_handle, int cmd, v 3.4 return rc; 3.5 } 3.6 3.7 -unsigned long xc_make_page_below_4G( 3.8 - int xc_handle, uint32_t domid, unsigned long mfn) 3.9 -{ 3.10 - unsigned long new_mfn; 3.11 - 3.12 - if ( xc_domain_memory_decrease_reservation( 3.13 - xc_handle, domid, 1, 0, &mfn) != 0 ) 3.14 - { 3.15 - fprintf(stderr,"xc_make_page_below_4G decrease failed. mfn=%lx\n",mfn); 3.16 - return 0; 3.17 - } 3.18 - 3.19 - if ( xc_domain_memory_increase_reservation( 3.20 - xc_handle, domid, 1, 0, 32, &new_mfn) != 0 ) 3.21 - { 3.22 - fprintf(stderr,"xc_make_page_below_4G increase failed. mfn=%lx\n",mfn); 3.23 - return 0; 3.24 - } 3.25 - 3.26 - return new_mfn; 3.27 -} 3.28 - 3.29 /* 3.30 * Local variables: 3.31 * mode: C
4.1 --- a/tools/libxc/xenctrl.h Fri May 26 13:53:49 2006 +0100 4.2 +++ b/tools/libxc/xenctrl.h Fri May 26 17:22:30 2006 +0100 4.3 @@ -453,9 +453,6 @@ int xc_domain_iomem_permission(int xc_ha 4.4 unsigned long nr_mfns, 4.5 uint8_t allow_access); 4.6 4.7 -unsigned long xc_make_page_below_4G(int xc_handle, uint32_t domid, 4.8 - unsigned long mfn); 4.9 - 4.10 typedef dom0_perfc_desc_t xc_perfc_desc_t; 4.11 /* IMPORTANT: The caller is responsible for mlock()'ing the @desc array. */ 4.12 int xc_perfc_control(int xc_handle,
5.1 --- a/xen/arch/x86/domain_build.c Fri May 26 13:53:49 2006 +0100 5.2 +++ b/xen/arch/x86/domain_build.c Fri May 26 17:22:30 2006 +0100 5.3 @@ -367,7 +367,10 @@ int construct_dom0(struct domain *d, 5.4 if ( (1UL << order) > nr_pages ) 5.5 panic("Domain 0 allocation is too small for kernel image.\n"); 5.6 5.7 - /* Allocate from DMA pool: PAE L3 table must be below 4GB boundary. */ 5.8 + /* 5.9 + * Allocate from DMA pool: on i386 this ensures that our low-memory 1:1 5.10 + * mapping covers the allocation. 5.11 + */ 5.12 if ( (page = alloc_domheap_pages(d, order, ALLOC_DOM_DMA)) == NULL ) 5.13 panic("Not enough RAM for domain 0 allocation.\n"); 5.14 alloc_spfn = page_to_mfn(page);
6.1 --- a/xen/arch/x86/mm.c Fri May 26 13:53:49 2006 +0100 6.2 +++ b/xen/arch/x86/mm.c Fri May 26 17:22:30 2006 +0100 6.3 @@ -260,9 +260,42 @@ void share_xen_page_with_privileged_gues 6.4 share_xen_page_with_guest(page, dom_xen, readonly); 6.5 } 6.6 6.7 +static void __write_ptbase(unsigned long mfn) 6.8 +{ 6.9 +#ifdef CONFIG_X86_PAE 6.10 + if ( mfn >= 0x100000 ) 6.11 + { 6.12 + l3_pgentry_t *highmem_l3tab, *lowmem_l3tab; 6.13 + struct vcpu *v = current; 6.14 + unsigned long flags; 6.15 + 6.16 + /* Protects against re-entry and against __pae_flush_pgd(). */ 6.17 + local_irq_save(flags); 6.18 + 6.19 + /* Pick an unused low-memory L3 cache slot. */ 6.20 + v->arch.lowmem_l3tab_inuse ^= 1; 6.21 + lowmem_l3tab = v->arch.lowmem_l3tab[v->arch.lowmem_l3tab_inuse]; 6.22 + v->arch.lowmem_l3tab_high_mfn[v->arch.lowmem_l3tab_inuse] = mfn; 6.23 + 6.24 + /* Map the guest L3 table and copy to the chosen low-memory cache. */ 6.25 + highmem_l3tab = map_domain_page(mfn); 6.26 + memcpy(lowmem_l3tab, highmem_l3tab, sizeof(v->arch.lowmem_l3tab)); 6.27 + unmap_domain_page(highmem_l3tab); 6.28 + 6.29 + /* Install the low-memory L3 table in CR3. */ 6.30 + write_cr3(__pa(lowmem_l3tab)); 6.31 + 6.32 + local_irq_restore(flags); 6.33 + return; 6.34 + } 6.35 +#endif 6.36 + 6.37 + write_cr3(mfn << PAGE_SHIFT); 6.38 +} 6.39 + 6.40 void write_ptbase(struct vcpu *v) 6.41 { 6.42 - write_cr3(pagetable_get_paddr(v->arch.monitor_table)); 6.43 + __write_ptbase(pagetable_get_pfn(v->arch.monitor_table)); 6.44 } 6.45 6.46 void invalidate_shadow_ldt(struct vcpu *v) 6.47 @@ -401,6 +434,7 @@ static int get_page_and_type_from_pagenr 6.48 return 1; 6.49 } 6.50 6.51 +#ifndef CONFIG_X86_PAE /* We do not support guest linear mappings on PAE. */ 6.52 /* 6.53 * We allow root tables to map each other (a.k.a. linear page tables). It 6.54 * needs some special care with reference counts and access permissions: 6.55 @@ -456,6 +490,7 @@ get_linear_pagetable( 6.56 6.57 return 1; 6.58 } 6.59 +#endif /* !CONFIG_X86_PAE */ 6.60 6.61 int 6.62 get_page_from_l1e( 6.63 @@ -564,10 +599,6 @@ get_page_from_l3e( 6.64 rc = get_page_and_type_from_pagenr( 6.65 l3e_get_pfn(l3e), 6.66 PGT_l2_page_table | vaddr, d); 6.67 -#if CONFIG_PAGING_LEVELS == 3 6.68 - if ( unlikely(!rc) ) 6.69 - rc = get_linear_pagetable(l3e, pfn, d); 6.70 -#endif 6.71 return rc; 6.72 } 6.73 #endif /* 3 level */ 6.74 @@ -773,6 +804,50 @@ static int create_pae_xen_mappings(l3_pg 6.75 return 1; 6.76 } 6.77 6.78 +struct pae_flush_pgd { 6.79 + unsigned long l3tab_mfn; 6.80 + unsigned int l3tab_idx; 6.81 + l3_pgentry_t nl3e; 6.82 +}; 6.83 + 6.84 +static void __pae_flush_pgd(void *data) 6.85 +{ 6.86 + struct pae_flush_pgd *args = data; 6.87 + struct vcpu *v = this_cpu(curr_vcpu); 6.88 + int i = v->arch.lowmem_l3tab_inuse; 6.89 + intpte_t _ol3e, _nl3e, _pl3e; 6.90 + l3_pgentry_t *l3tab_ptr; 6.91 + 6.92 + ASSERT(!local_irq_is_enabled()); 6.93 + 6.94 + if ( v->arch.lowmem_l3tab_high_mfn[i] != args->l3tab_mfn ) 6.95 + return; 6.96 + 6.97 + l3tab_ptr = &v->arch.lowmem_l3tab[i][args->l3tab_idx]; 6.98 + 6.99 + _ol3e = l3e_get_intpte(*l3tab_ptr); 6.100 + _nl3e = l3e_get_intpte(args->nl3e); 6.101 + _pl3e = cmpxchg((intpte_t *)l3tab_ptr, _ol3e, _nl3e); 6.102 + BUG_ON(_pl3e != _ol3e); 6.103 +} 6.104 + 6.105 +/* Flush a pgdir update into low-memory caches. */ 6.106 +static void pae_flush_pgd( 6.107 + unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e) 6.108 +{ 6.109 + struct domain *d = page_get_owner(mfn_to_page(mfn)); 6.110 + struct pae_flush_pgd args = { 6.111 + .l3tab_mfn = mfn, 6.112 + .l3tab_idx = idx, 6.113 + .nl3e = nl3e }; 6.114 + 6.115 + /* If below 4GB then the pgdir is not shadowed in low memory. */ 6.116 + if ( mfn < 0x100000 ) 6.117 + return; 6.118 + 6.119 + on_selected_cpus(d->domain_dirty_cpumask, __pae_flush_pgd, &args, 1, 1); 6.120 +} 6.121 + 6.122 static inline int l1_backptr( 6.123 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type) 6.124 { 6.125 @@ -787,6 +862,7 @@ static inline int l1_backptr( 6.126 6.127 #elif CONFIG_X86_64 6.128 # define create_pae_xen_mappings(pl3e) (1) 6.129 +# define pae_flush_pgd(mfn, idx, nl3e) ((void)0) 6.130 6.131 static inline int l1_backptr( 6.132 unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type) 6.133 @@ -886,14 +962,6 @@ static int alloc_l3_table(struct page_in 6.134 6.135 ASSERT(!shadow_mode_refcounts(d)); 6.136 6.137 -#ifdef CONFIG_X86_PAE 6.138 - if ( pfn >= 0x100000 ) 6.139 - { 6.140 - MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn); 6.141 - return 0; 6.142 - } 6.143 -#endif 6.144 - 6.145 pl3e = map_domain_page(pfn); 6.146 for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) 6.147 { 6.148 @@ -1241,6 +1309,8 @@ static int mod_l3_entry(l3_pgentry_t *pl 6.149 okay = create_pae_xen_mappings(pl3e); 6.150 BUG_ON(!okay); 6.151 6.152 + pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e); 6.153 + 6.154 put_page_from_l3e(ol3e, pfn); 6.155 return 1; 6.156 } 6.157 @@ -3109,7 +3179,7 @@ void ptwr_flush(struct domain *d, const 6.158 6.159 if ( unlikely(d->arch.ptwr[which].vcpu != current) ) 6.160 /* Don't use write_ptbase: it may switch to guest_user on x86/64! */ 6.161 - write_cr3(pagetable_get_paddr( 6.162 + __write_ptbase(pagetable_get_pfn( 6.163 d->arch.ptwr[which].vcpu->arch.guest_table)); 6.164 else 6.165 TOGGLE_MODE();
7.1 --- a/xen/common/kernel.c Fri May 26 13:53:49 2006 +0100 7.2 +++ b/xen/common/kernel.c Fri May 26 17:22:30 2006 +0100 7.3 @@ -191,12 +191,11 @@ long do_xen_version(int cmd, XEN_GUEST_H 7.4 switch ( fi.submap_idx ) 7.5 { 7.6 case 0: 7.7 - fi.submap = 0; 7.8 + fi.submap = (1U << XENFEAT_pae_pgdir_above_4gb); 7.9 if ( shadow_mode_translate(current->domain) ) 7.10 fi.submap |= 7.11 (1U << XENFEAT_writable_page_tables) | 7.12 - (1U << XENFEAT_auto_translated_physmap) | 7.13 - (1U << XENFEAT_pae_pgdir_above_4gb); 7.14 + (1U << XENFEAT_auto_translated_physmap); 7.15 if ( supervisor_mode_kernel ) 7.16 fi.submap |= 1U << XENFEAT_supervisor_mode_kernel; 7.17 break;
8.1 --- a/xen/include/asm-x86/domain.h Fri May 26 13:53:49 2006 +0100 8.2 +++ b/xen/include/asm-x86/domain.h Fri May 26 17:22:30 2006 +0100 8.3 @@ -120,6 +120,18 @@ struct arch_vcpu 8.4 struct vcpu_guest_context guest_context 8.5 __attribute__((__aligned__(16))); 8.6 8.7 +#ifdef CONFIG_X86_PAE 8.8 + /* 8.9 + * Two low-memory (<4GB) PAE L3 tables, used as fallback when the guest 8.10 + * supplies a >=4GB PAE L3 table. We need two because we cannot set up 8.11 + * an L3 table while we are currently running on it (without using 8.12 + * expensive atomic 64-bit operations). 8.13 + */ 8.14 + l3_pgentry_t lowmem_l3tab[2][4] __attribute__((__aligned__(32))); 8.15 + unsigned long lowmem_l3tab_high_mfn[2]; /* The >=4GB MFN being shadowed. */ 8.16 + unsigned int lowmem_l3tab_inuse; /* Which lowmem_l3tab is in use? */ 8.17 +#endif 8.18 + 8.19 unsigned long flags; /* TF_ */ 8.20 8.21 void (*schedule_tail) (struct vcpu *);