ia64/xen-unstable

changeset 10193:954f4dea9da6

[PAE] Allow pgdirs above 4GB for paravirt guests.
**NOTE**: This obviates the need for lowmem_emergency_pool.
Unpriv guests no longer need to be able to allocate memory
below 4GB for PAE PDPTs.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Fri May 26 17:22:30 2006 +0100 (2006-05-26)
parents 3ed325fa395b
children c073ebdbde8c 71fa0e0d520c
files tools/libxc/xc_linux_build.c tools/libxc/xc_linux_restore.c tools/libxc/xc_private.c tools/libxc/xenctrl.h xen/arch/x86/domain_build.c xen/arch/x86/mm.c xen/common/kernel.c xen/include/asm-x86/domain.h
line diff
     1.1 --- a/tools/libxc/xc_linux_build.c	Fri May 26 13:53:49 2006 +0100
     1.2 +++ b/tools/libxc/xc_linux_build.c	Fri May 26 17:22:30 2006 +0100
     1.3 @@ -268,22 +268,11 @@ static int setup_pg_tables_pae(int xc_ha
     1.4      l2_pgentry_64_t *vl2tab = NULL, *vl2e = NULL;
     1.5      l3_pgentry_64_t *vl3tab = NULL, *vl3e = NULL;
     1.6      uint64_t l1tab, l2tab, l3tab, pl1tab, pl2tab, pl3tab;
     1.7 -    unsigned long ppt_alloc, count, nmfn;
     1.8 +    unsigned long ppt_alloc, count;
     1.9  
    1.10      /* First allocate page for page dir. */
    1.11      ppt_alloc = (vpt_start - dsi_v_start) >> PAGE_SHIFT;
    1.12  
    1.13 -    if ( page_array[ppt_alloc] > 0xfffff )
    1.14 -    {
    1.15 -        nmfn = xc_make_page_below_4G(xc_handle, dom, page_array[ppt_alloc]);
    1.16 -        if ( nmfn == 0 )
    1.17 -        {
    1.18 -            fprintf(stderr, "Couldn't get a page below 4GB :-(\n");
    1.19 -            goto error_out;
    1.20 -        }
    1.21 -        page_array[ppt_alloc] = nmfn;
    1.22 -    }
    1.23 -
    1.24      alloc_pt(l3tab, vl3tab, pl3tab);
    1.25      vl3e = &vl3tab[l3_table_offset_pae(dsi_v_start)];
    1.26      if (shadow_mode_enabled)
     2.1 --- a/tools/libxc/xc_linux_restore.c	Fri May 26 13:53:49 2006 +0100
     2.2 +++ b/tools/libxc/xc_linux_restore.c	Fri May 26 17:22:30 2006 +0100
     2.3 @@ -331,25 +331,17 @@ int xc_linux_restore(int xc_handle, int 
     2.4                  ** A page table page - need to 'uncanonicalize' it, i.e.
     2.5                  ** replace all the references to pfns with the corresponding
     2.6                  ** mfns for the new domain.
     2.7 -                **
     2.8 -                ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
     2.9 -                ** so we may need to update the p2m after the main loop.
    2.10 -                ** Hence we defer canonicalization of L1s until then.
    2.11                  */
    2.12 -                if(pt_levels != 3 || pagetype != L1TAB) {
    2.13 -
    2.14 -                    if(!uncanonicalize_pagetable(pagetype, page)) {
    2.15 -                        /*
    2.16 -                        ** Failing to uncanonicalize a page table can be ok
    2.17 -                        ** under live migration since the pages type may have
    2.18 -                        ** changed by now (and we'll get an update later).
    2.19 -                        */
    2.20 -                        DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
    2.21 -                                pagetype >> 28, pfn, mfn);
    2.22 -                        nraces++;
    2.23 -                        continue;
    2.24 -                    }
    2.25 -
    2.26 +                if(!uncanonicalize_pagetable(pagetype, page)) {
    2.27 +                    /*
    2.28 +                    ** Failing to uncanonicalize a page table can be ok
    2.29 +                    ** under live migration since the pages type may have
    2.30 +                    ** changed by now (and we'll get an update later).
    2.31 +                    */
    2.32 +                    DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
    2.33 +                            pagetype >> 28, pfn, mfn);
    2.34 +                    nraces++;
    2.35 +                    continue;
    2.36                  }
    2.37  
    2.38              } else if(pagetype != NOTAB) {
    2.39 @@ -398,100 +390,6 @@ int xc_linux_restore(int xc_handle, int 
    2.40  
    2.41      DPRINTF("Received all pages (%d races)\n", nraces);
    2.42  
    2.43 -    if(pt_levels == 3) {
    2.44 -
    2.45 -        /*
    2.46 -        ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
    2.47 -        ** is a little awkward and involves (a) finding all such PGDs and
    2.48 -        ** replacing them with 'lowmem' versions; (b) upating the p2m[]
    2.49 -        ** with the new info; and (c) canonicalizing all the L1s using the
    2.50 -        ** (potentially updated) p2m[].
    2.51 -        **
    2.52 -        ** This is relatively slow (and currently involves two passes through
    2.53 -        ** the pfn_type[] array), but at least seems to be correct. May wish
    2.54 -        ** to consider more complex approaches to optimize this later.
    2.55 -        */
    2.56 -
    2.57 -        int j, k;
    2.58 -
    2.59 -        /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
    2.60 -        for (i = 0; i < max_pfn; i++) {
    2.61 -
    2.62 -            if (((pfn_type[i] & LTABTYPE_MASK)==L3TAB) && (p2m[i]>0xfffffUL)) {
    2.63 -
    2.64 -                unsigned long new_mfn;
    2.65 -                uint64_t l3ptes[4];
    2.66 -                uint64_t *l3tab;
    2.67 -
    2.68 -                l3tab = (uint64_t *)
    2.69 -                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
    2.70 -                                         PROT_READ, p2m[i]);
    2.71 -
    2.72 -                for(j = 0; j < 4; j++)
    2.73 -                    l3ptes[j] = l3tab[j];
    2.74 -
    2.75 -                munmap(l3tab, PAGE_SIZE);
    2.76 -
    2.77 -                if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) {
    2.78 -                    ERR("Couldn't get a page below 4GB :-(");
    2.79 -                    goto out;
    2.80 -                }
    2.81 -
    2.82 -                p2m[i] = new_mfn;
    2.83 -                if (xc_add_mmu_update(xc_handle, mmu,
    2.84 -                                      (((unsigned long long)new_mfn)
    2.85 -                                       << PAGE_SHIFT) |
    2.86 -                                      MMU_MACHPHYS_UPDATE, i)) {
    2.87 -                    ERR("Couldn't m2p on PAE root pgdir");
    2.88 -                    goto out;
    2.89 -                }
    2.90 -
    2.91 -                l3tab = (uint64_t *)
    2.92 -                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
    2.93 -                                         PROT_READ | PROT_WRITE, p2m[i]);
    2.94 -
    2.95 -                for(j = 0; j < 4; j++)
    2.96 -                    l3tab[j] = l3ptes[j];
    2.97 -
    2.98 -                munmap(l3tab, PAGE_SIZE);
    2.99 -
   2.100 -            }
   2.101 -        }
   2.102 -
   2.103 -        /* Second pass: find all L1TABs and uncanonicalize them */
   2.104 -        j = 0;
   2.105 -
   2.106 -        for(i = 0; i < max_pfn; i++) {
   2.107 -
   2.108 -            if (((pfn_type[i] & LTABTYPE_MASK)==L1TAB)) {
   2.109 -                region_mfn[j] = p2m[i];
   2.110 -                j++;
   2.111 -            }
   2.112 -
   2.113 -            if(i == (max_pfn-1) || j == MAX_BATCH_SIZE) {
   2.114 -
   2.115 -                if (!(region_base = xc_map_foreign_batch(
   2.116 -                          xc_handle, dom, PROT_READ | PROT_WRITE,
   2.117 -                          region_mfn, j))) {
   2.118 -                    ERR("map batch failed");
   2.119 -                    goto out;
   2.120 -                }
   2.121 -
   2.122 -                for(k = 0; k < j; k++) {
   2.123 -                    if(!uncanonicalize_pagetable(L1TAB,
   2.124 -                                                 region_base + k*PAGE_SIZE)) {
   2.125 -                        ERR("failed uncanonicalize pt!");
   2.126 -                        goto out;
   2.127 -                    }
   2.128 -                }
   2.129 -
   2.130 -                munmap(region_base, j*PAGE_SIZE);
   2.131 -                j = 0;
   2.132 -            }
   2.133 -        }
   2.134 -
   2.135 -    }
   2.136 -
   2.137  
   2.138      if (xc_finish_mmu_updates(xc_handle, mmu)) {
   2.139          ERR("Error doing finish_mmu_updates()");
     3.1 --- a/tools/libxc/xc_private.c	Fri May 26 13:53:49 2006 +0100
     3.2 +++ b/tools/libxc/xc_private.c	Fri May 26 17:22:30 2006 +0100
     3.3 @@ -430,28 +430,6 @@ int xc_version(int xc_handle, int cmd, v
     3.4      return rc;
     3.5  }
     3.6  
     3.7 -unsigned long xc_make_page_below_4G(
     3.8 -    int xc_handle, uint32_t domid, unsigned long mfn)
     3.9 -{
    3.10 -    unsigned long new_mfn;
    3.11 -
    3.12 -    if ( xc_domain_memory_decrease_reservation(
    3.13 -        xc_handle, domid, 1, 0, &mfn) != 0 )
    3.14 -    {
    3.15 -        fprintf(stderr,"xc_make_page_below_4G decrease failed. mfn=%lx\n",mfn);
    3.16 -        return 0;
    3.17 -    }
    3.18 -
    3.19 -    if ( xc_domain_memory_increase_reservation(
    3.20 -        xc_handle, domid, 1, 0, 32, &new_mfn) != 0 )
    3.21 -    {
    3.22 -        fprintf(stderr,"xc_make_page_below_4G increase failed. mfn=%lx\n",mfn);
    3.23 -        return 0;
    3.24 -    }
    3.25 -
    3.26 -    return new_mfn;
    3.27 -}
    3.28 -
    3.29  /*
    3.30   * Local variables:
    3.31   * mode: C
     4.1 --- a/tools/libxc/xenctrl.h	Fri May 26 13:53:49 2006 +0100
     4.2 +++ b/tools/libxc/xenctrl.h	Fri May 26 17:22:30 2006 +0100
     4.3 @@ -453,9 +453,6 @@ int xc_domain_iomem_permission(int xc_ha
     4.4                                 unsigned long nr_mfns,
     4.5                                 uint8_t allow_access);
     4.6  
     4.7 -unsigned long xc_make_page_below_4G(int xc_handle, uint32_t domid,
     4.8 -                                    unsigned long mfn);
     4.9 -
    4.10  typedef dom0_perfc_desc_t xc_perfc_desc_t;
    4.11  /* IMPORTANT: The caller is responsible for mlock()'ing the @desc array. */
    4.12  int xc_perfc_control(int xc_handle,
     5.1 --- a/xen/arch/x86/domain_build.c	Fri May 26 13:53:49 2006 +0100
     5.2 +++ b/xen/arch/x86/domain_build.c	Fri May 26 17:22:30 2006 +0100
     5.3 @@ -367,7 +367,10 @@ int construct_dom0(struct domain *d,
     5.4      if ( (1UL << order) > nr_pages )
     5.5          panic("Domain 0 allocation is too small for kernel image.\n");
     5.6  
     5.7 -    /* Allocate from DMA pool: PAE L3 table must be below 4GB boundary. */
     5.8 +    /*
     5.9 +     * Allocate from DMA pool: on i386 this ensures that our low-memory 1:1
    5.10 +     * mapping covers the allocation.
    5.11 +     */
    5.12      if ( (page = alloc_domheap_pages(d, order, ALLOC_DOM_DMA)) == NULL )
    5.13          panic("Not enough RAM for domain 0 allocation.\n");
    5.14      alloc_spfn = page_to_mfn(page);
     6.1 --- a/xen/arch/x86/mm.c	Fri May 26 13:53:49 2006 +0100
     6.2 +++ b/xen/arch/x86/mm.c	Fri May 26 17:22:30 2006 +0100
     6.3 @@ -260,9 +260,42 @@ void share_xen_page_with_privileged_gues
     6.4      share_xen_page_with_guest(page, dom_xen, readonly);
     6.5  }
     6.6  
     6.7 +static void __write_ptbase(unsigned long mfn)
     6.8 +{
     6.9 +#ifdef CONFIG_X86_PAE
    6.10 +    if ( mfn >= 0x100000 )
    6.11 +    {
    6.12 +        l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
    6.13 +        struct vcpu *v = current;
    6.14 +        unsigned long flags;
    6.15 +
    6.16 +        /* Protects against re-entry and against __pae_flush_pgd(). */
    6.17 +        local_irq_save(flags);
    6.18 +
    6.19 +        /* Pick an unused low-memory L3 cache slot. */
    6.20 +        v->arch.lowmem_l3tab_inuse ^= 1;
    6.21 +        lowmem_l3tab = v->arch.lowmem_l3tab[v->arch.lowmem_l3tab_inuse];
    6.22 +        v->arch.lowmem_l3tab_high_mfn[v->arch.lowmem_l3tab_inuse] = mfn;
    6.23 +
    6.24 +        /* Map the guest L3 table and copy to the chosen low-memory cache. */
    6.25 +        highmem_l3tab = map_domain_page(mfn);
    6.26 +        memcpy(lowmem_l3tab, highmem_l3tab, sizeof(v->arch.lowmem_l3tab));
    6.27 +        unmap_domain_page(highmem_l3tab);
    6.28 +
    6.29 +        /* Install the low-memory L3 table in CR3. */
    6.30 +        write_cr3(__pa(lowmem_l3tab));
    6.31 +
    6.32 +        local_irq_restore(flags);
    6.33 +        return;
    6.34 +    }
    6.35 +#endif
    6.36 +
    6.37 +    write_cr3(mfn << PAGE_SHIFT);
    6.38 +}
    6.39 +
    6.40  void write_ptbase(struct vcpu *v)
    6.41  {
    6.42 -    write_cr3(pagetable_get_paddr(v->arch.monitor_table));
    6.43 +    __write_ptbase(pagetable_get_pfn(v->arch.monitor_table));
    6.44  }
    6.45  
    6.46  void invalidate_shadow_ldt(struct vcpu *v)
    6.47 @@ -401,6 +434,7 @@ static int get_page_and_type_from_pagenr
    6.48      return 1;
    6.49  }
    6.50  
    6.51 +#ifndef CONFIG_X86_PAE /* We do not support guest linear mappings on PAE. */
    6.52  /*
    6.53   * We allow root tables to map each other (a.k.a. linear page tables). It
    6.54   * needs some special care with reference counts and access permissions:
    6.55 @@ -456,6 +490,7 @@ get_linear_pagetable(
    6.56  
    6.57      return 1;
    6.58  }
    6.59 +#endif /* !CONFIG_X86_PAE */
    6.60  
    6.61  int
    6.62  get_page_from_l1e(
    6.63 @@ -564,10 +599,6 @@ get_page_from_l3e(
    6.64      rc = get_page_and_type_from_pagenr(
    6.65          l3e_get_pfn(l3e),
    6.66          PGT_l2_page_table | vaddr, d);
    6.67 -#if CONFIG_PAGING_LEVELS == 3
    6.68 -    if ( unlikely(!rc) )
    6.69 -        rc = get_linear_pagetable(l3e, pfn, d);
    6.70 -#endif
    6.71      return rc;
    6.72  }
    6.73  #endif /* 3 level */
    6.74 @@ -773,6 +804,50 @@ static int create_pae_xen_mappings(l3_pg
    6.75      return 1;
    6.76  }
    6.77  
    6.78 +struct pae_flush_pgd {
    6.79 +    unsigned long l3tab_mfn;
    6.80 +    unsigned int  l3tab_idx;
    6.81 +    l3_pgentry_t  nl3e;
    6.82 +};
    6.83 +
    6.84 +static void __pae_flush_pgd(void *data)
    6.85 +{
    6.86 +    struct pae_flush_pgd *args = data;
    6.87 +    struct vcpu *v = this_cpu(curr_vcpu);
    6.88 +    int i = v->arch.lowmem_l3tab_inuse;
    6.89 +    intpte_t _ol3e, _nl3e, _pl3e;
    6.90 +    l3_pgentry_t *l3tab_ptr;
    6.91 +
    6.92 +    ASSERT(!local_irq_is_enabled());
    6.93 +
    6.94 +    if ( v->arch.lowmem_l3tab_high_mfn[i] != args->l3tab_mfn )
    6.95 +        return;
    6.96 +
    6.97 +    l3tab_ptr = &v->arch.lowmem_l3tab[i][args->l3tab_idx];
    6.98 +
    6.99 +    _ol3e = l3e_get_intpte(*l3tab_ptr);
   6.100 +    _nl3e = l3e_get_intpte(args->nl3e);
   6.101 +    _pl3e = cmpxchg((intpte_t *)l3tab_ptr, _ol3e, _nl3e);
   6.102 +    BUG_ON(_pl3e != _ol3e);
   6.103 +}
   6.104 +
   6.105 +/* Flush a pgdir update into low-memory caches. */
   6.106 +static void pae_flush_pgd(
   6.107 +    unsigned long mfn, unsigned int idx, l3_pgentry_t nl3e)
   6.108 +{
   6.109 +    struct domain *d = page_get_owner(mfn_to_page(mfn));
   6.110 +    struct pae_flush_pgd args = {
   6.111 +        .l3tab_mfn = mfn,
   6.112 +        .l3tab_idx = idx,
   6.113 +        .nl3e      = nl3e };
   6.114 +
   6.115 +    /* If below 4GB then the pgdir is not shadowed in low memory. */
   6.116 +    if ( mfn < 0x100000 )
   6.117 +        return;
   6.118 +
   6.119 +    on_selected_cpus(d->domain_dirty_cpumask, __pae_flush_pgd, &args, 1, 1);
   6.120 +}
   6.121 +
   6.122  static inline int l1_backptr(
   6.123      unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
   6.124  {
   6.125 @@ -787,6 +862,7 @@ static inline int l1_backptr(
   6.126  
   6.127  #elif CONFIG_X86_64
   6.128  # define create_pae_xen_mappings(pl3e) (1)
   6.129 +# define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
   6.130  
   6.131  static inline int l1_backptr(
   6.132      unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type)
   6.133 @@ -886,14 +962,6 @@ static int alloc_l3_table(struct page_in
   6.134  
   6.135      ASSERT(!shadow_mode_refcounts(d));
   6.136  
   6.137 -#ifdef CONFIG_X86_PAE
   6.138 -    if ( pfn >= 0x100000 )
   6.139 -    {
   6.140 -        MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
   6.141 -        return 0;
   6.142 -    }
   6.143 -#endif
   6.144 -
   6.145      pl3e = map_domain_page(pfn);
   6.146      for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
   6.147      {
   6.148 @@ -1241,6 +1309,8 @@ static int mod_l3_entry(l3_pgentry_t *pl
   6.149      okay = create_pae_xen_mappings(pl3e);
   6.150      BUG_ON(!okay);
   6.151  
   6.152 +    pae_flush_pgd(pfn, pgentry_ptr_to_slot(pl3e), nl3e);
   6.153 +
   6.154      put_page_from_l3e(ol3e, pfn);
   6.155      return 1;
   6.156  }
   6.157 @@ -3109,7 +3179,7 @@ void ptwr_flush(struct domain *d, const 
   6.158  
   6.159      if ( unlikely(d->arch.ptwr[which].vcpu != current) )
   6.160          /* Don't use write_ptbase: it may switch to guest_user on x86/64! */
   6.161 -        write_cr3(pagetable_get_paddr(
   6.162 +        __write_ptbase(pagetable_get_pfn(
   6.163              d->arch.ptwr[which].vcpu->arch.guest_table));
   6.164      else
   6.165          TOGGLE_MODE();
     7.1 --- a/xen/common/kernel.c	Fri May 26 13:53:49 2006 +0100
     7.2 +++ b/xen/common/kernel.c	Fri May 26 17:22:30 2006 +0100
     7.3 @@ -191,12 +191,11 @@ long do_xen_version(int cmd, XEN_GUEST_H
     7.4          switch ( fi.submap_idx )
     7.5          {
     7.6          case 0:
     7.7 -            fi.submap = 0;
     7.8 +            fi.submap = (1U << XENFEAT_pae_pgdir_above_4gb);
     7.9              if ( shadow_mode_translate(current->domain) )
    7.10                  fi.submap |= 
    7.11                      (1U << XENFEAT_writable_page_tables) |
    7.12 -                    (1U << XENFEAT_auto_translated_physmap) |
    7.13 -                    (1U << XENFEAT_pae_pgdir_above_4gb);
    7.14 +                    (1U << XENFEAT_auto_translated_physmap);
    7.15              if ( supervisor_mode_kernel )
    7.16                  fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
    7.17              break;
     8.1 --- a/xen/include/asm-x86/domain.h	Fri May 26 13:53:49 2006 +0100
     8.2 +++ b/xen/include/asm-x86/domain.h	Fri May 26 17:22:30 2006 +0100
     8.3 @@ -120,6 +120,18 @@ struct arch_vcpu
     8.4      struct vcpu_guest_context guest_context
     8.5      __attribute__((__aligned__(16)));
     8.6  
     8.7 +#ifdef CONFIG_X86_PAE
     8.8 +    /*
     8.9 +     * Two low-memory (<4GB) PAE L3 tables, used as fallback when the guest
    8.10 +     * supplies a >=4GB PAE L3 table. We need two because we cannot set up
    8.11 +     * an L3 table while we are currently running on it (without using
    8.12 +     * expensive atomic 64-bit operations).
    8.13 +     */
    8.14 +    l3_pgentry_t  lowmem_l3tab[2][4] __attribute__((__aligned__(32)));
    8.15 +    unsigned long lowmem_l3tab_high_mfn[2]; /* The >=4GB MFN being shadowed. */
    8.16 +    unsigned int  lowmem_l3tab_inuse;       /* Which lowmem_l3tab is in use? */
    8.17 +#endif
    8.18 +
    8.19      unsigned long      flags; /* TF_ */
    8.20  
    8.21      void (*schedule_tail) (struct vcpu *);