ia64/xen-unstable

changeset 10304:8aca850f66ad

Create new vmassist type 'pae_extended_cr3'. Only advertise
pae_pgdir_above_4gb tp guests that have enabled this vmassist.
Control tools ensure all PAE page directories are below 4GB
unless the vmassist is enabled (triggered via an extended-cr3
option in guest Elf header).
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Mon Jun 05 10:42:40 2006 +0100 (2006-06-05)
parents c9e6255cb44a
children 47412b44e35e
files tools/libxc/xc_linux_build.c tools/libxc/xc_linux_restore.c tools/libxc/xc_linux_save.c tools/libxc/xc_load_elf.c tools/libxc/xc_private.c tools/libxc/xenctrl.h tools/libxc/xg_private.h xen/arch/x86/domain_build.c xen/arch/x86/mm.c xen/common/kernel.c xen/common/keyhandler.c xen/include/public/xen.h
line diff
     1.1 --- a/tools/libxc/xc_linux_build.c	Fri Jun 02 19:14:44 2006 +0100
     1.2 +++ b/tools/libxc/xc_linux_build.c	Mon Jun 05 10:42:40 2006 +0100
     1.3 @@ -254,17 +254,33 @@ static int setup_pg_tables_pae(int xc_ha
     1.4                                 unsigned long *page_array,
     1.5                                 unsigned long vpt_start,
     1.6                                 unsigned long vpt_end,
     1.7 -                               unsigned shadow_mode_enabled)
     1.8 +                               unsigned shadow_mode_enabled,
     1.9 +                               unsigned pae_mode)
    1.10  {
    1.11      l1_pgentry_64_t *vl1tab = NULL, *vl1e = NULL;
    1.12      l2_pgentry_64_t *vl2tab = NULL, *vl2e = NULL;
    1.13      l3_pgentry_64_t *vl3tab = NULL, *vl3e = NULL;
    1.14      uint64_t l1tab, l2tab, l3tab, pl1tab, pl2tab, pl3tab;
    1.15 -    unsigned long ppt_alloc, count;
    1.16 +    unsigned long ppt_alloc, count, nmfn;
    1.17  
    1.18      /* First allocate page for page dir. */
    1.19      ppt_alloc = (vpt_start - dsi_v_start) >> PAGE_SHIFT;
    1.20  
    1.21 +    if ( pae_mode == PAEKERN_extended_cr3 )
    1.22 +    {
    1.23 +        ctxt->vm_assist |= (1UL << VMASST_TYPE_pae_extended_cr3);
    1.24 +    }
    1.25 +    else if ( page_array[ppt_alloc] > 0xfffff )
    1.26 +    {
    1.27 +        nmfn = xc_make_page_below_4G(xc_handle, dom, page_array[ppt_alloc]);
    1.28 +        if ( nmfn == 0 )
    1.29 +        {
    1.30 +            fprintf(stderr, "Couldn't get a page below 4GB :-(\n");
    1.31 +            goto error_out;
    1.32 +        }
    1.33 +        page_array[ppt_alloc] = nmfn;
    1.34 +    }
    1.35 +
    1.36      alloc_pt(l3tab, vl3tab, pl3tab);
    1.37      vl3e = &vl3tab[l3_table_offset_pae(dsi_v_start)];
    1.38      if (shadow_mode_enabled)
    1.39 @@ -579,11 +595,11 @@ static int compat_check(int xc_handle, s
    1.40      }
    1.41  
    1.42      if (strstr(xen_caps, "xen-3.0-x86_32p")) {
    1.43 -        if (!dsi->pae_kernel) {
    1.44 +        if (dsi->pae_kernel == PAEKERN_no) {
    1.45              ERROR("Non PAE-kernel on PAE host.");
    1.46              return 0;
    1.47          }
    1.48 -    } else if (dsi->pae_kernel) {
    1.49 +    } else if (dsi->pae_kernel != PAEKERN_no) {
    1.50          ERROR("PAE-kernel on non-PAE host.");
    1.51          return 0;
    1.52      }
    1.53 @@ -673,7 +689,8 @@ static int setup_guest(int xc_handle,
    1.54  
    1.55      for ( i = 0; i < XENFEAT_NR_SUBMAPS; i++ )
    1.56      {
    1.57 -        if ( (supported_features[i]&required_features[i]) != required_features[i] )
    1.58 +        if ( (supported_features[i] & required_features[i]) !=
    1.59 +             required_features[i] )
    1.60          {
    1.61              ERROR("Guest kernel does not support a required feature.");
    1.62              goto error_out;
    1.63 @@ -719,7 +736,7 @@ static int setup_guest(int xc_handle,
    1.64      (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
    1.65      ((_l) & ~((1UL<<(_s))-1))) >> (_s))
    1.66  #if defined(__i386__)
    1.67 -        if ( dsi.pae_kernel )
    1.68 +        if ( dsi.pae_kernel != PAEKERN_no )
    1.69          {
    1.70              if ( (1 + /* # L3 */
    1.71                    NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT_PAE) + /* # L2 */
    1.72 @@ -797,11 +814,11 @@ static int setup_guest(int xc_handle,
    1.73  
    1.74      /* setup page tables */
    1.75  #if defined(__i386__)
    1.76 -    if (dsi.pae_kernel)
    1.77 +    if (dsi.pae_kernel != PAEKERN_no)
    1.78          rc = setup_pg_tables_pae(xc_handle, dom, ctxt,
    1.79                                   dsi.v_start, v_end,
    1.80                                   page_array, vpt_start, vpt_end,
    1.81 -                                 shadow_mode_enabled);
    1.82 +                                 shadow_mode_enabled, dsi.pae_kernel);
    1.83      else
    1.84          rc = setup_pg_tables(xc_handle, dom, ctxt,
    1.85                               dsi.v_start, v_end,
    1.86 @@ -824,7 +841,7 @@ static int setup_guest(int xc_handle,
    1.87       */
    1.88      if ( !shadow_mode_enabled )
    1.89      {
    1.90 -        if ( dsi.pae_kernel )
    1.91 +        if ( dsi.pae_kernel != PAEKERN_no )
    1.92          {
    1.93              if ( pin_table(xc_handle, MMUEXT_PIN_L3_TABLE,
    1.94                             xen_cr3_to_pfn(ctxt->ctrlreg[3]), dom) )
    1.95 @@ -958,7 +975,7 @@ static int setup_guest(int xc_handle,
    1.96      rc = xc_version(xc_handle, XENVER_version, NULL);
    1.97      sprintf(start_info->magic, "xen-%i.%i-x86_%d%s",
    1.98              rc >> 16, rc & (0xFFFF), (unsigned int)sizeof(long)*8,
    1.99 -            dsi.pae_kernel ? "p" : "");
   1.100 +            (dsi.pae_kernel != PAEKERN_no) ? "p" : "");
   1.101      start_info->nr_pages     = nr_pages;
   1.102      start_info->shared_info  = guest_shared_info_mfn << PAGE_SHIFT;
   1.103      start_info->flags        = flags;
     2.1 --- a/tools/libxc/xc_linux_restore.c	Fri Jun 02 19:14:44 2006 +0100
     2.2 +++ b/tools/libxc/xc_linux_restore.c	Mon Jun 05 10:42:40 2006 +0100
     2.3 @@ -108,7 +108,7 @@ int xc_linux_restore(int xc_handle, int 
     2.4                       unsigned int console_evtchn, unsigned long *console_mfn)
     2.5  {
     2.6      DECLARE_DOM0_OP;
     2.7 -    int rc = 1, i, n;
     2.8 +    int rc = 1, i, n, pae_extended_cr3 = 0;
     2.9      unsigned long mfn, pfn;
    2.10      unsigned int prev_pc, this_pc;
    2.11      int verify = 0;
    2.12 @@ -162,26 +162,84 @@ int xc_linux_restore(int xc_handle, int 
    2.13          return 1;
    2.14      }
    2.15  
    2.16 -
    2.17      if (mlock(&ctxt, sizeof(ctxt))) {
    2.18          /* needed for build dom0 op, but might as well do early */
    2.19          ERR("Unable to mlock ctxt");
    2.20          return 1;
    2.21      }
    2.22  
    2.23 -
    2.24 -    /* Read the saved P2M frame list */
    2.25 -    if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
    2.26 +    if (!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
    2.27          ERR("Couldn't allocate p2m_frame_list array");
    2.28          goto out;
    2.29      }
    2.30  
    2.31 -    if (!read_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
    2.32 +    /* Read first entry of P2M list, or extended-info signature (~0UL). */
    2.33 +    if (!read_exact(io_fd, p2m_frame_list, sizeof(long))) {
    2.34 +        ERR("read extended-info signature failed");
    2.35 +        goto out;
    2.36 +    }
    2.37 +
    2.38 +    if (p2m_frame_list[0] == ~0UL) {
    2.39 +        uint32_t tot_bytes;
    2.40 +
    2.41 +        /* Next 4 bytes: total size of following extended info. */
    2.42 +        if (!read_exact(io_fd, &tot_bytes, sizeof(tot_bytes))) {
    2.43 +            ERR("read extended-info size failed");
    2.44 +            goto out;
    2.45 +        }
    2.46 +
    2.47 +        while (tot_bytes) {
    2.48 +            uint32_t chunk_bytes;
    2.49 +            char     chunk_sig[4];
    2.50 +
    2.51 +            /* 4-character chunk signature + 4-byte remaining chunk size. */
    2.52 +            if (!read_exact(io_fd, chunk_sig, sizeof(chunk_sig)) ||
    2.53 +                !read_exact(io_fd, &chunk_bytes, sizeof(chunk_bytes))) {
    2.54 +                ERR("read extended-info chunk signature failed");
    2.55 +                goto out;
    2.56 +            }
    2.57 +            tot_bytes -= 8;
    2.58 +
    2.59 +            /* VCPU context structure? */
    2.60 +            if (!strncmp(chunk_sig, "vcpu", 4)) {
    2.61 +                if (!read_exact(io_fd, &ctxt, sizeof(ctxt))) {
    2.62 +                    ERR("read extended-info vcpu context failed");
    2.63 +                    goto out;
    2.64 +                }
    2.65 +                tot_bytes   -= sizeof(struct vcpu_guest_context);
    2.66 +                chunk_bytes -= sizeof(struct vcpu_guest_context);
    2.67 +
    2.68 +                if (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3))
    2.69 +                    pae_extended_cr3 = 1;
    2.70 +            }
    2.71 +
    2.72 +            /* Any remaining bytes of this chunk: read and discard. */
    2.73 +            while (chunk_bytes) {
    2.74 +                unsigned long sz = chunk_bytes;
    2.75 +                if ( sz > P2M_FL_SIZE )
    2.76 +                    sz = P2M_FL_SIZE;
    2.77 +                if (!read_exact(io_fd, p2m_frame_list, sz)) {
    2.78 +                    ERR("read-and-discard extended-info chunk bytes failed");
    2.79 +                    goto out;
    2.80 +                }
    2.81 +                chunk_bytes -= sz;
    2.82 +                tot_bytes   -= sz;
    2.83 +            }
    2.84 +        }
    2.85 +
    2.86 +        /* Now read the real first entry of P2M list. */
    2.87 +        if (!read_exact(io_fd, p2m_frame_list, sizeof(long))) {
    2.88 +            ERR("read first entry of p2m_frame_list failed");
    2.89 +            goto out;
    2.90 +        }
    2.91 +    }
    2.92 +
    2.93 +    /* First entry is already read into the p2m array. */
    2.94 +    if (!read_exact(io_fd, &p2m_frame_list[1], P2M_FL_SIZE - sizeof(long))) {
    2.95          ERR("read p2m_frame_list failed");
    2.96          goto out;
    2.97      }
    2.98  
    2.99 -
   2.100      /* We want zeroed memory so use calloc rather than malloc. */
   2.101      p2m        = calloc(max_pfn, sizeof(unsigned long));
   2.102      pfn_type   = calloc(max_pfn, sizeof(unsigned long));
   2.103 @@ -331,17 +389,27 @@ int xc_linux_restore(int xc_handle, int 
   2.104                  ** A page table page - need to 'uncanonicalize' it, i.e.
   2.105                  ** replace all the references to pfns with the corresponding
   2.106                  ** mfns for the new domain.
   2.107 +                **
   2.108 +                ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
   2.109 +                ** so we may need to update the p2m after the main loop.
   2.110 +                ** Hence we defer canonicalization of L1s until then.
   2.111                  */
   2.112 -                if(!uncanonicalize_pagetable(pagetype, page)) {
   2.113 -                    /*
   2.114 -                    ** Failing to uncanonicalize a page table can be ok
   2.115 -                    ** under live migration since the pages type may have
   2.116 -                    ** changed by now (and we'll get an update later).
   2.117 -                    */
   2.118 -                    DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
   2.119 -                            pagetype >> 28, pfn, mfn);
   2.120 -                    nraces++;
   2.121 -                    continue;
   2.122 +                if ((pt_levels != 3) ||
   2.123 +                    pae_extended_cr3 ||
   2.124 +                    (pagetype != L1TAB)) {
   2.125 +
   2.126 +                    if (!uncanonicalize_pagetable(pagetype, page)) {
   2.127 +                        /*
   2.128 +                        ** Failing to uncanonicalize a page table can be ok
   2.129 +                        ** under live migration since the pages type may have
   2.130 +                        ** changed by now (and we'll get an update later).
   2.131 +                        */
   2.132 +                        DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
   2.133 +                                pagetype >> 28, pfn, mfn);
   2.134 +                        nraces++;
   2.135 +                        continue;
   2.136 +                    }
   2.137 +
   2.138                  }
   2.139  
   2.140              } else if(pagetype != NOTAB) {
   2.141 @@ -390,6 +458,100 @@ int xc_linux_restore(int xc_handle, int 
   2.142  
   2.143      DPRINTF("Received all pages (%d races)\n", nraces);
   2.144  
   2.145 +    if ((pt_levels == 3) && !pae_extended_cr3) {
   2.146 +
   2.147 +        /*
   2.148 +        ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
   2.149 +        ** is a little awkward and involves (a) finding all such PGDs and
   2.150 +        ** replacing them with 'lowmem' versions; (b) upating the p2m[]
   2.151 +        ** with the new info; and (c) canonicalizing all the L1s using the
   2.152 +        ** (potentially updated) p2m[].
   2.153 +        **
   2.154 +        ** This is relatively slow (and currently involves two passes through
   2.155 +        ** the pfn_type[] array), but at least seems to be correct. May wish
   2.156 +        ** to consider more complex approaches to optimize this later.
   2.157 +        */
   2.158 +
   2.159 +        int j, k;
   2.160 +
   2.161 +        /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
   2.162 +        for (i = 0; i < max_pfn; i++) {
   2.163 +
   2.164 +            if (((pfn_type[i] & LTABTYPE_MASK)==L3TAB) && (p2m[i]>0xfffffUL)) {
   2.165 +
   2.166 +                unsigned long new_mfn;
   2.167 +                uint64_t l3ptes[4];
   2.168 +                uint64_t *l3tab;
   2.169 +
   2.170 +                l3tab = (uint64_t *)
   2.171 +                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
   2.172 +                                         PROT_READ, p2m[i]);
   2.173 +
   2.174 +                for(j = 0; j < 4; j++)
   2.175 +                    l3ptes[j] = l3tab[j];
   2.176 +
   2.177 +                munmap(l3tab, PAGE_SIZE);
   2.178 +
   2.179 +                if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) {
   2.180 +                    ERR("Couldn't get a page below 4GB :-(");
   2.181 +                    goto out;
   2.182 +                }
   2.183 +
   2.184 +                p2m[i] = new_mfn;
   2.185 +                if (xc_add_mmu_update(xc_handle, mmu,
   2.186 +                                      (((unsigned long long)new_mfn)
   2.187 +                                       << PAGE_SHIFT) |
   2.188 +                                      MMU_MACHPHYS_UPDATE, i)) {
   2.189 +                    ERR("Couldn't m2p on PAE root pgdir");
   2.190 +                    goto out;
   2.191 +                }
   2.192 +
   2.193 +                l3tab = (uint64_t *)
   2.194 +                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
   2.195 +                                         PROT_READ | PROT_WRITE, p2m[i]);
   2.196 +
   2.197 +                for(j = 0; j < 4; j++)
   2.198 +                    l3tab[j] = l3ptes[j];
   2.199 +
   2.200 +                munmap(l3tab, PAGE_SIZE);
   2.201 +
   2.202 +            }
   2.203 +        }
   2.204 +
   2.205 +        /* Second pass: find all L1TABs and uncanonicalize them */
   2.206 +        j = 0;
   2.207 +
   2.208 +        for(i = 0; i < max_pfn; i++) {
   2.209 +
   2.210 +            if (((pfn_type[i] & LTABTYPE_MASK)==L1TAB)) {
   2.211 +                region_mfn[j] = p2m[i];
   2.212 +                j++;
   2.213 +            }
   2.214 +
   2.215 +            if(i == (max_pfn-1) || j == MAX_BATCH_SIZE) {
   2.216 +
   2.217 +                if (!(region_base = xc_map_foreign_batch(
   2.218 +                          xc_handle, dom, PROT_READ | PROT_WRITE,
   2.219 +                          region_mfn, j))) {
   2.220 +                    ERR("map batch failed");
   2.221 +                    goto out;
   2.222 +                }
   2.223 +
   2.224 +                for(k = 0; k < j; k++) {
   2.225 +                    if(!uncanonicalize_pagetable(L1TAB,
   2.226 +                                                 region_base + k*PAGE_SIZE)) {
   2.227 +                        ERR("failed uncanonicalize pt!");
   2.228 +                        goto out;
   2.229 +                    }
   2.230 +                }
   2.231 +
   2.232 +                munmap(region_base, j*PAGE_SIZE);
   2.233 +                j = 0;
   2.234 +            }
   2.235 +        }
   2.236 +
   2.237 +    }
   2.238 +
   2.239  
   2.240      if (xc_finish_mmu_updates(xc_handle, mmu)) {
   2.241          ERR("Error doing finish_mmu_updates()");
     3.1 --- a/tools/libxc/xc_linux_save.c	Fri Jun 02 19:14:44 2006 +0100
     3.2 +++ b/tools/libxc/xc_linux_save.c	Mon Jun 05 10:42:40 2006 +0100
     3.3 @@ -818,12 +818,33 @@ int xc_linux_save(int xc_handle, int io_
     3.4  
     3.5      /* Start writing out the saved-domain record. */
     3.6  
     3.7 -    if(!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
     3.8 +    if (!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
     3.9          ERR("write: max_pfn");
    3.10          goto out;
    3.11      }
    3.12  
    3.13 -    if(!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
    3.14 +    /*
    3.15 +     * Write an extended-info structure to inform the restore code that
    3.16 +     * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
    3.17 +     * slow paths in the restore code.
    3.18 +     */
    3.19 +    if ((pt_levels == 3) &&
    3.20 +        (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3))) {
    3.21 +        unsigned long signature = ~0UL;
    3.22 +        uint32_t tot_sz   = sizeof(struct vcpu_guest_context) + 8;
    3.23 +        uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
    3.24 +        char chunk_sig[]  = "vcpu";
    3.25 +        if (!write_exact(io_fd, &signature, sizeof(signature)) ||
    3.26 +            !write_exact(io_fd, &tot_sz,    sizeof(tot_sz)) ||
    3.27 +            !write_exact(io_fd, &chunk_sig, 4) ||
    3.28 +            !write_exact(io_fd, &chunk_sz,  sizeof(chunk_sz)) ||
    3.29 +            !write_exact(io_fd, &ctxt,      sizeof(ctxt))) {
    3.30 +            ERR("write: extended info");
    3.31 +            goto out;
    3.32 +        }
    3.33 +    }
    3.34 +
    3.35 +    if (!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
    3.36          ERR("write: p2m_frame_list");
    3.37          goto out;
    3.38      }
     4.1 --- a/tools/libxc/xc_load_elf.c	Fri Jun 02 19:14:44 2006 +0100
     4.2 +++ b/tools/libxc/xc_load_elf.c	Mon Jun 05 10:42:40 2006 +0100
     4.3 @@ -122,8 +122,15 @@ static int parseelfimage(const char *ima
     4.4              ERROR("Actually saw: '%s'", guestinfo);
     4.5              return -EINVAL;
     4.6          }
     4.7 -        if ( (strstr(guestinfo, "PAE=yes") != NULL) )
     4.8 -            dsi->pae_kernel = 1;
     4.9 +
    4.10 +        dsi->pae_kernel = PAEKERN_no;
    4.11 +        p = strstr(guestinfo, "PAE=yes");
    4.12 +        if ( p != NULL )
    4.13 +        {
    4.14 +            dsi->pae_kernel = PAEKERN_yes;
    4.15 +            if ( !strncmp(p+7, "[extended-cr3]", 14) )
    4.16 +                dsi->pae_kernel = PAEKERN_extended_cr3;
    4.17 +        }
    4.18  
    4.19          break;
    4.20      }
     5.1 --- a/tools/libxc/xc_private.c	Fri Jun 02 19:14:44 2006 +0100
     5.2 +++ b/tools/libxc/xc_private.c	Mon Jun 05 10:42:40 2006 +0100
     5.3 @@ -430,6 +430,28 @@ int xc_version(int xc_handle, int cmd, v
     5.4      return rc;
     5.5  }
     5.6  
     5.7 +unsigned long xc_make_page_below_4G(
     5.8 +    int xc_handle, uint32_t domid, unsigned long mfn)
     5.9 +{
    5.10 +    unsigned long new_mfn;
    5.11 +
    5.12 +    if ( xc_domain_memory_decrease_reservation(
    5.13 +        xc_handle, domid, 1, 0, &mfn) != 0 )
    5.14 +    {
    5.15 +        fprintf(stderr,"xc_make_page_below_4G decrease failed. mfn=%lx\n",mfn);
    5.16 +        return 0;
    5.17 +    }
    5.18 +
    5.19 +    if ( xc_domain_memory_increase_reservation(
    5.20 +        xc_handle, domid, 1, 0, 32, &new_mfn) != 0 )
    5.21 +    {
    5.22 +        fprintf(stderr,"xc_make_page_below_4G increase failed. mfn=%lx\n",mfn);
    5.23 +        return 0;
    5.24 +    }
    5.25 +
    5.26 +    return new_mfn;
    5.27 +}
    5.28 +
    5.29  /*
    5.30   * Local variables:
    5.31   * mode: C
     6.1 --- a/tools/libxc/xenctrl.h	Fri Jun 02 19:14:44 2006 +0100
     6.2 +++ b/tools/libxc/xenctrl.h	Mon Jun 05 10:42:40 2006 +0100
     6.3 @@ -453,6 +453,9 @@ int xc_domain_iomem_permission(int xc_ha
     6.4                                 unsigned long nr_mfns,
     6.5                                 uint8_t allow_access);
     6.6  
     6.7 +unsigned long xc_make_page_below_4G(int xc_handle, uint32_t domid,
     6.8 +                                    unsigned long mfn);
     6.9 +
    6.10  typedef dom0_perfc_desc_t xc_perfc_desc_t;
    6.11  /* IMPORTANT: The caller is responsible for mlock()'ing the @desc array. */
    6.12  int xc_perfc_control(int xc_handle,
     7.1 --- a/tools/libxc/xg_private.h	Fri Jun 02 19:14:44 2006 +0100
     7.2 +++ b/tools/libxc/xg_private.h	Mon Jun 05 10:42:40 2006 +0100
     7.3 @@ -156,6 +156,9 @@ struct domain_setup_info
     7.4  
     7.5      unsigned long elf_paddr_offset;
     7.6  
     7.7 +#define PAEKERN_no           0
     7.8 +#define PAEKERN_yes          1
     7.9 +#define PAEKERN_extended_cr3 2
    7.10      unsigned int  pae_kernel;
    7.11  
    7.12      unsigned int  load_symtab;
     8.1 --- a/xen/arch/x86/domain_build.c	Fri Jun 02 19:14:44 2006 +0100
     8.2 +++ b/xen/arch/x86/domain_build.c	Mon Jun 05 10:42:40 2006 +0100
     8.3 @@ -302,6 +302,9 @@ int construct_dom0(struct domain *d,
     8.4          return -EINVAL;
     8.5      }
     8.6  
     8.7 +    if ( xen_pae && !!strstr(dsi.xen_section_string, "PAE=yes[extended-cr3]") )
     8.8 +        set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist);
     8.9 +
    8.10      if ( (p = strstr(dsi.xen_section_string, "FEATURES=")) != NULL )
    8.11      {
    8.12          parse_features(
     9.1 --- a/xen/arch/x86/mm.c	Fri Jun 02 19:14:44 2006 +0100
     9.2 +++ b/xen/arch/x86/mm.c	Mon Jun 05 10:42:40 2006 +0100
     9.3 @@ -997,6 +997,21 @@ static int alloc_l3_table(struct page_in
     9.4  
     9.5      ASSERT(!shadow_mode_refcounts(d));
     9.6  
     9.7 +#ifdef CONFIG_X86_PAE
     9.8 +    /*
     9.9 +     * PAE pgdirs above 4GB are unacceptable if the guest does not understand
    9.10 +     * the weird 'extended cr3' format for dealing with high-order address
    9.11 +     * bits. We cut some slack for control tools (before vcpu0 is initialised).
    9.12 +     */
    9.13 +    if ( (pfn >= 0x100000) &&
    9.14 +         unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
    9.15 +         d->vcpu[0] && test_bit(_VCPUF_initialised, &d->vcpu[0]->vcpu_flags) )
    9.16 +    {
    9.17 +        MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
    9.18 +        return 0;
    9.19 +    }
    9.20 +#endif
    9.21 +
    9.22      pl3e = map_domain_page(pfn);
    9.23      for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
    9.24      {
    10.1 --- a/xen/common/kernel.c	Fri Jun 02 19:14:44 2006 +0100
    10.2 +++ b/xen/common/kernel.c	Mon Jun 05 10:42:40 2006 +0100
    10.3 @@ -184,6 +184,7 @@ long do_xen_version(int cmd, XEN_GUEST_H
    10.4      case XENVER_get_features:
    10.5      {
    10.6          xen_feature_info_t fi;
    10.7 +        struct domain *d = current->domain;
    10.8  
    10.9          if ( copy_from_guest(&fi, arg, 1) )
   10.10              return -EFAULT;
   10.11 @@ -191,7 +192,9 @@ long do_xen_version(int cmd, XEN_GUEST_H
   10.12          switch ( fi.submap_idx )
   10.13          {
   10.14          case 0:
   10.15 -            fi.submap = (1U << XENFEAT_pae_pgdir_above_4gb);
   10.16 +            fi.submap = 0;
   10.17 +            if ( VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3) )
   10.18 +                fi.submap |= (1U << XENFEAT_pae_pgdir_above_4gb);
   10.19              if ( shadow_mode_translate(current->domain) )
   10.20                  fi.submap |= 
   10.21                      (1U << XENFEAT_writable_page_tables) |
    11.1 --- a/xen/common/keyhandler.c	Fri Jun 02 19:14:44 2006 +0100
    11.2 +++ b/xen/common/keyhandler.c	Mon Jun 05 10:42:40 2006 +0100
    11.3 @@ -128,11 +128,12 @@ static void dump_domains(unsigned char k
    11.4                 d->domain_flags, atomic_read(&d->refcnt),
    11.5                 d->tot_pages, d->xenheap_pages, cpuset);
    11.6          printk("    handle=%02x%02x%02x%02x-%02x%02x-%02x%02x-"
    11.7 -               "%02x%02x-%02x%02x%02x%02x%02x%02x\n",
    11.8 +               "%02x%02x-%02x%02x%02x%02x%02x%02x vm_assist=%08lx\n",
    11.9                 d->handle[ 0], d->handle[ 1], d->handle[ 2], d->handle[ 3],
   11.10                 d->handle[ 4], d->handle[ 5], d->handle[ 6], d->handle[ 7],
   11.11                 d->handle[ 8], d->handle[ 9], d->handle[10], d->handle[11],
   11.12 -               d->handle[12], d->handle[13], d->handle[14], d->handle[15]);
   11.13 +               d->handle[12], d->handle[13], d->handle[14], d->handle[15],
   11.14 +               d->vm_assist);
   11.15  
   11.16          arch_dump_domain_info(d);
   11.17  
    12.1 --- a/xen/include/public/xen.h	Fri Jun 02 19:14:44 2006 +0100
    12.2 +++ b/xen/include/public/xen.h	Mon Jun 05 10:42:40 2006 +0100
    12.3 @@ -234,10 +234,24 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
    12.4   */
    12.5  #define VMASST_CMD_enable                0
    12.6  #define VMASST_CMD_disable               1
    12.7 +
    12.8 +/* x86/32 guests: simulate full 4GB segment limits. */
    12.9  #define VMASST_TYPE_4gb_segments         0
   12.10 +
   12.11 +/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */
   12.12  #define VMASST_TYPE_4gb_segments_notify  1
   12.13 +
   12.14 +/*
   12.15 + * x86 guests: support writes to bottom-level PTEs.
   12.16 + * NB1. Page-directory entries cannot be written.
   12.17 + * NB2. Guest must continue to remove all writable mappings of PTEs.
   12.18 + */
   12.19  #define VMASST_TYPE_writable_pagetables  2
   12.20 -#define MAX_VMASST_TYPE 2
   12.21 +
   12.22 +/* x86/PAE guests: support PDPTs above 4GB. */
   12.23 +#define VMASST_TYPE_pae_extended_cr3     3
   12.24 +
   12.25 +#define MAX_VMASST_TYPE                  3
   12.26  
   12.27  #ifndef __ASSEMBLY__
   12.28