ia64/xen-unstable

changeset 19010:292919f61238

x86-64: guest directed placement of initial p->m map

By adding another ELF note, the kernel can now direct the hypervisor
(for Dom0) and in the future also the tools (for DomU-s) to place the
initial phys->mach translation table at other than an address
immediately above the kernel/initrd images. This eliminates the size
restriction imposed on this table by Linux (the kernel loads above the
-2Gb boundary, and hence the entire initial mapping cannot reach or
even exceed 2Gb).

There are a few items in this patch I'm not particularly happy with,
but couldn't think of a better solution:
- there is a hidden assumption that pages allocated for the domain are
put on the domain's page list sequentially
- the way backward compatibility is maintained is placing requirements
on the kernel side that make the code somewhat convoluted (because
it
needs to check where the map is actually placed in quite a few
places)
- code is there to use 1Gb mappings for the hypervisor created table,
but lacking a machine with 512G+ memory for immediate testing I
can't
verify this works; I know that 2Mb mappings work, and hence imply
that 1Gb ones would too (of course, if the kernel replaces the table
- like Linux does -, it cannot use 2Mb/1Gb mappings or even try to
re-use the page table entries, but I don't consider this a problem)

Signed-off-by: Jan Beulich <jbeulich@novell.com>
author Keir Fraser <keir.fraser@citrix.com>
date Thu Jan 08 11:32:39 2009 +0000 (2009-01-08)
parents 97f8d6453fda
children 275abe1c5d24
files tools/include/xen-foreign/reference.size xen/arch/x86/domain_build.c xen/arch/x86/mm.c xen/common/libelf/libelf-dominfo.c xen/include/public/elfnote.h xen/include/public/libelf.h xen/include/public/xen.h
line diff
     1.1 --- a/tools/include/xen-foreign/reference.size	Thu Jan 08 11:27:11 2009 +0000
     1.2 +++ b/tools/include/xen-foreign/reference.size	Thu Jan 08 11:32:39 2009 +0000
     1.3 @@ -1,7 +1,7 @@
     1.4  
     1.5  structs                   |  x86_32  x86_64    ia64
     1.6  
     1.7 -start_info                |    1104    1152    1152
     1.8 +start_info                |    1112    1168    1168
     1.9  trap_info                 |       8      16       -
    1.10  pt_fpreg                  |       -       -      16
    1.11  cpu_user_regs             |      68     200       -
     2.1 --- a/xen/arch/x86/domain_build.c	Thu Jan 08 11:27:11 2009 +0000
     2.2 +++ b/xen/arch/x86/domain_build.c	Thu Jan 08 11:32:39 2009 +0000
     2.3 @@ -341,6 +341,12 @@ int __init construct_dom0(
     2.4  #endif
     2.5      }
     2.6  
     2.7 +    if ( (parms.p2m_base != UNSET_ADDR) && elf_32bit(&elf) )
     2.8 +    {
     2.9 +        printk(XENLOG_WARNING "P2M table base ignored\n");
    2.10 +        parms.p2m_base = UNSET_ADDR;
    2.11 +    }
    2.12 +
    2.13      domain_set_alloc_bitsize(d);
    2.14  
    2.15      /*
    2.16 @@ -359,6 +365,8 @@ int __init construct_dom0(
    2.17      vphysmap_end     = vphysmap_start + (nr_pages * (!is_pv_32on64_domain(d) ?
    2.18                                                       sizeof(unsigned long) :
    2.19                                                       sizeof(unsigned int)));
    2.20 +    if ( parms.p2m_base != UNSET_ADDR )
    2.21 +        vphysmap_end = vphysmap_start;
    2.22      vstartinfo_start = round_pgup(vphysmap_end);
    2.23      vstartinfo_end   = (vstartinfo_start +
    2.24                          sizeof(struct start_info) +
    2.25 @@ -400,6 +408,11 @@ int __init construct_dom0(
    2.26      /* Ensure that our low-memory 1:1 mapping covers the allocation. */
    2.27      page = alloc_domheap_pages(d, order, MEMF_bits(30));
    2.28  #else
    2.29 +    if ( parms.p2m_base != UNSET_ADDR )
    2.30 +    {
    2.31 +        vphysmap_start = parms.p2m_base;
    2.32 +        vphysmap_end   = vphysmap_start + nr_pages * sizeof(unsigned long);
    2.33 +    }
    2.34      page = alloc_domheap_pages(d, order, 0);
    2.35  #endif
    2.36      if ( page == NULL )
    2.37 @@ -740,8 +753,109 @@ int __init construct_dom0(
    2.38      snprintf(si->magic, sizeof(si->magic), "xen-3.0-x86_%d%s",
    2.39               elf_64bit(&elf) ? 64 : 32, parms.pae ? "p" : "");
    2.40  
    2.41 +    count = d->tot_pages;
    2.42 +#ifdef __x86_64__
    2.43 +    /* Set up the phys->machine table if not part of the initial mapping. */
    2.44 +    if ( parms.p2m_base != UNSET_ADDR )
    2.45 +    {
    2.46 +        unsigned long va = vphysmap_start;
    2.47 +
    2.48 +        if ( v_start <= vphysmap_end && vphysmap_start <= v_end )
    2.49 +            panic("DOM0 P->M table overlaps initial mapping");
    2.50 +
    2.51 +        while ( va < vphysmap_end )
    2.52 +        {
    2.53 +            if ( d->tot_pages + ((round_pgup(vphysmap_end) - va)
    2.54 +                                 >> PAGE_SHIFT) + 3 > nr_pages )
    2.55 +                panic("Dom0 allocation too small for initial P->M table.\n");
    2.56 +
    2.57 +            l4tab = l4start + l4_table_offset(va);
    2.58 +            if ( !l4e_get_intpte(*l4tab) )
    2.59 +            {
    2.60 +                page = alloc_domheap_pages(d, 0, 0);
    2.61 +                if ( !page )
    2.62 +                    break;
    2.63 +                /* No mapping, PGC_allocated + page-table page. */
    2.64 +                page->count_info = PGC_allocated | 2;
    2.65 +                page->u.inuse.type_info =
    2.66 +                    PGT_l3_page_table | PGT_validated | 1;
    2.67 +                clear_page(page_to_virt(page));
    2.68 +                *l4tab = l4e_from_page(page, L4_PROT);
    2.69 +            }
    2.70 +            l3tab = page_to_virt(l4e_get_page(*l4tab));
    2.71 +            l3tab += l3_table_offset(va);
    2.72 +            if ( !l3e_get_intpte(*l3tab) )
    2.73 +            {
    2.74 +                if ( cpu_has_page1gb &&
    2.75 +                     !(va & ((1UL << L3_PAGETABLE_SHIFT) - 1)) &&
    2.76 +                     vphysmap_end >= va + (1UL << L3_PAGETABLE_SHIFT) &&
    2.77 +                     (page = alloc_domheap_pages(d,
    2.78 +                                                 L3_PAGETABLE_SHIFT -
    2.79 +                                                     PAGE_SHIFT,
    2.80 +                                                 0)) != NULL )
    2.81 +                {
    2.82 +                    *l3tab = l3e_from_page(page,
    2.83 +                                           L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
    2.84 +                    va += 1UL << L3_PAGETABLE_SHIFT;
    2.85 +                    continue;
    2.86 +                }
    2.87 +                if ( (page = alloc_domheap_pages(d, 0, 0)) == NULL )
    2.88 +                    break;
    2.89 +                else
    2.90 +                {
    2.91 +                    /* No mapping, PGC_allocated + page-table page. */
    2.92 +                    page->count_info = PGC_allocated | 2;
    2.93 +                    page->u.inuse.type_info =
    2.94 +                        PGT_l2_page_table | PGT_validated | 1;
    2.95 +                    clear_page(page_to_virt(page));
    2.96 +                    *l3tab = l3e_from_page(page, L3_PROT);
    2.97 +                }
    2.98 +            }
    2.99 +            l2tab = page_to_virt(l3e_get_page(*l3tab));
   2.100 +            l2tab += l2_table_offset(va);
   2.101 +            if ( !l2e_get_intpte(*l2tab) )
   2.102 +            {
   2.103 +                if ( !(va & ((1UL << L2_PAGETABLE_SHIFT) - 1)) &&
   2.104 +                     vphysmap_end >= va + (1UL << L2_PAGETABLE_SHIFT) &&
   2.105 +                     (page = alloc_domheap_pages(d,
   2.106 +                                                 L2_PAGETABLE_SHIFT -
   2.107 +                                                     PAGE_SHIFT,
   2.108 +                                                 0)) != NULL )
   2.109 +                {
   2.110 +                    *l2tab = l2e_from_page(page,
   2.111 +                                           L1_PROT|_PAGE_DIRTY|_PAGE_PSE);
   2.112 +                    va += 1UL << L2_PAGETABLE_SHIFT;
   2.113 +                    continue;
   2.114 +                }
   2.115 +                if ( (page = alloc_domheap_pages(d, 0, 0)) == NULL )
   2.116 +                    break;
   2.117 +                else
   2.118 +                {
   2.119 +                    /* No mapping, PGC_allocated + page-table page. */
   2.120 +                    page->count_info = PGC_allocated | 2;
   2.121 +                    page->u.inuse.type_info =
   2.122 +                        PGT_l1_page_table | PGT_validated | 1;
   2.123 +                    clear_page(page_to_virt(page));
   2.124 +                    *l2tab = l2e_from_page(page, L2_PROT);
   2.125 +                }
   2.126 +            }
   2.127 +            l1tab = page_to_virt(l2e_get_page(*l2tab));
   2.128 +            l1tab += l1_table_offset(va);
   2.129 +            BUG_ON(l1e_get_intpte(*l1tab));
   2.130 +            page = alloc_domheap_pages(d, 0, 0);
   2.131 +            if ( !page )
   2.132 +                break;
   2.133 +            *l1tab = l1e_from_page(page, L1_PROT|_PAGE_DIRTY);
   2.134 +            va += PAGE_SIZE;
   2.135 +            va &= PAGE_MASK;
   2.136 +        }
   2.137 +        if ( !page )
   2.138 +            panic("Not enough RAM for DOM0 P->M table.\n");
   2.139 +    }
   2.140 +#endif
   2.141 +
   2.142      /* Write the phys->machine and machine->phys table entries. */
   2.143 -    for ( pfn = 0; pfn < d->tot_pages; pfn++ )
   2.144 +    for ( pfn = 0; pfn < count; pfn++ )
   2.145      {
   2.146          mfn = pfn + alloc_spfn;
   2.147  #ifndef NDEBUG
   2.148 @@ -755,6 +869,26 @@ int __init construct_dom0(
   2.149              ((unsigned int *)vphysmap_start)[pfn] = mfn;
   2.150          set_gpfn_from_mfn(mfn, pfn);
   2.151      }
   2.152 +    si->first_p2m_pfn = pfn;
   2.153 +    si->nr_p2m_frames = d->tot_pages - count;
   2.154 +    list_for_each_entry ( page, &d->page_list, list )
   2.155 +    {
   2.156 +        mfn = page_to_mfn(page);
   2.157 +        if ( get_gpfn_from_mfn(mfn) >= count )
   2.158 +        {
   2.159 +            BUG_ON(is_pv_32bit_domain(d));
   2.160 +            if ( !page->u.inuse.type_info &&
   2.161 +                 !get_page_and_type(page, d, PGT_writable_page) )
   2.162 +                BUG();
   2.163 +            ((unsigned long *)vphysmap_start)[pfn] = mfn;
   2.164 +            set_gpfn_from_mfn(mfn, pfn);
   2.165 +            ++pfn;
   2.166 +#ifndef NDEBUG
   2.167 +            ++alloc_epfn;
   2.168 +#endif
   2.169 +        }
   2.170 +    }
   2.171 +    BUG_ON(pfn != d->tot_pages);
   2.172      while ( pfn < nr_pages )
   2.173      {
   2.174          if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL )
     3.1 --- a/xen/arch/x86/mm.c	Thu Jan 08 11:27:11 2009 +0000
     3.2 +++ b/xen/arch/x86/mm.c	Thu Jan 08 11:32:39 2009 +0000
     3.3 @@ -1013,7 +1013,8 @@ static int put_page_from_l2e(l2_pgentry_
     3.4      {
     3.5          unsigned long mfn = l2e_get_pfn(l2e), m = mfn;
     3.6          int writeable = l2e_get_flags(l2e) & _PAGE_RW;
     3.7 -        ASSERT(opt_allow_hugepage && !(mfn & (L1_PAGETABLE_ENTRIES-1)));
     3.8 +
     3.9 +        ASSERT(!(mfn & (L1_PAGETABLE_ENTRIES-1)));
    3.10          do {
    3.11              put_data_page(mfn_to_page(m), writeable);
    3.12          } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
    3.13 @@ -1031,14 +1032,28 @@ static int __put_page_type(struct page_i
    3.14  static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
    3.15                               int partial, int preemptible)
    3.16  {
    3.17 -    if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && 
    3.18 -         (l3e_get_pfn(l3e) != pfn) )
    3.19 +    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
    3.20 +        return 1;
    3.21 +
    3.22 +#ifdef __x86_64__
    3.23 +    if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
    3.24      {
    3.25 -        if ( unlikely(partial > 0) )
    3.26 -            return __put_page_type(l3e_get_page(l3e), preemptible);
    3.27 -        return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
    3.28 +        unsigned long mfn = l3e_get_pfn(l3e);
    3.29 +        int writeable = l3e_get_flags(l3e) & _PAGE_RW;
    3.30 +
    3.31 +        ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
    3.32 +        do {
    3.33 +            put_data_page(mfn_to_page(mfn), writeable);
    3.34 +        } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
    3.35 +
    3.36 +        return 0;
    3.37      }
    3.38 -    return 1;
    3.39 +#endif
    3.40 +
    3.41 +    if ( unlikely(partial > 0) )
    3.42 +        return __put_page_type(l3e_get_page(l3e), preemptible);
    3.43 +
    3.44 +    return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
    3.45  }
    3.46  
    3.47  #if CONFIG_PAGING_LEVELS >= 4
     4.1 --- a/xen/common/libelf/libelf-dominfo.c	Thu Jan 08 11:27:11 2009 +0000
     4.2 +++ b/xen/common/libelf/libelf-dominfo.c	Thu Jan 08 11:32:39 2009 +0000
     4.3 @@ -90,6 +90,7 @@ int elf_xen_parse_note(struct elf_binary
     4.4          [XEN_ELFNOTE_ENTRY] = { "ENTRY", 0},
     4.5          [XEN_ELFNOTE_HYPERCALL_PAGE] = { "HYPERCALL_PAGE", 0},
     4.6          [XEN_ELFNOTE_VIRT_BASE] = { "VIRT_BASE", 0},
     4.7 +        [XEN_ELFNOTE_INIT_P2M] = { "INIT_P2M", 0},
     4.8          [XEN_ELFNOTE_PADDR_OFFSET] = { "PADDR_OFFSET", 0},
     4.9          [XEN_ELFNOTE_HV_START_LOW] = { "HV_START_LOW", 0},
    4.10          [XEN_ELFNOTE_XEN_VERSION] = { "XEN_VERSION", 1},
    4.11 @@ -164,6 +165,9 @@ int elf_xen_parse_note(struct elf_binary
    4.12      case XEN_ELFNOTE_ENTRY:
    4.13          parms->virt_entry = val;
    4.14          break;
    4.15 +    case XEN_ELFNOTE_INIT_P2M:
    4.16 +        parms->p2m_base = val;
    4.17 +        break;
    4.18      case XEN_ELFNOTE_PADDR_OFFSET:
    4.19          parms->elf_paddr_offset = val;
    4.20          break;
    4.21 @@ -392,6 +396,7 @@ static int elf_xen_addr_calc_check(struc
    4.22      elf_msg(elf, "    virt_kstart      = 0x%" PRIx64 "\n", parms->virt_kstart);
    4.23      elf_msg(elf, "    virt_kend        = 0x%" PRIx64 "\n", parms->virt_kend);
    4.24      elf_msg(elf, "    virt_entry       = 0x%" PRIx64 "\n", parms->virt_entry);
    4.25 +    elf_msg(elf, "    p2m_base         = 0x%" PRIx64 "\n", parms->p2m_base);
    4.26  
    4.27      if ( (parms->virt_kstart > parms->virt_kend) ||
    4.28           (parms->virt_entry < parms->virt_kstart) ||
    4.29 @@ -403,6 +408,15 @@ static int elf_xen_addr_calc_check(struc
    4.30          return -1;
    4.31      }
    4.32  
    4.33 +    if ( (parms->p2m_base != UNSET_ADDR) &&
    4.34 +         (parms->p2m_base >= parms->virt_kstart) &&
    4.35 +         (parms->p2m_base < parms->virt_kend) )
    4.36 +    {
    4.37 +        elf_err(elf, "%s: ERROR: P->M table base is out of bounds.\n",
    4.38 +                __FUNCTION__);
    4.39 +        return -1;
    4.40 +    }
    4.41 +
    4.42      return 0;
    4.43  }
    4.44  
    4.45 @@ -422,6 +436,7 @@ int elf_xen_parse(struct elf_binary *elf
    4.46      parms->virt_entry = UNSET_ADDR;
    4.47      parms->virt_hypercall = UNSET_ADDR;
    4.48      parms->virt_hv_start_low = UNSET_ADDR;
    4.49 +    parms->p2m_base = UNSET_ADDR;
    4.50      parms->elf_paddr_offset = UNSET_ADDR;
    4.51  
    4.52      /* Find and parse elf notes. */
     5.1 --- a/xen/include/public/elfnote.h	Thu Jan 08 11:27:11 2009 +0000
     5.2 +++ b/xen/include/public/elfnote.h	Thu Jan 08 11:32:39 2009 +0000
     5.3 @@ -162,9 +162,20 @@
     5.4  #define XEN_ELFNOTE_SUSPEND_CANCEL 14
     5.5  
     5.6  /*
     5.7 + * The (non-default) location the initial phys-to-machine map should be
     5.8 + * placed at by the hypervisor (Dom0) or the tools (DomU).
     5.9 + * The kernel must be prepared for this mapping to be established using
    5.10 + * large pages, despite such otherwise not being available to guests.
    5.11 + * The kernel must also be able to handle the page table pages used for
    5.12 + * this mapping not being accessible through the initial mapping.
    5.13 + * (Only x86-64 supports this at present.)
    5.14 + */
    5.15 +#define XEN_ELFNOTE_INIT_P2M      15
    5.16 +
    5.17 +/*
    5.18   * The number of the highest elfnote defined.
    5.19   */
    5.20 -#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUSPEND_CANCEL
    5.21 +#define XEN_ELFNOTE_MAX XEN_ELFNOTE_INIT_P2M
    5.22  
    5.23  /*
    5.24   * System information exported through crash notes.
     6.1 --- a/xen/include/public/libelf.h	Thu Jan 08 11:27:11 2009 +0000
     6.2 +++ b/xen/include/public/libelf.h	Thu Jan 08 11:32:39 2009 +0000
     6.3 @@ -232,6 +232,7 @@ struct elf_dom_parms {
     6.4      uint64_t virt_entry;
     6.5      uint64_t virt_hypercall;
     6.6      uint64_t virt_hv_start_low;
     6.7 +    uint64_t p2m_base;
     6.8      uint64_t elf_paddr_offset;
     6.9      uint32_t f_supported[XENFEAT_NR_SUBMAPS];
    6.10      uint32_t f_required[XENFEAT_NR_SUBMAPS];
     7.1 --- a/xen/include/public/xen.h	Thu Jan 08 11:27:11 2009 +0000
     7.2 +++ b/xen/include/public/xen.h	Thu Jan 08 11:32:39 2009 +0000
     7.3 @@ -513,6 +513,7 @@ typedef struct shared_info shared_info_t
     7.4   *      a. relocated kernel image
     7.5   *      b. initial ram disk              [mod_start, mod_len]
     7.6   *      c. list of allocated page frames [mfn_list, nr_pages]
     7.7 + *         (unless relocated due to XEN_ELFNOTE_INIT_P2M)
     7.8   *      d. start_info_t structure        [register ESI (x86)]
     7.9   *      e. bootstrap page tables         [pt_base, CR3 (x86)]
    7.10   *      f. bootstrap stack               [register ESP (x86)]
    7.11 @@ -554,6 +555,9 @@ struct start_info {
    7.12      unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
    7.13      unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
    7.14      int8_t cmd_line[MAX_GUEST_CMDLINE];
    7.15 +    /* The pfn range here covers both page table and p->m table frames.   */
    7.16 +    unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */
    7.17 +    unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table.  */
    7.18  };
    7.19  typedef struct start_info start_info_t;
    7.20