ia64/xen-unstable

changeset 18775:5fd51e1e9c79

x86: PV support for hugepages

Hugepage support must be enabled via the hypervisor command line
option "allowhugepage". There is currently no support in the tools for
saving/restoring/migrating guests who use hugepages.

Signed-off-by: Dave McCracken <dave.mccracken@oracle.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Nov 05 10:57:21 2008 +0000 (2008-11-05)
parents 1e437b5b418a
children 3af208e6f850
files xen/arch/x86/mm.c xen/arch/x86/traps.c xen/include/asm-x86/mm.h xen/include/asm-x86/x86_32/page.h xen/include/asm-x86/x86_64/page.h
line diff
     1.1 --- a/xen/arch/x86/mm.c	Wed Nov 05 10:26:19 2008 +0000
     1.2 +++ b/xen/arch/x86/mm.c	Wed Nov 05 10:57:21 2008 +0000
     1.3 @@ -160,6 +160,9 @@ unsigned long total_pages;
     1.4  
     1.5  #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)
     1.6  
     1.7 +int opt_allow_hugepage;
     1.8 +boolean_param("allowhugepage", opt_allow_hugepage);
     1.9 +
    1.10  #define l1_disallow_mask(d)                                     \
    1.11      ((d != dom_io) &&                                           \
    1.12       (rangeset_is_empty((d)->iomem_caps) &&                     \
    1.13 @@ -586,6 +589,28 @@ static int get_page_and_type_from_pagenr
    1.14      return rc;
    1.15  }
    1.16  
    1.17 +static int get_data_page(
    1.18 +    struct page_info *page, struct domain *d, int writeable)
    1.19 +{
    1.20 +    int rc;
    1.21 +
    1.22 +    if ( writeable )
    1.23 +        rc = get_page_and_type(page, d, PGT_writable_page);
    1.24 +    else
    1.25 +        rc = get_page(page, d);
    1.26 +
    1.27 +    return rc;
    1.28 +}
    1.29 +
    1.30 +static void put_data_page(
    1.31 +    struct page_info *page, int writeable)
    1.32 +{
    1.33 +    if ( writeable )
    1.34 +        put_page_and_type(page);
    1.35 +    else
    1.36 +        put_page(page);
    1.37 +}
    1.38 +
    1.39  /*
    1.40   * We allow root tables to map each other (a.k.a. linear page tables). It
    1.41   * needs some special care with reference counts and access permissions:
    1.42 @@ -700,10 +725,9 @@ get_page_from_l1e(
    1.43       * contribute to writeable mapping refcounts.  (This allows the
    1.44       * qemu-dm helper process in dom0 to map the domain's memory without
    1.45       * messing up the count of "real" writable mappings.) */
    1.46 -    okay = (((l1f & _PAGE_RW) && 
    1.47 -             !(unlikely(paging_mode_external(d) && (d != curr->domain))))
    1.48 -            ? get_page_and_type(page, d, PGT_writable_page)
    1.49 -            : get_page(page, d));
    1.50 +    okay = get_data_page(
    1.51 +        page, d,
    1.52 +        (l1f & _PAGE_RW) && !(paging_mode_external(d) && (d != curr->domain)));
    1.53      if ( !okay )
    1.54      {
    1.55          MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
    1.56 @@ -751,6 +775,7 @@ static int
    1.57  get_page_from_l2e(
    1.58      l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
    1.59  {
    1.60 +    unsigned long mfn = l2e_get_pfn(l2e);
    1.61      int rc;
    1.62  
    1.63      if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
    1.64 @@ -762,10 +787,37 @@ get_page_from_l2e(
    1.65          return -EINVAL;
    1.66      }
    1.67  
    1.68 -    rc = get_page_and_type_from_pagenr(
    1.69 -        l2e_get_pfn(l2e), PGT_l1_page_table, d, 0, 0);
    1.70 -    if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
    1.71 -        rc = 0;
    1.72 +    if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
    1.73 +    {
    1.74 +        rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, 0, 0);
    1.75 +        if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
    1.76 +            rc = 0;
    1.77 +    }
    1.78 +    else if ( !opt_allow_hugepage || (mfn & (L1_PAGETABLE_ENTRIES-1)) )
    1.79 +    {
    1.80 +        rc = -EINVAL;
    1.81 +    }
    1.82 +    else
    1.83 +    {
    1.84 +        unsigned long m = mfn;
    1.85 +        int writeable = !!(l2e_get_flags(l2e) & _PAGE_RW);
    1.86 +  
    1.87 +        do {
    1.88 +            rc = get_data_page(mfn_to_page(m), d, writeable);
    1.89 +            if ( unlikely(!rc) )
    1.90 +            {
    1.91 +                while ( m-- > mfn )
    1.92 +                    put_data_page(mfn_to_page(m), writeable);
    1.93 +                return -EINVAL;
    1.94 +            }
    1.95 +        } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
    1.96 +
    1.97 +#ifdef __x86_64__
    1.98 +        map_pages_to_xen(
    1.99 +            (unsigned long)mfn_to_virt(mfn), mfn, L1_PAGETABLE_ENTRIES,
   1.100 +            PAGE_HYPERVISOR | l2e_get_flags(l2e));
   1.101 +#endif
   1.102 +    }
   1.103  
   1.104      return rc;
   1.105  }
   1.106 @@ -954,13 +1006,24 @@ void put_page_from_l1e(l1_pgentry_t l1e,
   1.107   */
   1.108  static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
   1.109  {
   1.110 -    if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) && 
   1.111 -         (l2e_get_pfn(l2e) != pfn) )
   1.112 +    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
   1.113 +        return 1;
   1.114 +
   1.115 +    if ( l2e_get_flags(l2e) & _PAGE_PSE )
   1.116 +    {
   1.117 +        unsigned long mfn = l2e_get_pfn(l2e), m = mfn;
   1.118 +        int writeable = l2e_get_flags(l2e) & _PAGE_RW;
   1.119 +        ASSERT(opt_allow_hugepage && !(mfn & (L1_PAGETABLE_ENTRIES-1)));
   1.120 +        do {
   1.121 +            put_data_page(mfn_to_page(m), writeable);
   1.122 +        } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) );
   1.123 +    }
   1.124 +    else
   1.125      {
   1.126          put_page_and_type(l2e_get_page(l2e));
   1.127 -        return 0;
   1.128      }
   1.129 -    return 1;
   1.130 +
   1.131 +    return 0;
   1.132  }
   1.133  
   1.134  static int __put_page_type(struct page_info *, int preemptible);
     2.1 --- a/xen/arch/x86/traps.c	Wed Nov 05 10:26:19 2008 +0000
     2.2 +++ b/xen/arch/x86/traps.c	Wed Nov 05 10:57:21 2008 +0000
     2.3 @@ -723,7 +723,8 @@ static void pv_cpuid(struct cpu_user_reg
     2.4      {
     2.5          /* Modify Feature Information. */
     2.6          __clear_bit(X86_FEATURE_VME, &d);
     2.7 -        __clear_bit(X86_FEATURE_PSE, &d);
     2.8 +        if ( !opt_allow_hugepage )
     2.9 +            __clear_bit(X86_FEATURE_PSE, &d);
    2.10          __clear_bit(X86_FEATURE_PGE, &d);
    2.11          __clear_bit(X86_FEATURE_MCE, &d);
    2.12          __clear_bit(X86_FEATURE_MCA, &d);
    2.13 @@ -2003,9 +2004,12 @@ static int emulate_privileged_op(struct 
    2.14          case 4: /* Read CR4 */
    2.15              /*
    2.16               * Guests can read CR4 to see what features Xen has enabled. We
    2.17 -             * therefore lie about PGE & PSE as they are unavailable to guests.
    2.18 +             * therefore lie about PGE as it is unavailable to guests.
    2.19 +             * Also disallow PSE if hugepages are not enabled.
    2.20               */
    2.21 -            *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE);
    2.22 +            *reg = read_cr4() & ~X86_CR4_PGE;
    2.23 +            if ( !opt_allow_hugepage )
    2.24 +                *reg &= ~X86_CR4_PSE;
    2.25              break;
    2.26  
    2.27          default:
     3.1 --- a/xen/include/asm-x86/mm.h	Wed Nov 05 10:26:19 2008 +0000
     3.2 +++ b/xen/include/asm-x86/mm.h	Wed Nov 05 10:57:21 2008 +0000
     3.3 @@ -263,6 +263,7 @@ pae_copy_root(struct vcpu *v, l3_pgentry
     3.4  
     3.5  int check_descriptor(const struct domain *, struct desc_struct *d);
     3.6  
     3.7 +extern int opt_allow_hugepage;
     3.8  
     3.9  /******************************************************************************
    3.10   * With shadow pagetables, the different kinds of address start 
     4.1 --- a/xen/include/asm-x86/x86_32/page.h	Wed Nov 05 10:26:19 2008 +0000
     4.2 +++ b/xen/include/asm-x86/x86_32/page.h	Wed Nov 05 10:57:21 2008 +0000
     4.3 @@ -112,7 +112,7 @@ extern unsigned int PAGE_HYPERVISOR_NOCA
     4.4  #define BASE_DISALLOW_MASK (0xFFFFF198U & ~_PAGE_NX)
     4.5  
     4.6  #define L1_DISALLOW_MASK (BASE_DISALLOW_MASK | _PAGE_GNTTAB)
     4.7 -#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK)
     4.8 +#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK & ~_PAGE_PSE)
     4.9  #define L3_DISALLOW_MASK 0xFFFFF1FEU /* must-be-zero */
    4.10  
    4.11  #endif /* __X86_32_PAGE_H__ */
     5.1 --- a/xen/include/asm-x86/x86_64/page.h	Wed Nov 05 10:26:19 2008 +0000
     5.2 +++ b/xen/include/asm-x86/x86_64/page.h	Wed Nov 05 10:57:21 2008 +0000
     5.3 @@ -115,7 +115,7 @@ typedef l4_pgentry_t root_pgentry_t;
     5.4  #define BASE_DISALLOW_MASK (0xFF800198U & ~_PAGE_NX)
     5.5  
     5.6  #define L1_DISALLOW_MASK (BASE_DISALLOW_MASK | _PAGE_GNTTAB)
     5.7 -#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK)
     5.8 +#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK & ~_PAGE_PSE)
     5.9  #define L3_DISALLOW_MASK (BASE_DISALLOW_MASK)
    5.10  #define L4_DISALLOW_MASK (BASE_DISALLOW_MASK)
    5.11