ia64/xen-unstable

changeset 6071:a1f7e01b0990

Fixes for x86/64 writable pagetables, including SMP
guest support.

NOTE: I removed some x86/64 specific tests from get_page_type().
I can't see what good could come of them -- if they caused
things to work then I'm pretty sure there must be some underlying
bug that is what ought to be fixed.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Tue Aug 09 10:42:51 2005 +0000 (2005-08-09)
parents 663f0fb1e444
children 4125b9fea242 12deebfb7f87 f01ba22e044c
files xen/arch/x86/mm.c xen/include/asm-x86/page.h
line diff
     1.1 --- a/xen/arch/x86/mm.c	Tue Aug 09 09:34:06 2005 +0000
     1.2 +++ b/xen/arch/x86/mm.c	Tue Aug 09 10:42:51 2005 +0000
     1.3 @@ -138,7 +138,7 @@ static struct {
     1.4   * Returns the current foreign domain; defaults to the currently-executing
     1.5   * domain if a foreign override hasn't been specified.
     1.6   */
     1.7 -#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
     1.8 +#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ?: current->domain)
     1.9  
    1.10  /* Private domain structs for DOMID_XEN and DOMID_IO. */
    1.11  static struct domain *dom_xen, *dom_io;
    1.12 @@ -903,7 +903,8 @@ static int alloc_l4_table(struct pfn_inf
    1.13          return 1;
    1.14      ASSERT(!shadow_mode_refcounts(d));
    1.15  
    1.16 -    for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ ) {
    1.17 +    for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
    1.18 +    {
    1.19          if ( !l3_backptr(&vaddr, i, type) )
    1.20              goto fail;
    1.21  
    1.22 @@ -1122,10 +1123,9 @@ static int mod_l2_entry(l2_pgentry_t *pl
    1.23              return 0;
    1.24          }
    1.25      }
    1.26 -    else
    1.27 +    else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
    1.28      {
    1.29 -        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
    1.30 -            return 0;
    1.31 +        return 0;
    1.32      }
    1.33  
    1.34      put_page_from_l2e(ol2e, pfn);
    1.35 @@ -1188,23 +1188,16 @@ static int mod_l3_entry(l3_pgentry_t *pl
    1.36  
    1.37          if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
    1.38          {
    1.39 -            BUG_ON(!create_pae_xen_mappings(pl3e));
    1.40              put_page_from_l3e(nl3e, pfn);
    1.41              return 0;
    1.42          }
    1.43 -
    1.44 -        put_page_from_l3e(ol3e, pfn);
    1.45 -        return 1;
    1.46      }
    1.47 -    else
    1.48 -   {
    1.49 -       if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
    1.50 -           {
    1.51 -               BUG_ON(!create_pae_xen_mappings(pl3e));
    1.52 -               return 0;
    1.53 -           }
    1.54 -   }
    1.55 -
    1.56 +    else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
    1.57 +    {
    1.58 +        return 0;
    1.59 +    }
    1.60 +
    1.61 +    BUG_ON(!create_pae_xen_mappings(pl3e));
    1.62      put_page_from_l3e(ol3e, pfn);
    1.63      return 1;
    1.64  }
    1.65 @@ -1254,11 +1247,10 @@ static int mod_l4_entry(l4_pgentry_t *pl
    1.66              return 0;
    1.67          }
    1.68      }
    1.69 -    else 
    1.70 +    else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
    1.71      {
    1.72 -        if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
    1.73 -            return 0;
    1.74 -     }
    1.75 +        return 0;
    1.76 +    }
    1.77  
    1.78      put_page_from_l4e(ol4e, pfn);
    1.79      return 1;
    1.80 @@ -1409,11 +1401,7 @@ int get_page_type(struct pfn_info *page,
    1.81          }
    1.82          else if ( unlikely((x & PGT_count_mask) == 0) )
    1.83          {
    1.84 -#ifdef CONFIG_X86_64
    1.85 -            if ( (x & (PGT_type_mask|PGT_va_mask)) != (type & ~PGT_va_mask))
    1.86 -#else
    1.87              if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
    1.88 -#endif
    1.89              {
    1.90                  if ( (x & PGT_type_mask) != (type & PGT_type_mask) )
    1.91                  {
    1.92 @@ -1445,17 +1433,14 @@ int get_page_type(struct pfn_info *page,
    1.93          }
    1.94          else
    1.95          {
    1.96 -#ifdef CONFIG_X86_64
    1.97 -            if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != (type & ~PGT_va_mask)) )
    1.98 -#else
    1.99              if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
   1.100 -#endif
   1.101              {
   1.102                  if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
   1.103                  {
   1.104                      if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
   1.105                           ((type & PGT_type_mask) != PGT_l1_page_table) )
   1.106 -                        MEM_LOG("Bad type (saw %" PRtype_info "!= exp %" PRtype_info ") for pfn %lx",
   1.107 +                        MEM_LOG("Bad type (saw %" PRtype_info
   1.108 +                                "!= exp %" PRtype_info ") for pfn %lx",
   1.109                                  x, type, page_to_pfn(page));
   1.110                      return 0;
   1.111                  }
   1.112 @@ -1718,9 +1703,6 @@ int do_mmuext_op(
   1.113              type = PGT_l1_page_table | PGT_va_mutable;
   1.114  
   1.115          pin_page:
   1.116 -#if CONFIG_PAGING_LEVELS >= 4
   1.117 -            type |= PGT_va_mutable;
   1.118 -#endif
   1.119              if ( shadow_mode_refcounts(FOREIGNDOM) )
   1.120                  type = PGT_writable_page;
   1.121  
   1.122 @@ -1744,16 +1726,16 @@ int do_mmuext_op(
   1.123  
   1.124  #ifndef CONFIG_X86_PAE /* Unsafe on PAE because of Xen-private mappings. */
   1.125          case MMUEXT_PIN_L2_TABLE:
   1.126 -            type = PGT_l2_page_table;
   1.127 +            type = PGT_l2_page_table | PGT_va_mutable;
   1.128              goto pin_page;
   1.129  #endif
   1.130  
   1.131          case MMUEXT_PIN_L3_TABLE:
   1.132 -            type = PGT_l3_page_table;
   1.133 +            type = PGT_l3_page_table | PGT_va_mutable;
   1.134              goto pin_page;
   1.135  
   1.136          case MMUEXT_PIN_L4_TABLE:
   1.137 -            type = PGT_l4_page_table;
   1.138 +            type = PGT_l4_page_table | PGT_va_mutable;
   1.139              goto pin_page;
   1.140  
   1.141          case MMUEXT_UNPIN_TABLE:
   1.142 @@ -1946,9 +1928,9 @@ int do_mmuext_op(
   1.143                       unlikely(_nd != _d) )
   1.144                  {
   1.145                      MEM_LOG("Bad page values %lx: ed=%p(%u), sd=%p,"
   1.146 -                            " caf=%08x, taf=%" PRtype_info "\n", page_to_pfn(page),
   1.147 -                            d, d->domain_id, unpickle_domptr(_nd), x,
   1.148 -                            page->u.inuse.type_info);
   1.149 +                            " caf=%08x, taf=%" PRtype_info "\n",
   1.150 +                            page_to_pfn(page), d, d->domain_id,
   1.151 +                            unpickle_domptr(_nd), x, page->u.inuse.type_info);
   1.152                      okay = 0;
   1.153                      goto reassign_fail;
   1.154                  }
   1.155 @@ -2111,7 +2093,8 @@ int do_mmu_update(
   1.156                      l1e = l1e_from_intpte(req.val);
   1.157                      okay = mod_l1_entry(va, l1e);
   1.158                      if ( okay && unlikely(shadow_mode_enabled(d)) )
   1.159 -                        shadow_l1_normal_pt_update(d, req.ptr, l1e, &sh_mapcache);
   1.160 +                        shadow_l1_normal_pt_update(
   1.161 +                            d, req.ptr, l1e, &sh_mapcache);
   1.162                      put_page_type(page);
   1.163                  }
   1.164                  break;
   1.165 @@ -2124,9 +2107,11 @@ int do_mmu_update(
   1.166  
   1.167                      /* FIXME: doesn't work with PAE */
   1.168                      l2e = l2e_from_intpte(req.val);
   1.169 -                    okay = mod_l2_entry((l2_pgentry_t *)va, l2e, mfn, type_info);
   1.170 +                    okay = mod_l2_entry(
   1.171 +                        (l2_pgentry_t *)va, l2e, mfn, type_info);
   1.172                      if ( okay && unlikely(shadow_mode_enabled(d)) )
   1.173 -                        shadow_l2_normal_pt_update(d, req.ptr, l2e, &sh_mapcache);
   1.174 +                        shadow_l2_normal_pt_update(
   1.175 +                            d, req.ptr, l2e, &sh_mapcache);
   1.176                      put_page_type(page);
   1.177                  }
   1.178                  break;
   1.179 @@ -2142,7 +2127,8 @@ int do_mmu_update(
   1.180                      l3e = l3e_from_intpte(req.val);
   1.181                      okay = mod_l3_entry(va, l3e, mfn, type_info);
   1.182                      if ( okay && unlikely(shadow_mode_enabled(d)) )
   1.183 -                        shadow_l3_normal_pt_update(d, req.ptr, l3e, &sh_mapcache);
   1.184 +                        shadow_l3_normal_pt_update(
   1.185 +                            d, req.ptr, l3e, &sh_mapcache);
   1.186                      put_page_type(page);
   1.187                  }
   1.188                  break;
   1.189 @@ -2158,7 +2144,8 @@ int do_mmu_update(
   1.190                      l4e = l4e_from_intpte(req.val);
   1.191                      okay = mod_l4_entry(va, l4e, mfn, type_info);
   1.192                      if ( okay && unlikely(shadow_mode_enabled(d)) )
   1.193 -                        shadow_l4_normal_pt_update(d, req.ptr, l4e, &sh_mapcache);
   1.194 +                        shadow_l4_normal_pt_update(
   1.195 +                            d, req.ptr, l4e, &sh_mapcache);
   1.196                      put_page_type(page);
   1.197                  }
   1.198                  break;
   1.199 @@ -2205,7 +2192,8 @@ int do_mmu_update(
   1.200              if ( unlikely(shadow_mode_translate(FOREIGNDOM) && IS_PRIV(d)) )
   1.201              {
   1.202                  shadow_lock(FOREIGNDOM);
   1.203 -                printk("privileged guest dom%d requests pfn=%lx to map mfn=%lx for dom%d\n",
   1.204 +                printk("privileged guest dom%d requests pfn=%lx to "
   1.205 +                       "map mfn=%lx for dom%d\n",
   1.206                         d->domain_id, gpfn, mfn, FOREIGNDOM->domain_id);
   1.207                  set_machinetophys(mfn, gpfn);
   1.208                  set_p2m_entry(FOREIGNDOM, gpfn, mfn, &sh_mapcache, &mapcache);
   1.209 @@ -2629,18 +2617,12 @@ int ptwr_debug = 0x0;
   1.210  #endif
   1.211  
   1.212  /* Re-validate a given p.t. page, given its prior snapshot */
   1.213 -int revalidate_l1(struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot)
   1.214 +int revalidate_l1(
   1.215 +    struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot)
   1.216  {
   1.217      l1_pgentry_t ol1e, nl1e;
   1.218      int modified = 0, i;
   1.219  
   1.220 -#if 0
   1.221 -    if ( d->domain_id )
   1.222 -        printk("%s: l1page mfn=%lx snapshot mfn=%lx\n", __func__,
   1.223 -               l1e_get_pfn(linear_pg_table[l1_linear_offset((unsigned long)l1page)]),
   1.224 -               l1e_get_pfn(linear_pg_table[l1_linear_offset((unsigned long)snapshot)]));
   1.225 -#endif
   1.226 -
   1.227      for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
   1.228      {
   1.229          ol1e = snapshot[i];
   1.230 @@ -2690,9 +2672,9 @@ void ptwr_flush(struct domain *d, const 
   1.231      l1_pgentry_t  *pl1e;
   1.232      l2_pgentry_t  *pl2e;
   1.233      unsigned int   modified;
   1.234 -#if defined(__x86_64__)
   1.235 +
   1.236 +#ifdef CONFIG_X86_64
   1.237      struct vcpu *v = current;
   1.238 -    /* If in user mode, switch to kernel mode just to read LDT mapping. */
   1.239      extern void toggle_guest_mode(struct vcpu *);
   1.240      int user_mode = !(v->arch.flags & TF_kernel_mode);
   1.241  #endif
   1.242 @@ -2700,8 +2682,10 @@ void ptwr_flush(struct domain *d, const 
   1.243      ASSERT(!shadow_mode_enabled(d));
   1.244  
   1.245      if ( unlikely(d->arch.ptwr[which].vcpu != current) )
   1.246 -        write_ptbase(d->arch.ptwr[which].vcpu);
   1.247 -    else 
   1.248 +        /* Don't use write_ptbase: it may switch to guest_user on x86/64! */
   1.249 +        write_cr3(pagetable_get_paddr(
   1.250 +            d->arch.ptwr[which].vcpu->arch.guest_table));
   1.251 +    else
   1.252          TOGGLE_MODE();
   1.253  
   1.254      l1va = d->arch.ptwr[which].l1va;
   1.255 @@ -2803,7 +2787,7 @@ static int ptwr_emulated_update(
   1.256          /* Align address; read full word. */
   1.257          addr &= ~(sizeof(physaddr_t)-1);
   1.258          if ( (rc = x86_emulate_read_std(addr, (unsigned long *)&full,
   1.259 -					sizeof(physaddr_t))) )
   1.260 +                                        sizeof(physaddr_t))) )
   1.261              return rc; 
   1.262          /* Mask out bits provided by caller. */
   1.263          full &= ~((((physaddr_t)1 << (bytes*8)) - 1) << (offset*8));
   1.264 @@ -2829,7 +2813,8 @@ static int ptwr_emulated_update(
   1.265           ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) ||
   1.266           (page_get_owner(page) != d) )
   1.267      {
   1.268 -        MEM_LOG("ptwr_emulate: Page is mistyped or bad pte (%lx, %" PRtype_info ")\n",
   1.269 +        MEM_LOG("ptwr_emulate: Page is mistyped or bad pte "
   1.270 +                "(%lx, %" PRtype_info ")\n",
   1.271                  l1e_get_pfn(pte), page->u.inuse.type_info);
   1.272          return X86EMUL_UNHANDLEABLE;
   1.273      }
   1.274 @@ -2902,42 +2887,13 @@ static struct x86_mem_emulator ptwr_mem_
   1.275      .cmpxchg8b_emulated = ptwr_emulated_cmpxchg8b
   1.276  };
   1.277  
   1.278 -#if defined(__x86_64__)
   1.279 -/*
   1.280 - * Returns zero on if mapped, or -1 otherwise
   1.281 - */
   1.282 -static int __not_mapped(l2_pgentry_t *pl2e)
   1.283 -{
   1.284 -    unsigned long page = read_cr3();
   1.285 -
   1.286 -    page &= PAGE_MASK;
   1.287 -    page = ((unsigned long *) __va(page))[l4_table_offset((unsigned long)pl2e)];
   1.288 -    if ( !(page & _PAGE_PRESENT) ) 
   1.289 -        return -1;        
   1.290 -        
   1.291 -    page &= PAGE_MASK;
   1.292 -    page = ((unsigned long *) __va(page))[l3_table_offset((unsigned long)pl2e)];
   1.293 -    if ( !(page & _PAGE_PRESENT) ) 
   1.294 -        return -1;
   1.295 -
   1.296 -    page &= PAGE_MASK;
   1.297 -    page = ((unsigned long *) __va(page))[l2_table_offset((unsigned long)pl2e)];
   1.298 -    if ( !(page & _PAGE_PRESENT) )
   1.299 -        return -1;
   1.300 -
   1.301 -    return 0;
   1.302 -}
   1.303 -#else
   1.304 -#define __not_mapped(p) (0)
   1.305 -#endif
   1.306 -
   1.307  /* Write page fault handler: check if guest is trying to modify a PTE. */
   1.308  int ptwr_do_page_fault(struct domain *d, unsigned long addr)
   1.309  {
   1.310      unsigned long    pfn;
   1.311      struct pfn_info *page;
   1.312      l1_pgentry_t     pte;
   1.313 -    l2_pgentry_t    *pl2e;
   1.314 +    l2_pgentry_t    *pl2e, l2e;
   1.315      int              which;
   1.316      unsigned long    l2_idx;
   1.317  
   1.318 @@ -2984,10 +2940,7 @@ int ptwr_do_page_fault(struct domain *d,
   1.319      pl2e = &__linear_l2_table[l2_idx];
   1.320      which = PTWR_PT_INACTIVE;
   1.321  
   1.322 -    if ( unlikely(__not_mapped(pl2e)) )
   1.323 -        goto inactive;
   1.324 -
   1.325 -    if ( (l2e_get_pfn(*pl2e)) == pfn )
   1.326 +    if ( (__get_user(l2e.l2, &pl2e->l2) == 0) && (l2e_get_pfn(l2e) == pfn) )
   1.327      {
   1.328          /*
   1.329           * Check the PRESENT bit to set ACTIVE mode.
   1.330 @@ -2995,14 +2948,12 @@ int ptwr_do_page_fault(struct domain *d,
   1.331           * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr).
   1.332           * The ptwr_flush call below will restore the PRESENT bit.
   1.333           */
   1.334 -        if ( likely(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
   1.335 +        if ( likely(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
   1.336               (d->arch.ptwr[PTWR_PT_ACTIVE].l1va &&
   1.337                (l2_idx == d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx)) )
   1.338              which = PTWR_PT_ACTIVE;
   1.339      }
   1.340  
   1.341 -  inactive:
   1.342 -
   1.343      /*
   1.344       * If this is a multi-processor guest then ensure that the page is hooked
   1.345       * into at most one L2 table, which must be the one running on this VCPU.
     2.1 --- a/xen/include/asm-x86/page.h	Tue Aug 09 09:34:06 2005 +0000
     2.2 +++ b/xen/include/asm-x86/page.h	Tue Aug 09 10:42:51 2005 +0000
     2.3 @@ -208,21 +208,21 @@ typedef struct { u64 pfn; } pagetable_t;
     2.4       + DOMAIN_ENTRIES_PER_L4_PAGETABLE)
     2.5  #endif
     2.6  
     2.7 -#define VA_LINEAR_PT_VIRT_START (LINEAR_PT_VIRT_START & VADDR_MASK)
     2.8 -#define linear_l1_table                                                  \
     2.9 +#define LINEAR_PT_OFFSET (LINEAR_PT_VIRT_START & VADDR_MASK)
    2.10 +#define linear_l1_table                                             \
    2.11      ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
    2.12 -#define __linear_l2_table                                                \
    2.13 -    ((l2_pgentry_t *)(LINEAR_PT_VIRT_START +                             \
    2.14 -                     (VA_LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<0))))
    2.15 -#define __linear_l3_table                                                \
    2.16 -    ((l3_pgentry_t *)(LINEAR_PT_VIRT_START +                             \
    2.17 -                     (VA_LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<0)) + \
    2.18 -                     (VA_LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<1))))
    2.19 -#define __linear_l4_table                                                \
    2.20 -    ((l4_pgentry_t *)(LINEAR_PT_VIRT_START +                             \
    2.21 -                     (VA_LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<0)) + \
    2.22 -                     (VA_LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<1)) + \
    2.23 -                     (VA_LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<2))))
    2.24 +#define __linear_l2_table                                           \
    2.25 +    ((l2_pgentry_t *)(LINEAR_PT_VIRT_START +                        \
    2.26 +                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0))))
    2.27 +#define __linear_l3_table                                           \
    2.28 +    ((l3_pgentry_t *)(LINEAR_PT_VIRT_START +                        \
    2.29 +                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) +   \
    2.30 +                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1))))
    2.31 +#define __linear_l4_table                                           \
    2.32 +    ((l4_pgentry_t *)(LINEAR_PT_VIRT_START +                        \
    2.33 +                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) +   \
    2.34 +                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1)) +   \
    2.35 +                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<2))))
    2.36  
    2.37  #define linear_pg_table linear_l1_table
    2.38  #define linear_l2_table(_ed) ((_ed)->arch.guest_vtable)