ia64/xen-unstable

changeset 2420:5d7a7c656fa3

bitkeeper revision 1.1159.69.3 (4138d7a65FvXU3lh0Vx8Nsl4KhPxGw)

Fix potential security hole in writeable pagetable implementation:
We wern't ensuring that that L1 pages' VA backpointer is immutable
after the backpointer is initialised when the page first becomes
linked into a pagetable. The backpointer can only be released after
the type count drops to zero (or 1 if the page is pinned).
In summary: We now ensure that if an L1 page is used in multiple
pagetables it must be at the same virtual address in all of them,
and that L1 pages can only be used once in any given pagetable.
None of these extra rules should be a problem for any OS.
author iap10@labyrinth.cl.cam.ac.uk
date Fri Sep 03 20:44:22 2004 +0000 (2004-09-03)
parents 51f753d105f7
children 2c92a3484f05 ac282d5dd9d1
files xen/arch/x86/domain.c xen/arch/x86/memory.c xen/arch/x86/traps.c xen/include/asm-x86/mm.h
line diff
     1.1 --- a/xen/arch/x86/domain.c	Fri Sep 03 12:05:52 2004 +0000
     1.2 +++ b/xen/arch/x86/domain.c	Fri Sep 03 20:44:22 2004 +0000
     1.3 @@ -745,6 +745,9 @@ int construct_dom0(struct domain *p,
     1.4          {
     1.5              page->u.inuse.type_info &= ~PGT_type_mask;
     1.6              page->u.inuse.type_info |= PGT_l1_page_table;
     1.7 +	    page->u.inuse.type_info |= 
     1.8 +		((v_start>>L2_PAGETABLE_SHIFT)+(count-1))<<PGT_va_shift;
     1.9 +
    1.10              get_page(page, p); /* an extra ref because of readable mapping */
    1.11          }
    1.12          l1tab++;
     2.1 --- a/xen/arch/x86/memory.c	Fri Sep 03 12:05:52 2004 +0000
     2.2 +++ b/xen/arch/x86/memory.c	Fri Sep 03 20:44:22 2004 +0000
     2.3 @@ -142,8 +142,6 @@ static struct domain *dom_xen, *dom_io;
     2.4  
     2.5  void arch_init_memory(void)
     2.6  {
     2.7 -    static void ptwr_init_backpointers(void);
     2.8 -    static void ptwr_disable(void);
     2.9      unsigned long mfn;
    2.10  
    2.11      /*
    2.12 @@ -165,10 +163,15 @@ void arch_init_memory(void)
    2.13  
    2.14      memset(percpu_info, 0, sizeof(percpu_info));
    2.15  
    2.16 +/* XXXX WRITEABLE PAGETABLES SHOULD BE A DOMAIN CREATION-TIME
    2.17 +   DECISION, NOT SOMETHING THAT IS CHANGED ON A RUNNING DOMAIN 
    2.18 +   !!! FIX ME !!!! 
    2.19 + */
    2.20 +
    2.21      vm_assist_info[VMASST_TYPE_writable_pagetables].enable =
    2.22 -        ptwr_init_backpointers;
    2.23 +        NULL;
    2.24      vm_assist_info[VMASST_TYPE_writable_pagetables].disable =
    2.25 -        ptwr_disable;
    2.26 +        NULL;
    2.27  
    2.28      for ( mfn = 0; mfn < max_page; mfn++ )
    2.29          frame_table[mfn].count_info |= PGC_always_set;
    2.30 @@ -319,17 +322,6 @@ static int get_page_and_type_from_pagenr
    2.31  }
    2.32  
    2.33  
    2.34 -static inline void set_l1_page_va(unsigned long pfn,
    2.35 -                                  unsigned long va_idx)
    2.36 -{
    2.37 -    struct pfn_info *page;
    2.38 -    
    2.39 -    page = &frame_table[pfn];
    2.40 -    page->u.inuse.type_info &= ~PGT_va_mask;
    2.41 -    page->u.inuse.type_info |= va_idx << PGT_va_shift;
    2.42 -}
    2.43 -
    2.44 -
    2.45  /*
    2.46   * We allow an L2 tables to map each other (a.k.a. linear page tables). It
    2.47   * needs some special care with reference counst and access permissions:
    2.48 @@ -463,8 +455,10 @@ get_page_from_l1e(
    2.49  /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
    2.50  static int 
    2.51  get_page_from_l2e(
    2.52 -    l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
    2.53 +    l2_pgentry_t l2e, unsigned long pfn, struct domain *d, unsigned long va_idx)
    2.54  {
    2.55 +    int rc;
    2.56 +
    2.57      if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
    2.58          return 1;
    2.59  
    2.60 @@ -475,8 +469,11 @@ get_page_from_l2e(
    2.61          return 0;
    2.62      }
    2.63  
    2.64 -    if ( unlikely(!get_page_and_type_from_pagenr(
    2.65 -        l2_pgentry_to_pagenr(l2e), PGT_l1_page_table, d)) )
    2.66 +    rc = get_page_and_type_from_pagenr(
    2.67 +        l2_pgentry_to_pagenr(l2e), 
    2.68 +	PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
    2.69 +
    2.70 +    if ( unlikely(!rc) )
    2.71          return get_linear_pagetable(l2e, pfn, d);
    2.72  
    2.73      return 1;
    2.74 @@ -550,9 +547,8 @@ static int alloc_l2_table(struct pfn_inf
    2.75      pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
    2.76  
    2.77      for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) {
    2.78 -        if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d)) )
    2.79 +        if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
    2.80              goto fail;
    2.81 -        set_l1_page_va(l2_pgentry_val(pl2e[i]) >> PAGE_SHIFT, i);
    2.82      }
    2.83      
    2.84  #if defined(__i386__)
    2.85 @@ -674,11 +670,10 @@ static int mod_l2_entry(l2_pgentry_t *pl
    2.86          if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
    2.87              return update_l2e(pl2e, ol2e, nl2e);
    2.88  
    2.89 -        if ( unlikely(!get_page_from_l2e(nl2e, pfn, current)) )
    2.90 +        if ( unlikely(!get_page_from_l2e(nl2e, pfn, current, 
    2.91 +					((unsigned long)
    2.92 +					 pl2e & ~PAGE_MASK) >> 2 )) )
    2.93              return 0;
    2.94 -        
    2.95 -        set_l1_page_va(l2_pgentry_val(nl2e) >> PAGE_SHIFT,
    2.96 -                       ((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2);
    2.97  
    2.98          if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
    2.99          {
   2.100 @@ -833,10 +828,20 @@ static int do_extended_command(unsigned 
   2.101      {
   2.102      case MMUEXT_PIN_L1_TABLE:
   2.103      case MMUEXT_PIN_L2_TABLE:
   2.104 +
   2.105 +	/* When we pin an L1 page we now insist that the va
   2.106 +	   backpointer (used for writable page tables) must still be
   2.107 +	   mutable. This is an additional restriction even for guests
   2.108 +	   that don't use writable page tables, but I don't think it
   2.109 +	   will break anything as guests typically pin pages before
   2.110 +	   they are used, hence they'll still be mutable. */
   2.111 +
   2.112          okay = get_page_and_type_from_pagenr(
   2.113              pfn, 
   2.114 -            (cmd==MMUEXT_PIN_L2_TABLE) ? PGT_l2_page_table : PGT_l1_page_table,
   2.115 +            ((cmd==MMUEXT_PIN_L2_TABLE) ? 
   2.116 +	     PGT_l2_page_table : (PGT_l1_page_table | PGT_va_mutable) ) ,
   2.117              FOREIGNDOM);
   2.118 +
   2.119          if ( unlikely(!okay) )
   2.120          {
   2.121              MEM_LOG("Error while pinning pfn %08lx", pfn);
   2.122 @@ -894,8 +899,7 @@ static int do_extended_command(unsigned 
   2.123              /*
   2.124               * Note that we tick the clock /after/ dropping the old base's
   2.125               * reference count. If the page tables got freed then this will
   2.126 -             * avoid unnecessary TLB flushes when the pages are reused.
   2.127 -             */
   2.128 +             * avoid unnecessary TLB flushes when the pages are reused.  */
   2.129              tlb_clocktick();
   2.130          }
   2.131          else
   2.132 @@ -1230,7 +1234,7 @@ int do_mmu_update(mmu_update_t *ureqs, i
   2.133              switch ( (page->u.inuse.type_info & PGT_type_mask) )
   2.134              {
   2.135              case PGT_l1_page_table: 
   2.136 -                if ( likely(get_page_type(page, PGT_l1_page_table)) )
   2.137 +                if ( likely(passive_get_page_type(page, PGT_l1_page_table)) )
   2.138                  {
   2.139                      okay = mod_l1_entry((l1_pgentry_t *)va, 
   2.140                                          mk_l1_pgentry(req.val)); 
   2.141 @@ -1623,9 +1627,11 @@ int ptwr_do_page_fault(unsigned long add
   2.142      PTWR_PRINTK(("get user %p for va %08lx\n",
   2.143                   &linear_pg_table[addr>>PAGE_SHIFT], addr));
   2.144  #endif
   2.145 +
   2.146 +    /* Testing for page_present in the L2 avoids lots of unncessary fixups */
   2.147      if ( (l2_pgentry_val(linear_l2_table[addr >> L2_PAGETABLE_SHIFT]) &
   2.148 -          _PAGE_PRESENT) &&
   2.149 -         (__get_user(pte, (unsigned long *)
   2.150 +      _PAGE_PRESENT) &&
   2.151 +	 (__get_user(pte, (unsigned long *)
   2.152                       &linear_pg_table[addr >> PAGE_SHIFT]) == 0) )
   2.153      {
   2.154          pfn = pte >> PAGE_SHIFT;
   2.155 @@ -1650,6 +1656,7 @@ int ptwr_do_page_fault(unsigned long add
   2.156  
   2.157              if ( l2_pgentry_val(*pl2e) >> PAGE_SHIFT != pfn )
   2.158              {
   2.159 +		/* this L1 is not in the current address space */
   2.160                  l1_pgentry_t *pl1e;
   2.161                  PTWR_PRINTK(("[I] freeing l1 page %p taf %08x/%08x\n", page,
   2.162                               page->u.inuse.type_info,
   2.163 @@ -1715,36 +1722,6 @@ int ptwr_do_page_fault(unsigned long add
   2.164      return 0;
   2.165  }
   2.166  
   2.167 -static void ptwr_init_backpointers(void)
   2.168 -{
   2.169 -    struct pfn_info *page;
   2.170 -    unsigned long pde;
   2.171 -    int va_idx;
   2.172 -
   2.173 -    for ( va_idx = 0; va_idx < DOMAIN_ENTRIES_PER_L2_PAGETABLE; va_idx++ )
   2.174 -    {
   2.175 -        /* check if entry valid */
   2.176 -        pde = l2_pgentry_val(linear_l2_table[va_idx]);
   2.177 -        if ( (pde & _PAGE_PRESENT) == 0 )
   2.178 -            continue;
   2.179 -
   2.180 -        page = &frame_table[pde >> PAGE_SHIFT];
   2.181 -        /* assert that page is an l1_page_table   XXXcl maybe l2? */
   2.182 -        if ( (page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table ) {
   2.183 -	    MEM_LOG("ptwr: Inconsistent pagetable: pde %lx not an l1 page\n",
   2.184 -		    pde >> PAGE_SHIFT);
   2.185 -	    domain_crash();
   2.186 -	}
   2.187 -        page->u.inuse.type_info &= ~PGT_va_mask;
   2.188 -        page->u.inuse.type_info |= va_idx << PGT_va_shift;
   2.189 -    }
   2.190 -}
   2.191 -
   2.192 -static void ptwr_disable(void)
   2.193 -{
   2.194 -    __cleanup_writable_pagetable(PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
   2.195 -}
   2.196 -
   2.197  #ifndef NDEBUG
   2.198  void ptwr_status(void)
   2.199  {
     3.1 --- a/xen/arch/x86/traps.c	Fri Sep 03 12:05:52 2004 +0000
     3.2 +++ b/xen/arch/x86/traps.c	Fri Sep 03 20:44:22 2004 +0000
     3.3 @@ -330,9 +330,9 @@ asmlinkage void do_page_fault(struct pt_
     3.4              return; /* successfully copied the mapping */
     3.5      }
     3.6  
     3.7 -    if ( unlikely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
     3.8 +    if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
     3.9      {
    3.10 -        if ( (addr >> L2_PAGETABLE_SHIFT) == ptwr_info[cpu].disconnected )
    3.11 +        if ( unlikely((addr >> L2_PAGETABLE_SHIFT) == ptwr_info[cpu].disconnected ))
    3.12          {
    3.13              ptwr_reconnect_disconnected(addr);
    3.14              return;
     4.1 --- a/xen/include/asm-x86/mm.h	Fri Sep 03 12:05:52 2004 +0000
     4.2 +++ b/xen/include/asm-x86/mm.h	Fri Sep 03 20:44:22 2004 +0000
     4.3 @@ -74,6 +74,7 @@ struct pfn_info
     4.4   /* 10-bit most significant bits of va address if used as l1 page table */
     4.5  #define PGT_va_shift        18
     4.6  #define PGT_va_mask         (((1<<10)-1)<<PGT_va_shift)
     4.7 +#define PGT_va_mutable      PGT_va_mask /* va backpointer is still mutable */
     4.8   /* 18-bit count of uses of this frame as its current type. */
     4.9  #define PGT_count_mask      ((1<<18)-1)
    4.10  
    4.11 @@ -198,6 +199,13 @@ static inline void put_page_type(struct 
    4.12                  nx &= ~PGT_validated;
    4.13              }
    4.14          }
    4.15 +	else if ( unlikely( ((nx & PGT_count_mask) == 1) && 
    4.16 +	    test_bit(_PGC_guest_pinned, &page->count_info)) )
    4.17 +	{
    4.18 +	    /* if the page is pinned, but we're dropping the last reference
    4.19 +	       then make the va backpointer mutable again */
    4.20 +	    nx |= PGT_va_mutable;
    4.21 +	}
    4.22      }
    4.23      while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
    4.24  }
    4.25 @@ -217,22 +225,35 @@ static inline int get_page_type(struct p
    4.26          }
    4.27          else if ( unlikely((x & PGT_count_mask) == 0) )
    4.28          {
    4.29 -            if ( (x & PGT_type_mask) != type )
    4.30 +            if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
    4.31              {
    4.32 -                nx &= ~(PGT_type_mask | PGT_validated);
    4.33 +                nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
    4.34                  nx |= type;
    4.35                  /* No extra validation needed for writable pages. */
    4.36 -                if ( type == PGT_writable_page )
    4.37 +                if ( (type & PGT_type_mask) == PGT_writable_page )
    4.38                      nx |= PGT_validated;
    4.39              }
    4.40          }
    4.41 -        else if ( unlikely((x & PGT_type_mask) != type) )
    4.42 +        else if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
    4.43          {
    4.44              DPRINTK("Unexpected type (saw %08x != exp %08x) for pfn %08lx\n",
    4.45                      x & PGT_type_mask, type, page_to_pfn(page));
    4.46              return 0;
    4.47          }
    4.48 -        else if ( unlikely(!(x & PGT_validated)) )
    4.49 +	else if ( (x & PGT_va_mask) == PGT_va_mutable )
    4.50 +	{
    4.51 +	    /* The va_backpointer is currently mutable, hence we update it. */
    4.52 +	    nx &= ~PGT_va_mask;
    4.53 +	    nx |= type; /* we know the actual type is correct */
    4.54 +	}
    4.55 +        else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask) ) )
    4.56 +        {
    4.57 +	    /* The va backpointer wasn't mutable, and is different :-( */
    4.58 +            DPRINTK("Unexpected va backpointer (saw %08x != exp %08x) for pfn %08lx\n",
    4.59 +                    x, type, page_to_pfn(page));
    4.60 +            return 0;
    4.61 +        }
    4.62 +	else if ( unlikely(!(x & PGT_validated)) )
    4.63          {
    4.64              /* Someone else is updating validation of this page. Wait... */
    4.65              while ( (y = page->u.inuse.type_info) != x )
    4.66 @@ -248,7 +269,7 @@ static inline int get_page_type(struct p
    4.67      if ( unlikely(!(nx & PGT_validated)) )
    4.68      {
    4.69          /* Try to validate page type; drop the new reference on failure. */
    4.70 -        if ( unlikely(!alloc_page_type(page, type)) )
    4.71 +        if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
    4.72          {
    4.73              DPRINTK("Error while validating pfn %08lx for type %08x."
    4.74                      " caf=%08x taf=%08x\n",
    4.75 @@ -258,12 +279,62 @@ static inline int get_page_type(struct p
    4.76              put_page_type(page);
    4.77              return 0;
    4.78          }
    4.79 +
    4.80          set_bit(_PGT_validated, &page->u.inuse.type_info);
    4.81      }
    4.82  
    4.83      return 1;
    4.84  }
    4.85  
    4.86 +/* This 'passive' version of get_page_type doesn't attempt to validate
    4.87 +the page, but just checks the type and increments the type count.  The
    4.88 +function is called while doing a NORMAL_PT_UPDATE of an entry in an L1
    4.89 +page table: We want to 'lock' the page for the brief beriod while
    4.90 +we're doing the update, but we're not actually linking it in to a
    4.91 +pagetable. */
    4.92 +
    4.93 +static inline int passive_get_page_type(struct pfn_info *page, u32 type)
    4.94 +{
    4.95 +    u32 nx, x, y = page->u.inuse.type_info;
    4.96 + again:
    4.97 +    do {
    4.98 +        x  = y;
    4.99 +        nx = x + 1;
   4.100 +        if ( unlikely((nx & PGT_count_mask) == 0) )
   4.101 +        {
   4.102 +            DPRINTK("Type count overflow on pfn %08lx\n", page_to_pfn(page));
   4.103 +            return 0;
   4.104 +        }
   4.105 +        else if ( unlikely((x & PGT_count_mask) == 0) )
   4.106 +        {
   4.107 +            if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
   4.108 +            {
   4.109 +                nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
   4.110 +                nx |= type;
   4.111 +            }
   4.112 +        }
   4.113 +        else if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
   4.114 +        {
   4.115 +            DPRINTK("Unexpected type (saw %08x != exp %08x) for pfn %08lx\n",
   4.116 +                    x & PGT_type_mask, type, page_to_pfn(page));
   4.117 +            return 0;
   4.118 +        }
   4.119 +	else if ( unlikely(!(x & PGT_validated)) )
   4.120 +        {
   4.121 +            /* Someone else is updating validation of this page. Wait... */
   4.122 +            while ( (y = page->u.inuse.type_info) != x )
   4.123 +            {
   4.124 +                rep_nop();
   4.125 +                barrier();
   4.126 +            }
   4.127 +            goto again;
   4.128 +        }
   4.129 +    }
   4.130 +    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
   4.131 +
   4.132 +    return 1;
   4.133 +}
   4.134 +
   4.135  
   4.136  static inline void put_page_and_type(struct pfn_info *page)
   4.137  {