ia64/xen-unstable

changeset 2460:9c7d7819508a

bitkeeper revision 1.1159.74.1 (413f431aIw1lNSoAl0H63fnzXoSZ8Q)

Fix TLB flushing on page-type changes. In particular, page-table pages
must trigger a flush when their type changes. To simplify the code we also
flush for LDT/GDT pages, but they will change type very infrequently.
author kaf24@freefall.cl.cam.ac.uk
date Wed Sep 08 17:36:26 2004 +0000 (2004-09-08)
parents d3c0c3c96dc0
children 77bc1f0ea51f
files xen/arch/x86/domain.c xen/arch/x86/memory.c xen/include/asm-x86/mm.h
line diff
     1.1 --- a/xen/arch/x86/domain.c	Wed Sep 08 13:17:25 2004 +0000
     1.2 +++ b/xen/arch/x86/domain.c	Wed Sep 08 17:36:26 2004 +0000
     1.3 @@ -733,7 +733,6 @@ int construct_dom0(struct domain *p,
     1.4          *l1tab++ = mk_l1_pgentry((mfn << PAGE_SHIFT) | L1_PROT);
     1.5          
     1.6          page = &frame_table[mfn];
     1.7 -        set_bit(_PGC_tlb_flush_on_type_change, &page->count_info);
     1.8          if ( !get_page_and_type(page, p, PGT_writable_page) )
     1.9              BUG();
    1.10  
     2.1 --- a/xen/arch/x86/memory.c	Wed Sep 08 13:17:25 2004 +0000
     2.2 +++ b/xen/arch/x86/memory.c	Wed Sep 08 17:36:26 2004 +0000
     2.3 @@ -462,7 +462,6 @@ get_page_from_l1e(
     2.4      {
     2.5          if ( unlikely(!get_page_type(page, PGT_writable_page)) )
     2.6              return 0;
     2.7 -        set_bit(_PGC_tlb_flush_on_type_change, &page->count_info);
     2.8      }
     2.9  
    2.10      return 1;
    2.11 @@ -774,18 +773,6 @@ static int mod_l1_entry(l1_pgentry_t *pl
    2.12  
    2.13  int alloc_page_type(struct pfn_info *page, unsigned int type)
    2.14  {
    2.15 -    if ( unlikely(test_and_clear_bit(_PGC_tlb_flush_on_type_change, 
    2.16 -                                     &page->count_info)) )
    2.17 -    {
    2.18 -        struct domain *p = page->u.inuse.domain;
    2.19 -        if ( unlikely(NEED_FLUSH(tlbflush_time[p->processor],
    2.20 -                                 page->tlbflush_timestamp)) )
    2.21 -        {
    2.22 -            perfc_incr(need_flush_tlb_flush);
    2.23 -            flush_tlb_cpu(p->processor);
    2.24 -        }
    2.25 -    }
    2.26 -
    2.27      switch ( type )
    2.28      {
    2.29      case PGT_l1_page_table:
    2.30 @@ -833,6 +820,151 @@ void free_page_type(struct pfn_info *pag
    2.31  }
    2.32  
    2.33  
    2.34 +void put_page_type(struct pfn_info *page)
    2.35 +{
    2.36 +    u32 nx, x, y = page->u.inuse.type_info;
    2.37 +
    2.38 + again:
    2.39 +    do {
    2.40 +        x  = y;
    2.41 +        nx = x - 1;
    2.42 +
    2.43 +        ASSERT((x & PGT_count_mask) != 0);
    2.44 +        ASSERT(x & PGT_validated);
    2.45 +
    2.46 +        if ( unlikely((nx & PGT_count_mask) == 0) )
    2.47 +        {
    2.48 +            /* Record TLB information for flush later. Races are harmless. */
    2.49 +            page->tlbflush_timestamp = tlbflush_clock;
    2.50 +            
    2.51 +            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) )
    2.52 +            {
    2.53 +                /*
    2.54 +                 * Page-table pages must be unvalidated when count is zero. The
    2.55 +                 * 'free' is safe because the refcnt is non-zero and validated
    2.56 +                 * bit is clear => other ops will spin or fail.
    2.57 +                 */
    2.58 +                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
    2.59 +                                           x & ~PGT_validated)) != x) )
    2.60 +                    goto again;
    2.61 +                /* We cleared the 'valid bit' so we do the clear up. */
    2.62 +                free_page_type(page, x & PGT_type_mask);
    2.63 +                /* Carry on, but with the 'valid bit' now clear. */
    2.64 +                x  &= ~PGT_validated;
    2.65 +                nx &= ~PGT_validated;
    2.66 +            }
    2.67 +        }
    2.68 +	else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 
    2.69 +                           (PGT_pinned | 1)) )
    2.70 +	{
    2.71 +            /* Page is now only pinned. Make the back pointer mutable again. */
    2.72 +	    nx |= PGT_va_mutable;
    2.73 +	}
    2.74 +    }
    2.75 +    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
    2.76 +}
    2.77 +
    2.78 +
    2.79 +int get_page_type(struct pfn_info *page, u32 type)
    2.80 +{
    2.81 +    u32 nx, x, y = page->u.inuse.type_info;
    2.82 +
    2.83 + again:
    2.84 +    do {
    2.85 +        x  = y;
    2.86 +        nx = x + 1;
    2.87 +        if ( unlikely((nx & PGT_count_mask) == 0) )
    2.88 +        {
    2.89 +            MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
    2.90 +            return 0;
    2.91 +        }
    2.92 +        else if ( unlikely((x & PGT_count_mask) == 0) )
    2.93 +        {
    2.94 +            if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
    2.95 +            {
    2.96 +                /*
    2.97 +                 * On type change we check to flush stale TLB entries. This 
    2.98 +                 * may be unnecessary (e.g., page was GDT/LDT) but those
    2.99 +                 * circumstances should be very rare.
   2.100 +                 */
   2.101 +                struct domain *d = page->u.inuse.domain;
   2.102 +                if ( unlikely(NEED_FLUSH(tlbflush_time[d->processor],
   2.103 +                                         page->tlbflush_timestamp)) )
   2.104 +                {
   2.105 +                    perfc_incr(need_flush_tlb_flush);
   2.106 +                    flush_tlb_cpu(d->processor);
   2.107 +                }
   2.108 +
   2.109 +                /* We lose existing type, back pointer, and validity. */
   2.110 +                nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
   2.111 +                nx |= type;
   2.112 +
   2.113 +                /* No special validation needed for writable pages. */
   2.114 +                /* Page tables and GDT/LDT need to be scanned for validity. */
   2.115 +                if ( type == PGT_writable_page )
   2.116 +                    nx |= PGT_validated;
   2.117 +            }
   2.118 +        }
   2.119 +        else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
   2.120 +        {
   2.121 +            if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
   2.122 +            {
   2.123 +                if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
   2.124 +                     ((type & PGT_type_mask) != PGT_l1_page_table) )
   2.125 +                    MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
   2.126 +                            x & PGT_type_mask, type, page_to_pfn(page));
   2.127 +                return 0;
   2.128 +            }
   2.129 +            else if ( (x & PGT_va_mask) == PGT_va_mutable )
   2.130 +            {
   2.131 +                /* The va backpointer is mutable, hence we update it. */
   2.132 +                nx &= ~PGT_va_mask;
   2.133 +                nx |= type; /* we know the actual type is correct */
   2.134 +            }
   2.135 +            else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
   2.136 +            {
   2.137 +                /* The va backpointer wasn't mutable, and is different. */
   2.138 +                MEM_LOG("Unexpected va backpointer (saw %08x != exp %08x)"
   2.139 +                        " for pfn %08lx\n", x, type, page_to_pfn(page));
   2.140 +                return 0;
   2.141 +            }
   2.142 +        }
   2.143 +	else if ( unlikely(!(x & PGT_validated)) )
   2.144 +        {
   2.145 +            /* Someone else is updating validation of this page. Wait... */
   2.146 +            while ( (y = page->u.inuse.type_info) == x )
   2.147 +            {
   2.148 +                rep_nop();
   2.149 +                barrier();
   2.150 +            }
   2.151 +            goto again;
   2.152 +        }
   2.153 +    }
   2.154 +    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
   2.155 +
   2.156 +    if ( unlikely(!(nx & PGT_validated)) )
   2.157 +    {
   2.158 +        /* Try to validate page type; drop the new reference on failure. */
   2.159 +        if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
   2.160 +        {
   2.161 +            MEM_LOG("Error while validating pfn %08lx for type %08x."
   2.162 +                    " caf=%08x taf=%08x\n",
   2.163 +                    page_to_pfn(page), type,
   2.164 +		    page->count_info,
   2.165 +		    page->u.inuse.type_info);
   2.166 +            /* Noone else can get a reference. We hold the only ref. */
   2.167 +            page->u.inuse.type_info = 0;
   2.168 +            return 0;
   2.169 +        }
   2.170 +
   2.171 +        /* Noone else is updating simultaneously. */
   2.172 +        __set_bit(_PGT_validated, &page->u.inuse.type_info);
   2.173 +    }
   2.174 +
   2.175 +    return 1;
   2.176 +}
   2.177 +
   2.178 +
   2.179  static int do_extended_command(unsigned long ptr, unsigned long val)
   2.180  {
   2.181      int okay = 1, cpu = smp_processor_id();
   2.182 @@ -1747,7 +1879,6 @@ int ptwr_do_page_fault(unsigned long add
   2.183  #ifndef NDEBUG
   2.184  void ptwr_status(void)
   2.185  {
   2.186 -    int i;
   2.187      unsigned long pte, pfn;
   2.188      struct pfn_info *page;
   2.189      l2_pgentry_t *pl2e;
     3.1 --- a/xen/include/asm-x86/mm.h	Wed Sep 08 13:17:25 2004 +0000
     3.2 +++ b/xen/include/asm-x86/mm.h	Wed Sep 08 17:36:26 2004 +0000
     3.3 @@ -81,17 +81,14 @@ struct pfn_info
     3.4   /* 17-bit count of uses of this frame as its current type. */
     3.5  #define PGT_count_mask      ((1<<17)-1)
     3.6  
     3.7 - /* For safety, force a TLB flush when this page's type changes. */
     3.8 -#define _PGC_tlb_flush_on_type_change 31
     3.9 -#define PGC_tlb_flush_on_type_change  (1<<_PGC_tlb_flush_on_type_change)
    3.10   /* Cleared when the owning guest 'frees' this page. */
    3.11 -#define _PGC_allocated                30
    3.12 +#define _PGC_allocated                31
    3.13  #define PGC_allocated                 (1<<_PGC_allocated)
    3.14   /* This bit is always set, guaranteeing that the count word is never zero. */
    3.15 -#define _PGC_always_set               29
    3.16 +#define _PGC_always_set               30
    3.17  #define PGC_always_set                (1<<_PGC_always_set)
    3.18 - /* 29-bit count of references to this frame. */
    3.19 -#define PGC_count_mask                ((1<<29)-1)
    3.20 + /* 30-bit count of references to this frame. */
    3.21 +#define PGC_count_mask                ((1<<30)-1)
    3.22  
    3.23  /* We trust the slab allocator in slab.c, and our use of it. */
    3.24  #define PageSlab(page)		(1)
    3.25 @@ -104,7 +101,7 @@ struct pfn_info
    3.26      do {                                                                    \
    3.27          (_pfn)->u.inuse.domain = (_dom);                                    \
    3.28          /* The incremented type count is intended to pin to 'writable'. */  \
    3.29 -        (_pfn)->u.inuse.type_info  = PGT_writable_page | PGT_validated | 1; \
    3.30 +        (_pfn)->u.inuse.type_info = PGT_writable_page | PGT_validated | 1;  \
    3.31          wmb(); /* install valid domain ptr before updating refcnt. */       \
    3.32          spin_lock(&(_dom)->page_alloc_lock);                                \
    3.33          /* _dom holds an allocation reference */                            \
    3.34 @@ -143,155 +140,34 @@ static inline int get_page(struct pfn_in
    3.35                             struct domain *domain)
    3.36  {
    3.37      u32 x, nx, y = page->count_info;
    3.38 -    struct domain *p, *np = page->u.inuse.domain;
    3.39 +    struct domain *d, *nd = page->u.inuse.domain;
    3.40  
    3.41      do {
    3.42          x  = y;
    3.43          nx = x + 1;
    3.44 -        p  = np;
    3.45 +        d  = nd;
    3.46          if ( unlikely((x & PGC_count_mask) == 0) ||  /* Not allocated? */
    3.47               unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
    3.48 -             unlikely(p != domain) )                 /* Wrong owner? */
    3.49 +             unlikely(d != domain) )                 /* Wrong owner? */
    3.50          {
    3.51              DPRINTK("Error pfn %08lx: ed=%p, sd=%p, caf=%08x, taf=%08x\n",
    3.52 -                    page_to_pfn(page), domain, p,
    3.53 +                    page_to_pfn(page), domain, d,
    3.54                      x, page->u.inuse.type_info);
    3.55              return 0;
    3.56          }
    3.57          __asm__ __volatile__(
    3.58              LOCK_PREFIX "cmpxchg8b %3"
    3.59 -            : "=d" (np), "=a" (y), "=c" (p),
    3.60 +            : "=d" (nd), "=a" (y), "=c" (d),
    3.61                "=m" (*(volatile u64 *)(&page->count_info))
    3.62 -            : "0" (p), "1" (x), "c" (p), "b" (nx) );
    3.63 +            : "0" (d), "1" (x), "c" (d), "b" (nx) );
    3.64      }
    3.65 -    while ( unlikely(np != p) || unlikely(y != x) );
    3.66 +    while ( unlikely(nd != d) || unlikely(y != x) );
    3.67  
    3.68      return 1;
    3.69  }
    3.70  
    3.71 -
    3.72 -static inline void put_page_type(struct pfn_info *page)
    3.73 -{
    3.74 -    u32 nx, x, y = page->u.inuse.type_info;
    3.75 -
    3.76 - again:
    3.77 -    do {
    3.78 -        x  = y;
    3.79 -        nx = x - 1;
    3.80 -        if ( unlikely((nx & PGT_count_mask) == 0) )
    3.81 -        {
    3.82 -            page->tlbflush_timestamp = tlbflush_clock;
    3.83 -            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
    3.84 -                 likely(nx & PGT_validated) )
    3.85 -            {
    3.86 -                /*
    3.87 -                 * Page-table pages must be unvalidated when count is zero. The
    3.88 -                 * 'free' is safe because the refcnt is non-zero and the
    3.89 -                 * validated bit is clear => other ops will spin or fail.
    3.90 -                 */
    3.91 -                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
    3.92 -                                           x & ~PGT_validated)) != x) )
    3.93 -                    goto again;
    3.94 -                /* We cleared the 'valid bit' so we must do the clear up. */
    3.95 -                free_page_type(page, x & PGT_type_mask);
    3.96 -                /* Carry on as we were, but with the 'valid bit' now clear. */
    3.97 -                x  &= ~PGT_validated;
    3.98 -                nx &= ~PGT_validated;
    3.99 -            }
   3.100 -        }
   3.101 -	else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 
   3.102 -                           (PGT_pinned | 1)) )
   3.103 -	{
   3.104 -            /* Page is now only pinned. Make the back pointer mutable again. */
   3.105 -	    nx |= PGT_va_mutable;
   3.106 -	}
   3.107 -    }
   3.108 -    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
   3.109 -}
   3.110 -
   3.111 -
   3.112 -static inline int get_page_type(struct pfn_info *page, u32 type)
   3.113 -{
   3.114 -    u32 nx, x, y = page->u.inuse.type_info;
   3.115 - again:
   3.116 -    do {
   3.117 -        x  = y;
   3.118 -        nx = x + 1;
   3.119 -        if ( unlikely((nx & PGT_count_mask) == 0) )
   3.120 -        {
   3.121 -            DPRINTK("Type count overflow on pfn %08lx\n", page_to_pfn(page));
   3.122 -            return 0;
   3.123 -        }
   3.124 -        else if ( unlikely((x & PGT_count_mask) == 0) )
   3.125 -        {
   3.126 -            if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
   3.127 -            {
   3.128 -                nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
   3.129 -                nx |= type;
   3.130 -                /* No extra validation needed for writable pages. */
   3.131 -                if ( type == PGT_writable_page )
   3.132 -                    nx |= PGT_validated;
   3.133 -            }
   3.134 -        }
   3.135 -        else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
   3.136 -        {
   3.137 -            if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
   3.138 -            {
   3.139 -#ifdef VERBOSE
   3.140 -                if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
   3.141 -                     ((type & PGT_type_mask) != PGT_l1_page_table) )
   3.142 -                    DPRINTK("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
   3.143 -                            x & PGT_type_mask, type, page_to_pfn(page));
   3.144 -#endif
   3.145 -                return 0;
   3.146 -            }
   3.147 -            else if ( (x & PGT_va_mask) == PGT_va_mutable )
   3.148 -            {
   3.149 -                /* The va backpointer is mutable, hence we update it. */
   3.150 -                nx &= ~PGT_va_mask;
   3.151 -                nx |= type; /* we know the actual type is correct */
   3.152 -            }
   3.153 -            else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
   3.154 -            {
   3.155 -                /* The va backpointer wasn't mutable, and is different. */
   3.156 -                DPRINTK("Unexpected va backpointer (saw %08x != exp %08x)"
   3.157 -                        " for pfn %08lx\n", x, type, page_to_pfn(page));
   3.158 -                return 0;
   3.159 -            }
   3.160 -        }
   3.161 -	else if ( unlikely(!(x & PGT_validated)) )
   3.162 -        {
   3.163 -            /* Someone else is updating validation of this page. Wait... */
   3.164 -            while ( (y = page->u.inuse.type_info) == x )
   3.165 -            {
   3.166 -                rep_nop();
   3.167 -                barrier();
   3.168 -            }
   3.169 -            goto again;
   3.170 -        }
   3.171 -    }
   3.172 -    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
   3.173 -
   3.174 -    if ( unlikely(!(nx & PGT_validated)) )
   3.175 -    {
   3.176 -        /* Try to validate page type; drop the new reference on failure. */
   3.177 -        if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
   3.178 -        {
   3.179 -            DPRINTK("Error while validating pfn %08lx for type %08x."
   3.180 -                    " caf=%08x taf=%08x\n",
   3.181 -                    page_to_pfn(page), type,
   3.182 -		    page->count_info,
   3.183 -		    page->u.inuse.type_info);
   3.184 -            put_page_type(page);
   3.185 -            return 0;
   3.186 -        }
   3.187 -
   3.188 -        set_bit(_PGT_validated, &page->u.inuse.type_info);
   3.189 -    }
   3.190 -
   3.191 -    return 1;
   3.192 -}
   3.193 -
   3.194 +void put_page_type(struct pfn_info *page);
   3.195 +int  get_page_type(struct pfn_info *page, u32 type);
   3.196  
   3.197  static inline void put_page_and_type(struct pfn_info *page)
   3.198  {