ia64/xen-unstable

changeset 1822:a2b2c8621a31

bitkeeper revision 1.1102.1.1 (40fba238l_-yBFQR6TV9GfynDkYi9A)

first go at writable pagetables
author cl349@freefall.cl.cam.ac.uk
date Mon Jul 19 10:28:08 2004 +0000 (2004-07-19)
parents bd80b2bba0ce
children 101465779482
files linux-2.4.26-xen-sparse/include/asm-xen/page.h linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h linux-2.4.26-xen-sparse/mm/memory.c xen/arch/x86/memory.c xen/arch/x86/traps.c xen/include/asm-x86/mm.h
line diff
     1.1 --- a/linux-2.4.26-xen-sparse/include/asm-xen/page.h	Fri Jul 16 12:33:42 2004 +0000
     1.2 +++ b/linux-2.4.26-xen-sparse/include/asm-xen/page.h	Mon Jul 19 10:28:08 2004 +0000
     1.3 @@ -78,7 +78,8 @@ typedef struct { unsigned long pgprot; }
     1.4  static inline unsigned long pmd_val(pmd_t x)
     1.5  {
     1.6      unsigned long ret = x.pmd;
     1.7 -    if ( (ret & 1) ) ret = machine_to_phys(ret);
     1.8 +    if (!(ret & 0x801) && ret) printk("pmd_val really invalid!!!\n");
     1.9 +    if (ret) ret = machine_to_phys(ret);
    1.10      return ret;
    1.11  }
    1.12  #define pgd_val(x)	({ BUG(); (unsigned long)0; })
     2.1 --- a/linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h	Fri Jul 16 12:33:42 2004 +0000
     2.2 +++ b/linux-2.4.26-xen-sparse/include/asm-xen/pgalloc.h	Mon Jul 19 10:28:08 2004 +0000
     2.3 @@ -135,7 +135,7 @@ static inline pte_t *pte_alloc_one(struc
     2.4      {
     2.5          clear_page(pte);
     2.6          __make_page_readonly(pte);
     2.7 -        queue_pte_pin(__pa(pte));
     2.8 +        // queue_pte_pin(__pa(pte));
     2.9      }
    2.10      return pte;
    2.11  
    2.12 @@ -154,7 +154,7 @@ static inline pte_t *pte_alloc_one_fast(
    2.13  
    2.14  static __inline__ void pte_free_slow(pte_t *pte)
    2.15  {
    2.16 -    queue_pte_unpin(__pa(pte));
    2.17 +    // queue_pte_unpin(__pa(pte));
    2.18      __make_page_writeable(pte);
    2.19      free_page((unsigned long)pte);
    2.20  }
     3.1 --- a/linux-2.4.26-xen-sparse/mm/memory.c	Fri Jul 16 12:33:42 2004 +0000
     3.2 +++ b/linux-2.4.26-xen-sparse/mm/memory.c	Mon Jul 19 10:28:08 2004 +0000
     3.3 @@ -163,6 +163,18 @@ void clear_page_tables(struct mm_struct 
     3.4  #define PTE_TABLE_MASK	((PTRS_PER_PTE-1) * sizeof(pte_t))
     3.5  #define PMD_TABLE_MASK	((PTRS_PER_PMD-1) * sizeof(pmd_t))
     3.6  
     3.7 +#undef set_pte
     3.8 +#define set_pte(pteptr, pteval) do { \
     3.9 +	(*(pteptr) = pteval); \
    3.10 +	/* printk("set_pte %p -> %08lx\n", pteptr, pteval); */ \
    3.11 +} while (0)
    3.12 +//void queue_l1_entry_update_queued(pte_t *ptr, unsigned long val);
    3.13 +//#define set_pte(pteptr, pteval) queue_l1_entry_update_queued(pteptr, (pteval).pte_low)
    3.14 +// #define ptep_get_and_clear(xp)	__pte(xchg(&(xp)->pte_low, 0))
    3.15 +//#undef pte_unmap
    3.16 +//#define pte_unmap(pte) xen_flush_page_update_queue()
    3.17 +#undef pmd_bad
    3.18 +#define	pmd_bad(x)	(((x).pmd & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT & ~0x800)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
    3.19  /*
    3.20   * copy one vm_area from one task to the other. Assumes the page tables
    3.21   * already present in the new task to be cleared in the whole range
    3.22 @@ -184,6 +196,8 @@ int copy_page_range(struct mm_struct *ds
    3.23  
    3.24  	src_pgd = pgd_offset(src, address)-1;
    3.25  	dst_pgd = pgd_offset(dst, address)-1;
    3.26 +        /* printk("copy_page_range src %p dst %p src_pgd %p dst_pgd %p %08lx-%08lx\n", */
    3.27 +/*                src, dst, src_pgd, dst_pgd, address, end); */
    3.28  
    3.29  	for (;;) {
    3.30  		pmd_t * src_pmd, * dst_pmd;
    3.31 @@ -205,6 +219,7 @@ skip_copy_pmd_range:	address = (address 
    3.32  
    3.33  		src_pmd = pmd_offset(src_pgd, address);
    3.34  		dst_pmd = pmd_alloc(dst, dst_pgd, address);
    3.35 +                /* printk("src_pmd %p dst_pmd %p\n", src_pmd, dst_pmd); */
    3.36  		if (!dst_pmd)
    3.37  			goto nomem;
    3.38  
    3.39 @@ -226,6 +241,8 @@ skip_copy_pte_range:		address = (address
    3.40  
    3.41  			src_pte = pte_offset(src_pmd, address);
    3.42  			dst_pte = pte_alloc(dst, dst_pmd, address);
    3.43 +                        /* printk("src_pte %p(%p,%08lx,%08lx, %08lx) dst_pte %p\n", */
    3.44 +/*                                src_pte, src_pmd, *src_pmd, pmd_page(*src_pmd), address, dst_pte); */
    3.45  			if (!dst_pte)
    3.46  				goto nomem;
    3.47  
    3.48 @@ -239,6 +256,8 @@ skip_copy_pte_range:		address = (address
    3.49  				if (pte_none(pte))
    3.50  					goto cont_copy_pte_range_noset;
    3.51  				if (!pte_present(pte)) {
    3.52 +                                    printk("swap_dup call %p:%08lx\n",
    3.53 +                                           src_pte, pte.pte_low);
    3.54  					swap_duplicate(pte_to_swp_entry(pte));
    3.55  					goto cont_copy_pte_range;
    3.56  				}
    3.57 @@ -249,10 +268,17 @@ skip_copy_pte_range:		address = (address
    3.58  
    3.59  				/* If it's a COW mapping, write protect it both in the parent and the child */
    3.60  				if (cow && pte_write(pte)) {
    3.61 +                                    /* printk("ptep_set_wrprotect %p was %08lx\n", src_pte, *src_pte); */
    3.62 +#if 0
    3.63  					/* XEN modification: modified ordering here to avoid RaW hazard. */
    3.64  					pte = *src_pte;
    3.65  					pte = pte_wrprotect(pte);
    3.66  					ptep_set_wrprotect(src_pte);
    3.67 +#else
    3.68 +                                        clear_bit(_PAGE_BIT_RW, src_pte); //ptep_set_wrprotect(src_pte);
    3.69 +					pte = *src_pte;
    3.70 +                                    /* printk("ptep_set_wrprotect %p now %08lx\n", src_pte, *src_pte); */
    3.71 +#endif
    3.72  				}
    3.73  
    3.74  				/* If it's a shared mapping, mark it clean in the child */
    3.75 @@ -278,10 +304,13 @@ cont_copy_pmd_range:	src_pmd++;
    3.76  out_unlock:
    3.77  	spin_unlock(&src->page_table_lock);
    3.78  out:
    3.79 +        /* printk("out\n"); */
    3.80  	return 0;
    3.81  nomem:
    3.82  	return -ENOMEM;
    3.83  }
    3.84 +#undef set_pte
    3.85 +#define set_pte(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low)
    3.86  
    3.87  /*
    3.88   * Return indicates whether a page was freed so caller can adjust rss
     4.1 --- a/xen/arch/x86/memory.c	Fri Jul 16 12:33:42 2004 +0000
     4.2 +++ b/xen/arch/x86/memory.c	Mon Jul 19 10:28:08 2004 +0000
     4.3 @@ -1,3 +1,7 @@
     4.4 +extern unsigned long disconnected;
     4.5 +extern void ptwr_reconnect(unsigned long);
     4.6 +extern int writable_idx;
     4.7 +extern void ptwr_flush(void);
     4.8  /******************************************************************************
     4.9   * arch/x86/memory.c
    4.10   * 
    4.11 @@ -117,7 +121,7 @@ static int get_page_and_type_from_pagenr
    4.12  static void free_l2_table(struct pfn_info *page);
    4.13  static void free_l1_table(struct pfn_info *page);
    4.14  
    4.15 -static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
    4.16 +int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
    4.17  static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
    4.18  
    4.19  /* Used to defer flushing of memory structures. */
    4.20 @@ -509,8 +513,20 @@ static inline int update_l2e(l2_pgentry_
    4.21  }
    4.22  
    4.23  
    4.24 +static inline void set_l1_page_va(unsigned long pfn,
    4.25 +                                  unsigned long va_idx)
    4.26 +{
    4.27 +    struct pfn_info *page;
    4.28 +    
    4.29 +    page = &frame_table[pfn];
    4.30 +    page->type_and_flags &= ~PGT_va_mask;
    4.31 +    page->type_and_flags |= va_idx << PGT_va_shift;
    4.32 +}
    4.33 +
    4.34 +
    4.35 +#define NPRINTK if (0) printk
    4.36  /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
    4.37 -static int mod_l2_entry(l2_pgentry_t *pl2e, 
    4.38 +int mod_l2_entry(l2_pgentry_t *pl2e, 
    4.39                          l2_pgentry_t nl2e, 
    4.40                          unsigned long pfn)
    4.41  {
    4.42 @@ -528,6 +544,8 @@ static int mod_l2_entry(l2_pgentry_t *pl
    4.43          return 0;
    4.44      ol2e = mk_l2_pgentry(_ol2e);
    4.45  
    4.46 +    NPRINTK("mod_l2_entry pl2e %p ol2e %08lx nl2e %08lx pfn %08lx\n",
    4.47 +            pl2e, l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), pfn);
    4.48      if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
    4.49      {
    4.50          /* Differ in mapping (bits 12-31) or presence (bit 0)? */
    4.51 @@ -537,6 +555,9 @@ static int mod_l2_entry(l2_pgentry_t *pl
    4.52          if ( unlikely(!get_page_from_l2e(nl2e, pfn)) )
    4.53              return 0;
    4.54          
    4.55 +        set_l1_page_va(l2_pgentry_val(nl2e) >> PAGE_SHIFT,
    4.56 +                       ((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2);
    4.57 +
    4.58          if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
    4.59          {
    4.60              put_page_from_l2e(nl2e, pfn);
    4.61 @@ -698,6 +719,11 @@ static int do_extended_command(unsigned 
    4.62      u32 x, y;
    4.63      domid_t domid;
    4.64  
    4.65 +    if (disconnected != ENTRIES_PER_L2_PAGETABLE)
    4.66 +        ptwr_reconnect(0L);
    4.67 +    if (writable_idx)
    4.68 +        ptwr_flush();
    4.69 +
    4.70      switch ( cmd )
    4.71      {
    4.72      case MMUEXT_PIN_L1_TABLE:
    4.73 @@ -946,6 +972,11 @@ int do_mmu_update(mmu_update_t *ureqs, i
    4.74      perfc_incrc(calls_to_mmu_update); 
    4.75      perfc_addc(num_page_updates, count);
    4.76  
    4.77 +    if (disconnected != ENTRIES_PER_L2_PAGETABLE)
    4.78 +        ptwr_reconnect(0L);
    4.79 +    if (writable_idx)
    4.80 +        ptwr_flush();
    4.81 +
    4.82      for ( i = 0; i < count; i++ )
    4.83      {
    4.84          if ( unlikely(copy_from_user(&req, ureqs, sizeof(req)) != 0) )
    4.85 @@ -1119,6 +1150,11 @@ int do_update_va_mapping(unsigned long p
    4.86  
    4.87      perfc_incrc(calls_to_update_va);
    4.88  
    4.89 +    if (disconnected != ENTRIES_PER_L2_PAGETABLE)
    4.90 +        ptwr_reconnect(0L);
    4.91 +    if (writable_idx)
    4.92 +        ptwr_flush();
    4.93 +
    4.94      if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
    4.95          return -EINVAL;
    4.96  
     5.1 --- a/xen/arch/x86/traps.c	Fri Jul 16 12:33:42 2004 +0000
     5.2 +++ b/xen/arch/x86/traps.c	Mon Jul 19 10:28:08 2004 +0000
     5.3 @@ -310,6 +310,82 @@ asmlinkage void do_double_fault(void)
     5.4      for ( ; ; ) ;
     5.5  }
     5.6  
     5.7 +extern int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
     5.8 +unsigned long disconnected = ENTRIES_PER_L2_PAGETABLE;
     5.9 +static unsigned long *writable_l1;
    5.10 +#define	NR_WRITABLES 4
    5.11 +static unsigned long *writables[NR_WRITABLES];
    5.12 +int writable_idx = 0;
    5.13 +#define PRINTK if (0) printk
    5.14 +#define NPRINTK if (0) printk
    5.15 +
    5.16 +void ptwr_reconnect(unsigned long addr)
    5.17 +{
    5.18 +    unsigned long pte;
    5.19 +    unsigned long pfn;
    5.20 +    struct pfn_info *page;
    5.21 +    l2_pgentry_t *pl2e;
    5.22 +    PRINTK("page fault in disconnected space: addr %08lx space %08lx\n",
    5.23 +           addr, disconnected << L2_PAGETABLE_SHIFT);
    5.24 +    pl2e = &linear_l2_table[disconnected];
    5.25 +
    5.26 +    if (__get_user(pte, writable_l1))
    5.27 +        BUG();
    5.28 +    pfn = pte >> PAGE_SHIFT;
    5.29 +    page = &frame_table[pfn];
    5.30 +
    5.31 +    /* reconnect l1 page */
    5.32 +    PRINTK("    pl2e %p l2e %08lx pfn %08lx taf %08x/%08x\n", pl2e,
    5.33 +           l2_pgentry_val(*pl2e),
    5.34 +           l1_pgentry_val(linear_pg_table[(unsigned long)pl2e >>
    5.35 +                                          PAGE_SHIFT]) >> PAGE_SHIFT,
    5.36 +           frame_table[l2_pgentry_to_pagenr(*pl2e)].type_and_flags,
    5.37 +           frame_table[pfn].type_and_flags);
    5.38 +    mod_l2_entry(pl2e, mk_l2_pgentry((l2_pgentry_val(*pl2e) & ~0x800) |
    5.39 +                                     _PAGE_PRESENT),
    5.40 +                 l1_pgentry_val(linear_pg_table[(unsigned long)pl2e >>
    5.41 +                                                PAGE_SHIFT]) >> PAGE_SHIFT);
    5.42 +    PRINTK("now pl2e %p l2e %08lx              taf %08x/%08x\n", pl2e,
    5.43 +           l2_pgentry_val(*pl2e),
    5.44 +           frame_table[l2_pgentry_to_pagenr(*pl2e)].type_and_flags,
    5.45 +           frame_table[pfn].type_and_flags);
    5.46 +    disconnected = ENTRIES_PER_L2_PAGETABLE;
    5.47 +    /* make pt page write protected */
    5.48 +    if (__get_user(pte, writable_l1))
    5.49 +        BUG();
    5.50 +    PRINTK("writable_l1 at %p is %08lx\n", writable_l1, pte);
    5.51 +    pte &= ~_PAGE_RW;
    5.52 +    if (__put_user(pte, writable_l1))
    5.53 +        BUG();
    5.54 +    PRINTK("writable_l1 at %p now %08lx\n", writable_l1, pte);
    5.55 +    /* and try again */
    5.56 +    return;
    5.57 +}
    5.58 +
    5.59 +void ptwr_flush(void)
    5.60 +{
    5.61 +    unsigned long pte, pfn;
    5.62 +    struct pfn_info *page;
    5.63 +    int i;
    5.64 +
    5.65 +    for (i = 0; i < writable_idx; i++) {
    5.66 +        if (__get_user(pte, writables[i]))
    5.67 +            BUG();
    5.68 +        pfn = pte >> PAGE_SHIFT;
    5.69 +        page = &frame_table[pfn];
    5.70 +        PRINTK("alloc l1 page %p\n", page);
    5.71 +        if (!get_page_type(page, PGT_l1_page_table))
    5.72 +            BUG();
    5.73 +        /* make pt page writable */
    5.74 +        PRINTK("writable_l1 at %p is %08lx\n", writables[i], pte);
    5.75 +        pte &= ~_PAGE_RW;
    5.76 +        if (__put_user(pte, writables[i]))
    5.77 +            BUG();
    5.78 +        PRINTK("writable_l1 at %p now %08lx\n", writables[i], pte);
    5.79 +    }
    5.80 +    writable_idx = 0;
    5.81 +}
    5.82 +
    5.83  asmlinkage void do_page_fault(struct pt_regs *regs, long error_code)
    5.84  {
    5.85      struct guest_trap_bounce *gtb = guest_trap_bounce+smp_processor_id();
    5.86 @@ -335,6 +411,80 @@ asmlinkage void do_page_fault(struct pt_
    5.87              return; /* successfully copied the mapping */
    5.88      }
    5.89  
    5.90 +    if ((addr >> L2_PAGETABLE_SHIFT) == disconnected) {
    5.91 +        ptwr_reconnect(addr);
    5.92 +        return;
    5.93 +    }
    5.94 +
    5.95 +    if (addr < PAGE_OFFSET && error_code & 2) {
    5.96 +        /* write page fault, check if we're trying to modify an l1
    5.97 +           page table */
    5.98 +        unsigned long pte, pfn;
    5.99 +        struct pfn_info *page;
   5.100 +        l2_pgentry_t *pl2e;
   5.101 +        NPRINTK("get user %p for va %08lx\n",
   5.102 +                &linear_pg_table[addr>>PAGE_SHIFT], addr);
   5.103 +        if (l2_pgentry_val(linear_l2_table[addr >> L2_PAGETABLE_SHIFT]) &
   5.104 +            _PAGE_PRESENT &&
   5.105 +            __get_user(pte, (unsigned long *)
   5.106 +                       &linear_pg_table[addr >> PAGE_SHIFT]) == 0) {
   5.107 +            pfn = pte >> PAGE_SHIFT;
   5.108 +            NPRINTK("check pte %08lx = pfn %08lx for va %08lx\n", pte, pfn, addr);
   5.109 +            page = &frame_table[pfn];
   5.110 +            if ((page->type_and_flags & PGT_type_mask) == PGT_l1_page_table) {
   5.111 +                pl2e = &linear_l2_table[(page->type_and_flags &
   5.112 +                                         PGT_va_mask) >> PGT_va_shift];
   5.113 +                PRINTK("page_fault on l1 pt at va %08lx, pt for %08x, pfn %08lx\n",
   5.114 +                       addr, ((page->type_and_flags & PGT_va_mask) >>
   5.115 +                              PGT_va_shift) << L2_PAGETABLE_SHIFT, pfn);
   5.116 +                if (l2_pgentry_val(*pl2e) >> PAGE_SHIFT != pfn) {
   5.117 +                    PRINTK("freeing l1 page %p\n", page);
   5.118 +                    if (writable_idx == NR_WRITABLES)
   5.119 +                        ptwr_flush();
   5.120 +                    writables[writable_idx++] = (unsigned long *)
   5.121 +                        &linear_pg_table[addr>>PAGE_SHIFT];
   5.122 +                    if ((page->type_and_flags & PGT_count_mask) != 1)
   5.123 +                        BUG();
   5.124 +                    put_page_type(page);
   5.125 +                } else {
   5.126 +                    if (disconnected != ENTRIES_PER_L2_PAGETABLE)
   5.127 +                        ptwr_reconnect(addr);
   5.128 +                    PRINTK("    pl2e %p l2e %08lx pfn %08lx taf %08x/%08x\n",
   5.129 +                           pl2e, l2_pgentry_val(*pl2e),
   5.130 +                           l1_pgentry_val(linear_pg_table[(unsigned long)pl2e
   5.131 +                                                          >> PAGE_SHIFT]) >>
   5.132 +                           PAGE_SHIFT,
   5.133 +                           frame_table[l2_pgentry_to_pagenr(*pl2e)].
   5.134 +                           type_and_flags, frame_table[pfn].type_and_flags);
   5.135 +                    /* disconnect l1 page */
   5.136 +                    mod_l2_entry(pl2e, mk_l2_pgentry((l2_pgentry_val(*pl2e) &
   5.137 +                                                      ~_PAGE_PRESENT) | 0x800),
   5.138 +                                 l1_pgentry_val(linear_pg_table
   5.139 +                                                [(unsigned long)pl2e
   5.140 +                                                 >> PAGE_SHIFT]) >>
   5.141 +                                 PAGE_SHIFT);
   5.142 +                    disconnected = (page->type_and_flags & PGT_va_mask) >>
   5.143 +                        PGT_va_shift;
   5.144 +                    PRINTK("now pl2e %p l2e %08lx              taf %08x/%08x\n",
   5.145 +                           pl2e, l2_pgentry_val(*pl2e),
   5.146 +                           frame_table[l2_pgentry_to_pagenr(*pl2e)].
   5.147 +                           type_and_flags,
   5.148 +                           frame_table[pfn].type_and_flags);
   5.149 +                    writable_l1 = (unsigned long *)
   5.150 +                        &linear_pg_table[addr>>PAGE_SHIFT];
   5.151 +                }
   5.152 +                /* make pt page writable */
   5.153 +                pte |= _PAGE_RW;
   5.154 +                PRINTK("update %p pte to %08lx\n",
   5.155 +                        &linear_pg_table[addr>>PAGE_SHIFT], pte);
   5.156 +                if (__put_user(pte, (unsigned long *)
   5.157 +                               &linear_pg_table[addr>>PAGE_SHIFT]))
   5.158 +                    BUG();
   5.159 +                return;
   5.160 +            }
   5.161 +        }
   5.162 +    }
   5.163 +
   5.164      if ( unlikely(p->mm.shadow_mode) && 
   5.165           (addr < PAGE_OFFSET) && shadow_fault(addr, error_code) )
   5.166          return; /* Returns TRUE if fault was handled. */
     6.1 --- a/xen/include/asm-x86/mm.h	Fri Jul 16 12:33:42 2004 +0000
     6.2 +++ b/xen/include/asm-x86/mm.h	Mon Jul 19 10:28:08 2004 +0000
     6.3 @@ -52,8 +52,11 @@ struct pfn_info
     6.4   /* Has this page been validated for use as its current type? */
     6.5  #define _PGT_validated      28
     6.6  #define PGT_validated       (1<<_PGT_validated)
     6.7 - /* 28-bit count of uses of this frame as its current type. */
     6.8 -#define PGT_count_mask      ((1<<28)-1)
     6.9 + /* 10-bit most significant bits of va address if used as l1 page table */
    6.10 +#define PGT_va_shift        18
    6.11 +#define PGT_va_mask         (((1<<10)-1)<<PGT_va_shift)
    6.12 + /* 18-bit count of uses of this frame as its current type. */
    6.13 +#define PGT_count_mask      ((1<<18)-1)
    6.14  
    6.15   /* For safety, force a TLB flush when this page's type changes. */
    6.16  #define _PGC_tlb_flush_on_type_change 31