ia64/xen-unstable

changeset 7662:25599e222c33

Fix pagetable pinning logic for xen/i386 kernels. The pin
flag is now associated with the pgd rather than the mm -- this
avoids a race where a pgd is allocated from the pgd_cache but,
before it gets associated with an mm, the kernel suspends itself.
At this point the kernel mappings will not get rewritten when the
kernel is resumed, and the system will fail.

A further advantage is that the code is slightly simpler and less
invasive (no changes to mm_context for example).

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Mon Nov 07 18:14:45 2005 +0100 (2005-11-07)
parents 63aeaa2152d8
children 66dd96e90be4 055efdd6b7c5
files linux-2.6-xen-sparse/arch/xen/i386/kernel/ldt.c linux-2.6-xen-sparse/arch/xen/i386/mm/init.c linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c linux-2.6-xen-sparse/arch/xen/kernel/reboot.c linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu.h linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h
line diff
     1.1 --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/ldt.c	Mon Nov 07 16:37:58 2005 +0100
     1.2 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/ldt.c	Mon Nov 07 18:14:45 2005 +0100
     1.3 @@ -18,7 +18,6 @@
     1.4  #include <asm/system.h>
     1.5  #include <asm/ldt.h>
     1.6  #include <asm/desc.h>
     1.7 -#include <asm/mmu_context.h>
     1.8  
     1.9  #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
    1.10  static void flush_ldt(void *null)
    1.11 @@ -101,19 +100,14 @@ int init_new_context(struct task_struct 
    1.12  	struct mm_struct * old_mm;
    1.13  	int retval = 0;
    1.14  
    1.15 -	memset(&mm->context, 0, sizeof(mm->context));
    1.16  	init_MUTEX(&mm->context.sem);
    1.17 +	mm->context.size = 0;
    1.18  	old_mm = current->mm;
    1.19  	if (old_mm && old_mm->context.size > 0) {
    1.20  		down(&old_mm->context.sem);
    1.21  		retval = copy_ldt(&mm->context, &old_mm->context);
    1.22  		up(&old_mm->context.sem);
    1.23  	}
    1.24 -	if (retval == 0) {
    1.25 -		spin_lock(&mm_unpinned_lock);
    1.26 -		list_add(&mm->context.unpinned, &mm_unpinned);
    1.27 -		spin_unlock(&mm_unpinned_lock);
    1.28 -	}
    1.29  	return retval;
    1.30  }
    1.31  
    1.32 @@ -134,11 +128,6 @@ void destroy_context(struct mm_struct *m
    1.33  			kfree(mm->context.ldt);
    1.34  		mm->context.size = 0;
    1.35  	}
    1.36 -	if (!mm->context.pinned) {
    1.37 -		spin_lock(&mm_unpinned_lock);
    1.38 -		list_del(&mm->context.unpinned);
    1.39 -		spin_unlock(&mm_unpinned_lock);
    1.40 -	}
    1.41  }
    1.42  
    1.43  static int read_ldt(void __user * ptr, unsigned long bytecount)
     2.1 --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/init.c	Mon Nov 07 16:37:58 2005 +0100
     2.2 +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/init.c	Mon Nov 07 18:14:45 2005 +0100
     2.3 @@ -376,7 +376,6 @@ static void __init pagetable_init (void)
     2.4  		__PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
     2.5  	}
     2.6  
     2.7 -	init_mm.context.pinned = 1;
     2.8  	kernel_physical_mapping_init(pgd_base);
     2.9  	remap_numa_kva();
    2.10  
    2.11 @@ -689,6 +688,8 @@ void __init mem_init(void)
    2.12  #ifndef CONFIG_SMP
    2.13  	zap_low_mappings();
    2.14  #endif
    2.15 +
    2.16 +	set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags);
    2.17  }
    2.18  
    2.19  kmem_cache_t *pgd_cache;
     3.1 --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c	Mon Nov 07 16:37:58 2005 +0100
     3.2 +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c	Mon Nov 07 18:14:45 2005 +0100
     3.3 @@ -27,6 +27,9 @@
     3.4  #include <asm-xen/foreign_page.h>
     3.5  #include <asm/hypervisor.h>
     3.6  
     3.7 +static void __pgd_pin(pgd_t *pgd);
     3.8 +static void __pgd_unpin(pgd_t *pgd);
     3.9 +
    3.10  void show_mem(void)
    3.11  {
    3.12  	int total = 0, reserved = 0;
    3.13 @@ -299,6 +302,8 @@ void pgd_dtor(void *pgd, kmem_cache_t *c
    3.14  {
    3.15  	unsigned long flags; /* can be called from interrupt context */
    3.16  
    3.17 +	BUG_ON(test_bit(PG_pinned, &virt_to_page(pgd)->flags));
    3.18 +
    3.19  	if (HAVE_SHARED_KERNEL_PMD)
    3.20  		return;
    3.21  
    3.22 @@ -312,6 +317,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
    3.23  	int i = 0;
    3.24  	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
    3.25  
    3.26 +	BUG_ON(test_bit(PG_pinned, &virt_to_page(pgd)->flags));
    3.27 +
    3.28  	if (PTRS_PER_PMD == 1 || !pgd)
    3.29  		return pgd;
    3.30  
    3.31 @@ -351,15 +358,9 @@ out_oom:
    3.32  void pgd_free(pgd_t *pgd)
    3.33  {
    3.34  	int i;
    3.35 -	pte_t *ptep = virt_to_ptep(pgd);
    3.36  
    3.37 -	if (!pte_write(*ptep)) {
    3.38 -		xen_pgd_unpin(__pa(pgd));
    3.39 -		BUG_ON(HYPERVISOR_update_va_mapping(
    3.40 -			(unsigned long)pgd,
    3.41 -			pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
    3.42 -			0));
    3.43 -	}
    3.44 +	if (test_bit(PG_pinned, &virt_to_page(pgd)->flags))
    3.45 +		__pgd_unpin(pgd);
    3.46  
    3.47  	/* in the PAE case user pgd entries are overwritten before usage */
    3.48  	if (PTRS_PER_PMD > 1) {
    3.49 @@ -441,10 +442,7 @@ void make_pages_writable(void *va, unsig
    3.50  }
    3.51  #endif /* CONFIG_XEN_SHADOW_MODE */
    3.52  
    3.53 -LIST_HEAD(mm_unpinned);
    3.54 -DEFINE_SPINLOCK(mm_unpinned_lock);
    3.55 -
    3.56 -static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
    3.57 +static inline void pgd_walk_set_prot(void *pt, pgprot_t flags)
    3.58  {
    3.59  	struct page *page = virt_to_page(pt);
    3.60  	unsigned long pfn = page_to_pfn(page);
    3.61 @@ -456,103 +454,111 @@ static inline void mm_walk_set_prot(void
    3.62  		pfn_pte(pfn, flags), 0));
    3.63  }
    3.64  
    3.65 -static void mm_walk(struct mm_struct *mm, pgprot_t flags)
    3.66 +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
    3.67  {
    3.68 -	pgd_t       *pgd;
    3.69 -	pud_t       *pud;
    3.70 -	pmd_t       *pmd;
    3.71 -	pte_t       *pte;
    3.72 -	int          g,u,m;
    3.73 +	pgd_t *pgd = pgd_base;
    3.74 +	pud_t *pud;
    3.75 +	pmd_t *pmd;
    3.76 +	pte_t *pte;
    3.77 +	int    g, u, m;
    3.78  
    3.79 -	pgd = mm->pgd;
    3.80  	for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
    3.81  		if (pgd_none(*pgd))
    3.82  			continue;
    3.83  		pud = pud_offset(pgd, 0);
    3.84  		if (PTRS_PER_PUD > 1) /* not folded */
    3.85 -			mm_walk_set_prot(pud,flags);
    3.86 +			pgd_walk_set_prot(pud,flags);
    3.87  		for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
    3.88  			if (pud_none(*pud))
    3.89  				continue;
    3.90  			pmd = pmd_offset(pud, 0);
    3.91  			if (PTRS_PER_PMD > 1) /* not folded */
    3.92 -				mm_walk_set_prot(pmd,flags);
    3.93 +				pgd_walk_set_prot(pmd,flags);
    3.94  			for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
    3.95  				if (pmd_none(*pmd))
    3.96  					continue;
    3.97  				pte = pte_offset_kernel(pmd,0);
    3.98 -				mm_walk_set_prot(pte,flags);
    3.99 +				pgd_walk_set_prot(pte,flags);
   3.100  			}
   3.101  		}
   3.102  	}
   3.103 +
   3.104 +	BUG_ON(HYPERVISOR_update_va_mapping(
   3.105 +		(unsigned long)pgd_base,
   3.106 +		pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
   3.107 +		UVMF_TLB_FLUSH));
   3.108 +}
   3.109 +
   3.110 +static void __pgd_pin(pgd_t *pgd)
   3.111 +{
   3.112 +	pgd_walk(pgd, PAGE_KERNEL_RO);
   3.113 +	xen_pgd_pin(__pa(pgd));
   3.114 +	set_bit(PG_pinned, &virt_to_page(pgd)->flags);
   3.115 +}
   3.116 +
   3.117 +static void __pgd_unpin(pgd_t *pgd)
   3.118 +{
   3.119 +	xen_pgd_unpin(__pa(pgd));
   3.120 +	pgd_walk(pgd, PAGE_KERNEL);
   3.121 +	clear_bit(PG_pinned, &virt_to_page(pgd)->flags);
   3.122  }
   3.123  
   3.124  void mm_pin(struct mm_struct *mm)
   3.125  {
   3.126 -    spin_lock(&mm->page_table_lock);
   3.127 -
   3.128 -    mm_walk(mm, PAGE_KERNEL_RO);
   3.129 -    BUG_ON(HYPERVISOR_update_va_mapping(
   3.130 -        (unsigned long)mm->pgd,
   3.131 -        pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO),
   3.132 -        UVMF_TLB_FLUSH));
   3.133 -    xen_pgd_pin(__pa(mm->pgd));
   3.134 -    mm->context.pinned = 1;
   3.135 -    spin_lock(&mm_unpinned_lock);
   3.136 -    list_del(&mm->context.unpinned);
   3.137 -    spin_unlock(&mm_unpinned_lock);
   3.138 -
   3.139 -    spin_unlock(&mm->page_table_lock);
   3.140 +	spin_lock(&mm->page_table_lock);
   3.141 +	__pgd_pin(mm->pgd);
   3.142 +	spin_unlock(&mm->page_table_lock);
   3.143  }
   3.144  
   3.145  void mm_unpin(struct mm_struct *mm)
   3.146  {
   3.147 -    spin_lock(&mm->page_table_lock);
   3.148 -
   3.149 -    xen_pgd_unpin(__pa(mm->pgd));
   3.150 -    BUG_ON(HYPERVISOR_update_va_mapping(
   3.151 -        (unsigned long)mm->pgd,
   3.152 -        pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0));
   3.153 -    mm_walk(mm, PAGE_KERNEL);
   3.154 -    xen_tlb_flush();
   3.155 -    mm->context.pinned = 0;
   3.156 -    spin_lock(&mm_unpinned_lock);
   3.157 -    list_add(&mm->context.unpinned, &mm_unpinned);
   3.158 -    spin_unlock(&mm_unpinned_lock);
   3.159 -
   3.160 -    spin_unlock(&mm->page_table_lock);
   3.161 +	spin_lock(&mm->page_table_lock);
   3.162 +	__pgd_unpin(mm->pgd);
   3.163 +	spin_unlock(&mm->page_table_lock);
   3.164  }
   3.165  
   3.166  void mm_pin_all(void)
   3.167  {
   3.168 -    while (!list_empty(&mm_unpinned))	
   3.169 -	mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
   3.170 -			  context.unpinned));
   3.171 +	struct page *page;
   3.172 +	for (page = pgd_list; page; page = (struct page *)page->index) {
   3.173 +		if (!test_bit(PG_pinned, &page->flags))
   3.174 +			__pgd_pin((pgd_t *)page_address(page));
   3.175 +	}
   3.176  }
   3.177  
   3.178  void _arch_exit_mmap(struct mm_struct *mm)
   3.179  {
   3.180 -    struct task_struct *tsk = current;
   3.181 -
   3.182 -    task_lock(tsk);
   3.183 +	struct task_struct *tsk = current;
   3.184  
   3.185 -    /*
   3.186 -     * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
   3.187 -     * *much* faster this way, as no tlb flushes means bigger wrpt batches.
   3.188 -     */
   3.189 -    if ( tsk->active_mm == mm )
   3.190 -    {
   3.191 -        tsk->active_mm = &init_mm;
   3.192 -        atomic_inc(&init_mm.mm_count);
   3.193 +	task_lock(tsk);
   3.194  
   3.195 -        switch_mm(mm, &init_mm, tsk);
   3.196 +	/*
   3.197 +	 * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
   3.198 +	 * *much* faster this way, as no tlb flushes means bigger wrpt batches.
   3.199 +	 */
   3.200 +	if (tsk->active_mm == mm) {
   3.201 +		tsk->active_mm = &init_mm;
   3.202 +		atomic_inc(&init_mm.mm_count);
   3.203  
   3.204 -        atomic_dec(&mm->mm_count);
   3.205 -        BUG_ON(atomic_read(&mm->mm_count) == 0);
   3.206 -    }
   3.207 +		switch_mm(mm, &init_mm, tsk);
   3.208  
   3.209 -    task_unlock(tsk);
   3.210 +		atomic_dec(&mm->mm_count);
   3.211 +		BUG_ON(atomic_read(&mm->mm_count) == 0);
   3.212 +	}
   3.213  
   3.214 -    if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) )
   3.215 -        mm_unpin(mm);
   3.216 +	task_unlock(tsk);
   3.217 +
   3.218 +	if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) &&
   3.219 +	    (atomic_read(&mm->mm_count) == 1))
   3.220 +		mm_unpin(mm);
   3.221  }
   3.222 +
   3.223 +/*
   3.224 + * Local variables:
   3.225 + *  c-file-style: "linux"
   3.226 + *  indent-tabs-mode: t
   3.227 + *  c-indent-level: 8
   3.228 + *  c-basic-offset: 8
   3.229 + *  tab-width: 8
   3.230 + * End:
   3.231 + */
     4.1 --- a/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c	Mon Nov 07 16:37:58 2005 +0100
     4.2 +++ b/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c	Mon Nov 07 18:14:45 2005 +0100
     4.3 @@ -129,8 +129,8 @@ static int __do_suspend(void *ignore)
     4.4  	preempt_disable();
     4.5  
     4.6  #ifdef __i386__
     4.7 +	kmem_cache_shrink(pgd_cache);
     4.8  	mm_pin_all();
     4.9 -	kmem_cache_shrink(pgd_cache);
    4.10  #endif
    4.11  
    4.12  	__cli();
     5.1 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu.h	Mon Nov 07 16:37:58 2005 +0100
     5.2 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu.h	Mon Nov 07 18:14:45 2005 +0100
     5.3 @@ -12,13 +12,8 @@ typedef struct {
     5.4  	int size;
     5.5  	struct semaphore sem;
     5.6  	void *ldt;
     5.7 -	unsigned pinned:1;
     5.8 -	struct list_head unpinned;
     5.9  } mm_context_t;
    5.10  
    5.11 -extern struct list_head mm_unpinned;
    5.12 -extern spinlock_t mm_unpinned_lock;
    5.13 -
    5.14  /* mm/memory.c:exit_mmap hook */
    5.15  extern void _arch_exit_mmap(struct mm_struct *mm);
    5.16  #define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
     6.1 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h	Mon Nov 07 16:37:58 2005 +0100
     6.2 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h	Mon Nov 07 18:14:45 2005 +0100
     6.3 @@ -53,7 +53,7 @@ static inline void switch_mm(struct mm_s
     6.4  	struct mmuext_op _op[2], *op = _op;
     6.5  
     6.6  	if (likely(prev != next)) {
     6.7 -		if (!next->context.pinned)
     6.8 +		if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
     6.9  			mm_pin(next);
    6.10  
    6.11  		/* stop flush ipis for the previous mm */
     7.1 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h	Mon Nov 07 16:37:58 2005 +0100
     7.2 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h	Mon Nov 07 18:14:45 2005 +0100
     7.3 @@ -7,12 +7,15 @@
     7.4  #include <linux/mm.h>		/* for struct page */
     7.5  #include <asm/io.h>		/* for phys_to_virt and page_to_pseudophys */
     7.6  
     7.7 +/* Is this pagetable pinned? */
     7.8 +#define PG_pinned	PG_arch_1
     7.9 +
    7.10  #define pmd_populate_kernel(mm, pmd, pte) \
    7.11  		set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
    7.12  
    7.13  #define pmd_populate(mm, pmd, pte) 					\
    7.14  do {									\
    7.15 -	if (unlikely((mm)->context.pinned)) {				\
    7.16 +	if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) {	\
    7.17  		if (!PageHighMem(pte))					\
    7.18  			BUG_ON(HYPERVISOR_update_va_mapping(		\
    7.19  			  (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT),\