ia64/xen-unstable

changeset 10406:ee482dc60eab

[LINUX][PAE] More fixes to pgd allocation. Since allocating pmds
can sleep, we could race save/restore and end up with stale
machine addresses stores in pgd entries. Avoid this by
remembering virtuall addresses and translating to machine
addresses all at the end and protected by the pgd_lock.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Wed Jun 14 17:06:28 2006 +0100 (2006-06-14)
parents 63967ff8d459
children 161473836da3
files linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c
line diff
     1.1 --- a/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c	Wed Jun 14 13:48:04 2006 +0100
     1.2 +++ b/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c	Wed Jun 14 17:06:28 2006 +0100
     1.3 @@ -330,71 +330,90 @@ void pgd_dtor(void *pgd, kmem_cache_t *c
     1.4  pgd_t *pgd_alloc(struct mm_struct *mm)
     1.5  {
     1.6  	int i;
     1.7 -	pgd_t *pgd_tmp = NULL, *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
     1.8 +	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
     1.9 +	pmd_t **pmd;
    1.10 +	unsigned long flags;
    1.11  
    1.12  	pgd_test_and_unpin(pgd);
    1.13  
    1.14  	if (PTRS_PER_PMD == 1 || !pgd)
    1.15  		return pgd;
    1.16  
    1.17 -	for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
    1.18 -		pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
    1.19 -		if (!pmd)
    1.20 -			goto out_oom;
    1.21 -		set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
    1.22 -	}
    1.23 -
    1.24 -	if (!HAVE_SHARED_KERNEL_PMD) {
    1.25 -		unsigned long flags;
    1.26 -
    1.27 -		for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
    1.28 +	if (HAVE_SHARED_KERNEL_PMD) {
    1.29 +		for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
    1.30  			pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
    1.31  			if (!pmd)
    1.32  				goto out_oom;
    1.33  			set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
    1.34  		}
    1.35 +		return pgd;
    1.36 +	}
    1.37  
    1.38 -		/* create_contig_region() loses page data. Make a temp copy. */
    1.39 -		if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
    1.40 -			pgd_tmp = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
    1.41 -			if (!pgd_tmp)
    1.42 -				goto out_oom;
    1.43 -			memcpy(pgd_tmp, pgd, PAGE_SIZE);
    1.44 -		}
    1.45 +	/*
    1.46 +	 * We can race save/restore (if we sleep during a GFP_KERNEL memory
    1.47 +	 * allocation). We therefore store virtual addresses of pmds as they
    1.48 +	 * do not change across save/restore, and poke the machine addresses
    1.49 +	 * into the pgdir under the pgd_lock.
    1.50 +	 */
    1.51 +	pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
    1.52 +	if (!pmd) {
    1.53 +		kmem_cache_free(pgd_cache, pgd);
    1.54 +		return NULL;
    1.55 +	}
    1.56  
    1.57 -		spin_lock_irqsave(&pgd_lock, flags);
    1.58 +	/* Allocate pmds, remember virtual addresses. */
    1.59 +	for (i = 0; i < PTRS_PER_PGD; ++i) {
    1.60 +		pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
    1.61 +		if (!pmd[i])
    1.62 +			goto out_oom;
    1.63 +	}
    1.64 +
    1.65 +	spin_lock_irqsave(&pgd_lock, flags);
    1.66  
    1.67 -		/* Protect against save/restore: move below 4GB with lock. */
    1.68 -		if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
    1.69 -			int rc = xen_create_contiguous_region(
    1.70 -				(unsigned long)pgd, 0, 32);
    1.71 -			memcpy(pgd, pgd_tmp, PAGE_SIZE);
    1.72 -			kmem_cache_free(pgd_cache, pgd_tmp);
    1.73 -			if (rc)
    1.74 -				goto out_oom;
    1.75 +	/* Protect against save/restore: move below 4GB under pgd_lock. */
    1.76 +	if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
    1.77 +		int rc = xen_create_contiguous_region(
    1.78 +			(unsigned long)pgd, 0, 32);
    1.79 +		if (rc) {
    1.80 +			spin_unlock_irqrestore(&pgd_lock, flags);
    1.81 +			goto out_oom;
    1.82  		}
    1.83 +	}
    1.84  
    1.85 -		for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
    1.86 -			unsigned long v = (unsigned long)i << PGDIR_SHIFT;
    1.87 -			pgd_t *kpgd = pgd_offset_k(v);
    1.88 -			pud_t *kpud = pud_offset(kpgd, v);
    1.89 -			pmd_t *kpmd = pmd_offset(kpud, v);
    1.90 -			pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
    1.91 -			memcpy(pmd, kpmd, PAGE_SIZE);
    1.92 -			make_lowmem_page_readonly(
    1.93 -				pmd, XENFEAT_writable_page_tables);
    1.94 -		}
    1.95 +	/* Copy kernel pmd contents and write-protect the new pmds. */
    1.96 +	for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
    1.97 +		unsigned long v = (unsigned long)i << PGDIR_SHIFT;
    1.98 +		pgd_t *kpgd = pgd_offset_k(v);
    1.99 +		pud_t *kpud = pud_offset(kpgd, v);
   1.100 +		pmd_t *kpmd = pmd_offset(kpud, v);
   1.101 +		memcpy(pmd[i], kpmd, PAGE_SIZE);
   1.102 +		make_lowmem_page_readonly(
   1.103 +			pmd[i], XENFEAT_writable_page_tables);
   1.104 +	}
   1.105  
   1.106 -		pgd_list_add(pgd);
   1.107 +	/* It is safe to poke machine addresses of pmds under the pmd_lock. */
   1.108 +	for (i = 0; i < PTRS_PER_PGD; i++)
   1.109 +		set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i])));
   1.110  
   1.111 -		spin_unlock_irqrestore(&pgd_lock, flags);
   1.112 -	}
   1.113 +	/* Ensure this pgd gets picked up and pinned on save/restore. */
   1.114 +	pgd_list_add(pgd);
   1.115 +
   1.116 +	spin_unlock_irqrestore(&pgd_lock, flags);
   1.117 +
   1.118 +	kfree(pmd);
   1.119  
   1.120  	return pgd;
   1.121  
   1.122  out_oom:
   1.123 -	for (i--; i >= 0; i--)
   1.124 -		kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
   1.125 +	if (HAVE_SHARED_KERNEL_PMD) {
   1.126 +		for (i--; i >= 0; i--)
   1.127 +			kmem_cache_free(pmd_cache,
   1.128 +					(void *)__va(pgd_val(pgd[i])-1));
   1.129 +	} else {
   1.130 +		for (i--; i >= 0; i--)
   1.131 +			kmem_cache_free(pmd_cache, pmd[i]);
   1.132 +		kfree(pmd);
   1.133 +	}
   1.134  	kmem_cache_free(pgd_cache, pgd);
   1.135  	return NULL;
   1.136  }
   1.137 @@ -403,6 +422,14 @@ void pgd_free(pgd_t *pgd)
   1.138  {
   1.139  	int i;
   1.140  
   1.141 +	/*
   1.142 +	 * After this the pgd should not be pinned for the duration of this
   1.143 +	 * function's execution. We should never sleep and thus never race:
   1.144 +	 *  1. User pmds will not become write-protected under our feet due
   1.145 +	 *     to a concurrent mm_pin_all().
   1.146 +	 *  2. The machine addresses in PGD entries will not become invalid
   1.147 +	 *     due to a concurrent save/restore.
   1.148 +	 */
   1.149  	pgd_test_and_unpin(pgd);
   1.150  
   1.151  	/* in the PAE case user pgd entries are overwritten before usage */
   1.152 @@ -418,8 +445,6 @@ void pgd_free(pgd_t *pgd)
   1.153  			pgd_list_del(pgd);
   1.154  			spin_unlock_irqrestore(&pgd_lock, flags);
   1.155  
   1.156 -			pgd_test_and_unpin(pgd);
   1.157 -
   1.158  			for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
   1.159  				pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
   1.160  				make_lowmem_page_writable(
   1.161 @@ -609,7 +634,7 @@ static void pgd_test_and_unpin(pgd_t *pg
   1.162  void mm_pin(struct mm_struct *mm)
   1.163  {
   1.164  	if (xen_feature(XENFEAT_writable_page_tables))
   1.165 -	    return;
   1.166 +		return;
   1.167  	spin_lock(&mm->page_table_lock);
   1.168  	__pgd_pin(mm->pgd);
   1.169  	spin_unlock(&mm->page_table_lock);
   1.170 @@ -618,7 +643,7 @@ void mm_pin(struct mm_struct *mm)
   1.171  void mm_unpin(struct mm_struct *mm)
   1.172  {
   1.173  	if (xen_feature(XENFEAT_writable_page_tables))
   1.174 -	    return;
   1.175 +		return;
   1.176  	spin_lock(&mm->page_table_lock);
   1.177  	__pgd_unpin(mm->pgd);
   1.178  	spin_unlock(&mm->page_table_lock);
   1.179 @@ -628,7 +653,7 @@ void mm_pin_all(void)
   1.180  {
   1.181  	struct page *page;
   1.182  	if (xen_feature(XENFEAT_writable_page_tables))
   1.183 -	    return;
   1.184 +		return;
   1.185  	for (page = pgd_list; page; page = (struct page *)page->index) {
   1.186  		if (!test_bit(PG_pinned, &page->flags))
   1.187  			__pgd_pin((pgd_t *)page_address(page));