From a842f3f13d1ff4e7a34fdc9b227df88af8dea05c Mon Sep 17 00:00:00 2001 From: Keir Fraser Date: Wed, 22 Sep 2010 08:57:21 +0100 Subject: [PATCH] xen/x86: synchronize vmalloc_sync_all() with mm_{,un}pin() As recently diagnosed by Citrix engineers, mm_{,un}pin() and vmalloc_sync_all() aren't properly synchronized. So we add a backlink to the referencing struct mm_struct to the pgd's struct page, and use this to lock the page table updates in vmalloc_sync_all(). Due to the way pgd-s get allocated and put on the global list on i386, we have to account for the backlink not to be set yet (in which case they cannot be subject to (un)pinning. Along the way, I found it necessary/desirable to also fix - a potential NULL dereference in i386's pgd_alloc(), - x86-64 adding not yet cleaned out pgd-s to the global list, and - x86-64 removing pgd-s from the global list rather late. Signed-off-by: Jan Beulich --- arch/i386/mm/fault-xen.c | 19 ++++++++++++++++--- arch/i386/mm/pgtable-xen.c | 9 ++++++++- arch/x86_64/mm/fault-xen.c | 6 ++++++ include/asm-x86_64/mach-xen/asm/pgalloc.h | 18 ++++++++++++------ 4 files changed, 42 insertions(+), 10 deletions(-) diff --git a/arch/i386/mm/fault-xen.c b/arch/i386/mm/fault-xen.c index 6c036c7c..22e7a208 100644 --- a/arch/i386/mm/fault-xen.c +++ b/arch/i386/mm/fault-xen.c @@ -762,12 +762,25 @@ void vmalloc_sync_all(void) return; } for (page = pgd_list; page; page = - (struct page *)page->index) - if (!vmalloc_sync_one(page_address(page), - address)) { + (struct page *)page->index) { + spinlock_t *lock = page->mapping + ? &((struct mm_struct *)page->mapping) + ->page_table_lock + : NULL; + pmd_t *pmd; + + if (lock) + spin_lock(lock); + pmd = vmalloc_sync_one(page_address(page), + address); + if (lock) + spin_unlock(lock); + + if (!pmd) { BUG_ON(page != pgd_list); break; } + } spin_unlock_irqrestore(&pgd_lock, flags); if (!page) set_bit(sync_index(address), insync); diff --git a/arch/i386/mm/pgtable-xen.c b/arch/i386/mm/pgtable-xen.c index fa783a43..9311d478 100644 --- a/arch/i386/mm/pgtable-xen.c +++ b/arch/i386/mm/pgtable-xen.c @@ -232,6 +232,7 @@ static inline void pgd_list_del(pgd_t *pgd) *pprev = next; if (next) set_page_private(next, (unsigned long)pprev); + page->mapping = NULL; } void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) @@ -273,9 +274,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pmd_t **pmd; unsigned long flags; + if (!pgd) + return NULL; + pgd_test_and_unpin(pgd); - if (PTRS_PER_PMD == 1 || !pgd) + /* Store a back link for vmalloc_sync_all(). */ + virt_to_page(pgd)->mapping = (void *)mm; + + if (PTRS_PER_PMD == 1) return pgd; if (HAVE_SHARED_KERNEL_PMD) { diff --git a/arch/x86_64/mm/fault-xen.c b/arch/x86_64/mm/fault-xen.c index 7a12f4a5..7a1e74ed 100644 --- a/arch/x86_64/mm/fault-xen.c +++ b/arch/x86_64/mm/fault-xen.c @@ -677,6 +677,9 @@ do_sigbus: DEFINE_SPINLOCK(pgd_lock); struct page *pgd_list; +#define pgd_page_table(what, pg) \ + spin_##what(&((struct mm_struct *)(pg)->mapping)->page_table_lock) + void vmalloc_sync_all(void) { /* Note that races in the updates of insync and start aren't @@ -699,10 +702,13 @@ void vmalloc_sync_all(void) page = (struct page *)page->index) { pgd_t *pgd; pgd = (pgd_t *)page_address(page) + pgd_index(address); + + pgd_page_table(lock, page); if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); else BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref)); + pgd_page_table(unlock, page); } spin_unlock(&pgd_lock); set_bit(pgd_index(address), insync); diff --git a/include/asm-x86_64/mach-xen/asm/pgalloc.h b/include/asm-x86_64/mach-xen/asm/pgalloc.h index cff0fcf1..7f9757fd 100644 --- a/include/asm-x86_64/mach-xen/asm/pgalloc.h +++ b/include/asm-x86_64/mach-xen/asm/pgalloc.h @@ -95,10 +95,13 @@ static inline void pud_free(pud_t *pud) pte_free(virt_to_page(pud)); } -static inline void pgd_list_add(pgd_t *pgd) +static inline void pgd_list_add(pgd_t *pgd, void *mm) { struct page *page = virt_to_page(pgd); + /* Store a back link for vmalloc_sync_all(). */ + page->mapping = mm; + spin_lock(&pgd_lock); page->index = (pgoff_t)pgd_list; if (pgd_list) @@ -119,6 +122,8 @@ static inline void pgd_list_del(pgd_t *pgd) if (next) next->private = (unsigned long)pprev; spin_unlock(&pgd_lock); + + page->mapping = NULL; } static inline pgd_t *pgd_alloc(struct mm_struct *mm) @@ -127,22 +132,22 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) * We allocate two contiguous pages for kernel and user. */ unsigned boundary; - pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1); + pgd_t *pgd; + + pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 1); if (!pgd) return NULL; - pgd_list_add(pgd); + pgd_list_add(pgd, mm); /* * Copy kernel pointers in from init. * Could keep a freelist or slab cache of those because the kernel * part never changes. */ boundary = pgd_index(__PAGE_OFFSET); - memset(pgd, 0, boundary * sizeof(pgd_t)); memcpy(pgd + boundary, init_level4_pgt + boundary, (PTRS_PER_PGD - boundary) * sizeof(pgd_t)); - memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */ /* * Set level3_user_pgt for vsyscall area */ @@ -155,6 +160,8 @@ static inline void pgd_free(pgd_t *pgd) { pte_t *ptep = virt_to_ptep(pgd); + pgd_list_del(pgd); + if (!pte_write(*ptep)) { xen_pgd_unpin(__pa(pgd)); BUG_ON(HYPERVISOR_update_va_mapping( @@ -174,7 +181,6 @@ static inline void pgd_free(pgd_t *pgd) 0)); } - pgd_list_del(pgd); free_pages((unsigned long)pgd, 1); } -- 2.39.5