]> xenbits.xensource.com Git - legacy/linux-2.6.18-xen.git/commitdiff
linux: allow use of split page table locks
authorKeir Fraser <keir@xensource.com>
Fri, 5 Oct 2007 09:49:06 +0000 (10:49 +0100)
committerKeir Fraser <keir@xensource.com>
Fri, 5 Oct 2007 09:49:06 +0000 (10:49 +0100)
This fixes the race condition previously experienced between
(un)pinning and vmscan.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
arch/i386/mm/pgtable-xen.c
arch/x86_64/mm/pageattr-xen.c
mm/Kconfig

index 24eb8248fd36349c1542ce4bc55d4206b4f150e6..085d7f323af019eadbac217b97a8a80e42de6984 100644 (file)
@@ -494,6 +494,64 @@ void make_pages_writable(void *va, unsigned int nr, unsigned int feature)
        }
 }
 
+static void _pin_lock(struct mm_struct *mm, int lock) {
+       if (lock)
+               spin_lock(&mm->page_table_lock);
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+       /* While mm->page_table_lock protects us against insertions and
+        * removals of higher level page table pages, it doesn't protect
+        * against updates of pte-s. Such updates, however, require the
+        * pte pages to be in consistent state (unpinned+writable or
+        * pinned+readonly). The pinning and attribute changes, however
+        * cannot be done atomically, which is why such updates must be
+        * prevented from happening concurrently.
+        * Note that no pte lock can ever elsewhere be acquired nesting
+        * with an already acquired one in the same mm, or with the mm's
+        * page_table_lock already acquired, as that would break in the
+        * non-split case (where all these are actually resolving to the
+        * one page_table_lock). Thus acquiring all of them here is not
+        * going to result in dead locks, and the order of acquires
+        * doesn't matter.
+        */
+       {
+               pgd_t *pgd = mm->pgd;
+               unsigned g;
+
+               for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
+                       pud_t *pud;
+                       unsigned u;
+
+                       if (pgd_none(*pgd))
+                               continue;
+                       pud = pud_offset(pgd, 0);
+                       for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+                               pmd_t *pmd;
+                               unsigned m;
+
+                               if (pud_none(*pud))
+                                       continue;
+                               pmd = pmd_offset(pud, 0);
+                               for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+                                       spinlock_t *ptl;
+
+                                       if (pmd_none(*pmd))
+                                               continue;
+                                       ptl = pte_lockptr(0, pmd);
+                                       if (lock)
+                                               spin_lock(ptl);
+                                       else
+                                               spin_unlock(ptl);
+                               }
+                       }
+               }
+       }
+#endif
+       if (!lock)
+               spin_unlock(&mm->page_table_lock);
+}
+#define pin_lock(mm) _pin_lock(mm, 1)
+#define pin_unlock(mm) _pin_lock(mm, 0)
+
 static inline void pgd_walk_set_prot(struct page *page, pgprot_t flags)
 {
        unsigned long pfn = page_to_pfn(page);
@@ -576,18 +634,18 @@ void mm_pin(struct mm_struct *mm)
 {
        if (xen_feature(XENFEAT_writable_page_tables))
                return;
-       spin_lock(&mm->page_table_lock);
+       pin_lock(mm);
        __pgd_pin(mm->pgd);
-       spin_unlock(&mm->page_table_lock);
+       pin_unlock(mm);
 }
 
 void mm_unpin(struct mm_struct *mm)
 {
        if (xen_feature(XENFEAT_writable_page_tables))
                return;
-       spin_lock(&mm->page_table_lock);
+       pin_lock(mm);
        __pgd_unpin(mm->pgd);
-       spin_unlock(&mm->page_table_lock);
+       pin_unlock(mm);
 }
 
 void mm_pin_all(void)
index 3b5fc26e844fafb0c09af5ced644d7367adac9ba..e87e2785f199d79f1ac637e94cd71dd4fefb1447 100644 (file)
 LIST_HEAD(mm_unpinned);
 DEFINE_SPINLOCK(mm_unpinned_lock);
 
+static void _pin_lock(struct mm_struct *mm, int lock) {
+       if (lock)
+               spin_lock(&mm->page_table_lock);
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+       /* While mm->page_table_lock protects us against insertions and
+        * removals of higher level page table pages, it doesn't protect
+        * against updates of pte-s. Such updates, however, require the
+        * pte pages to be in consistent state (unpinned+writable or
+        * pinned+readonly). The pinning and attribute changes, however
+        * cannot be done atomically, which is why such updates must be
+        * prevented from happening concurrently.
+        * Note that no pte lock can ever elsewhere be acquired nesting
+        * with an already acquired one in the same mm, or with the mm's
+        * page_table_lock already acquired, as that would break in the
+        * non-split case (where all these are actually resolving to the
+        * one page_table_lock). Thus acquiring all of them here is not
+        * going to result in dead locks, and the order of acquires
+        * doesn't matter.
+        */
+       {
+               pgd_t *pgd = mm->pgd;
+               unsigned g;
+
+               for (g = 0; g <= ((TASK_SIZE64-1) / PGDIR_SIZE); g++, pgd++) {
+                       pud_t *pud;
+                       unsigned u;
+
+                       if (pgd_none(*pgd))
+                               continue;
+                       pud = pud_offset(pgd, 0);
+                       for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+                               pmd_t *pmd;
+                               unsigned m;
+
+                               if (pud_none(*pud))
+                                       continue;
+                               pmd = pmd_offset(pud, 0);
+                               for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+                                       spinlock_t *ptl;
+
+                                       if (pmd_none(*pmd))
+                                               continue;
+                                       ptl = pte_lockptr(0, pmd);
+                                       if (lock)
+                                               spin_lock(ptl);
+                                       else
+                                               spin_unlock(ptl);
+                               }
+                       }
+               }
+       }
+#endif
+       if (!lock)
+               spin_unlock(&mm->page_table_lock);
+}
+#define pin_lock(mm) _pin_lock(mm, 1)
+#define pin_unlock(mm) _pin_lock(mm, 0)
+
 static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
 {
        struct page *page = virt_to_page(pt);
@@ -76,7 +134,7 @@ void mm_pin(struct mm_struct *mm)
        if (xen_feature(XENFEAT_writable_page_tables))
                return;
 
-       spin_lock(&mm->page_table_lock);
+       pin_lock(mm);
 
        mm_walk(mm, PAGE_KERNEL_RO);
        if (HYPERVISOR_update_va_mapping(
@@ -97,7 +155,7 @@ void mm_pin(struct mm_struct *mm)
        list_del(&mm->context.unpinned);
        spin_unlock(&mm_unpinned_lock);
 
-       spin_unlock(&mm->page_table_lock);
+       pin_unlock(mm);
 }
 
 void mm_unpin(struct mm_struct *mm)
@@ -105,7 +163,7 @@ void mm_unpin(struct mm_struct *mm)
        if (xen_feature(XENFEAT_writable_page_tables))
                return;
 
-       spin_lock(&mm->page_table_lock);
+       pin_lock(mm);
 
        xen_pgd_unpin(__pa(mm->pgd));
        xen_pgd_unpin(__pa(__user_pgd(mm->pgd)));
@@ -125,7 +183,7 @@ void mm_unpin(struct mm_struct *mm)
        list_add(&mm->context.unpinned, &mm_unpinned);
        spin_unlock(&mm_unpinned_lock);
 
-       spin_unlock(&mm->page_table_lock);
+       pin_unlock(mm);
 }
 
 void mm_pin_all(void)
index 14492aaa5b3967f7ea119b37fa6f349b00758890..8f5b45615f7bf1b43c35253b8f28f1ef4871818e 100644 (file)
@@ -127,14 +127,11 @@ comment "Memory hotplug is currently incompatible with Software Suspend"
 # Default to 4 for wider testing, though 8 might be more appropriate.
 # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
 # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
-# XEN on x86 architecture uses the mapping field on pagetable pages to store a
-# pointer to the destructor. This conflicts with pte_lock_deinit().
 #
 config SPLIT_PTLOCK_CPUS
        int
        default "4096" if ARM && !CPU_CACHE_VIPT
        default "4096" if PARISC && !PA20
-       default "4096" if X86_XEN || X86_64_XEN
        default "4"
 
 #