x86/mm: Rework locking in the PoD layer

author Andres Lagar-Cavilla <andres@lagarcavilla.org>

Fri, 10 Feb 2012 16:07:07 +0000 (16:07 +0000)

committer Andres Lagar-Cavilla <andres@lagarcavilla.org>

Fri, 10 Feb 2012 16:07:07 +0000 (16:07 +0000)
author Andres Lagar-Cavilla <andres@lagarcavilla.org>
Fri, 10 Feb 2012 16:07:07 +0000 (16:07 +0000)
committer Andres Lagar-Cavilla <andres@lagarcavilla.org>
Fri, 10 Feb 2012 16:07:07 +0000 (16:07 +0000)
diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h

index 524c2b31e3bf8382b27f5a4b51711afe97242d6c..c973b4672bf52acbb2cce3082ab79cf512e9a321 100644 (file)
--- a/xen/arch/x86/mm/mm-locks.h
+++ b/xen/arch/x86/mm/mm-locks.h
@@ -194,6 +194,16 @@ declare_mm_order_constraint(per_page_sharing)
   * - setting the "cr3" field of any p2m table to a non-CR3_EADDR value. 
   *   (i.e. assigning a p2m table to be the shadow of that cr3 */
  
+/* PoD lock (per-p2m-table)
+ * 
+ * Protects private PoD data structs: entry and cache
+ * counts, page lists, sweep parameters. */
+
+declare_mm_lock(pod)
+#define pod_lock(p)           mm_lock(pod, &(p)->pod.lock)
+#define pod_unlock(p)         mm_unlock(&(p)->pod.lock)
+#define pod_locked_by_me(p)   mm_locked_by_me(&(p)->pod.lock)
+
  /* Page alloc lock (per-domain)
   *
   * This is an external lock, not represented by an mm_lock_t. However, 
diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c

index 7d6b14b27ca015f84352ec6f6fbb9b1c3fb4b27a..ca0e33238fc6573b0b7f318af03163b9502f2a99 100644 (file)
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -100,7 +100,7 @@ p2m_pod_cache_add(struct p2m_domain *p2m,
      }
  #endif
  
-    ASSERT(p2m_locked_by_me(p2m));
+    ASSERT(pod_locked_by_me(p2m));
  
      /*
       * Pages from domain_alloc and returned by the balloon driver aren't
@@ -114,15 +114,16 @@ p2m_pod_cache_add(struct p2m_domain *p2m,
          unmap_domain_page(b);
      }
  
-    lock_page_alloc(p2m);
-
      /* First, take all pages off the domain list */
+    lock_page_alloc(p2m);
      for(i=0; i < 1 << order ; i++)
      {
          p = page + i;
          page_list_del(p, &d->page_list);
      }
  
+    unlock_page_alloc(p2m);
+
      /* Then add the first one to the appropriate populate-on-demand list */
      switch(order)
      {
@@ -138,25 +139,20 @@ p2m_pod_cache_add(struct p2m_domain *p2m,
          BUG();
      }
  
-    /* Ensure that the PoD cache has never been emptied.  
-     * This may cause "zombie domains" since the page will never be freed. */
-    BUG_ON( d->arch.relmem != RELMEM_not_started );
-
-    unlock_page_alloc(p2m);
-
      return 0;
  }
  
  /* Get a page of size order from the populate-on-demand cache.  Will break
   * down 2-meg pages into singleton pages automatically.  Returns null if
- * a superpage is requested and no superpages are available.  Must be called
- * with the d->page_lock held. */
+ * a superpage is requested and no superpages are available. */
  static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m,
                                              unsigned long order)
  {
      struct page_info *p = NULL;
      int i;
  
+    ASSERT(pod_locked_by_me(p2m));
+
      if ( order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) )
      {
          return NULL;
@@ -185,7 +181,7 @@ static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m,
      case PAGE_ORDER_2M:
          BUG_ON( page_list_empty(&p2m->pod.super) );
          p = page_list_remove_head(&p2m->pod.super);
-        p2m->pod.count -= 1 << order; /* Lock: page_alloc */
+        p2m->pod.count -= 1 << order;
          break;
      case PAGE_ORDER_4K:
          BUG_ON( page_list_empty(&p2m->pod.single) );
@@ -197,11 +193,13 @@ static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m,
      }
  
      /* Put the pages back on the domain page_list */
+    lock_page_alloc(p2m);
      for ( i = 0 ; i < (1 << order); i++ )
      {
          BUG_ON(page_get_owner(p + i) != p2m->domain);
          page_list_add_tail(p + i, &p2m->domain->page_list);
      }
+    unlock_page_alloc(p2m);
  
      return p;
  }
@@ -213,6 +211,8 @@ p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long pod_target, int p
      struct domain *d = p2m->domain;
      int ret = 0;
  
+    ASSERT(pod_locked_by_me(p2m));
+
      /* Increasing the target */
      while ( pod_target > p2m->pod.count )
      {
@@ -250,17 +250,13 @@ p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long pod_target, int p
      }
  
      /* Decreasing the target */
-    /* We hold the p2m lock here, so we don't need to worry about
+    /* We hold the pod lock here, so we don't need to worry about
       * cache disappearing under our feet. */
      while ( pod_target < p2m->pod.count )
      {
          struct page_info * page;
          int order, i;
  
-        /* Grab the lock before checking that pod.super is empty, or the last
-         * entries may disappear before we grab the lock. */
-        lock_page_alloc(p2m);
-
          if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES
               && !page_list_empty(&p2m->pod.super) )
              order = PAGE_ORDER_2M;
@@ -271,8 +267,6 @@ p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long pod_target, int p
  
          ASSERT(page != NULL);
  
-        unlock_page_alloc(p2m);
-
          /* Then free them */
          for ( i = 0 ; i < (1 << order) ; i++ )
          {
@@ -348,7 +342,7 @@ p2m_pod_set_mem_target(struct domain *d, unsigned long target)
      int ret = 0;
      unsigned long populated;
  
-    p2m_lock(p2m);
+    pod_lock(p2m);
  
      /* P == B: Nothing to do. */
      if ( p2m->pod.entry_count == 0 )
@@ -377,7 +371,7 @@ p2m_pod_set_mem_target(struct domain *d, unsigned long target)
      ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
  
  out:
-    p2m_unlock(p2m);
+    pod_unlock(p2m);
  
      return ret;
  }
@@ -390,7 +384,7 @@ p2m_pod_empty_cache(struct domain *d)
  
      /* After this barrier no new PoD activities can happen. */
      BUG_ON(!d->is_dying);
-    spin_barrier(&p2m->lock.lock);
+    spin_barrier(&p2m->pod.lock.lock);
  
      lock_page_alloc(p2m);
  
@@ -431,7 +425,7 @@ p2m_pod_offline_or_broken_hit(struct page_info *p)
      if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
          return 0;
  
-    lock_page_alloc(p2m);
+    pod_lock(p2m);
      bmfn = mfn_x(page_to_mfn(p));
      page_list_for_each_safe(q, tmp, &p2m->pod.super)
      {
@@ -462,12 +456,14 @@ p2m_pod_offline_or_broken_hit(struct page_info *p)
          }
      }
  
-    unlock_page_alloc(p2m);
+    pod_unlock(p2m);
      return 0;
  
  pod_hit:
+    lock_page_alloc(p2m);
      page_list_add_tail(p, &d->arch.relmem_list);
      unlock_page_alloc(p2m);
+    pod_unlock(p2m);
      return 1;
  }
  
@@ -486,9 +482,9 @@ p2m_pod_offline_or_broken_replace(struct page_info *p)
      if ( unlikely(!p) )
          return;
  
-    p2m_lock(p2m);
+    pod_lock(p2m);
      p2m_pod_cache_add(p2m, p, PAGE_ORDER_4K);
-    p2m_unlock(p2m);
+    pod_unlock(p2m);
      return;
  }
  
@@ -511,18 +507,18 @@ p2m_pod_decrease_reservation(struct domain *d,
  
      int steal_for_cache = 0;
      int pod = 0, nonpod = 0, ram = 0;
-    
+
+    gfn_lock(p2m, gpfn, order);
+    pod_lock(p2m);    
  
      /* If we don't have any outstanding PoD entries, let things take their
       * course */
      if ( p2m->pod.entry_count == 0 )
-        goto out;
+        goto out_unlock;
  
      /* Figure out if we need to steal some freed memory for our cache */
      steal_for_cache =  ( p2m->pod.entry_count > p2m->pod.count );
  
-    gfn_lock(p2m, gpfn, order);
-
      if ( unlikely(d->is_dying) )
          goto out_unlock;
  
@@ -554,7 +550,7 @@ p2m_pod_decrease_reservation(struct domain *d,
          /* All PoD: Mark the whole region invalid and tell caller
           * we're done. */
          set_p2m_entry(p2m, gpfn, _mfn(INVALID_MFN), order, p2m_invalid, p2m->default_access);
-        p2m->pod.entry_count-=(1<<order); /* Lock: p2m */
+        p2m->pod.entry_count-=(1<<order);
          BUG_ON(p2m->pod.entry_count < 0);
          ret = 1;
          goto out_entry_check;
@@ -578,7 +574,7 @@ p2m_pod_decrease_reservation(struct domain *d,
          if ( t == p2m_populate_on_demand )
          {
              set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, p2m->default_access);
-            p2m->pod.entry_count--; /* Lock: p2m */
+            p2m->pod.entry_count--;
              BUG_ON(p2m->pod.entry_count < 0);
              pod--;
          }
@@ -615,9 +611,8 @@ out_entry_check:
      }
  
  out_unlock:
+    pod_unlock(p2m);
      gfn_unlock(p2m, gpfn, order);
-
-out:
      return ret;
  }
  
@@ -630,7 +625,8 @@ void p2m_pod_dump_data(struct domain *d)
  
  
  /* Search for all-zero superpages to be reclaimed as superpages for the
- * PoD cache. Must be called w/ p2m lock held, page_alloc lock not held. */
+ * PoD cache. Must be called w/ pod lock held, must lock the superpage
+ * in the p2m */
  static int
  p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn)
  {
@@ -642,6 +638,8 @@ p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn)
      int max_ref = 1;
      struct domain *d = p2m->domain;
  
+    ASSERT(pod_locked_by_me(p2m));
+
      if ( !superpage_aligned(gfn) )
          goto out;
  
@@ -649,6 +647,10 @@ p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn)
      if ( paging_mode_shadow(d) )
          max_ref++;
  
+    /* NOTE: this is why we don't enforce deadlock constraints between p2m 
+     * and pod locks */
+    gfn_lock(p2m, gfn, SUPERPAGE_ORDER);
+
      /* Look up the mfns, checking to make sure they're the same mfn
       * and aligned, and mapping them. */
      for ( i=0; i<SUPERPAGE_PAGES; i++ )
@@ -761,6 +763,7 @@ out_reset:
          set_p2m_entry(p2m, gfn, mfn0, 9, type0, p2m->default_access);
      
  out:
+    gfn_unlock(p2m, gfn, SUPERPAGE_ORDER);
      return ret;
  }
  
@@ -922,6 +925,10 @@ p2m_pod_emergency_sweep(struct p2m_domain *p2m)
      limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
  
      /* FIXME: Figure out how to avoid superpages */
+    /* NOTE: Promote to globally locking the p2m. This will get complicated
+     * in a fine-grained scenario. If we lock each gfn individually we must be
+     * careful about spinlock recursion limits and POD_SWEEP_STRIDE. */
+    p2m_lock(p2m);
      for ( i=p2m->pod.reclaim_single; i > 0 ; i-- )
      {
          p2m_access_t a;
@@ -940,7 +947,7 @@ p2m_pod_emergency_sweep(struct p2m_domain *p2m)
          /* Stop if we're past our limit and we have found *something*.
           *
           * NB that this is a zero-sum game; we're increasing our cache size
-         * by re-increasing our 'debt'.  Since we hold the p2m lock,
+         * by re-increasing our 'debt'.  Since we hold the pod lock,
           * (entry_count - count) must remain the same. */
          if ( p2m->pod.count > 0 && i < limit )
              break;
@@ -949,6 +956,7 @@ p2m_pod_emergency_sweep(struct p2m_domain *p2m)
      if ( j )
          p2m_pod_zero_check(p2m, gfns, j);
  
+    p2m_unlock(p2m);
      p2m->pod.reclaim_single = i ? i - 1 : i;
  
  }
@@ -965,8 +973,9 @@ p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn,
      int i;
  
      ASSERT(gfn_locked_by_me(p2m, gfn));
+    pod_lock(p2m);
  
-    /* This check is done with the p2m lock held.  This will make sure that
+    /* This check is done with the pod lock held.  This will make sure that
       * even if d->is_dying changes under our feet, p2m_pod_empty_cache() 
       * won't start until we're done. */
      if ( unlikely(d->is_dying) )
@@ -977,6 +986,7 @@ p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn,
       * 1GB region to 2MB chunks for a retry. */
      if ( order == PAGE_ORDER_1G )
      {
+        pod_unlock(p2m);
          gfn_aligned = (gfn >> order) << order;
          /* Note that we are supposed to call set_p2m_entry() 512 times to 
           * split 1GB into 512 2MB pages here. But We only do once here because
@@ -1000,11 +1010,15 @@ p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn,
  
          /* If we're low, start a sweep */
          if ( order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) )
+            /* Note that sweeps scan other ranges in the p2m. In an scenario
+             * in which p2m locks are fine-grained, this may result in deadlock.
+             * Using trylock on the gfn's as we sweep would avoid it. */
              p2m_pod_emergency_sweep_super(p2m);
  
          if ( page_list_empty(&p2m->pod.single) &&
               ( ( order == PAGE_ORDER_4K )
                 || (order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) ) ) )
+            /* Same comment regarding deadlock applies */
              p2m_pod_emergency_sweep(p2m);
      }
  
@@ -1012,8 +1026,6 @@ p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn,
      if ( q == p2m_guest && gfn > p2m->pod.max_guest )
          p2m->pod.max_guest = gfn;
  
-    lock_page_alloc(p2m);
-
      if ( p2m->pod.count == 0 )
          goto out_of_memory;
  
@@ -1026,8 +1038,6 @@ p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn,
  
      BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0);
  
-    unlock_page_alloc(p2m);
-
      gfn_aligned = (gfn >> order) << order;
  
      set_p2m_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, p2m->default_access);
@@ -1038,7 +1048,7 @@ p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn,
          paging_mark_dirty(d, mfn_x(mfn) + i);
      }
      
-    p2m->pod.entry_count -= (1 << order); /* Lock: p2m */
+    p2m->pod.entry_count -= (1 << order);
      BUG_ON(p2m->pod.entry_count < 0);
  
      if ( tb_init_done )
@@ -1056,20 +1066,24 @@ p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn,
          __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t);
      }
  
+    pod_unlock(p2m);
      return 0;
  out_of_memory:
-    unlock_page_alloc(p2m);
+    pod_unlock(p2m);
  
      printk("%s: Out of populate-on-demand memory! tot_pages %" PRIu32 " pod_entries %" PRIi32 "\n",
             __func__, d->tot_pages, p2m->pod.entry_count);
      domain_crash(d);
  out_fail:
+    pod_unlock(p2m);
      return -1;
  remap_and_retry:
      BUG_ON(order != PAGE_ORDER_2M);
-    unlock_page_alloc(p2m);
+    pod_unlock(p2m);
  
      /* Remap this 2-meg region in singleton chunks */
+    /* NOTE: In a p2m fine-grained lock scenario this might
+     * need promoting the gfn lock from gfn->2M superpage */
      gfn_aligned = (gfn>>order)<<order;
      for(i=0; i<(1<<order); i++)
          set_p2m_entry(p2m, gfn_aligned+i, _mfn(0), PAGE_ORDER_4K,
@@ -1137,9 +1151,11 @@ guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn,
          rc = -EINVAL;
      else
      {
-        p2m->pod.entry_count += 1 << order; /* Lock: p2m */
+        pod_lock(p2m);
+        p2m->pod.entry_count += 1 << order;
          p2m->pod.entry_count -= pod_count;
          BUG_ON(p2m->pod.entry_count < 0);
+        pod_unlock(p2m);
      }
  
      gfn_unlock(p2m, gfn, order);
diff --git a/xen/arch/x86/mm/p2m-pt.c b/xen/arch/x86/mm/p2m-pt.c

index ee0460150d03d4b0c035ae5c9b9e4dfd8ef7a599..be0201857d8142c8a1e0bd5e73c6a3e296b7dbc2 100644 (file)
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -954,6 +954,7 @@ long p2m_pt_audit_p2m(struct p2m_domain *p2m)
      struct domain *d = p2m->domain;
  
      ASSERT(p2m_locked_by_me(p2m));
+    ASSERT(pod_locked_by_me(p2m));
  
      test_linear = ( (d == current->domain)
                      && !pagetable_is_null(current->arch.monitor_table) );
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c

index 7efe029f680e6acf90e9e629307a89e793d91a14..c7669dff1bd17574809d946113c807c9a6aa2655 100644 (file)
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -72,6 +72,7 @@ boolean_param("hap_2mb", opt_hap_2mb);
  static void p2m_initialise(struct domain *d, struct p2m_domain *p2m)
  {
      mm_lock_init(&p2m->lock);
+    mm_lock_init(&p2m->pod.lock);
      INIT_LIST_HEAD(&p2m->np2m_list);
      INIT_PAGE_LIST_HEAD(&p2m->pages);
      INIT_PAGE_LIST_HEAD(&p2m->pod.super);
@@ -568,8 +569,10 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
              rc = -EINVAL;
          else
          {
-            p2m->pod.entry_count -= pod_count; /* Lock: p2m */
+            pod_lock(p2m);
+            p2m->pod.entry_count -= pod_count;
              BUG_ON(p2m->pod.entry_count < 0);
+            pod_unlock(p2m);
          }
      }
  
@@ -1350,6 +1353,7 @@ p2m_flush_table(struct p2m_domain *p2m)
      /* "Host" p2m tables can have shared entries &c that need a bit more 
       * care when discarding them */
      ASSERT(p2m_is_nestedp2m(p2m));
+    /* Nested p2m's do not do pod, hence the asserts (and no pod lock)*/
      ASSERT(page_list_empty(&p2m->pod.super));
      ASSERT(page_list_empty(&p2m->pod.single));
  
@@ -1507,6 +1511,7 @@ void audit_p2m(struct domain *d,
      P2M_PRINTK("p2m audit starts\n");
  
      p2m_lock(p2m);
+    pod_lock(p2m);
  
      if (p2m->audit_p2m)
          pmbad = p2m->audit_p2m(p2m);
@@ -1567,6 +1572,7 @@ void audit_p2m(struct domain *d,
      }
      spin_unlock(&d->page_alloc_lock);
  
+    pod_unlock(p2m);
      p2m_unlock(p2m);
   
      P2M_PRINTK("p2m audit complete\n");
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h

index 4c150b4903e047e1064e56d799217307308585ed..bf05de16a528699abc39e7364b2fb6528e0ed53e 100644 (file)
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -261,25 +261,12 @@ struct p2m_domain {
      unsigned long max_mapped_pfn;
  
      /* Populate-on-demand variables
-     * NB on locking.  {super,single,count} are
-     * covered by d->page_alloc_lock, since they're almost always used in
-     * conjunction with that functionality.  {entry_count} is covered by
-     * the domain p2m lock, since it's almost always used in conjunction
-     * with changing the p2m tables.
-     *
-     * At this point, both locks are held in two places.  In both,
-     * the order is [p2m,page_alloc]:
-     * + p2m_pod_decrease_reservation() calls p2m_pod_cache_add(),
-     *   which grabs page_alloc
-     * + p2m_pod_demand_populate() grabs both; the p2m lock to avoid
-     *   double-demand-populating of pages, the page_alloc lock to
-     *   protect moving stuff from the PoD cache to the domain page list.
-     *
-     * We enforce this lock ordering through a construct in mm-locks.h.
-     * This demands, however, that we store the previous lock-ordering
-     * level in effect before grabbing the page_alloc lock. The unlock
-     * level is stored in the arch section of the domain struct.
-     */
+     * All variables are protected with the pod lock. We cannot rely on
+     * the p2m lock if it's turned into a fine-grained lock.
+     * We only use the domain page_alloc lock for additions and 
+     * deletions to the domain's page list. Because we use it nested
+     * within the PoD lock, we enforce it's ordering (by remembering
+     * the unlock level in the arch_domain sub struct). */
      struct {
          struct page_list_head super,   /* List of superpages                */
                           single;       /* Non-super lists                   */
@@ -288,6 +275,8 @@ struct p2m_domain {
          unsigned         reclaim_super; /* Last gpfn of a scan */
          unsigned         reclaim_single; /* Last gpfn of a scan */
          unsigned         max_guest;    /* gpfn of max guest demand-populate */
+        mm_lock_t        lock;         /* Locking of private pod structs,   *
+                                        * not relying on the p2m lock.      */
      } pod;
  };
author	Andres Lagar-Cavilla <andres@lagarcavilla.org>
	Fri, 10 Feb 2012 16:07:07 +0000 (16:07 +0000)
committer	Andres Lagar-Cavilla <andres@lagarcavilla.org>
	Fri, 10 Feb 2012 16:07:07 +0000 (16:07 +0000)
xen/arch/x86/mm/mm-locks.h		patch \| blob \| blame \| history
xen/arch/x86/mm/p2m-pod.c		patch \| blob \| blame \| history
xen/arch/x86/mm/p2m-pt.c		patch \| blob \| blame \| history
xen/arch/x86/mm/p2m.c		patch \| blob \| blame \| history
xen/include/asm-x86/p2m.h		patch \| blob \| blame \| history