xen/sched: add code to sync scheduling of all vcpus of a sched unit

author Juergen Gross <jgross@suse.com>

Wed, 2 Oct 2019 07:27:26 +0000 (09:27 +0200)

committer Jan Beulich <jbeulich@suse.com>

Fri, 4 Oct 2019 10:53:40 +0000 (12:53 +0200)
author Juergen Gross <jgross@suse.com>
Wed, 2 Oct 2019 07:27:26 +0000 (09:27 +0200)
committer Jan Beulich <jbeulich@suse.com>
Fri, 4 Oct 2019 10:53:40 +0000 (12:53 +0200)
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c

index f0ee5a214018f553a75e30ffb7b0f9b5043a17e6..460e968e97512b1c9aa07f9b97eb42b0d6a2cd5d 100644 (file)
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -318,7 +318,7 @@ static void schedule_tail(struct vcpu *prev)
  
      local_irq_enable();
  
-    context_saved(prev);
+    sched_context_switched(prev, current);
  
      update_runstate_area(current);
  
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c

index c7fa224c896699319825426b70fab8462cd84dc3..27f99d3bcc65d3a9afa4e96b695b43bd552bbbfa 100644 (file)
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1784,7 +1784,6 @@ static void __context_switch(void)
      per_cpu(curr_vcpu, cpu) = n;
  }
  
-
  void context_switch(struct vcpu *prev, struct vcpu *next)
  {
      unsigned int cpu = smp_processor_id();
@@ -1860,7 +1859,7 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
          }
      }
  
-    context_saved(prev);
+    sched_context_switched(prev, next);
  
      _update_runstate_area(next);
      /* Must be done with interrupts enabled */
diff --git a/xen/common/schedule.c b/xen/common/schedule.c

index 4711ece1efcccfdf325cabeb1e4dc07090c20db8..ff67fb363394574129a6c4fd74b9df627e390a3c 100644 (file)
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -61,6 +61,9 @@ boolean_param("sched_smt_power_savings", sched_smt_power_savings);
  int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
  integer_param("sched_ratelimit_us", sched_ratelimit_us);
  
+/* Number of vcpus per struct sched_unit. */
+static unsigned int __read_mostly sched_granularity = 1;
+
  /* Common lock for free cpus. */
  static DEFINE_SPINLOCK(sched_free_cpu_lock);
  
@@ -532,8 +535,8 @@ int sched_move_domain(struct domain *d, struct cpupool *c)
      if ( IS_ERR(domdata) )
          return PTR_ERR(domdata);
  
-    /* TODO: fix array size with multiple vcpus per unit. */
-    unit_priv = xzalloc_array(void *, d->max_vcpus);
+    unit_priv = xzalloc_array(void *,
+                              DIV_ROUND_UP(d->max_vcpus, sched_granularity));
      if ( unit_priv == NULL )
      {
          sched_free_domdata(c->sched, domdata);
@@ -1714,133 +1717,325 @@ void vcpu_set_periodic_timer(struct vcpu *v, s_time_t value)
      spin_unlock(&v->periodic_timer_lock);
  }
  
-/*
- * The main function
- * - deschedule the current domain (scheduler independent).
- * - pick a new domain (scheduler dependent).
- */
-static void schedule(void)
+static void sched_switch_units(struct sched_resource *sr,
+                               struct sched_unit *next, struct sched_unit *prev,
+                               s_time_t now)
  {
-    struct sched_unit    *prev = current->sched_unit, *next = NULL;
-    s_time_t              now;
-    struct scheduler     *sched;
-    unsigned long        *tasklet_work = &this_cpu(tasklet_work_to_do);
-    bool                  tasklet_work_scheduled = false;
-    struct sched_resource *sd;
-    spinlock_t           *lock;
-    int cpu = smp_processor_id();
+    sr->curr = next;
  
-    ASSERT_NOT_IN_ATOMIC();
+    TRACE_3D(TRC_SCHED_SWITCH_INFPREV, prev->domain->domain_id, prev->unit_id,
+             now - prev->state_entry_time);
+    TRACE_4D(TRC_SCHED_SWITCH_INFNEXT, next->domain->domain_id, next->unit_id,
+             (next->vcpu_list->runstate.state == RUNSTATE_runnable) ?
+             (now - next->state_entry_time) : 0, prev->next_time);
  
-    SCHED_STAT_CRANK(sched_run);
+    ASSERT(prev->vcpu_list->runstate.state == RUNSTATE_running);
+
+    TRACE_4D(TRC_SCHED_SWITCH, prev->domain->domain_id, prev->unit_id,
+             next->domain->domain_id, next->unit_id);
+
+    sched_unit_runstate_change(prev, false, now);
+
+    ASSERT(next->vcpu_list->runstate.state != RUNSTATE_running);
+    sched_unit_runstate_change(next, true, now);
  
-    sd = get_sched_res(cpu);
+    /*
+     * NB. Don't add any trace records from here until the actual context
+     * switch, else lost_records resume will not work properly.
+     */
+
+    ASSERT(!next->is_running);
+    next->vcpu_list->is_running = 1;
+    next->is_running = true;
+    next->state_entry_time = now;
+}
+
+static bool sched_tasklet_check_cpu(unsigned int cpu)
+{
+    unsigned long *tasklet_work = &per_cpu(tasklet_work_to_do, cpu);
  
-    /* Update tasklet scheduling status. */
      switch ( *tasklet_work )
      {
      case TASKLET_enqueued:
          set_bit(_TASKLET_scheduled, tasklet_work);
          /* fallthrough */
      case TASKLET_enqueued|TASKLET_scheduled:
-        tasklet_work_scheduled = true;
+        return true;
          break;
      case TASKLET_scheduled:
          clear_bit(_TASKLET_scheduled, tasklet_work);
+        /* fallthrough */
      case 0:
-        /*tasklet_work_scheduled = false;*/
+        /* return false; */
          break;
      default:
          BUG();
      }
  
-    lock = pcpu_schedule_lock_irq(cpu);
+    return false;
+}
  
-    now = NOW();
+static bool sched_tasklet_check(unsigned int cpu)
+{
+    bool tasklet_work_scheduled = false;
+    const cpumask_t *mask = get_sched_res(cpu)->cpus;
+    unsigned int cpu_iter;
+
+    for_each_cpu ( cpu_iter, mask )
+        if ( sched_tasklet_check_cpu(cpu_iter) )
+            tasklet_work_scheduled = true;
  
-    stop_timer(&sd->s_timer);
+    return tasklet_work_scheduled;
+}
+
+static struct sched_unit *do_schedule(struct sched_unit *prev, s_time_t now,
+                                      unsigned int cpu)
+{
+    struct scheduler *sched = per_cpu(scheduler, cpu);
+    struct sched_resource *sr = get_sched_res(cpu);
+    struct sched_unit *next;
  
      /* get policy-specific decision on scheduling... */
-    sched = this_cpu(scheduler);
-    sched->do_schedule(sched, prev, now, tasklet_work_scheduled);
+    sched->do_schedule(sched, prev, now, sched_tasklet_check(cpu));
  
      next = prev->next_task;
  
-    sd->curr = next;
-
      if ( prev->next_time >= 0 ) /* -ve means no limit */
-        set_timer(&sd->s_timer, now + prev->next_time);
+        set_timer(&sr->s_timer, now + prev->next_time);
+
+    if ( likely(prev != next) )
+        sched_switch_units(sr, next, prev, now);
+
+    return next;
+}
+
+static void context_saved(struct vcpu *prev)
+{
+    struct sched_unit *unit = prev->sched_unit;
+
+    /* Clear running flag /after/ writing context to memory. */
+    smp_wmb();
+
+    prev->is_running = 0;
+    unit->is_running = false;
+    unit->state_entry_time = NOW();
+
+    /* Check for migration request /after/ clearing running flag. */
+    smp_mb();
+
+    sched_context_saved(vcpu_scheduler(prev), unit);
  
-    if ( unlikely(prev == next) )
+    sched_unit_migrate_finish(unit);
+}
+
+/*
+ * Rendezvous on end of context switch.
+ * As no lock is protecting this rendezvous function we need to use atomic
+ * access functions on the counter.
+ * The counter will be 0 in case no rendezvous is needed. For the rendezvous
+ * case it is initialised to the number of cpus to rendezvous plus 1. Each
+ * member entering decrements the counter. The last one will decrement it to
+ * 1 and perform the final needed action in that case (call of context_saved()
+ * if vcpu was switched), and then set the counter to zero. The other members
+ * will wait until the counter becomes zero until they proceed.
+ */
+void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext)
+{
+    struct sched_unit *next = vnext->sched_unit;
+
+    if ( atomic_read(&next->rendezvous_out_cnt) )
+    {
+        int cnt = atomic_dec_return(&next->rendezvous_out_cnt);
+
+        /* Call context_saved() before releasing other waiters. */
+        if ( cnt == 1 )
+        {
+            if ( vprev != vnext )
+                context_saved(vprev);
+            atomic_set(&next->rendezvous_out_cnt, 0);
+        }
+        else
+            while ( atomic_read(&next->rendezvous_out_cnt) )
+                cpu_relax();
+    }
+    else if ( vprev != vnext )
+        context_saved(vprev);
+}
+
+static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext,
+                                 s_time_t now)
+{
+    if ( unlikely(vprev == vnext) )
      {
-        pcpu_schedule_unlock_irq(lock, cpu);
          TRACE_4D(TRC_SCHED_SWITCH_INFCONT,
-                 next->domain->domain_id, next->unit_id,
-                 now - prev->state_entry_time,
-                 prev->next_time);
-        trace_continue_running(next->vcpu_list);
-        return continue_running(prev->vcpu_list);
+                 vnext->domain->domain_id, vnext->sched_unit->unit_id,
+                 now - vprev->runstate.state_entry_time,
+                 vprev->sched_unit->next_time);
+        sched_context_switched(vprev, vnext);
+        trace_continue_running(vnext);
+        return continue_running(vprev);
      }
  
-    TRACE_3D(TRC_SCHED_SWITCH_INFPREV,
-             prev->domain->domain_id, prev->unit_id,
-             now - prev->state_entry_time);
-    TRACE_4D(TRC_SCHED_SWITCH_INFNEXT,
-             next->domain->domain_id, next->unit_id,
-             (next->vcpu_list->runstate.state == RUNSTATE_runnable) ?
-             (now - next->state_entry_time) : 0,
-             prev->next_time);
+    SCHED_STAT_CRANK(sched_ctx);
  
-    ASSERT(prev->vcpu_list->runstate.state == RUNSTATE_running);
+    stop_timer(&vprev->periodic_timer);
  
-    TRACE_4D(TRC_SCHED_SWITCH,
-             prev->domain->domain_id, prev->unit_id,
-             next->domain->domain_id, next->unit_id);
+    if ( vnext->sched_unit->migrated )
+        vcpu_move_irqs(vnext);
  
-    sched_unit_runstate_change(prev, false, now);
+    vcpu_periodic_timer_work(vnext);
  
-    ASSERT(next->vcpu_list->runstate.state != RUNSTATE_running);
-    sched_unit_runstate_change(next, true, now);
+    context_switch(vprev, vnext);
+}
  
-    /*
-     * NB. Don't add any trace records from here until the actual context
-     * switch, else lost_records resume will not work properly.
-     */
+/*
+ * Rendezvous before taking a scheduling decision.
+ * Called with schedule lock held, so all accesses to the rendezvous counter
+ * can be normal ones (no atomic accesses needed).
+ * The counter is initialized to the number of cpus to rendezvous initially.
+ * Each cpu entering will decrement the counter. In case the counter becomes
+ * zero do_schedule() is called and the rendezvous counter for leaving
+ * context_switch() is set. All other members will wait until the counter is
+ * becoming zero, dropping the schedule lock in between.
+ */
+static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
+                                                   spinlock_t **lock, int cpu,
+                                                   s_time_t now)
+{
+    struct sched_unit *next;
  
-    ASSERT(!next->is_running);
-    next->vcpu_list->is_running = 1;
-    next->is_running = true;
-    next->state_entry_time = now;
+    if ( !--prev->rendezvous_in_cnt )
+    {
+        next = do_schedule(prev, now, cpu);
+        atomic_set(&next->rendezvous_out_cnt, sched_granularity + 1);
+        return next;
+    }
  
-    pcpu_schedule_unlock_irq(lock, cpu);
+    while ( prev->rendezvous_in_cnt )
+    {
+        /*
+         * Coming from idle might need to do tasklet work.
+         * In order to avoid deadlocks we can't do that here, but have to
+         * continue the idle loop.
+         * Undo the rendezvous_in_cnt decrement and schedule another call of
+         * sched_slave().
+         */
+        if ( is_idle_unit(prev) && sched_tasklet_check_cpu(cpu) )
+        {
+            struct vcpu *vprev = current;
  
-    SCHED_STAT_CRANK(sched_ctx);
+            prev->rendezvous_in_cnt++;
+            atomic_set(&prev->rendezvous_out_cnt, 0);
+
+            pcpu_schedule_unlock_irq(*lock, cpu);
+
+            raise_softirq(SCHED_SLAVE_SOFTIRQ);
+            sched_context_switch(vprev, vprev, now);
+
+            return NULL;         /* ARM only. */
+        }
  
-    stop_timer(&prev->vcpu_list->periodic_timer);
+        pcpu_schedule_unlock_irq(*lock, cpu);
  
-    if ( next->migrated )
-        vcpu_move_irqs(next->vcpu_list);
+        cpu_relax();
  
-    vcpu_periodic_timer_work(next->vcpu_list);
+        *lock = pcpu_schedule_lock_irq(cpu);
+    }
  
-    context_switch(prev->vcpu_list, next->vcpu_list);
+    return prev->next_task;
  }
  
-void context_saved(struct vcpu *prev)
+static void sched_slave(void)
  {
-    /* Clear running flag /after/ writing context to memory. */
-    smp_wmb();
+    struct vcpu          *vprev = current;
+    struct sched_unit    *prev = vprev->sched_unit, *next;
+    s_time_t              now;
+    spinlock_t           *lock;
+    unsigned int          cpu = smp_processor_id();
  
-    prev->is_running = 0;
-    prev->sched_unit->is_running = false;
-    prev->sched_unit->state_entry_time = NOW();
+    ASSERT_NOT_IN_ATOMIC();
  
-    /* Check for migration request /after/ clearing running flag. */
-    smp_mb();
+    lock = pcpu_schedule_lock_irq(cpu);
  
-    sched_context_saved(vcpu_scheduler(prev), prev->sched_unit);
+    now = NOW();
+
+    if ( !prev->rendezvous_in_cnt )
+    {
+        pcpu_schedule_unlock_irq(lock, cpu);
+        return;
+    }
+
+    stop_timer(&get_sched_res(cpu)->s_timer);
+
+    next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
+    if ( !next )
+        return;
+
+    pcpu_schedule_unlock_irq(lock, cpu);
  
-    sched_unit_migrate_finish(prev->sched_unit);
+    sched_context_switch(vprev, next->vcpu_list, now);
+}
+
+/*
+ * The main function
+ * - deschedule the current domain (scheduler independent).
+ * - pick a new domain (scheduler dependent).
+ */
+static void schedule(void)
+{
+    struct vcpu          *vnext, *vprev = current;
+    struct sched_unit    *prev = vprev->sched_unit, *next = NULL;
+    s_time_t              now;
+    struct sched_resource *sr;
+    spinlock_t           *lock;
+    int cpu = smp_processor_id();
+
+    ASSERT_NOT_IN_ATOMIC();
+
+    SCHED_STAT_CRANK(sched_run);
+
+    sr = get_sched_res(cpu);
+
+    lock = pcpu_schedule_lock_irq(cpu);
+
+    if ( prev->rendezvous_in_cnt )
+    {
+        /*
+         * We have a race: sched_slave() should be called, so raise a softirq
+         * in order to re-enter schedule() later and call sched_slave() now.
+         */
+        pcpu_schedule_unlock_irq(lock, cpu);
+
+        raise_softirq(SCHEDULE_SOFTIRQ);
+        return sched_slave();
+    }
+
+    stop_timer(&sr->s_timer);
+
+    now = NOW();
+
+    if ( sched_granularity > 1 )
+    {
+        cpumask_t mask;
+
+        prev->rendezvous_in_cnt = sched_granularity;
+        cpumask_andnot(&mask, sr->cpus, cpumask_of(cpu));
+        cpumask_raise_softirq(&mask, SCHED_SLAVE_SOFTIRQ);
+        next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
+        if ( !next )
+            return;
+    }
+    else
+    {
+        prev->rendezvous_in_cnt = 0;
+        next = do_schedule(prev, now, cpu);
+        atomic_set(&next->rendezvous_out_cnt, 0);
+    }
+
+    pcpu_schedule_unlock_irq(lock, cpu);
+
+    vnext = next->vcpu_list;
+    sched_context_switch(vprev, vnext, now);
  }
  
  /* The scheduler timer: force a run through the scheduler */
@@ -1881,6 +2076,7 @@ static int cpu_schedule_up(unsigned int cpu)
      if ( sr == NULL )
          return -ENOMEM;
      sr->master_cpu = cpu;
+    sr->cpus = cpumask_of(cpu);
      set_sched_res(cpu, sr);
  
      per_cpu(scheduler, cpu) = &sched_idle_ops;
@@ -1901,6 +2097,8 @@ static int cpu_schedule_up(unsigned int cpu)
      if ( idle_vcpu[cpu] == NULL )
          return -ENOMEM;
  
+    idle_vcpu[cpu]->sched_unit->rendezvous_in_cnt = 0;
+
      /*
       * No need to allocate any scheduler data, as cpus coming online are
       * free initially and the idle scheduler doesn't need any data areas
@@ -2001,6 +2199,7 @@ void __init scheduler_init(void)
      int i;
  
      open_softirq(SCHEDULE_SOFTIRQ, schedule);
+    open_softirq(SCHED_SLAVE_SOFTIRQ, sched_slave);
  
      for ( i = 0; i < NUM_SCHEDULERS; i++)
      {
diff --git a/xen/common/softirq.c b/xen/common/softirq.c

index 83c3c09bd5bd3603351b1ef6c734809fe52f77b3..2d661932033535421a283d89444cd1af561709d8 100644 (file)
--- a/xen/common/softirq.c
+++ b/xen/common/softirq.c
@@ -33,8 +33,8 @@ static void __do_softirq(unsigned long ignore_mask)
      for ( ; ; )
      {
          /*
-         * Initialise @cpu on every iteration: SCHEDULE_SOFTIRQ may move
-         * us to another processor.
+         * Initialise @cpu on every iteration: SCHEDULE_SOFTIRQ or
+         * SCHED_SLAVE_SOFTIRQ may move us to another processor.
           */
          cpu = smp_processor_id();
  
@@ -55,7 +55,7 @@ void process_pending_softirqs(void)
  {
      ASSERT(!in_irq() && local_irq_is_enabled());
      /* Do not enter scheduler as it can preempt the calling context. */
-    __do_softirq(1ul<<SCHEDULE_SOFTIRQ);
+    __do_softirq((1ul << SCHEDULE_SOFTIRQ) | (1ul << SCHED_SLAVE_SOFTIRQ));
  }
  
  void do_softirq(void)
diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h

index 0423be987d158b99cc7866556424eeeaa574ec4e..c65dfa943bcf757e049f7873829bcaac966799af 100644 (file)
--- a/xen/include/xen/sched-if.h
+++ b/xen/include/xen/sched-if.h
@@ -42,6 +42,7 @@ struct sched_resource {
  
      /* Cpu with lowest id in scheduling resource. */
      unsigned int        master_cpu;
+    const cpumask_t    *cpus;           /* cpus covered by this struct     */
  };
  
  DECLARE_PER_CPU(struct scheduler *, scheduler);
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h

index ebf723a86619d86424a110497a13689765a9fe12..c770ab4aa0b266e821b6046a980e82d8b49102da 100644 (file)
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -292,6 +292,12 @@ struct sched_unit {
      /* Next unit to run. */
      struct sched_unit      *next_task;
      s_time_t                next_time;
+
+    /* Number of vcpus not yet joined for context switch. */
+    unsigned int            rendezvous_in_cnt;
+
+    /* Number of vcpus not yet finished with context switch. */
+    atomic_t                rendezvous_out_cnt;
  };
  
  #define for_each_sched_unit(d, u)                                         \
@@ -696,10 +702,10 @@ void sync_local_execstate(void);
  
  /*
   * Called by the scheduler to switch to another VCPU. This function must
- * call context_saved(@prev) when the local CPU is no longer running in
- * @prev's context, and that context is saved to memory. Alternatively, if
- * implementing lazy context switching, it suffices to ensure that invoking
- * sync_vcpu_execstate() will switch and commit @prev's state.
+ * call sched_context_switched(@prev, @next) when the local CPU is no longer
+ * running in @prev's context, and that context is saved to memory.
+ * Alternatively, if implementing lazy context switching, it suffices to ensure
+ * that invoking sync_vcpu_execstate() will switch and commit @prev's state.
   */
  void context_switch(
      struct vcpu *prev,
@@ -711,7 +717,7 @@ void context_switch(
   * saved to memory. Alternatively, if implementing lazy context switching,
   * ensure that invoking sync_vcpu_execstate() will switch and commit @prev.
   */
-void context_saved(struct vcpu *prev);
+void sched_context_switched(struct vcpu *prev, struct vcpu *vnext);
  
  /* Called by the scheduler to continue running the current VCPU. */
  void continue_running(
diff --git a/xen/include/xen/softirq.h b/xen/include/xen/softirq.h

index c327c9b6cd85f38d71884085bef4d0555aa2cc57..d7273b389ba9523dd81a04b42f08cb9691509714 100644 (file)
--- a/xen/include/xen/softirq.h
+++ b/xen/include/xen/softirq.h
@@ -4,6 +4,7 @@
  /* Low-latency softirqs come first in the following list. */
  enum {
      TIMER_SOFTIRQ = 0,
+    SCHED_SLAVE_SOFTIRQ,
      SCHEDULE_SOFTIRQ,
      NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ,
      RCU_SOFTIRQ,
author	Juergen Gross <jgross@suse.com>
	Wed, 2 Oct 2019 07:27:26 +0000 (09:27 +0200)
committer	Jan Beulich <jbeulich@suse.com>
	Fri, 4 Oct 2019 10:53:40 +0000 (12:53 +0200)
xen/arch/arm/domain.c		patch \| blob \| blame \| history
xen/arch/x86/domain.c		patch \| blob \| blame \| history
xen/common/schedule.c		patch \| blob \| blame \| history
xen/common/softirq.c		patch \| blob \| blame \| history
xen/include/xen/sched-if.h		patch \| blob \| blame \| history
xen/include/xen/sched.h		patch \| blob \| blame \| history
xen/include/xen/softirq.h		patch \| blob \| blame \| history