ia64/xen-unstable

changeset 12267:bb6cd7ba259b

[XEN] Initial support for multi-core and multi-threaded CPU scheduling.
In multi-core and multi-threaded systems, not all idling "CPUs" are
equal: When there are idling execution vehicles, it's better to spread
VCPUs across sockets and cores before co-scheduling cores and threads.
Signed-off-by: Emmanuel Ackaouy <ack@xensource.com>
author Emmanuel Ackaouy <ack@xensource.com>
date Mon Nov 06 17:32:00 2006 +0000 (2006-11-06)
parents 32e4952c0638
children f8ffeb540ec1
files xen/common/sched_credit.c xen/common/sched_sedf.c xen/common/schedule.c xen/include/xen/sched-if.h
line diff
     1.1 --- a/xen/common/sched_credit.c	Mon Nov 06 16:55:56 2006 +0000
     1.2 +++ b/xen/common/sched_credit.c	Mon Nov 06 17:32:00 2006 +0000
     1.3 @@ -115,6 +115,10 @@
     1.4      _MACRO(steal_peer_idle)                 \
     1.5      _MACRO(steal_peer_running)              \
     1.6      _MACRO(steal_peer_pinned)               \
     1.7 +    _MACRO(steal_peer_migrating)            \
     1.8 +    _MACRO(steal_peer_best_idler)           \
     1.9 +    _MACRO(steal_loner_candidate)           \
    1.10 +    _MACRO(steal_loner_signal)              \
    1.11      _MACRO(dom_init)                        \
    1.12      _MACRO(dom_destroy)                     \
    1.13      _MACRO(vcpu_init)                       \
    1.14 @@ -370,8 +374,42 @@ static inline void
    1.15  #define CSCHED_VCPU_CHECK(_vc)
    1.16  #endif
    1.17  
    1.18 +/*
    1.19 + * Indicates which of two given idlers is most efficient to run
    1.20 + * an additional VCPU.
    1.21 + *
    1.22 + * Returns:
    1.23 + *  0:           They are the same.
    1.24 + *  negative:    One is less efficient than Two.
    1.25 + *  positive:    One is more efficient than Two.
    1.26 + */
    1.27 +static int
    1.28 +csched_idler_compare(int one, int two)
    1.29 +{
    1.30 +    cpumask_t idlers;
    1.31 +    cpumask_t one_idlers;
    1.32 +    cpumask_t two_idlers;
    1.33 +
    1.34 +    idlers = csched_priv.idlers;
    1.35 +    cpu_clear(one, idlers);
    1.36 +    cpu_clear(two, idlers);
    1.37 +
    1.38 +    if ( cpu_isset(one, cpu_core_map[two]) )
    1.39 +    {
    1.40 +        cpus_and(one_idlers, idlers, cpu_sibling_map[one]);
    1.41 +        cpus_and(two_idlers, idlers, cpu_sibling_map[two]);
    1.42 +    }
    1.43 +    else
    1.44 +    {
    1.45 +        cpus_and(one_idlers, idlers, cpu_core_map[one]);
    1.46 +        cpus_and(two_idlers, idlers, cpu_core_map[two]);
    1.47 +    }
    1.48 +
    1.49 +    return cpus_weight(one_idlers) - cpus_weight(two_idlers);
    1.50 +}
    1.51 +
    1.52  static inline int
    1.53 -__csched_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
    1.54 +__csched_queued_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
    1.55  {
    1.56      /*
    1.57       * Don't pick up work that's in the peer's scheduling tail. Also only pick
    1.58 @@ -392,6 +430,32 @@ static inline int
    1.59      return 1;
    1.60  }
    1.61  
    1.62 +static inline int
    1.63 +__csched_running_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
    1.64 +{
    1.65 +    BUG_ON( is_idle_vcpu(vc) );
    1.66 +
    1.67 +    if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) )
    1.68 +    {
    1.69 +        CSCHED_STAT_CRANK(steal_peer_pinned);
    1.70 +        return 0;
    1.71 +    }
    1.72 +
    1.73 +    if ( test_bit(_VCPUF_migrating, &vc->vcpu_flags) )
    1.74 +    {
    1.75 +        CSCHED_STAT_CRANK(steal_peer_migrating);
    1.76 +        return 0;
    1.77 +    }
    1.78 +
    1.79 +    if ( csched_idler_compare(local_cpu, vc->processor) <= 0 )
    1.80 +    {
    1.81 +        CSCHED_STAT_CRANK(steal_peer_best_idler);
    1.82 +        return 0;
    1.83 +    }
    1.84 +
    1.85 +    return 1;
    1.86 +}
    1.87 +
    1.88  static void
    1.89  csched_vcpu_acct(struct csched_vcpu *svc, int credit_dec)
    1.90  {
    1.91 @@ -652,6 +716,64 @@ csched_dom_destroy(struct domain *dom)
    1.92      xfree(sdom);
    1.93  }
    1.94  
    1.95 +static int
    1.96 +csched_cpu_pick(struct vcpu *vc)
    1.97 +{
    1.98 +    cpumask_t cpus;
    1.99 +    int cpu, nxt;
   1.100 +
   1.101 +    /*
   1.102 +     * Pick from online CPUs in VCPU's affinity mask, giving a
   1.103 +     * preference to its current processor if it's in there.
   1.104 +     */
   1.105 +    cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
   1.106 +    ASSERT( !cpus_empty(cpus) );
   1.107 +    cpu = cpu_isset(vc->processor, cpus) ? vc->processor : first_cpu(cpus);
   1.108 +
   1.109 +    /*
   1.110 +     * Try to find an idle processor within the above constraints.
   1.111 +     */
   1.112 +    cpus_and(cpus, cpus, csched_priv.idlers);
   1.113 +    if ( !cpus_empty(cpus) )
   1.114 +    {
   1.115 +        cpu = cpu_isset(cpu, cpus) ? cpu : first_cpu(cpus);
   1.116 +        cpu_clear(cpu, cpus);
   1.117 +
   1.118 +        /*
   1.119 +         * In multi-core and multi-threaded CPUs, not all idle execution
   1.120 +         * vehicles are equal!
   1.121 +         *
   1.122 +         * We give preference to the idle execution vehicle with the most
   1.123 +         * idling neighbours in its grouping. This distributes work across
   1.124 +         * distinct cores first and guarantees we don't do something stupid
   1.125 +         * like run two VCPUs on co-hyperthreads while there are idle cores
   1.126 +         * or sockets.
   1.127 +         */
   1.128 +        while ( !cpus_empty(cpus) )
   1.129 +        {
   1.130 +            nxt = first_cpu(cpus);
   1.131 +
   1.132 +            if ( csched_idler_compare(cpu, nxt) < 0 )
   1.133 +            {
   1.134 +                cpu = nxt;
   1.135 +                cpu_clear(nxt, cpus);
   1.136 +            }
   1.137 +            else if ( cpu_isset(cpu, cpu_core_map[nxt]) )
   1.138 +            {
   1.139 +                cpus_andnot(cpus, cpus, cpu_sibling_map[nxt]);
   1.140 +            }
   1.141 +            else
   1.142 +            {
   1.143 +                cpus_andnot(cpus, cpus, cpu_core_map[nxt]);
   1.144 +            }
   1.145 +
   1.146 +            ASSERT( !cpu_isset(nxt, cpus) );
   1.147 +        }
   1.148 +    }
   1.149 +
   1.150 +    return cpu;
   1.151 +}
   1.152 +
   1.153  /*
   1.154   * This is a O(n) optimized sort of the runq.
   1.155   *
   1.156 @@ -939,7 +1061,7 @@ csched_runq_steal(struct csched_pcpu *sp
   1.157          vc = speer->vcpu;
   1.158          BUG_ON( is_idle_vcpu(vc) );
   1.159  
   1.160 -        if ( __csched_vcpu_is_stealable(cpu, vc) )
   1.161 +        if ( __csched_queued_vcpu_is_stealable(cpu, vc) )
   1.162          {
   1.163              /* We got a candidate. Grab it! */
   1.164              __runq_remove(speer);
   1.165 @@ -959,6 +1081,7 @@ csched_load_balance(int cpu, struct csch
   1.166      struct csched_pcpu *spc;
   1.167      struct vcpu *peer_vcpu;
   1.168      cpumask_t workers;
   1.169 +    cpumask_t loners;
   1.170      int peer_cpu;
   1.171  
   1.172      if ( snext->pri == CSCHED_PRI_IDLE )
   1.173 @@ -971,6 +1094,7 @@ csched_load_balance(int cpu, struct csch
   1.174      /*
   1.175       * Peek at non-idling CPUs in the system
   1.176       */
   1.177 +    cpus_clear(loners);
   1.178      cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
   1.179      cpu_clear(cpu, workers);
   1.180  
   1.181 @@ -999,13 +1123,12 @@ csched_load_balance(int cpu, struct csch
   1.182              continue;
   1.183          }
   1.184  
   1.185 +        peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
   1.186          spc = CSCHED_PCPU(peer_cpu);
   1.187 -        peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
   1.188  
   1.189          if ( unlikely(spc == NULL) )
   1.190          {
   1.191              CSCHED_STAT_CRANK(steal_peer_down);
   1.192 -            speer = NULL;
   1.193          }
   1.194          else if ( unlikely(is_idle_vcpu(peer_vcpu)) )
   1.195          {
   1.196 @@ -1014,26 +1137,72 @@ csched_load_balance(int cpu, struct csch
   1.197               * pick up work from it itself.
   1.198               */
   1.199              CSCHED_STAT_CRANK(steal_peer_idle);
   1.200 -            speer = NULL;
   1.201 +        }
   1.202 +        else if ( is_idle_vcpu(__runq_elem(spc->runq.next)->vcpu) )
   1.203 +        {
   1.204 +            if ( snext->pri == CSCHED_PRI_IDLE &&
   1.205 +                 __csched_running_vcpu_is_stealable(cpu, peer_vcpu) )
   1.206 +            {
   1.207 +                CSCHED_STAT_CRANK(steal_loner_candidate);
   1.208 +                cpu_set(peer_cpu, loners);
   1.209 +            }
   1.210          }
   1.211          else
   1.212          {
   1.213 -            /* Try to steal work from an online non-idle CPU. */
   1.214 +            /* Try to steal work from a remote CPU's runq. */
   1.215              speer = csched_runq_steal(spc, cpu, snext->pri);
   1.216 +            if ( speer != NULL )
   1.217 +            {
   1.218 +                spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
   1.219 +                CSCHED_STAT_CRANK(vcpu_migrate);
   1.220 +                speer->stats.migrate++;
   1.221 +                return speer;
   1.222 +            }
   1.223          }
   1.224  
   1.225          spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
   1.226 +    }
   1.227  
   1.228 -        /* Got one? */
   1.229 -        if ( speer )
   1.230 +    /*
   1.231 +     * If we failed to find any remotely queued VCPUs to move here,
   1.232 +     * see if it would be more efficient to move any of the running
   1.233 +     * remote VCPUs over here.
   1.234 +     */
   1.235 +    while ( !cpus_empty(loners) )
   1.236 +    {
   1.237 +        /* For each CPU of interest, starting with our neighbour... */
   1.238 +        peer_cpu = next_cpu(peer_cpu, loners);
   1.239 +        if ( peer_cpu == NR_CPUS )
   1.240 +            peer_cpu = first_cpu(loners);
   1.241 +
   1.242 +        cpu_clear(peer_cpu, loners);
   1.243 +
   1.244 +        if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
   1.245          {
   1.246 -            CSCHED_STAT_CRANK(vcpu_migrate);
   1.247 -            speer->stats.migrate++;
   1.248 -            return speer;
   1.249 +            CSCHED_STAT_CRANK(steal_trylock_failed);
   1.250 +            continue;
   1.251 +        }
   1.252 +
   1.253 +        peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
   1.254 +        spc = CSCHED_PCPU(peer_cpu);
   1.255 +
   1.256 +        if ( !is_idle_vcpu(peer_vcpu) &&
   1.257 +             is_idle_vcpu(__runq_elem(spc->runq.next)->vcpu) &&
   1.258 +             __csched_running_vcpu_is_stealable(cpu, peer_vcpu) )
   1.259 +        {
   1.260 +            set_bit(_VCPUF_migrating, &peer_vcpu->vcpu_flags);
   1.261 +            spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
   1.262 +
   1.263 +            CSCHED_STAT_CRANK(steal_loner_signal);
   1.264 +            cpu_raise_softirq(peer_cpu, SCHEDULE_SOFTIRQ);
   1.265 +        }
   1.266 +        else
   1.267 +        {
   1.268 +            spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
   1.269          }
   1.270      }
   1.271  
   1.272 -    /* Failed to find more important work */
   1.273 +    /* Failed to find more important work elsewhere... */
   1.274      __runq_remove(snext);
   1.275      return snext;
   1.276  }
   1.277 @@ -1139,9 +1308,11 @@ csched_dump_pcpu(int cpu)
   1.278      spc = CSCHED_PCPU(cpu);
   1.279      runq = &spc->runq;
   1.280  
   1.281 -    printk(" tick=%lu, sort=%d\n",
   1.282 +    printk(" tick=%lu, sort=%d, sibling=0x%lx, core=0x%lx\n",
   1.283              per_cpu(schedule_data, cpu).tick,
   1.284 -            spc->runq_sort_last);
   1.285 +            spc->runq_sort_last,
   1.286 +            cpu_sibling_map[cpu].bits[0],
   1.287 +            cpu_core_map[cpu].bits[0]);
   1.288  
   1.289      /* current VCPU */
   1.290      svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
   1.291 @@ -1247,6 +1418,7 @@ struct scheduler sched_credit_def = {
   1.292  
   1.293      .adjust         = csched_dom_cntl,
   1.294  
   1.295 +    .pick_cpu       = csched_cpu_pick,
   1.296      .tick           = csched_tick,
   1.297      .do_schedule    = csched_schedule,
   1.298  
     2.1 --- a/xen/common/sched_sedf.c	Mon Nov 06 16:55:56 2006 +0000
     2.2 +++ b/xen/common/sched_sedf.c	Mon Nov 06 17:32:00 2006 +0000
     2.3 @@ -411,6 +411,14 @@ static void sedf_destroy_domain(struct d
     2.4      xfree(d->sched_priv);
     2.5  }
     2.6  
     2.7 +static int sedf_pick_cpu(struct vcpu *v)
     2.8 +{
     2.9 +    cpumask_t online_affinity;
    2.10 +
    2.11 +    cpus_and(online_affinity, v->cpu_affinity, cpu_online_map);
    2.12 +    return first_cpu(online_affinity);
    2.13 +}
    2.14 +
    2.15  /*
    2.16   * Handles the rescheduling & bookkeeping of domains running in their
    2.17   * guaranteed timeslice.
    2.18 @@ -1436,6 +1444,7 @@ struct scheduler sched_sedf_def = {
    2.19      .destroy_vcpu   = sedf_destroy_vcpu,
    2.20  
    2.21      .do_schedule    = sedf_do_schedule,
    2.22 +    .pick_cpu       = sedf_pick_cpu,
    2.23      .dump_cpu_state = sedf_dump_cpu_state,
    2.24      .sleep          = sedf_sleep,
    2.25      .wake           = sedf_wake,
     3.1 --- a/xen/common/schedule.c	Mon Nov 06 16:55:56 2006 +0000
     3.2 +++ b/xen/common/schedule.c	Mon Nov 06 17:32:00 2006 +0000
     3.3 @@ -203,7 +203,6 @@ void vcpu_wake(struct vcpu *v)
     3.4  
     3.5  static void vcpu_migrate(struct vcpu *v)
     3.6  {
     3.7 -    cpumask_t online_affinity;
     3.8      unsigned long flags;
     3.9      int old_cpu;
    3.10  
    3.11 @@ -218,8 +217,7 @@ static void vcpu_migrate(struct vcpu *v)
    3.12  
    3.13      /* Switch to new CPU, then unlock old CPU. */
    3.14      old_cpu = v->processor;
    3.15 -    cpus_and(online_affinity, v->cpu_affinity, cpu_online_map);
    3.16 -    v->processor = first_cpu(online_affinity);
    3.17 +    v->processor = SCHED_OP(pick_cpu, v);
    3.18      spin_unlock_irqrestore(
    3.19          &per_cpu(schedule_data, old_cpu).schedule_lock, flags);
    3.20  
     4.1 --- a/xen/include/xen/sched-if.h	Mon Nov 06 16:55:56 2006 +0000
     4.2 +++ b/xen/include/xen/sched-if.h	Mon Nov 06 17:32:00 2006 +0000
     4.3 @@ -74,6 +74,7 @@ struct scheduler {
     4.4  
     4.5      struct task_slice (*do_schedule) (s_time_t);
     4.6  
     4.7 +    int          (*pick_cpu)       (struct vcpu *);
     4.8      int          (*adjust)         (struct domain *,
     4.9                                      struct xen_domctl_scheduler_op *);
    4.10      void         (*dump_settings)  (void);