ia64/xen-unstable

changeset 12906:05e1863cc2a3

[XEN] Improve multi-core/thread scheduler support.
Relax granularity at which running VCPUs are migrated onto
cores or sockets with more idle execution vehicles and
simplify code a good deal in the process.
Also, some scheduler stat cleanup while in there.
Signed-off-by: Emmanuel Ackaouy <ack@xensource.com>
author Emmanuel Ackaouy <ack@xensource.com>
date Tue Dec 12 19:37:21 2006 +0000 (2006-12-12)
parents 6b68a3688509
children 2fa06640a1c1
files xen/common/sched_credit.c
line diff
     1.1 --- a/xen/common/sched_credit.c	Tue Dec 12 16:52:13 2006 +0000
     1.2 +++ b/xen/common/sched_credit.c	Tue Dec 12 19:37:21 2006 +0000
     1.3 @@ -106,20 +106,13 @@
     1.4      _MACRO(tickle_local_other)              \
     1.5      _MACRO(tickle_idlers_none)              \
     1.6      _MACRO(tickle_idlers_some)              \
     1.7 -    _MACRO(vcpu_migrate)                    \
     1.8      _MACRO(load_balance_idle)               \
     1.9      _MACRO(load_balance_over)               \
    1.10      _MACRO(load_balance_other)              \
    1.11      _MACRO(steal_trylock_failed)            \
    1.12 -    _MACRO(steal_peer_down)                 \
    1.13      _MACRO(steal_peer_idle)                 \
    1.14 -    _MACRO(steal_peer_running)              \
    1.15 -    _MACRO(steal_peer_pinned)               \
    1.16 -    _MACRO(steal_peer_migrating)            \
    1.17 -    _MACRO(steal_peer_best_idler)           \
    1.18 -    _MACRO(steal_loner_candidate)           \
    1.19 -    _MACRO(steal_loner_signal)              \
    1.20 -    _MACRO(cpu_pick)                        \
    1.21 +    _MACRO(migrate_queued)                  \
    1.22 +    _MACRO(migrate_running)                 \
    1.23      _MACRO(dom_init)                        \
    1.24      _MACRO(dom_destroy)                     \
    1.25      _MACRO(vcpu_init)                       \
    1.26 @@ -146,7 +139,7 @@
    1.27      struct                                      \
    1.28      {                                           \
    1.29          CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \
    1.30 -    } stats
    1.31 +    } stats;
    1.32  
    1.33  #define CSCHED_STATS_PRINTK()                   \
    1.34      do                                          \
    1.35 @@ -155,14 +148,27 @@
    1.36          CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \
    1.37      } while ( 0 )
    1.38  
    1.39 -#define CSCHED_STAT_CRANK(_X)   (CSCHED_STAT(_X)++)
    1.40 +#define CSCHED_STAT_CRANK(_X)               (CSCHED_STAT(_X)++)
    1.41 +
    1.42 +#define CSCHED_VCPU_STATS_RESET(_V)                     \
    1.43 +    do                                                  \
    1.44 +    {                                                   \
    1.45 +        memset(&(_V)->stats, 0, sizeof((_V)->stats));   \
    1.46 +    } while ( 0 )
    1.47 +
    1.48 +#define CSCHED_VCPU_STAT_CRANK(_V, _X)      (((_V)->stats._X)++)
    1.49 +
    1.50 +#define CSCHED_VCPU_STAT_SET(_V, _X, _Y)    (((_V)->stats._X) = (_Y))
    1.51  
    1.52  #else /* CSCHED_STATS */
    1.53  
    1.54 -#define CSCHED_STATS_RESET()    do {} while ( 0 )
    1.55 -#define CSCHED_STATS_DEFINE()   do {} while ( 0 )
    1.56 -#define CSCHED_STATS_PRINTK()   do {} while ( 0 )
    1.57 -#define CSCHED_STAT_CRANK(_X)   do {} while ( 0 )
    1.58 +#define CSCHED_STATS_RESET()                do {} while ( 0 )
    1.59 +#define CSCHED_STATS_DEFINE()
    1.60 +#define CSCHED_STATS_PRINTK()               do {} while ( 0 )
    1.61 +#define CSCHED_STAT_CRANK(_X)               do {} while ( 0 )
    1.62 +#define CSCHED_VCPU_STATS_RESET(_V)         do {} while ( 0 )
    1.63 +#define CSCHED_VCPU_STAT_CRANK(_V, _X)      do {} while ( 0 )
    1.64 +#define CSCHED_VCPU_STAT_SET(_V, _X, _Y)    do {} while ( 0 )
    1.65  
    1.66  #endif /* CSCHED_STATS */
    1.67  
    1.68 @@ -185,13 +191,16 @@ struct csched_vcpu {
    1.69      struct vcpu *vcpu;
    1.70      atomic_t credit;
    1.71      int16_t pri;
    1.72 +#ifdef CSCHED_STATS
    1.73      struct {
    1.74          int credit_last;
    1.75          uint32_t credit_incr;
    1.76          uint32_t state_active;
    1.77          uint32_t state_idle;
    1.78 -        uint32_t migrate;
    1.79 +        uint32_t migrate_q;
    1.80 +        uint32_t migrate_r;
    1.81      } stats;
    1.82 +#endif
    1.83  };
    1.84  
    1.85  /*
    1.86 @@ -219,7 +228,7 @@ struct csched_private {
    1.87      uint32_t credit;
    1.88      int credit_balance;
    1.89      uint32_t runq_sort;
    1.90 -    CSCHED_STATS_DEFINE();
    1.91 +    CSCHED_STATS_DEFINE()
    1.92  };
    1.93  
    1.94  
    1.95 @@ -231,6 +240,15 @@ static struct csched_private csched_priv
    1.96  
    1.97  
    1.98  static inline int
    1.99 +__cycle_cpu(int cpu, const cpumask_t *mask)
   1.100 +{
   1.101 +    int nxt = next_cpu(cpu, *mask);
   1.102 +    if (nxt == NR_CPUS)
   1.103 +        nxt = first_cpu(*mask);
   1.104 +    return nxt;
   1.105 +}
   1.106 +
   1.107 +static inline int
   1.108  __vcpu_on_runq(struct csched_vcpu *svc)
   1.109  {
   1.110      return !list_empty(&svc->runq_elem);
   1.111 @@ -375,118 +393,138 @@ static inline void
   1.112  #define CSCHED_VCPU_CHECK(_vc)
   1.113  #endif
   1.114  
   1.115 -/*
   1.116 - * Indicates which of two given idlers is most efficient to run
   1.117 - * an additional VCPU.
   1.118 - *
   1.119 - * Returns:
   1.120 - *  0:           They are the same.
   1.121 - *  negative:    One is less efficient than Two.
   1.122 - *  positive:    One is more efficient than Two.
   1.123 - */
   1.124 -static int
   1.125 -csched_idler_compare(int one, int two)
   1.126 -{
   1.127 -    cpumask_t idlers;
   1.128 -    cpumask_t one_idlers;
   1.129 -    cpumask_t two_idlers;
   1.130 -
   1.131 -    idlers = csched_priv.idlers;
   1.132 -    cpu_clear(one, idlers);
   1.133 -    cpu_clear(two, idlers);
   1.134 -
   1.135 -    if ( cpu_isset(one, cpu_core_map[two]) )
   1.136 -    {
   1.137 -        cpus_and(one_idlers, idlers, cpu_sibling_map[one]);
   1.138 -        cpus_and(two_idlers, idlers, cpu_sibling_map[two]);
   1.139 -    }
   1.140 -    else
   1.141 -    {
   1.142 -        cpus_and(one_idlers, idlers, cpu_core_map[one]);
   1.143 -        cpus_and(two_idlers, idlers, cpu_core_map[two]);
   1.144 -    }
   1.145 -
   1.146 -    return cpus_weight(one_idlers) - cpus_weight(two_idlers);
   1.147 -}
   1.148 -
   1.149  static inline int
   1.150 -__csched_queued_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
   1.151 +__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
   1.152  {
   1.153      /*
   1.154       * Don't pick up work that's in the peer's scheduling tail. Also only pick
   1.155       * up work that's allowed to run on our CPU.
   1.156       */
   1.157 -    if ( unlikely(test_bit(_VCPUF_running, &vc->vcpu_flags)) )
   1.158 -    {
   1.159 -        CSCHED_STAT_CRANK(steal_peer_running);
   1.160 -        return 0;
   1.161 -    }
   1.162 -
   1.163 -    if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) )
   1.164 -    {
   1.165 -        CSCHED_STAT_CRANK(steal_peer_pinned);
   1.166 -        return 0;
   1.167 -    }
   1.168 -
   1.169 -    return 1;
   1.170 +    return !test_bit(_VCPUF_running, &vc->vcpu_flags) &&
   1.171 +           cpu_isset(dest_cpu, vc->cpu_affinity);
   1.172  }
   1.173  
   1.174 -static inline int
   1.175 -__csched_running_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
   1.176 +static int
   1.177 +csched_cpu_pick(struct vcpu *vc)
   1.178  {
   1.179 -    BUG_ON( is_idle_vcpu(vc) );
   1.180 +    cpumask_t cpus;
   1.181 +    cpumask_t idlers;
   1.182 +    int cpu;
   1.183 +
   1.184 +    /*
   1.185 +     * Pick from online CPUs in VCPU's affinity mask, giving a
   1.186 +     * preference to its current processor if it's in there.
   1.187 +     */
   1.188 +    cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
   1.189 +    cpu = cpu_isset(vc->processor, cpus)
   1.190 +            ? vc->processor
   1.191 +            : __cycle_cpu(vc->processor, &cpus);
   1.192 +    ASSERT( !cpus_empty(cpus) && cpu_isset(cpu, cpus) );
   1.193  
   1.194 -    if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) )
   1.195 +    /*
   1.196 +     * Try to find an idle processor within the above constraints.
   1.197 +     *
   1.198 +     * In multi-core and multi-threaded CPUs, not all idle execution
   1.199 +     * vehicles are equal!
   1.200 +     *
   1.201 +     * We give preference to the idle execution vehicle with the most
   1.202 +     * idling neighbours in its grouping. This distributes work across
   1.203 +     * distinct cores first and guarantees we don't do something stupid
   1.204 +     * like run two VCPUs on co-hyperthreads while there are idle cores
   1.205 +     * or sockets.
   1.206 +     */
   1.207 +    idlers = csched_priv.idlers;
   1.208 +    cpu_set(cpu, idlers);
   1.209 +    cpus_and(cpus, cpus, idlers);
   1.210 +    cpu_clear(cpu, cpus);
   1.211 +
   1.212 +    while ( !cpus_empty(cpus) )
   1.213      {
   1.214 -        CSCHED_STAT_CRANK(steal_peer_pinned);
   1.215 -        return 0;
   1.216 +        cpumask_t cpu_idlers;
   1.217 +        cpumask_t nxt_idlers;
   1.218 +        int nxt;
   1.219 +
   1.220 +        nxt = __cycle_cpu(cpu, &cpus);
   1.221 +
   1.222 +        if ( cpu_isset(cpu, cpu_core_map[nxt]) )
   1.223 +        {
   1.224 +            ASSERT( cpu_isset(nxt, cpu_core_map[cpu]) );
   1.225 +            cpus_and(cpu_idlers, idlers, cpu_sibling_map[cpu]);
   1.226 +            cpus_and(nxt_idlers, idlers, cpu_sibling_map[nxt]);
   1.227 +        }
   1.228 +        else
   1.229 +        {
   1.230 +            ASSERT( !cpu_isset(nxt, cpu_core_map[cpu]) );
   1.231 +            cpus_and(cpu_idlers, idlers, cpu_core_map[cpu]);
   1.232 +            cpus_and(nxt_idlers, idlers, cpu_core_map[nxt]);
   1.233 +        }
   1.234 +
   1.235 +        if ( cpus_weight(cpu_idlers) < cpus_weight(nxt_idlers) )
   1.236 +        {
   1.237 +            cpu = nxt;
   1.238 +            cpu_clear(cpu, cpus);
   1.239 +        }
   1.240 +        else
   1.241 +        {
   1.242 +            cpus_andnot(cpus, cpus, nxt_idlers);
   1.243 +        }
   1.244      }
   1.245  
   1.246 -    if ( test_bit(_VCPUF_migrating, &vc->vcpu_flags) )
   1.247 -    {
   1.248 -        CSCHED_STAT_CRANK(steal_peer_migrating);
   1.249 -        return 0;
   1.250 -    }
   1.251 -
   1.252 -    if ( csched_idler_compare(local_cpu, vc->processor) <= 0 )
   1.253 -    {
   1.254 -        CSCHED_STAT_CRANK(steal_peer_best_idler);
   1.255 -        return 0;
   1.256 -    }
   1.257 -
   1.258 -    return 1;
   1.259 +    return cpu;
   1.260  }
   1.261  
   1.262 -static void
   1.263 -csched_vcpu_acct(struct csched_vcpu *svc, int credit_dec)
   1.264 +static inline void
   1.265 +__csched_vcpu_acct_start(struct csched_vcpu *svc)
   1.266  {
   1.267      struct csched_dom * const sdom = svc->sdom;
   1.268      unsigned long flags;
   1.269  
   1.270 -    /* Update credits */
   1.271 -    atomic_sub(credit_dec, &svc->credit);
   1.272 +    spin_lock_irqsave(&csched_priv.lock, flags);
   1.273  
   1.274 -    /* Put this VCPU and domain back on the active list if it was idling */
   1.275      if ( list_empty(&svc->active_vcpu_elem) )
   1.276      {
   1.277 -        spin_lock_irqsave(&csched_priv.lock, flags);
   1.278 +        CSCHED_VCPU_STAT_CRANK(svc, state_active);
   1.279 +        CSCHED_STAT_CRANK(acct_vcpu_active);
   1.280  
   1.281 -        if ( list_empty(&svc->active_vcpu_elem) )
   1.282 +        sdom->active_vcpu_count++;
   1.283 +        list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
   1.284 +        if ( list_empty(&sdom->active_sdom_elem) )
   1.285          {
   1.286 -            CSCHED_STAT_CRANK(acct_vcpu_active);
   1.287 -            svc->stats.state_active++;
   1.288 +            list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
   1.289 +            csched_priv.weight += sdom->weight;
   1.290 +        }
   1.291 +    }
   1.292 +
   1.293 +    spin_unlock_irqrestore(&csched_priv.lock, flags);
   1.294 +}
   1.295 +
   1.296 +static inline void
   1.297 +__csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
   1.298 +{
   1.299 +    struct csched_dom * const sdom = svc->sdom;
   1.300  
   1.301 -            sdom->active_vcpu_count++;
   1.302 -            list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
   1.303 -            if ( list_empty(&sdom->active_sdom_elem) )
   1.304 -            {
   1.305 -                list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
   1.306 -                csched_priv.weight += sdom->weight;
   1.307 -            }
   1.308 -        }
   1.309 +    BUG_ON( list_empty(&svc->active_vcpu_elem) );
   1.310 +
   1.311 +    CSCHED_VCPU_STAT_CRANK(svc, state_idle);
   1.312 +    CSCHED_STAT_CRANK(acct_vcpu_idle);
   1.313  
   1.314 -        spin_unlock_irqrestore(&csched_priv.lock, flags);
   1.315 +    sdom->active_vcpu_count--;
   1.316 +    list_del_init(&svc->active_vcpu_elem);
   1.317 +    if ( list_empty(&sdom->active_vcpu) )
   1.318 +    {
   1.319 +        BUG_ON( csched_priv.weight < sdom->weight );
   1.320 +        list_del_init(&sdom->active_sdom_elem);
   1.321 +        csched_priv.weight -= sdom->weight;
   1.322      }
   1.323 +}
   1.324 +
   1.325 +static void
   1.326 +csched_vcpu_acct(unsigned int cpu)
   1.327 +{
   1.328 +    struct csched_vcpu * const svc = CSCHED_VCPU(current);
   1.329 +
   1.330 +    ASSERT( current->processor == cpu );
   1.331 +    ASSERT( svc->sdom != NULL );
   1.332  
   1.333      /*
   1.334       * If this VCPU's priority was boosted when it last awoke, reset it.
   1.335 @@ -495,25 +533,30 @@ csched_vcpu_acct(struct csched_vcpu *svc
   1.336       */
   1.337      if ( svc->pri == CSCHED_PRI_TS_BOOST )
   1.338          svc->pri = CSCHED_PRI_TS_UNDER;
   1.339 -}
   1.340  
   1.341 -static inline void
   1.342 -__csched_vcpu_acct_idle_locked(struct csched_vcpu *svc)
   1.343 -{
   1.344 -    struct csched_dom * const sdom = svc->sdom;
   1.345 -
   1.346 -    BUG_ON( list_empty(&svc->active_vcpu_elem) );
   1.347 +    /*
   1.348 +     * Update credits
   1.349 +     */
   1.350 +    atomic_sub(CSCHED_CREDITS_PER_TICK, &svc->credit);
   1.351  
   1.352 -    CSCHED_STAT_CRANK(acct_vcpu_idle);
   1.353 -    svc->stats.state_idle++;
   1.354 -
   1.355 -    sdom->active_vcpu_count--;
   1.356 -    list_del_init(&svc->active_vcpu_elem);
   1.357 -    if ( list_empty(&sdom->active_vcpu) )
   1.358 +    /*
   1.359 +     * Put this VCPU and domain back on the active list if it was
   1.360 +     * idling.
   1.361 +     *
   1.362 +     * If it's been active a while, check if we'd be better off
   1.363 +     * migrating it to run elsewhere (see multi-core and multi-thread
   1.364 +     * support in csched_cpu_pick()).
   1.365 +     */
   1.366 +    if ( list_empty(&svc->active_vcpu_elem) )
   1.367      {
   1.368 -        BUG_ON( csched_priv.weight < sdom->weight );
   1.369 -        list_del_init(&sdom->active_sdom_elem);
   1.370 -        csched_priv.weight -= sdom->weight;
   1.371 +        __csched_vcpu_acct_start(svc);
   1.372 +    }
   1.373 +    else if ( csched_cpu_pick(current) != cpu )
   1.374 +    {
   1.375 +        CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
   1.376 +        CSCHED_STAT_CRANK(migrate_running);
   1.377 +        set_bit(_VCPUF_migrating, &current->vcpu_flags);
   1.378 +        cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
   1.379      }
   1.380  }
   1.381  
   1.382 @@ -537,15 +580,11 @@ csched_vcpu_init(struct vcpu *vc)
   1.383      svc->vcpu = vc;
   1.384      atomic_set(&svc->credit, 0);
   1.385      svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
   1.386 -    memset(&svc->stats, 0, sizeof(svc->stats));
   1.387 +    CSCHED_VCPU_STATS_RESET(svc);
   1.388      vc->sched_priv = svc;
   1.389  
   1.390      CSCHED_VCPU_CHECK(vc);
   1.391  
   1.392 -    /* Attach fair-share VCPUs to the accounting list */
   1.393 -    if ( likely(sdom != NULL) )
   1.394 -        csched_vcpu_acct(svc, 0);
   1.395 -
   1.396      /* Allocate per-PCPU info */
   1.397      if ( unlikely(!CSCHED_PCPU(vc->processor)) )
   1.398      {
   1.399 @@ -573,7 +612,7 @@ csched_vcpu_destroy(struct vcpu *vc)
   1.400      spin_lock_irqsave(&csched_priv.lock, flags);
   1.401  
   1.402      if ( !list_empty(&svc->active_vcpu_elem) )
   1.403 -        __csched_vcpu_acct_idle_locked(svc);
   1.404 +        __csched_vcpu_acct_stop_locked(svc);
   1.405  
   1.406      spin_unlock_irqrestore(&csched_priv.lock, flags);
   1.407  
   1.408 @@ -717,66 +756,6 @@ csched_dom_destroy(struct domain *dom)
   1.409      xfree(sdom);
   1.410  }
   1.411  
   1.412 -static int
   1.413 -csched_cpu_pick(struct vcpu *vc)
   1.414 -{
   1.415 -    cpumask_t cpus;
   1.416 -    int cpu, nxt;
   1.417 -
   1.418 -    CSCHED_STAT_CRANK(cpu_pick);
   1.419 -
   1.420 -    /*
   1.421 -     * Pick from online CPUs in VCPU's affinity mask, giving a
   1.422 -     * preference to its current processor if it's in there.
   1.423 -     */
   1.424 -    cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
   1.425 -    ASSERT( !cpus_empty(cpus) );
   1.426 -    cpu = cpu_isset(vc->processor, cpus) ? vc->processor : first_cpu(cpus);
   1.427 -
   1.428 -    /*
   1.429 -     * Try to find an idle processor within the above constraints.
   1.430 -     */
   1.431 -    cpus_and(cpus, cpus, csched_priv.idlers);
   1.432 -    if ( !cpus_empty(cpus) )
   1.433 -    {
   1.434 -        cpu = cpu_isset(cpu, cpus) ? cpu : first_cpu(cpus);
   1.435 -        cpu_clear(cpu, cpus);
   1.436 -
   1.437 -        /*
   1.438 -         * In multi-core and multi-threaded CPUs, not all idle execution
   1.439 -         * vehicles are equal!
   1.440 -         *
   1.441 -         * We give preference to the idle execution vehicle with the most
   1.442 -         * idling neighbours in its grouping. This distributes work across
   1.443 -         * distinct cores first and guarantees we don't do something stupid
   1.444 -         * like run two VCPUs on co-hyperthreads while there are idle cores
   1.445 -         * or sockets.
   1.446 -         */
   1.447 -        while ( !cpus_empty(cpus) )
   1.448 -        {
   1.449 -            nxt = first_cpu(cpus);
   1.450 -
   1.451 -            if ( csched_idler_compare(cpu, nxt) < 0 )
   1.452 -            {
   1.453 -                cpu = nxt;
   1.454 -                cpu_clear(nxt, cpus);
   1.455 -            }
   1.456 -            else if ( cpu_isset(cpu, cpu_core_map[nxt]) )
   1.457 -            {
   1.458 -                cpus_andnot(cpus, cpus, cpu_sibling_map[nxt]);
   1.459 -            }
   1.460 -            else
   1.461 -            {
   1.462 -                cpus_andnot(cpus, cpus, cpu_core_map[nxt]);
   1.463 -            }
   1.464 -
   1.465 -            ASSERT( !cpu_isset(nxt, cpus) );
   1.466 -        }
   1.467 -    }
   1.468 -
   1.469 -    return cpu;
   1.470 -}
   1.471 -
   1.472  /*
   1.473   * This is a O(n) optimized sort of the runq.
   1.474   *
   1.475 @@ -981,14 +960,14 @@ csched_acct(void)
   1.476  
   1.477                  if ( credit > CSCHED_CREDITS_PER_TSLICE )
   1.478                  {
   1.479 -                    __csched_vcpu_acct_idle_locked(svc);
   1.480 +                    __csched_vcpu_acct_stop_locked(svc);
   1.481                      credit = 0;
   1.482                      atomic_set(&svc->credit, credit);
   1.483                  }
   1.484              }
   1.485  
   1.486 -            svc->stats.credit_last = credit;
   1.487 -            svc->stats.credit_incr = credit_fair;
   1.488 +            CSCHED_VCPU_STAT_SET(svc, credit_last, credit);
   1.489 +            CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair);
   1.490              credit_balance += credit;
   1.491          }
   1.492      }
   1.493 @@ -1004,21 +983,14 @@ csched_acct(void)
   1.494  static void
   1.495  csched_tick(unsigned int cpu)
   1.496  {
   1.497 -    struct csched_vcpu * const svc = CSCHED_VCPU(current);
   1.498 -    struct csched_dom * const sdom = svc->sdom;
   1.499 -
   1.500      /*
   1.501       * Accounting for running VCPU
   1.502 -     *
   1.503 -     * Note: Some VCPUs, such as the idle tasks, are not credit scheduled.
   1.504       */
   1.505 -    if ( likely(sdom != NULL) )
   1.506 -    {
   1.507 -        csched_vcpu_acct(svc, CSCHED_CREDITS_PER_TICK);
   1.508 -    }
   1.509 +    if ( !is_idle_vcpu(current) )
   1.510 +        csched_vcpu_acct(cpu);
   1.511  
   1.512      /*
   1.513 -     * Accounting duty
   1.514 +     * Host-wide accounting duty
   1.515       *
   1.516       * Note: Currently, this is always done by the master boot CPU. Eventually,
   1.517       * we could distribute or at the very least cycle the duty.
   1.518 @@ -1040,40 +1012,48 @@ csched_tick(unsigned int cpu)
   1.519  }
   1.520  
   1.521  static struct csched_vcpu *
   1.522 -csched_runq_steal(struct csched_pcpu *spc, int cpu, int pri)
   1.523 +csched_runq_steal(int peer_cpu, int cpu, int pri)
   1.524  {
   1.525 +    const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
   1.526 +    const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
   1.527 +    struct csched_vcpu *speer;
   1.528      struct list_head *iter;
   1.529 -    struct csched_vcpu *speer;
   1.530      struct vcpu *vc;
   1.531  
   1.532 -    list_for_each( iter, &spc->runq )
   1.533 +    /*
   1.534 +     * Don't steal from an idle CPU's runq because it's about to
   1.535 +     * pick up work from it itself.
   1.536 +     */
   1.537 +    if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
   1.538      {
   1.539 -        speer = __runq_elem(iter);
   1.540 -
   1.541 -        /*
   1.542 -         * If next available VCPU here is not of higher priority than ours,
   1.543 -         * this PCPU is useless to us.
   1.544 -         */
   1.545 -        if ( speer->pri <= CSCHED_PRI_IDLE || speer->pri <= pri )
   1.546 +        list_for_each( iter, &peer_pcpu->runq )
   1.547          {
   1.548 -            CSCHED_STAT_CRANK(steal_peer_idle);
   1.549 -            break;
   1.550 -        }
   1.551 +            speer = __runq_elem(iter);
   1.552  
   1.553 -        /* Is this VCPU is runnable on our PCPU? */
   1.554 -        vc = speer->vcpu;
   1.555 -        BUG_ON( is_idle_vcpu(vc) );
   1.556 +            /*
   1.557 +             * If next available VCPU here is not of higher priority
   1.558 +             * than ours, this PCPU is useless to us.
   1.559 +             */
   1.560 +            if ( speer->pri <= CSCHED_PRI_IDLE || speer->pri <= pri )
   1.561 +                break;
   1.562  
   1.563 -        if ( __csched_queued_vcpu_is_stealable(cpu, vc) )
   1.564 -        {
   1.565 -            /* We got a candidate. Grab it! */
   1.566 -            __runq_remove(speer);
   1.567 -            vc->processor = cpu;
   1.568 +            /* Is this VCPU is runnable on our PCPU? */
   1.569 +            vc = speer->vcpu;
   1.570 +            BUG_ON( is_idle_vcpu(vc) );
   1.571  
   1.572 -            return speer;
   1.573 +            if (__csched_vcpu_is_migrateable(vc, cpu))
   1.574 +            {
   1.575 +                /* We got a candidate. Grab it! */
   1.576 +                CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
   1.577 +                CSCHED_STAT_CRANK(migrate_queued);
   1.578 +                __runq_remove(speer);
   1.579 +                vc->processor = cpu;
   1.580 +                return speer;
   1.581 +            }
   1.582          }
   1.583      }
   1.584  
   1.585 +    CSCHED_STAT_CRANK(steal_peer_idle);
   1.586      return NULL;
   1.587  }
   1.588  
   1.589 @@ -1081,12 +1061,11 @@ static struct csched_vcpu *
   1.590  csched_load_balance(int cpu, struct csched_vcpu *snext)
   1.591  {
   1.592      struct csched_vcpu *speer;
   1.593 -    struct csched_pcpu *spc;
   1.594 -    struct vcpu *peer_vcpu;
   1.595      cpumask_t workers;
   1.596 -    cpumask_t loners;
   1.597      int peer_cpu;
   1.598  
   1.599 +    BUG_ON( cpu != snext->vcpu->processor );
   1.600 +
   1.601      if ( snext->pri == CSCHED_PRI_IDLE )
   1.602          CSCHED_STAT_CRANK(load_balance_idle);
   1.603      else if ( snext->pri == CSCHED_PRI_TS_OVER )
   1.604 @@ -1095,22 +1074,16 @@ csched_load_balance(int cpu, struct csch
   1.605          CSCHED_STAT_CRANK(load_balance_other);
   1.606  
   1.607      /*
   1.608 -     * Peek at non-idling CPUs in the system
   1.609 +     * Peek at non-idling CPUs in the system, starting with our
   1.610 +     * immediate neighbour.
   1.611       */
   1.612 -    cpus_clear(loners);
   1.613      cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
   1.614      cpu_clear(cpu, workers);
   1.615 -
   1.616      peer_cpu = cpu;
   1.617 -    BUG_ON( peer_cpu != snext->vcpu->processor );
   1.618  
   1.619      while ( !cpus_empty(workers) )
   1.620      {
   1.621 -        /* For each CPU of interest, starting with our neighbour... */
   1.622 -        peer_cpu = next_cpu(peer_cpu, workers);
   1.623 -        if ( peer_cpu == NR_CPUS )
   1.624 -            peer_cpu = first_cpu(workers);
   1.625 -
   1.626 +        peer_cpu = __cycle_cpu(peer_cpu, &workers);
   1.627          cpu_clear(peer_cpu, workers);
   1.628  
   1.629          /*
   1.630 @@ -1126,83 +1099,13 @@ csched_load_balance(int cpu, struct csch
   1.631              continue;
   1.632          }
   1.633  
   1.634 -        peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
   1.635 -        spc = CSCHED_PCPU(peer_cpu);
   1.636 -
   1.637 -        if ( unlikely(spc == NULL) )
   1.638 -        {
   1.639 -            CSCHED_STAT_CRANK(steal_peer_down);
   1.640 -        }
   1.641 -        else if ( unlikely(is_idle_vcpu(peer_vcpu)) )
   1.642 -        {
   1.643 -            /*
   1.644 -             * Don't steal from an idle CPU's runq because it's about to
   1.645 -             * pick up work from it itself.
   1.646 -             */
   1.647 -            CSCHED_STAT_CRANK(steal_peer_idle);
   1.648 -        }
   1.649 -        else if ( is_idle_vcpu(__runq_elem(spc->runq.next)->vcpu) )
   1.650 -        {
   1.651 -            if ( snext->pri == CSCHED_PRI_IDLE &&
   1.652 -                 __csched_running_vcpu_is_stealable(cpu, peer_vcpu) )
   1.653 -            {
   1.654 -                CSCHED_STAT_CRANK(steal_loner_candidate);
   1.655 -                cpu_set(peer_cpu, loners);
   1.656 -            }
   1.657 -        }
   1.658 -        else
   1.659 -        {
   1.660 -            /* Try to steal work from a remote CPU's runq. */
   1.661 -            speer = csched_runq_steal(spc, cpu, snext->pri);
   1.662 -            if ( speer != NULL )
   1.663 -            {
   1.664 -                spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
   1.665 -                CSCHED_STAT_CRANK(vcpu_migrate);
   1.666 -                speer->stats.migrate++;
   1.667 -                return speer;
   1.668 -            }
   1.669 -        }
   1.670 -
   1.671 +        /*
   1.672 +         * Any work over there to steal?
   1.673 +         */
   1.674 +        speer = csched_runq_steal(peer_cpu, cpu, snext->pri);
   1.675          spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
   1.676 -    }
   1.677 -
   1.678 -    /*
   1.679 -     * If we failed to find any remotely queued VCPUs to move here,
   1.680 -     * see if it would be more efficient to move any of the running
   1.681 -     * remote VCPUs over here.
   1.682 -     */
   1.683 -    while ( !cpus_empty(loners) )
   1.684 -    {
   1.685 -        /* For each CPU of interest, starting with our neighbour... */
   1.686 -        peer_cpu = next_cpu(peer_cpu, loners);
   1.687 -        if ( peer_cpu == NR_CPUS )
   1.688 -            peer_cpu = first_cpu(loners);
   1.689 -
   1.690 -        cpu_clear(peer_cpu, loners);
   1.691 -
   1.692 -        if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
   1.693 -        {
   1.694 -            CSCHED_STAT_CRANK(steal_trylock_failed);
   1.695 -            continue;
   1.696 -        }
   1.697 -
   1.698 -        peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
   1.699 -        spc = CSCHED_PCPU(peer_cpu);
   1.700 -
   1.701 -        /* Signal the first candidate only. */
   1.702 -        if ( !is_idle_vcpu(peer_vcpu) &&
   1.703 -             is_idle_vcpu(__runq_elem(spc->runq.next)->vcpu) &&
   1.704 -             __csched_running_vcpu_is_stealable(cpu, peer_vcpu) )
   1.705 -        {
   1.706 -            set_bit(_VCPUF_migrating, &peer_vcpu->vcpu_flags);
   1.707 -            spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
   1.708 -
   1.709 -            CSCHED_STAT_CRANK(steal_loner_signal);
   1.710 -            cpu_raise_softirq(peer_cpu, SCHEDULE_SOFTIRQ);
   1.711 -            break;
   1.712 -        }
   1.713 -
   1.714 -        spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
   1.715 +        if ( speer != NULL )
   1.716 +            return speer;
   1.717      }
   1.718  
   1.719      /* Failed to find more important work elsewhere... */
   1.720 @@ -1270,7 +1173,6 @@ csched_schedule(s_time_t now)
   1.721      ret.task = snext->vcpu;
   1.722  
   1.723      CSCHED_VCPU_CHECK(ret.task);
   1.724 -
   1.725      return ret;
   1.726  }
   1.727  
   1.728 @@ -1287,14 +1189,16 @@ csched_dump_vcpu(struct csched_vcpu *svc
   1.729  
   1.730      if ( sdom )
   1.731      {
   1.732 -        printk(" credit=%i (%d+%u) {a/i=%u/%u m=%u w=%u}",
   1.733 -            atomic_read(&svc->credit),
   1.734 -            svc->stats.credit_last,
   1.735 -            svc->stats.credit_incr,
   1.736 -            svc->stats.state_active,
   1.737 -            svc->stats.state_idle,
   1.738 -            svc->stats.migrate,
   1.739 -            sdom->weight);
   1.740 +        printk(" credit=%i [w=%u]", atomic_read(&svc->credit), sdom->weight);
   1.741 +#ifdef CSCHED_STATS
   1.742 +        printk(" (%d+%u) {a/i=%u/%u m=%u+%u}",
   1.743 +                svc->stats.credit_last,
   1.744 +                svc->stats.credit_incr,
   1.745 +                svc->stats.state_active,
   1.746 +                svc->stats.state_idle,
   1.747 +                svc->stats.migrate_q,
   1.748 +                svc->stats.migrate_r);
   1.749 +#endif
   1.750      }
   1.751  
   1.752      printk("\n");