ia64/xen-unstable

changeset 12906:05e1863cc2a3

[XEN] Improve multi-core/thread scheduler support.
Relax granularity at which running VCPUs are migrated onto
cores or sockets with more idle execution vehicles and
simplify code a good deal in the process.
Also, some scheduler stat cleanup while in there.
Signed-off-by: Emmanuel Ackaouy <ack@xensource.com>
author Emmanuel Ackaouy <ack@xensource.com>
date Tue Dec 12 19:37:21 2006 +0000 (2006-12-12)
parents 6b68a3688509
children 2fa06640a1c1
files xen/common/sched_credit.c
line diff
     1.1 --- a/xen/common/sched_credit.c	Tue Dec 12 16:52:13 2006 +0000
     1.2 +++ b/xen/common/sched_credit.c	Tue Dec 12 19:37:21 2006 +0000
     1.3 @@ -106,20 +106,13 @@
     1.4      _MACRO(tickle_local_other)              \
     1.5      _MACRO(tickle_idlers_none)              \
     1.6      _MACRO(tickle_idlers_some)              \
     1.7 -    _MACRO(vcpu_migrate)                    \
     1.8      _MACRO(load_balance_idle)               \
     1.9      _MACRO(load_balance_over)               \
    1.10      _MACRO(load_balance_other)              \
    1.11      _MACRO(steal_trylock_failed)            \
    1.12 -    _MACRO(steal_peer_down)                 \
    1.13      _MACRO(steal_peer_idle)                 \
    1.14 -    _MACRO(steal_peer_running)              \
    1.15 -    _MACRO(steal_peer_pinned)               \
    1.16 -    _MACRO(steal_peer_migrating)            \
    1.17 -    _MACRO(steal_peer_best_idler)           \
    1.18 -    _MACRO(steal_loner_candidate)           \
    1.19 -    _MACRO(steal_loner_signal)              \
    1.20 -    _MACRO(cpu_pick)                        \
    1.21 +    _MACRO(migrate_queued)                  \
    1.22 +    _MACRO(migrate_running)                 \
    1.23      _MACRO(dom_init)                        \
    1.24      _MACRO(dom_destroy)                     \
    1.25      _MACRO(vcpu_init)                       \
    1.26 @@ -146,7 +139,7 @@
    1.27      struct                                      \
    1.28      {                                           \
    1.29          CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \
    1.30 -    } stats
    1.31 +    } stats;
    1.32  
    1.33  #define CSCHED_STATS_PRINTK()                   \
    1.34      do                                          \
    1.35 @@ -155,14 +148,27 @@
    1.36          CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \
    1.37      } while ( 0 )
    1.38  
    1.39 -#define CSCHED_STAT_CRANK(_X)   (CSCHED_STAT(_X)++)
    1.40 +#define CSCHED_STAT_CRANK(_X)               (CSCHED_STAT(_X)++)
    1.41 +
    1.42 +#define CSCHED_VCPU_STATS_RESET(_V)                     \
    1.43 +    do                                                  \
    1.44 +    {                                                   \
    1.45 +        memset(&(_V)->stats, 0, sizeof((_V)->stats));   \
    1.46 +    } while ( 0 )
    1.47 +
    1.48 +#define CSCHED_VCPU_STAT_CRANK(_V, _X)      (((_V)->stats._X)++)
    1.49 +
    1.50 +#define CSCHED_VCPU_STAT_SET(_V, _X, _Y)    (((_V)->stats._X) = (_Y))
    1.51  
    1.52  #else /* CSCHED_STATS */
    1.53  
    1.54 -#define CSCHED_STATS_RESET()    do {} while ( 0 )
    1.55 -#define CSCHED_STATS_DEFINE()   do {} while ( 0 )
    1.56 -#define CSCHED_STATS_PRINTK()   do {} while ( 0 )
    1.57 -#define CSCHED_STAT_CRANK(_X)   do {} while ( 0 )
    1.58 +#define CSCHED_STATS_RESET()                do {} while ( 0 )
    1.59 +#define CSCHED_STATS_DEFINE()
    1.60 +#define CSCHED_STATS_PRINTK()               do {} while ( 0 )
    1.61 +#define CSCHED_STAT_CRANK(_X)               do {} while ( 0 )
    1.62 +#define CSCHED_VCPU_STATS_RESET(_V)         do {} while ( 0 )
    1.63 +#define CSCHED_VCPU_STAT_CRANK(_V, _X)      do {} while ( 0 )
    1.64 +#define CSCHED_VCPU_STAT_SET(_V, _X, _Y)    do {} while ( 0 )
    1.65  
    1.66  #endif /* CSCHED_STATS */
    1.67  
    1.68 @@ -185,13 +191,16 @@ struct csched_vcpu {
    1.69      struct vcpu *vcpu;
    1.70      atomic_t credit;
    1.71      int16_t pri;
    1.72 +#ifdef CSCHED_STATS
    1.73      struct {
    1.74          int credit_last;
    1.75          uint32_t credit_incr;
    1.76          uint32_t state_active;
    1.77          uint32_t state_idle;
    1.78 -        uint32_t migrate;
    1.79 +        uint32_t migrate_q;
    1.80 +        uint32_t migrate_r;
    1.81      } stats;
    1.82 +#endif
    1.83  };
    1.84  
    1.85  /*
    1.86 @@ -219,7 +228,7 @@ struct csched_private {
    1.87      uint32_t credit;
    1.88      int credit_balance;
    1.89      uint32_t runq_sort;
    1.90 -    CSCHED_STATS_DEFINE();
    1.91 +    CSCHED_STATS_DEFINE()
    1.92  };
    1.93  
    1.94  
    1.95 @@ -231,6 +240,15 @@ static struct csched_private csched_priv
    1.96  
    1.97  
    1.98  static inline int
    1.99 +__cycle_cpu(int cpu, const cpumask_t *mask)
   1.100 +{
   1.101 +    int nxt = next_cpu(cpu, *mask);
   1.102 +    if (nxt == NR_CPUS)
   1.103 +        nxt = first_cpu(*mask);
   1.104 +    return nxt;
   1.105 +}
   1.106 +
   1.107 +static inline int
   1.108  __vcpu_on_runq(struct csched_vcpu *svc)
   1.109  {
   1.110      return !list_empty(&svc->runq_elem);
   1.111 @@ -375,118 +393,138 @@ static inline void
   1.112  #define CSCHED_VCPU_CHECK(_vc)
   1.113  #endif
   1.114  
   1.115 -/*
   1.116 - * Indicates which of two given idlers is most efficient to run
   1.117 - * an additional VCPU.
   1.118 - *
   1.119 - * Returns:
   1.120 - *  0:           They are the same.
   1.121 - *  negative:    One is less efficient than Two.
   1.122 - *  positive:    One is more efficient than Two.
   1.123 - */
   1.124 -static int
   1.125 -csched_idler_compare(int one, int two)
   1.126 -{
   1.127 -    cpumask_t idlers;
   1.128 -    cpumask_t one_idlers;
   1.129 -    cpumask_t two_idlers;
   1.130 -
   1.131 -    idlers = csched_priv.idlers;
   1.132 -    cpu_clear(one, idlers);
   1.133 -    cpu_clear(two, idlers);
   1.134 -
   1.135 -    if ( cpu_isset(one, cpu_core_map[two]) )
   1.136 -    {
   1.137 -        cpus_and(one_idlers, idlers, cpu_sibling_map[one]);
   1.138 -        cpus_and(two_idlers, idlers, cpu_sibling_map[two]);
   1.139 -    }
   1.140 -    else
   1.141 -    {
   1.142 -        cpus_and(one_idlers, idlers, cpu_core_map[one]);
   1.143 -        cpus_and(two_idlers, idlers, cpu_core_map[two]);
   1.144 -    }
   1.145 -
   1.146 -    return cpus_weight(one_idlers) - cpus_weight(two_idlers);
   1.147 -}
   1.148 -
   1.149  static inline int
   1.150 -__csched_queued_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
   1.151 +__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
   1.152  {
   1.153      /*
   1.154       * Don't pick up work that's in the peer's scheduling tail. Also only pick
   1.155       * up work that's allowed to run on our CPU.
   1.156       */
   1.157 -    if ( unlikely(test_bit(_VCPUF_running, &vc->vcpu_flags)) )
   1.158 -    {
   1.159 -        CSCHED_STAT_CRANK(steal_peer_running);
   1.160 -        return 0;
   1.161 -    }
   1.162 -
   1.163 -    if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) )
   1.164 -    {
   1.165 -        CSCHED_STAT_CRANK(steal_peer_pinned);
   1.166 -        return 0;
   1.167 -    }
   1.168 -
   1.169 -    return 1;
   1.170 +    return !test_bit(_VCPUF_running, &vc->vcpu_flags) &&
   1.171 +           cpu_isset(dest_cpu, vc->cpu_affinity);
   1.172  }
   1.173  
   1.174 -static inline int
   1.175 -__csched_running_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
   1.176 +static int
   1.177 +csched_cpu_pick(struct vcpu *vc)
   1.178  {
   1.179 -    BUG_ON( is_idle_vcpu(vc) );
   1.180 +    cpumask_t cpus;
   1.181 +    cpumask_t idlers;
   1.182 +    int cpu;
   1.183  
   1.184 -    if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) )
   1.185 +    /*
   1.186 +     * Pick from online CPUs in VCPU's affinity mask, giving a
   1.187 +     * preference to its current processor if it's in there.
   1.188 +     */
   1.189 +    cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
   1.190 +    cpu = cpu_isset(vc->processor, cpus)
   1.191 +            ? vc->processor
   1.192 +            : __cycle_cpu(vc->processor, &cpus);
   1.193 +    ASSERT( !cpus_empty(cpus) && cpu_isset(cpu, cpus) );
   1.194 +
   1.195 +    /*
   1.196 +     * Try to find an idle processor within the above constraints.
   1.197 +     *
   1.198 +     * In multi-core and multi-threaded CPUs, not all idle execution
   1.199 +     * vehicles are equal!
   1.200 +     *
   1.201 +     * We give preference to the idle execution vehicle with the most
   1.202 +     * idling neighbours in its grouping. This distributes work across
   1.203 +     * distinct cores first and guarantees we don't do something stupid
   1.204 +     * like run two VCPUs on co-hyperthreads while there are idle cores
   1.205 +     * or sockets.
   1.206 +     */
   1.207 +    idlers = csched_priv.idlers;
   1.208 +    cpu_set(cpu, idlers);
   1.209 +    cpus_and(cpus, cpus, idlers);
   1.210 +    cpu_clear(cpu, cpus);
   1.211 +
   1.212 +    while ( !cpus_empty(cpus) )
   1.213      {
   1.214 -        CSCHED_STAT_CRANK(steal_peer_pinned);
   1.215 -        return 0;
   1.216 +        cpumask_t cpu_idlers;
   1.217 +        cpumask_t nxt_idlers;
   1.218 +        int nxt;
   1.219 +
   1.220 +        nxt = __cycle_cpu(cpu, &cpus);
   1.221 +
   1.222 +        if ( cpu_isset(cpu, cpu_core_map[nxt]) )
   1.223 +        {
   1.224 +            ASSERT( cpu_isset(nxt, cpu_core_map[cpu]) );
   1.225 +            cpus_and(cpu_idlers, idlers, cpu_sibling_map[cpu]);
   1.226 +            cpus_and(nxt_idlers, idlers, cpu_sibling_map[nxt]);
   1.227 +        }
   1.228 +        else
   1.229 +        {
   1.230 +            ASSERT( !cpu_isset(nxt, cpu_core_map[cpu]) );
   1.231 +            cpus_and(cpu_idlers, idlers, cpu_core_map[cpu]);
   1.232 +            cpus_and(nxt_idlers, idlers, cpu_core_map[nxt]);
   1.233 +        }
   1.234 +
   1.235 +        if ( cpus_weight(cpu_idlers) < cpus_weight(nxt_idlers) )
   1.236 +        {
   1.237 +            cpu = nxt;
   1.238 +            cpu_clear(cpu, cpus);
   1.239 +        }
   1.240 +        else
   1.241 +        {
   1.242 +            cpus_andnot(cpus, cpus, nxt_idlers);
   1.243 +        }
   1.244      }
   1.245  
   1.246 -    if ( test_bit(_VCPUF_migrating, &vc->vcpu_flags) )
   1.247 -    {
   1.248 -        CSCHED_STAT_CRANK(steal_peer_migrating);
   1.249 -        return 0;
   1.250 -    }
   1.251 -
   1.252 -    if ( csched_idler_compare(local_cpu, vc->processor) <= 0 )
   1.253 -    {
   1.254 -        CSCHED_STAT_CRANK(steal_peer_best_idler);
   1.255 -        return 0;
   1.256 -    }
   1.257 -
   1.258 -    return 1;
   1.259 +    return cpu;
   1.260  }
   1.261  
   1.262 -static void
   1.263 -csched_vcpu_acct(struct csched_vcpu *svc, int credit_dec)
   1.264 +static inline void
   1.265 +__csched_vcpu_acct_start(struct csched_vcpu *svc)
   1.266  {
   1.267      struct csched_dom * const sdom = svc->sdom;
   1.268      unsigned long flags;
   1.269  
   1.270 -    /* Update credits */
   1.271 -    atomic_sub(credit_dec, &svc->credit);
   1.272 +    spin_lock_irqsave(&csched_priv.lock, flags);
   1.273  
   1.274 -    /* Put this VCPU and domain back on the active list if it was idling */
   1.275      if ( list_empty(&svc->active_vcpu_elem) )
   1.276      {
   1.277 -        spin_lock_irqsave(&csched_priv.lock, flags);
   1.278 -
   1.279 -        if ( list_empty(&svc->active_vcpu_elem) )
   1.280 -        {
   1.281 -            CSCHED_STAT_CRANK(acct_vcpu_active);
   1.282 -            svc->stats.state_active++;
   1.283 +        CSCHED_VCPU_STAT_CRANK(svc, state_active);
   1.284 +        CSCHED_STAT_CRANK(acct_vcpu_active);
   1.285  
   1.286 -            sdom->active_vcpu_count++;
   1.287 -            list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
   1.288 -            if ( list_empty(&sdom->active_sdom_elem) )
   1.289 -            {
   1.290 -                list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
   1.291 -                csched_priv.weight += sdom->weight;
   1.292 -            }
   1.293 +        sdom->active_vcpu_count++;
   1.294 +        list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
   1.295 +        if ( list_empty(&sdom->active_sdom_elem) )
   1.296 +        {
   1.297 +            list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
   1.298 +            csched_priv.weight += sdom->weight;
   1.299          }
   1.300 +    }
   1.301  
   1.302 -        spin_unlock_irqrestore(&csched_priv.lock, flags);
   1.303 +    spin_unlock_irqrestore(&csched_priv.lock, flags);
   1.304 +}
   1.305 +
   1.306 +static inline void
   1.307 +__csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
   1.308 +{
   1.309 +    struct csched_dom * const sdom = svc->sdom;
   1.310 +
   1.311 +    BUG_ON( list_empty(&svc->active_vcpu_elem) );
   1.312 +
   1.313 +    CSCHED_VCPU_STAT_CRANK(svc, state_idle);
   1.314 +    CSCHED_STAT_CRANK(acct_vcpu_idle);
   1.315 +
   1.316 +    sdom->active_vcpu_count--;
   1.317 +    list_del_init(&svc->active_vcpu_elem);
   1.318 +    if ( list_empty(&sdom->active_vcpu) )
   1.319 +    {
   1.320 +        BUG_ON( csched_priv.weight < sdom->weight );
   1.321 +        list_del_init(&sdom->active_sdom_elem);
   1.322 +        csched_priv.weight -= sdom->weight;
   1.323      }
   1.324 +}
   1.325 +
   1.326 +static void
   1.327 +csched_vcpu_acct(unsigned int cpu)
   1.328 +{
   1.329 +    struct csched_vcpu * const svc = CSCHED_VCPU(current);
   1.330 +
   1.331 +    ASSERT( current->processor == cpu );
   1.332 +    ASSERT( svc->sdom != NULL );
   1.333  
   1.334      /*
   1.335       * If this VCPU's priority was boosted when it last awoke, reset it.
   1.336 @@ -495,25 +533,30 @@ csched_vcpu_acct(struct csched_vcpu *svc
   1.337       */
   1.338      if ( svc->pri == CSCHED_PRI_TS_BOOST )
   1.339          svc->pri = CSCHED_PRI_TS_UNDER;
   1.340 -}
   1.341 -
   1.342 -static inline void
   1.343 -__csched_vcpu_acct_idle_locked(struct csched_vcpu *svc)
   1.344 -{
   1.345 -    struct csched_dom * const sdom = svc->sdom;
   1.346 -
   1.347 -    BUG_ON( list_empty(&svc->active_vcpu_elem) );
   1.348  
   1.349 -    CSCHED_STAT_CRANK(acct_vcpu_idle);
   1.350 -    svc->stats.state_idle++;
   1.351 +    /*
   1.352 +     * Update credits
   1.353 +     */
   1.354 +    atomic_sub(CSCHED_CREDITS_PER_TICK, &svc->credit);
   1.355  
   1.356 -    sdom->active_vcpu_count--;
   1.357 -    list_del_init(&svc->active_vcpu_elem);
   1.358 -    if ( list_empty(&sdom->active_vcpu) )
   1.359 +    /*
   1.360 +     * Put this VCPU and domain back on the active list if it was
   1.361 +     * idling.
   1.362 +     *
   1.363 +     * If it's been active a while, check if we'd be better off
   1.364 +     * migrating it to run elsewhere (see multi-core and multi-thread
   1.365 +     * support in csched_cpu_pick()).
   1.366 +     */
   1.367 +    if ( list_empty(&svc->active_vcpu_elem) )
   1.368      {
   1.369 -        BUG_ON( csched_priv.weight < sdom->weight );
   1.370 -        list_del_init(&sdom->active_sdom_elem);
   1.371 -        csched_priv.weight -= sdom->weight;
   1.372 +        __csched_vcpu_acct_start(svc);
   1.373 +    }
   1.374 +    else if ( csched_cpu_pick(current) != cpu )
   1.375 +    {
   1.376 +        CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
   1.377 +        CSCHED_STAT_CRANK(migrate_running);
   1.378 +        set_bit(_VCPUF_migrating, &current->vcpu_flags);
   1.379 +        cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
   1.380      }
   1.381  }
   1.382  
   1.383 @@ -537,15 +580,11 @@ csched_vcpu_init(struct vcpu *vc)
   1.384      svc->vcpu = vc;
   1.385      atomic_set(&svc->credit, 0);
   1.386      svc->pri = is_idle_domain(dom) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
   1.387 -    memset(&svc->stats, 0, sizeof(svc->stats));
   1.388 +    CSCHED_VCPU_STATS_RESET(svc);
   1.389      vc->sched_priv = svc;
   1.390  
   1.391      CSCHED_VCPU_CHECK(vc);
   1.392  
   1.393 -    /* Attach fair-share VCPUs to the accounting list */
   1.394 -    if ( likely(sdom != NULL) )
   1.395 -        csched_vcpu_acct(svc, 0);
   1.396 -
   1.397      /* Allocate per-PCPU info */
   1.398      if ( unlikely(!CSCHED_PCPU(vc->processor)) )
   1.399      {
   1.400 @@ -573,7 +612,7 @@ csched_vcpu_destroy(struct vcpu *vc)
   1.401      spin_lock_irqsave(&csched_priv.lock, flags);
   1.402  
   1.403      if ( !list_empty(&svc->active_vcpu_elem) )
   1.404 -        __csched_vcpu_acct_idle_locked(svc);
   1.405 +        __csched_vcpu_acct_stop_locked(svc);
   1.406  
   1.407      spin_unlock_irqrestore(&csched_priv.lock, flags);
   1.408  
   1.409 @@ -717,66 +756,6 @@ csched_dom_destroy(struct domain *dom)
   1.410      xfree(sdom);
   1.411  }
   1.412  
   1.413 -static int
   1.414 -csched_cpu_pick(struct vcpu *vc)
   1.415 -{
   1.416 -    cpumask_t cpus;
   1.417 -    int cpu, nxt;
   1.418 -
   1.419 -    CSCHED_STAT_CRANK(cpu_pick);
   1.420 -
   1.421 -    /*
   1.422 -     * Pick from online CPUs in VCPU's affinity mask, giving a
   1.423 -     * preference to its current processor if it's in there.
   1.424 -     */
   1.425 -    cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
   1.426 -    ASSERT( !cpus_empty(cpus) );
   1.427 -    cpu = cpu_isset(vc->processor, cpus) ? vc->processor : first_cpu(cpus);
   1.428 -
   1.429 -    /*
   1.430 -     * Try to find an idle processor within the above constraints.
   1.431 -     */
   1.432 -    cpus_and(cpus, cpus, csched_priv.idlers);
   1.433 -    if ( !cpus_empty(cpus) )
   1.434 -    {
   1.435 -        cpu = cpu_isset(cpu, cpus) ? cpu : first_cpu(cpus);
   1.436 -        cpu_clear(cpu, cpus);
   1.437 -
   1.438 -        /*
   1.439 -         * In multi-core and multi-threaded CPUs, not all idle execution
   1.440 -         * vehicles are equal!
   1.441 -         *
   1.442 -         * We give preference to the idle execution vehicle with the most
   1.443 -         * idling neighbours in its grouping. This distributes work across
   1.444 -         * distinct cores first and guarantees we don't do something stupid
   1.445 -         * like run two VCPUs on co-hyperthreads while there are idle cores
   1.446 -         * or sockets.
   1.447 -         */
   1.448 -        while ( !cpus_empty(cpus) )
   1.449 -        {
   1.450 -            nxt = first_cpu(cpus);
   1.451 -
   1.452 -            if ( csched_idler_compare(cpu, nxt) < 0 )
   1.453 -            {
   1.454 -                cpu = nxt;
   1.455 -                cpu_clear(nxt, cpus);
   1.456 -            }
   1.457 -            else if ( cpu_isset(cpu, cpu_core_map[nxt]) )
   1.458 -            {
   1.459 -                cpus_andnot(cpus, cpus, cpu_sibling_map[nxt]);
   1.460 -            }
   1.461 -            else
   1.462 -            {
   1.463 -                cpus_andnot(cpus, cpus, cpu_core_map[nxt]);
   1.464 -            }
   1.465 -
   1.466 -            ASSERT( !cpu_isset(nxt, cpus) );
   1.467 -        }
   1.468 -    }
   1.469 -
   1.470 -    return cpu;
   1.471 -}
   1.472 -
   1.473  /*
   1.474   * This is a O(n) optimized sort of the runq.
   1.475   *
   1.476 @@ -981,14 +960,14 @@ csched_acct(void)
   1.477  
   1.478                  if ( credit > CSCHED_CREDITS_PER_TSLICE )
   1.479                  {
   1.480 -                    __csched_vcpu_acct_idle_locked(svc);
   1.481 +                    __csched_vcpu_acct_stop_locked(svc);
   1.482                      credit = 0;
   1.483                      atomic_set(&svc->credit, credit);
   1.484                  }
   1.485              }
   1.486  
   1.487 -            svc->stats.credit_last = credit;
   1.488 -            svc->stats.credit_incr = credit_fair;
   1.489 +            CSCHED_VCPU_STAT_SET(svc, credit_last, credit);
   1.490 +            CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair);
   1.491              credit_balance += credit;
   1.492          }
   1.493      }
   1.494 @@ -1004,21 +983,14 @@ csched_acct(void)
   1.495  static void
   1.496  csched_tick(unsigned int cpu)
   1.497  {
   1.498 -    struct csched_vcpu * const svc = CSCHED_VCPU(current);
   1.499 -    struct csched_dom * const sdom = svc->sdom;
   1.500 -
   1.501      /*
   1.502       * Accounting for running VCPU
   1.503 -     *
   1.504 -     * Note: Some VCPUs, such as the idle tasks, are not credit scheduled.
   1.505       */
   1.506 -    if ( likely(sdom != NULL) )
   1.507 -    {
   1.508 -        csched_vcpu_acct(svc, CSCHED_CREDITS_PER_TICK);
   1.509 -    }
   1.510 +    if ( !is_idle_vcpu(current) )
   1.511 +        csched_vcpu_acct(cpu);
   1.512  
   1.513      /*
   1.514 -     * Accounting duty
   1.515 +     * Host-wide accounting duty
   1.516       *
   1.517       * Note: Currently, this is always done by the master boot CPU. Eventually,
   1.518       * we could distribute or at the very least cycle the duty.
   1.519 @@ -1040,40 +1012,48 @@ csched_tick(unsigned int cpu)
   1.520  }
   1.521  
   1.522  static struct csched_vcpu *
   1.523 -csched_runq_steal(struct csched_pcpu *spc, int cpu, int pri)
   1.524 +csched_runq_steal(int peer_cpu, int cpu, int pri)
   1.525  {
   1.526 +    const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
   1.527 +    const struct vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
   1.528 +    struct csched_vcpu *speer;
   1.529      struct list_head *iter;
   1.530 -    struct csched_vcpu *speer;
   1.531      struct vcpu *vc;
   1.532  
   1.533 -    list_for_each( iter, &spc->runq )
   1.534 +    /*
   1.535 +     * Don't steal from an idle CPU's runq because it's about to
   1.536 +     * pick up work from it itself.
   1.537 +     */
   1.538 +    if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
   1.539      {
   1.540 -        speer = __runq_elem(iter);
   1.541 -
   1.542 -        /*
   1.543 -         * If next available VCPU here is not of higher priority than ours,
   1.544 -         * this PCPU is useless to us.
   1.545 -         */
   1.546 -        if ( speer->pri <= CSCHED_PRI_IDLE || speer->pri <= pri )
   1.547 +        list_for_each( iter, &peer_pcpu->runq )
   1.548          {
   1.549 -            CSCHED_STAT_CRANK(steal_peer_idle);
   1.550 -            break;
   1.551 -        }
   1.552 +            speer = __runq_elem(iter);
   1.553  
   1.554 -        /* Is this VCPU is runnable on our PCPU? */
   1.555 -        vc = speer->vcpu;
   1.556 -        BUG_ON( is_idle_vcpu(vc) );
   1.557 +            /*
   1.558 +             * If next available VCPU here is not of higher priority
   1.559 +             * than ours, this PCPU is useless to us.
   1.560 +             */
   1.561 +            if ( speer->pri <= CSCHED_PRI_IDLE || speer->pri <= pri )
   1.562 +                break;
   1.563  
   1.564 -        if ( __csched_queued_vcpu_is_stealable(cpu, vc) )
   1.565 -        {
   1.566 -            /* We got a candidate. Grab it! */
   1.567 -            __runq_remove(speer);
   1.568 -            vc->processor = cpu;
   1.569 +            /* Is this VCPU is runnable on our PCPU? */
   1.570 +            vc = speer->vcpu;
   1.571 +            BUG_ON( is_idle_vcpu(vc) );
   1.572  
   1.573 -            return speer;
   1.574 +            if (__csched_vcpu_is_migrateable(vc, cpu))
   1.575 +            {
   1.576 +                /* We got a candidate. Grab it! */
   1.577 +                CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
   1.578 +                CSCHED_STAT_CRANK(migrate_queued);
   1.579 +                __runq_remove(speer);
   1.580 +                vc->processor = cpu;
   1.581 +                return speer;
   1.582 +            }
   1.583          }
   1.584      }
   1.585  
   1.586 +    CSCHED_STAT_CRANK(steal_peer_idle);
   1.587      return NULL;
   1.588  }
   1.589  
   1.590 @@ -1081,12 +1061,11 @@ static struct csched_vcpu *
   1.591  csched_load_balance(int cpu, struct csched_vcpu *snext)
   1.592  {
   1.593      struct csched_vcpu *speer;
   1.594 -    struct csched_pcpu *spc;
   1.595 -    struct vcpu *peer_vcpu;
   1.596      cpumask_t workers;
   1.597 -    cpumask_t loners;
   1.598      int peer_cpu;
   1.599  
   1.600 +    BUG_ON( cpu != snext->vcpu->processor );
   1.601 +
   1.602      if ( snext->pri == CSCHED_PRI_IDLE )
   1.603          CSCHED_STAT_CRANK(load_balance_idle);
   1.604      else if ( snext->pri == CSCHED_PRI_TS_OVER )
   1.605 @@ -1095,22 +1074,16 @@ csched_load_balance(int cpu, struct csch
   1.606          CSCHED_STAT_CRANK(load_balance_other);
   1.607  
   1.608      /*
   1.609 -     * Peek at non-idling CPUs in the system
   1.610 +     * Peek at non-idling CPUs in the system, starting with our
   1.611 +     * immediate neighbour.
   1.612       */
   1.613 -    cpus_clear(loners);
   1.614      cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
   1.615      cpu_clear(cpu, workers);
   1.616 -
   1.617      peer_cpu = cpu;
   1.618 -    BUG_ON( peer_cpu != snext->vcpu->processor );
   1.619  
   1.620      while ( !cpus_empty(workers) )
   1.621      {
   1.622 -        /* For each CPU of interest, starting with our neighbour... */
   1.623 -        peer_cpu = next_cpu(peer_cpu, workers);
   1.624 -        if ( peer_cpu == NR_CPUS )
   1.625 -            peer_cpu = first_cpu(workers);
   1.626 -
   1.627 +        peer_cpu = __cycle_cpu(peer_cpu, &workers);
   1.628          cpu_clear(peer_cpu, workers);
   1.629  
   1.630          /*
   1.631 @@ -1126,83 +1099,13 @@ csched_load_balance(int cpu, struct csch
   1.632              continue;
   1.633          }
   1.634  
   1.635 -        peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
   1.636 -        spc = CSCHED_PCPU(peer_cpu);
   1.637 -
   1.638 -        if ( unlikely(spc == NULL) )
   1.639 -        {
   1.640 -            CSCHED_STAT_CRANK(steal_peer_down);
   1.641 -        }
   1.642 -        else if ( unlikely(is_idle_vcpu(peer_vcpu)) )
   1.643 -        {
   1.644 -            /*
   1.645 -             * Don't steal from an idle CPU's runq because it's about to
   1.646 -             * pick up work from it itself.
   1.647 -             */
   1.648 -            CSCHED_STAT_CRANK(steal_peer_idle);
   1.649 -        }
   1.650 -        else if ( is_idle_vcpu(__runq_elem(spc->runq.next)->vcpu) )
   1.651 -        {
   1.652 -            if ( snext->pri == CSCHED_PRI_IDLE &&
   1.653 -                 __csched_running_vcpu_is_stealable(cpu, peer_vcpu) )
   1.654 -            {
   1.655 -                CSCHED_STAT_CRANK(steal_loner_candidate);
   1.656 -                cpu_set(peer_cpu, loners);
   1.657 -            }
   1.658 -        }
   1.659 -        else
   1.660 -        {
   1.661 -            /* Try to steal work from a remote CPU's runq. */
   1.662 -            speer = csched_runq_steal(spc, cpu, snext->pri);
   1.663 -            if ( speer != NULL )
   1.664 -            {
   1.665 -                spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
   1.666 -                CSCHED_STAT_CRANK(vcpu_migrate);
   1.667 -                speer->stats.migrate++;
   1.668 -                return speer;
   1.669 -            }
   1.670 -        }
   1.671 -
   1.672 +        /*
   1.673 +         * Any work over there to steal?
   1.674 +         */
   1.675 +        speer = csched_runq_steal(peer_cpu, cpu, snext->pri);
   1.676          spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
   1.677 -    }
   1.678 -
   1.679 -    /*
   1.680 -     * If we failed to find any remotely queued VCPUs to move here,
   1.681 -     * see if it would be more efficient to move any of the running
   1.682 -     * remote VCPUs over here.
   1.683 -     */
   1.684 -    while ( !cpus_empty(loners) )
   1.685 -    {
   1.686 -        /* For each CPU of interest, starting with our neighbour... */
   1.687 -        peer_cpu = next_cpu(peer_cpu, loners);
   1.688 -        if ( peer_cpu == NR_CPUS )
   1.689 -            peer_cpu = first_cpu(loners);
   1.690 -
   1.691 -        cpu_clear(peer_cpu, loners);
   1.692 -
   1.693 -        if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
   1.694 -        {
   1.695 -            CSCHED_STAT_CRANK(steal_trylock_failed);
   1.696 -            continue;
   1.697 -        }
   1.698 -
   1.699 -        peer_vcpu = per_cpu(schedule_data, peer_cpu).curr;
   1.700 -        spc = CSCHED_PCPU(peer_cpu);
   1.701 -
   1.702 -        /* Signal the first candidate only. */
   1.703 -        if ( !is_idle_vcpu(peer_vcpu) &&
   1.704 -             is_idle_vcpu(__runq_elem(spc->runq.next)->vcpu) &&
   1.705 -             __csched_running_vcpu_is_stealable(cpu, peer_vcpu) )
   1.706 -        {
   1.707 -            set_bit(_VCPUF_migrating, &peer_vcpu->vcpu_flags);
   1.708 -            spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
   1.709 -
   1.710 -            CSCHED_STAT_CRANK(steal_loner_signal);
   1.711 -            cpu_raise_softirq(peer_cpu, SCHEDULE_SOFTIRQ);
   1.712 -            break;
   1.713 -        }
   1.714 -
   1.715 -        spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
   1.716 +        if ( speer != NULL )
   1.717 +            return speer;
   1.718      }
   1.719  
   1.720      /* Failed to find more important work elsewhere... */
   1.721 @@ -1270,7 +1173,6 @@ csched_schedule(s_time_t now)
   1.722      ret.task = snext->vcpu;
   1.723  
   1.724      CSCHED_VCPU_CHECK(ret.task);
   1.725 -
   1.726      return ret;
   1.727  }
   1.728  
   1.729 @@ -1287,14 +1189,16 @@ csched_dump_vcpu(struct csched_vcpu *svc
   1.730  
   1.731      if ( sdom )
   1.732      {
   1.733 -        printk(" credit=%i (%d+%u) {a/i=%u/%u m=%u w=%u}",
   1.734 -            atomic_read(&svc->credit),
   1.735 -            svc->stats.credit_last,
   1.736 -            svc->stats.credit_incr,
   1.737 -            svc->stats.state_active,
   1.738 -            svc->stats.state_idle,
   1.739 -            svc->stats.migrate,
   1.740 -            sdom->weight);
   1.741 +        printk(" credit=%i [w=%u]", atomic_read(&svc->credit), sdom->weight);
   1.742 +#ifdef CSCHED_STATS
   1.743 +        printk(" (%d+%u) {a/i=%u/%u m=%u+%u}",
   1.744 +                svc->stats.credit_last,
   1.745 +                svc->stats.credit_incr,
   1.746 +                svc->stats.state_active,
   1.747 +                svc->stats.state_idle,
   1.748 +                svc->stats.migrate_q,
   1.749 +                svc->stats.migrate_r);
   1.750 +#endif
   1.751      }
   1.752  
   1.753      printk("\n");