direct-io.hg

changeset 10168:e539abd27a0f

New weighted fair-share CPU scheduler w/ automatic SMP load balancing
Signed-off-by: Emmanuel Ackaouy <ack@xensource.com>
author ack@kneesa.uk.xensource.com
date Fri May 26 11:14:36 2006 +0100 (2006-05-26)
parents b6937b931419
children 96ce5961b5eb
files tools/libxc/Makefile tools/libxc/xc_csched.c tools/libxc/xenctrl.h tools/python/xen/lowlevel/xc/xc.c tools/python/xen/xend/XendDomain.py tools/python/xen/xend/server/SrvDomain.py tools/python/xen/xm/main.py xen/common/Makefile xen/common/sched_credit.c xen/common/schedule.c xen/include/public/sched_ctl.h xen/include/xen/sched-if.h xen/include/xen/softirq.h
line diff
     1.1 --- a/tools/libxc/Makefile	Fri May 26 09:44:29 2006 +0100
     1.2 +++ b/tools/libxc/Makefile	Fri May 26 11:14:36 2006 +0100
     1.3 @@ -20,6 +20,7 @@ SRCS       += xc_acm.c
     1.4  SRCS       += xc_physdev.c
     1.5  SRCS       += xc_private.c
     1.6  SRCS       += xc_sedf.c
     1.7 +SRCS       += xc_csched.c
     1.8  SRCS       += xc_tbuf.c
     1.9  
    1.10  ifeq ($(patsubst x86%,x86,$(XEN_TARGET_ARCH)),x86)
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/tools/libxc/xc_csched.c	Fri May 26 11:14:36 2006 +0100
     2.3 @@ -0,0 +1,50 @@
     2.4 +/****************************************************************************
     2.5 + * (C) 2006 - Emmanuel Ackaouy - XenSource Inc.
     2.6 + ****************************************************************************
     2.7 + *
     2.8 + *        File: xc_csched.c
     2.9 + *      Author: Emmanuel Ackaouy
    2.10 + *
    2.11 + * Description: XC Interface to the credit scheduler
    2.12 + *
    2.13 + */
    2.14 +#include "xc_private.h"
    2.15 +
    2.16 +
    2.17 +int
    2.18 +xc_csched_domain_set(
    2.19 +    int xc_handle,
    2.20 +    uint32_t domid,
    2.21 +    struct csched_domain *sdom)
    2.22 +{
    2.23 +    DECLARE_DOM0_OP;
    2.24 +
    2.25 +    op.cmd = DOM0_ADJUSTDOM;    
    2.26 +    op.u.adjustdom.domain = (domid_t) domid;
    2.27 +    op.u.adjustdom.sched_id = SCHED_CREDIT;
    2.28 +    op.u.adjustdom.direction = SCHED_INFO_PUT;
    2.29 +    op.u.adjustdom.u.credit = *sdom;
    2.30 +
    2.31 +    return do_dom0_op(xc_handle, &op);
    2.32 +}
    2.33 +
    2.34 +int
    2.35 +xc_csched_domain_get(
    2.36 +    int xc_handle,
    2.37 +    uint32_t domid,
    2.38 +    struct csched_domain *sdom)
    2.39 +{
    2.40 +    DECLARE_DOM0_OP;
    2.41 +    int err;
    2.42 +
    2.43 +    op.cmd = DOM0_ADJUSTDOM;    
    2.44 +    op.u.adjustdom.domain = (domid_t) domid;
    2.45 +    op.u.adjustdom.sched_id = SCHED_CREDIT;
    2.46 +    op.u.adjustdom.direction = SCHED_INFO_GET;
    2.47 +
    2.48 +    err = do_dom0_op(xc_handle, &op);
    2.49 +    if ( err == 0 )
    2.50 +        *sdom = op.u.adjustdom.u.credit;
    2.51 +
    2.52 +    return err;
    2.53 +}
     3.1 --- a/tools/libxc/xenctrl.h	Fri May 26 09:44:29 2006 +0100
     3.2 +++ b/tools/libxc/xenctrl.h	Fri May 26 11:14:36 2006 +0100
     3.3 @@ -354,6 +354,14 @@ int xc_sedf_domain_get(int xc_handle,
     3.4                         uint64_t *latency, uint16_t *extratime,
     3.5                         uint16_t *weight);
     3.6  
     3.7 +int xc_csched_domain_set(int xc_handle,
     3.8 +                         uint32_t domid,
     3.9 +                         struct csched_domain *sdom);
    3.10 +
    3.11 +int xc_csched_domain_get(int xc_handle,
    3.12 +                         uint32_t domid,
    3.13 +                         struct csched_domain *sdom);
    3.14 +
    3.15  typedef evtchn_status_t xc_evtchn_status_t;
    3.16  
    3.17  /*
     4.1 --- a/tools/python/xen/lowlevel/xc/xc.c	Fri May 26 09:44:29 2006 +0100
     4.2 +++ b/tools/python/xen/lowlevel/xc/xc.c	Fri May 26 11:14:36 2006 +0100
     4.3 @@ -716,6 +716,49 @@ static PyObject *pyxc_sedf_domain_get(Xc
     4.4                           "weight",    weight);
     4.5  }
     4.6  
     4.7 +static PyObject *pyxc_csched_domain_set(XcObject *self,
     4.8 +                                        PyObject *args,
     4.9 +                                        PyObject *kwds)
    4.10 +{
    4.11 +    uint32_t domid;
    4.12 +    uint16_t weight;
    4.13 +    uint16_t cap;
    4.14 +    static char *kwd_list[] = { "dom", "weight", "cap", NULL };
    4.15 +    static char kwd_type[] = "I|HH";
    4.16 +    struct csched_domain sdom;
    4.17 +    
    4.18 +    weight = 0;
    4.19 +    cap = (uint16_t)~0U;
    4.20 +    if( !PyArg_ParseTupleAndKeywords(args, kwds, kwd_type, kwd_list, 
    4.21 +                                     &domid, &weight, &cap) )
    4.22 +        return NULL;
    4.23 +
    4.24 +    sdom.weight = weight;
    4.25 +    sdom.cap = cap;
    4.26 +
    4.27 +    if ( xc_csched_domain_set(self->xc_handle, domid, &sdom) != 0 )
    4.28 +        return PyErr_SetFromErrno(xc_error);
    4.29 +
    4.30 +    Py_INCREF(zero);
    4.31 +    return zero;
    4.32 +}
    4.33 +
    4.34 +static PyObject *pyxc_csched_domain_get(XcObject *self, PyObject *args)
    4.35 +{
    4.36 +    uint32_t domid;
    4.37 +    struct csched_domain sdom;
    4.38 +    
    4.39 +    if( !PyArg_ParseTuple(args, "I", &domid) )
    4.40 +        return NULL;
    4.41 +    
    4.42 +    if ( xc_csched_domain_get(self->xc_handle, domid, &sdom) != 0 )
    4.43 +        return PyErr_SetFromErrno(xc_error);
    4.44 +
    4.45 +    return Py_BuildValue("{s:H,s:H}",
    4.46 +                         "weight",  sdom.weight,
    4.47 +                         "cap",     sdom.cap);
    4.48 +}
    4.49 +
    4.50  static PyObject *pyxc_domain_setmaxmem(XcObject *self, PyObject *args)
    4.51  {
    4.52      uint32_t dom;
    4.53 @@ -1040,6 +1083,24 @@ static PyMethodDef pyxc_methods[] = {
    4.54        " slice     [long]: CPU reservation per period\n"
    4.55        " latency   [long]: domain's wakeup latency hint\n"
    4.56        " extratime [int]:  domain aware of extratime?\n"},
    4.57 +    
    4.58 +    { "csched_domain_set",
    4.59 +      (PyCFunction)pyxc_csched_domain_set,
    4.60 +      METH_KEYWORDS, "\n"
    4.61 +      "Set the scheduling parameters for a domain when running with the\n"
    4.62 +      "SMP credit scheduler.\n"
    4.63 +      " domid     [int]:   domain id to set\n"
    4.64 +      " weight    [short]: domain's scheduling weight\n"
    4.65 +      "Returns: [int] 0 on success; -1 on error.\n" },
    4.66 +
    4.67 +    { "csched_domain_get",
    4.68 +      (PyCFunction)pyxc_csched_domain_get,
    4.69 +      METH_VARARGS, "\n"
    4.70 +      "Get the scheduling parameters for a domain when running with the\n"
    4.71 +      "SMP credit scheduler.\n"
    4.72 +      " domid     [int]:   domain id to get\n"
    4.73 +      "Returns:   [dict]\n"
    4.74 +      " weight    [short]: domain's scheduling weight\n"},
    4.75  
    4.76      { "evtchn_alloc_unbound", 
    4.77        (PyCFunction)pyxc_evtchn_alloc_unbound,
     5.1 --- a/tools/python/xen/xend/XendDomain.py	Fri May 26 09:44:29 2006 +0100
     5.2 +++ b/tools/python/xen/xend/XendDomain.py	Fri May 26 11:14:36 2006 +0100
     5.3 @@ -522,6 +522,28 @@ class XendDomain:
     5.4          except Exception, ex:
     5.5              raise XendError(str(ex))
     5.6  
     5.7 +    def domain_csched_get(self, domid):
     5.8 +        """Get credit scheduler parameters for a domain.
     5.9 +        """
    5.10 +        dominfo = self.domain_lookup_by_name_or_id_nr(domid)
    5.11 +        if not dominfo:
    5.12 +            raise XendInvalidDomain(str(domid))
    5.13 +        try:
    5.14 +            return xc.csched_domain_get(dominfo.getDomid())
    5.15 +        except Exception, ex:
    5.16 +            raise XendError(str(ex))
    5.17 +    
    5.18 +    def domain_csched_set(self, domid, weight, cap):
    5.19 +        """Set credit scheduler parameters for a domain.
    5.20 +        """
    5.21 +        dominfo = self.domain_lookup_by_name_or_id_nr(domid)
    5.22 +        if not dominfo:
    5.23 +            raise XendInvalidDomain(str(domid))
    5.24 +        try:
    5.25 +            return xc.csched_domain_set(dominfo.getDomid(), weight, cap)
    5.26 +        except Exception, ex:
    5.27 +            raise XendError(str(ex))
    5.28 +
    5.29      def domain_maxmem_set(self, domid, mem):
    5.30          """Set the memory limit for a domain.
    5.31  
     6.1 --- a/tools/python/xen/xend/server/SrvDomain.py	Fri May 26 09:44:29 2006 +0100
     6.2 +++ b/tools/python/xen/xend/server/SrvDomain.py	Fri May 26 11:14:36 2006 +0100
     6.3 @@ -131,6 +131,20 @@ class SrvDomain(SrvDir):
     6.4  		     ['weight', 'int']])
     6.5          val = fn(req.args, {'dom': self.dom.domid})
     6.6          return val
     6.7 +    
     6.8 +    def op_domain_csched_get(self, _, req):
     6.9 +        fn = FormFn(self.xd.domain_csched_get,
    6.10 +                    [['dom', 'int']])
    6.11 +        val = fn(req.args, {'dom': self.dom.domid})
    6.12 +        return val
    6.13 +
    6.14 +
    6.15 +    def op_domain_csched_set(self, _, req):
    6.16 +        fn = FormFn(self.xd.domain_csched_set,
    6.17 +                    [['dom', 'int'],
    6.18 +                     ['weight', 'int']])
    6.19 +        val = fn(req.args, {'dom': self.dom.domid})
    6.20 +        return val
    6.21  
    6.22      def op_maxmem_set(self, _, req):
    6.23          fn = FormFn(self.xd.domain_maxmem_set,
     7.1 --- a/tools/python/xen/xm/main.py	Fri May 26 09:44:29 2006 +0100
     7.2 +++ b/tools/python/xen/xm/main.py	Fri May 26 11:14:36 2006 +0100
     7.3 @@ -99,6 +99,7 @@ sched_sedf_help = "sched-sedf [DOM] [OPT
     7.4                                      specifies another way of setting a domain's\n\
     7.5                                      cpu period/slice."
     7.6  
     7.7 +csched_help = "csched                           Set or get credit scheduler parameters"
     7.8  block_attach_help = """block-attach <DomId> <BackDev> <FrontDev> <Mode>
     7.9                  [BackDomId]         Create a new virtual block device"""
    7.10  block_detach_help = """block-detach  <DomId> <DevId>    Destroy a domain's virtual block device,
    7.11 @@ -174,6 +175,7 @@ host_commands = [
    7.12      ]
    7.13  
    7.14  scheduler_commands = [
    7.15 +    "csched",
    7.16      "sched-bvt",
    7.17      "sched-bvt-ctxallow",
    7.18      "sched-sedf",
    7.19 @@ -735,6 +737,48 @@ def xm_sched_sedf(args):
    7.20          else:
    7.21              print_sedf(sedf_info)
    7.22  
    7.23 +def xm_csched(args):
    7.24 +    usage_msg = """Csched:     Set or get credit scheduler parameters
    7.25 + Usage:
    7.26 +
    7.27 +        csched -d domain [-w weight] [-c cap]
    7.28 +    """
    7.29 +    try:
    7.30 +        opts, args = getopt.getopt(args[0:], "d:w:c:",
    7.31 +            ["domain=", "weight=", "cap="])
    7.32 +    except getopt.GetoptError:
    7.33 +        # print help information and exit:
    7.34 +        print usage_msg
    7.35 +        sys.exit(1)
    7.36 +
    7.37 +    domain = None
    7.38 +    weight = None
    7.39 +    cap = None
    7.40 +
    7.41 +    for o, a in opts:
    7.42 +        if o == "-d":
    7.43 +            domain = a
    7.44 +        elif o == "-w":
    7.45 +            weight = int(a)
    7.46 +        elif o == "-c":
    7.47 +            cap = int(a);
    7.48 +
    7.49 +    if domain is None:
    7.50 +        # place holder for system-wide scheduler parameters
    7.51 +        print usage_msg
    7.52 +        sys.exit(1)
    7.53 +
    7.54 +    if weight is None and cap is None:
    7.55 +        print server.xend.domain.csched_get(domain)
    7.56 +    else:
    7.57 +        if weight is None:
    7.58 +            weight = int(0)
    7.59 +        if cap is None:
    7.60 +            cap = int(~0)
    7.61 +
    7.62 +        err = server.xend.domain.csched_set(domain, weight, cap)
    7.63 +        if err != 0:
    7.64 +            print err
    7.65  
    7.66  def xm_info(args):
    7.67      arg_check(args, "info", 0)
    7.68 @@ -1032,6 +1076,7 @@ commands = {
    7.69      "sched-bvt": xm_sched_bvt,
    7.70      "sched-bvt-ctxallow": xm_sched_bvt_ctxallow,
    7.71      "sched-sedf": xm_sched_sedf,
    7.72 +    "csched": xm_csched,
    7.73      # block
    7.74      "block-attach": xm_block_attach,
    7.75      "block-detach": xm_block_detach,
     8.1 --- a/xen/common/Makefile	Fri May 26 09:44:29 2006 +0100
     8.2 +++ b/xen/common/Makefile	Fri May 26 11:14:36 2006 +0100
     8.3 @@ -13,6 +13,7 @@ obj-y += multicall.o
     8.4  obj-y += page_alloc.o
     8.5  obj-y += rangeset.o
     8.6  obj-y += sched_bvt.o
     8.7 +obj-y += sched_credit.o
     8.8  obj-y += sched_sedf.o
     8.9  obj-y += schedule.o
    8.10  obj-y += softirq.o
     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/xen/common/sched_credit.c	Fri May 26 11:14:36 2006 +0100
     9.3 @@ -0,0 +1,1233 @@
     9.4 +/****************************************************************************
     9.5 + * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
     9.6 + ****************************************************************************
     9.7 + *
     9.8 + *        File: common/csched_credit.c
     9.9 + *      Author: Emmanuel Ackaouy
    9.10 + *
    9.11 + * Description: Credit-based SMP CPU scheduler
    9.12 + */
    9.13 +
    9.14 +#include <xen/config.h>
    9.15 +#include <xen/init.h>
    9.16 +#include <xen/lib.h>
    9.17 +#include <xen/sched.h>
    9.18 +#include <xen/domain.h>
    9.19 +#include <xen/delay.h>
    9.20 +#include <xen/event.h>
    9.21 +#include <xen/time.h>
    9.22 +#include <xen/perfc.h>
    9.23 +#include <xen/sched-if.h>
    9.24 +#include <xen/softirq.h>
    9.25 +#include <asm/atomic.h>
    9.26 +
    9.27 +
    9.28 +/*
    9.29 + * CSCHED_STATS
    9.30 + *
    9.31 + * Manage very basic counters and stats.
    9.32 + *
    9.33 + * Useful for debugging live systems. The stats are displayed
    9.34 + * with runq dumps ('r' on the Xen console).
    9.35 + */
    9.36 +#define CSCHED_STATS
    9.37 +
    9.38 +
    9.39 +/*
    9.40 + * Basic constants
    9.41 + */
    9.42 +#define CSCHED_TICK             10      /* milliseconds */
    9.43 +#define CSCHED_TSLICE           30      /* milliseconds */
    9.44 +#define CSCHED_ACCT_NTICKS      3
    9.45 +#define CSCHED_ACCT_PERIOD      (CSCHED_ACCT_NTICKS * CSCHED_TICK)
    9.46 +#define CSCHED_DEFAULT_WEIGHT   256
    9.47 +
    9.48 +
    9.49 +/*
    9.50 + * Priorities
    9.51 + */
    9.52 +#define CSCHED_PRI_TS_UNDER     -1      /* time-share w/ credits */
    9.53 +#define CSCHED_PRI_TS_OVER      -2      /* time-share w/o credits */
    9.54 +#define CSCHED_PRI_IDLE         -64     /* idle */
    9.55 +#define CSCHED_PRI_TS_PARKED    -65     /* time-share w/ capped credits */
    9.56 +
    9.57 +
    9.58 +/*
    9.59 + * Useful macros
    9.60 + */
    9.61 +#define CSCHED_PCPU(_c)     ((struct csched_pcpu *)schedule_data[_c].sched_priv)
    9.62 +#define CSCHED_VCPU(_vcpu)  ((struct csched_vcpu *) (_vcpu)->sched_priv)
    9.63 +#define CSCHED_DOM(_dom)    ((struct csched_dom *) (_dom)->sched_priv)
    9.64 +#define RUNQ(_cpu)          (&(CSCHED_PCPU(_cpu)->runq))
    9.65 +
    9.66 +
    9.67 +/*
    9.68 + * Stats
    9.69 + */
    9.70 +#ifdef CSCHED_STATS
    9.71 +
    9.72 +#define CSCHED_STAT(_X)         (csched_priv.stats._X)
    9.73 +#define CSCHED_STAT_DEFINE(_X)  uint32_t _X;
    9.74 +#define CSCHED_STAT_PRINTK(_X)                                  \
    9.75 +    do                                                          \
    9.76 +    {                                                           \
    9.77 +        printk("\t%-30s = %u\n", #_X, CSCHED_STAT(_X));  \
    9.78 +    } while ( 0 );
    9.79 +
    9.80 +#define CSCHED_STATS_EXPAND_SCHED(_MACRO)   \
    9.81 +    _MACRO(vcpu_alloc)                      \
    9.82 +    _MACRO(vcpu_add)                        \
    9.83 +    _MACRO(vcpu_sleep)                      \
    9.84 +    _MACRO(vcpu_wake_running)               \
    9.85 +    _MACRO(vcpu_wake_onrunq)                \
    9.86 +    _MACRO(vcpu_wake_runnable)              \
    9.87 +    _MACRO(vcpu_wake_not_runnable)          \
    9.88 +    _MACRO(dom_free)                        \
    9.89 +    _MACRO(schedule)                        \
    9.90 +    _MACRO(tickle_local_idler)              \
    9.91 +    _MACRO(tickle_local_over)               \
    9.92 +    _MACRO(tickle_local_under)              \
    9.93 +    _MACRO(tickle_local_other)              \
    9.94 +    _MACRO(acct_run)                        \
    9.95 +    _MACRO(acct_no_work)                    \
    9.96 +    _MACRO(acct_balance)                    \
    9.97 +    _MACRO(acct_reorder)                    \
    9.98 +    _MACRO(acct_min_credit)                 \
    9.99 +    _MACRO(acct_vcpu_active)                \
   9.100 +    _MACRO(acct_vcpu_idle)                  \
   9.101 +    _MACRO(acct_vcpu_credit_min)
   9.102 +
   9.103 +#define CSCHED_STATS_EXPAND_SMP_LOAD_BALANCE(_MACRO)    \
   9.104 +    _MACRO(vcpu_migrate)                                \
   9.105 +    _MACRO(load_balance_idle)                           \
   9.106 +    _MACRO(load_balance_over)                           \
   9.107 +    _MACRO(load_balance_other)                          \
   9.108 +    _MACRO(steal_trylock_failed)                        \
   9.109 +    _MACRO(steal_peer_down)                             \
   9.110 +    _MACRO(steal_peer_idle)                             \
   9.111 +    _MACRO(steal_peer_running)                          \
   9.112 +    _MACRO(steal_peer_pinned)                           \
   9.113 +    _MACRO(tickle_idlers_none)                          \
   9.114 +    _MACRO(tickle_idlers_some)
   9.115 +
   9.116 +#ifndef NDEBUG
   9.117 +#define CSCHED_STATS_EXPAND_CHECKS(_MACRO)  \
   9.118 +    _MACRO(vcpu_check)
   9.119 +#else
   9.120 +#define CSCHED_STATS_EXPAND_CHECKS(_MACRO)
   9.121 +#endif
   9.122 +
   9.123 +#define CSCHED_STATS_EXPAND(_MACRO)                 \
   9.124 +    CSCHED_STATS_EXPAND_SCHED(_MACRO)               \
   9.125 +    CSCHED_STATS_EXPAND_SMP_LOAD_BALANCE(_MACRO)    \
   9.126 +    CSCHED_STATS_EXPAND_CHECKS(_MACRO)
   9.127 +
   9.128 +#define CSCHED_STATS_RESET()                                        \
   9.129 +    do                                                              \
   9.130 +    {                                                               \
   9.131 +        memset(&csched_priv.stats, 0, sizeof(csched_priv.stats));   \
   9.132 +    } while ( 0 )
   9.133 +
   9.134 +#define CSCHED_STATS_DEFINE()                   \
   9.135 +    struct                                      \
   9.136 +    {                                           \
   9.137 +        CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \
   9.138 +    } stats
   9.139 +
   9.140 +#define CSCHED_STATS_PRINTK()                   \
   9.141 +    do                                          \
   9.142 +    {                                           \
   9.143 +        printk("stats:\n");                     \
   9.144 +        CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \
   9.145 +    } while ( 0 )
   9.146 +
   9.147 +#define CSCHED_STAT_CRANK(_X)   (CSCHED_STAT(_X)++)
   9.148 +
   9.149 +#else /* CSCHED_STATS */
   9.150 +
   9.151 +#define CSCHED_STATS_RESET()    do {} while ( 0 )
   9.152 +#define CSCHED_STATS_DEFINE()   do {} while ( 0 )
   9.153 +#define CSCHED_STATS_PRINTK()   do {} while ( 0 )
   9.154 +#define CSCHED_STAT_CRANK(_X)   do {} while ( 0 )
   9.155 +
   9.156 +#endif /* CSCHED_STATS */
   9.157 +
   9.158 +
   9.159 +/*
   9.160 + * Physical CPU
   9.161 + */
   9.162 +struct csched_pcpu {
   9.163 +    struct list_head runq;
   9.164 +    uint32_t runq_sort_last;
   9.165 +};
   9.166 +
   9.167 +/*
   9.168 + * Virtual CPU
   9.169 + */
   9.170 +struct csched_vcpu {
   9.171 +    struct list_head runq_elem;
   9.172 +    struct list_head active_vcpu_elem;
   9.173 +    struct csched_dom *sdom;
   9.174 +    struct vcpu *vcpu;
   9.175 +    atomic_t credit;
   9.176 +    int credit_last;
   9.177 +    uint32_t credit_incr;
   9.178 +    uint32_t state_active;
   9.179 +    uint32_t state_idle;
   9.180 +    int16_t pri;
   9.181 +};
   9.182 +
   9.183 +/*
   9.184 + * Domain
   9.185 + */
   9.186 +struct csched_dom {
   9.187 +    struct list_head active_vcpu;
   9.188 +    struct list_head active_sdom_elem;
   9.189 +    struct domain *dom;
   9.190 +    uint16_t active_vcpu_count;
   9.191 +    uint16_t weight;
   9.192 +    uint16_t cap;
   9.193 +};
   9.194 +
   9.195 +/*
   9.196 + * System-wide private data
   9.197 + */
   9.198 +struct csched_private {
   9.199 +    spinlock_t lock;
   9.200 +    struct list_head active_sdom;
   9.201 +    uint32_t ncpus;
   9.202 +    unsigned int master;
   9.203 +    cpumask_t idlers;
   9.204 +    uint32_t weight;
   9.205 +    uint32_t credit;
   9.206 +    int credit_balance;
   9.207 +    uint32_t runq_sort;
   9.208 +    CSCHED_STATS_DEFINE();
   9.209 +};
   9.210 +
   9.211 +
   9.212 +/*
   9.213 + * Global variables
   9.214 + */
   9.215 +static struct csched_private csched_priv;
   9.216 +
   9.217 +
   9.218 +
   9.219 +static inline int
   9.220 +__vcpu_on_runq(struct csched_vcpu *svc)
   9.221 +{
   9.222 +    return !list_empty(&svc->runq_elem);
   9.223 +}
   9.224 +
   9.225 +static inline struct csched_vcpu *
   9.226 +__runq_elem(struct list_head *elem)
   9.227 +{
   9.228 +    return list_entry(elem, struct csched_vcpu, runq_elem);
   9.229 +}
   9.230 +
   9.231 +static inline void
   9.232 +__runq_insert(unsigned int cpu, struct csched_vcpu *svc)
   9.233 +{
   9.234 +    const struct list_head * const runq = RUNQ(cpu);
   9.235 +    struct list_head *iter;
   9.236 +
   9.237 +    BUG_ON( __vcpu_on_runq(svc) );
   9.238 +    BUG_ON( cpu != svc->vcpu->processor );
   9.239 +
   9.240 +    list_for_each( iter, runq )
   9.241 +    {
   9.242 +        const struct csched_vcpu * const iter_svc = __runq_elem(iter);
   9.243 +        if ( svc->pri > iter_svc->pri )
   9.244 +            break;
   9.245 +    }
   9.246 +
   9.247 +    list_add_tail(&svc->runq_elem, iter);
   9.248 +}
   9.249 +
   9.250 +static inline void
   9.251 +__runq_remove(struct csched_vcpu *svc)
   9.252 +{
   9.253 +    BUG_ON( !__vcpu_on_runq(svc) );
   9.254 +    list_del_init(&svc->runq_elem);
   9.255 +}
   9.256 +
   9.257 +static inline void
   9.258 +__runq_tickle(unsigned int cpu, struct csched_vcpu *new)
   9.259 +{
   9.260 +    struct csched_vcpu * const cur = CSCHED_VCPU(schedule_data[cpu].curr);
   9.261 +    cpumask_t mask;
   9.262 +
   9.263 +    ASSERT(cur);
   9.264 +    cpus_clear(mask);
   9.265 +
   9.266 +    /* If strictly higher priority than current VCPU, signal the CPU */
   9.267 +    if ( new->pri > cur->pri )
   9.268 +    {
   9.269 +        if ( cur->pri == CSCHED_PRI_IDLE )
   9.270 +            CSCHED_STAT_CRANK(tickle_local_idler);
   9.271 +        else if ( cur->pri == CSCHED_PRI_TS_OVER )
   9.272 +            CSCHED_STAT_CRANK(tickle_local_over);
   9.273 +        else if ( cur->pri == CSCHED_PRI_TS_UNDER )
   9.274 +            CSCHED_STAT_CRANK(tickle_local_under);
   9.275 +        else
   9.276 +            CSCHED_STAT_CRANK(tickle_local_other);
   9.277 +
   9.278 +        cpu_set(cpu, mask);
   9.279 +    }
   9.280 +
   9.281 +    /*
   9.282 +     * If this CPU has at least two runnable VCPUs, we tickle any idlers to
   9.283 +     * let them know there is runnable work in the system...
   9.284 +     */
   9.285 +    if ( cur->pri > CSCHED_PRI_IDLE )
   9.286 +    {
   9.287 +        if ( cpus_empty(csched_priv.idlers) )
   9.288 +        {
   9.289 +            CSCHED_STAT_CRANK(tickle_idlers_none);
   9.290 +        }
   9.291 +        else
   9.292 +        {
   9.293 +            CSCHED_STAT_CRANK(tickle_idlers_some);
   9.294 +            cpus_or(mask, mask, csched_priv.idlers);
   9.295 +        }
   9.296 +    }
   9.297 +
   9.298 +    /* Send scheduler interrupts to designated CPUs */
   9.299 +    if ( !cpus_empty(mask) )
   9.300 +        cpumask_raise_softirq(mask, SCHEDULE_SOFTIRQ);
   9.301 +}
   9.302 +
   9.303 +static void
   9.304 +csched_pcpu_init(int cpu)
   9.305 +{
   9.306 +    struct csched_pcpu *spc;
   9.307 +    unsigned long flags;
   9.308 +
   9.309 +    spin_lock_irqsave(&csched_priv.lock, flags);
   9.310 +
   9.311 +    /* Initialize/update system-wide config */
   9.312 +    csched_priv.credit += CSCHED_ACCT_PERIOD;
   9.313 +    if ( csched_priv.ncpus <= cpu )
   9.314 +        csched_priv.ncpus = cpu + 1;
   9.315 +    if ( csched_priv.master >= csched_priv.ncpus )
   9.316 +        csched_priv.master = cpu;
   9.317 +
   9.318 +    /* Allocate per-PCPU info */
   9.319 +    spc = xmalloc(struct csched_pcpu);
   9.320 +    BUG_ON( spc == NULL );
   9.321 +    INIT_LIST_HEAD(&spc->runq);
   9.322 +    spc->runq_sort_last = csched_priv.runq_sort;
   9.323 +    schedule_data[cpu].sched_priv = spc;
   9.324 +
   9.325 +    /* Start off idling... */
   9.326 +    BUG_ON( !is_idle_vcpu(schedule_data[cpu].curr) );
   9.327 +    cpu_set(cpu, csched_priv.idlers);
   9.328 +
   9.329 +    spin_unlock_irqrestore(&csched_priv.lock, flags);
   9.330 +}
   9.331 +
   9.332 +#ifndef NDEBUG
   9.333 +static inline void
   9.334 +__csched_vcpu_check(struct vcpu *vc)
   9.335 +{
   9.336 +    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
   9.337 +    struct csched_dom * const sdom = svc->sdom;
   9.338 +
   9.339 +    BUG_ON( svc->vcpu != vc );
   9.340 +    BUG_ON( sdom != CSCHED_DOM(vc->domain) );
   9.341 +    if ( sdom )
   9.342 +    {
   9.343 +        BUG_ON( is_idle_vcpu(vc) );
   9.344 +        BUG_ON( sdom->dom != vc->domain );
   9.345 +    }
   9.346 +    else
   9.347 +    {
   9.348 +        BUG_ON( !is_idle_vcpu(vc) );
   9.349 +    }
   9.350 +
   9.351 +    CSCHED_STAT_CRANK(vcpu_check);
   9.352 +}
   9.353 +#define CSCHED_VCPU_CHECK(_vc)  (__csched_vcpu_check(_vc))
   9.354 +#else
   9.355 +#define CSCHED_VCPU_CHECK(_vc)
   9.356 +#endif
   9.357 +
   9.358 +static inline int
   9.359 +__csched_vcpu_is_stealable(int local_cpu, struct vcpu *vc)
   9.360 +{
   9.361 +    /*
   9.362 +     * Don't pick up work that's in the peer's scheduling tail. Also only pick
   9.363 +     * up work that's allowed to run on our CPU.
   9.364 +     */
   9.365 +    if ( unlikely(test_bit(_VCPUF_running, &vc->vcpu_flags)) )
   9.366 +    {
   9.367 +        CSCHED_STAT_CRANK(steal_peer_running);
   9.368 +        return 0;
   9.369 +    }
   9.370 +
   9.371 +    if ( unlikely(!cpu_isset(local_cpu, vc->cpu_affinity)) )
   9.372 +    {
   9.373 +        CSCHED_STAT_CRANK(steal_peer_pinned);
   9.374 +        return 0;
   9.375 +    }
   9.376 +
   9.377 +    return 1;
   9.378 +}
   9.379 +
   9.380 +static void
   9.381 +csched_vcpu_acct(struct csched_vcpu *svc, int credit_dec)
   9.382 +{
   9.383 +    struct csched_dom * const sdom = svc->sdom;
   9.384 +    unsigned long flags;
   9.385 +
   9.386 +    /* Update credits */
   9.387 +    atomic_sub(credit_dec, &svc->credit);
   9.388 +
   9.389 +    /* Put this VCPU and domain back on the active list if it was idling */
   9.390 +    if ( list_empty(&svc->active_vcpu_elem) )
   9.391 +    {
   9.392 +        spin_lock_irqsave(&csched_priv.lock, flags);
   9.393 +
   9.394 +        if ( list_empty(&svc->active_vcpu_elem) )
   9.395 +        {
   9.396 +            CSCHED_STAT_CRANK(acct_vcpu_active);
   9.397 +            svc->state_active++;
   9.398 +
   9.399 +            sdom->active_vcpu_count++;
   9.400 +            list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
   9.401 +            if ( list_empty(&sdom->active_sdom_elem) )
   9.402 +            {
   9.403 +                list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
   9.404 +                csched_priv.weight += sdom->weight;
   9.405 +            }
   9.406 +        }
   9.407 +
   9.408 +        spin_unlock_irqrestore(&csched_priv.lock, flags);
   9.409 +    }
   9.410 +}
   9.411 +
   9.412 +static inline void
   9.413 +__csched_vcpu_acct_idle_locked(struct csched_vcpu *svc)
   9.414 +{
   9.415 +    struct csched_dom * const sdom = svc->sdom;
   9.416 +
   9.417 +    BUG_ON( list_empty(&svc->active_vcpu_elem) );
   9.418 +
   9.419 +    CSCHED_STAT_CRANK(acct_vcpu_idle);
   9.420 +    svc->state_idle++;
   9.421 +
   9.422 +    sdom->active_vcpu_count--;
   9.423 +    list_del_init(&svc->active_vcpu_elem);
   9.424 +    if ( list_empty(&sdom->active_vcpu) )
   9.425 +    {
   9.426 +        BUG_ON( csched_priv.weight < sdom->weight );
   9.427 +        list_del_init(&sdom->active_sdom_elem);
   9.428 +        csched_priv.weight -= sdom->weight;
   9.429 +    }
   9.430 +
   9.431 +    atomic_set(&svc->credit, 0);
   9.432 +}
   9.433 +
   9.434 +static int
   9.435 +csched_vcpu_alloc(struct vcpu *vc)
   9.436 +{
   9.437 +    struct domain * const dom = vc->domain;
   9.438 +    struct csched_dom *sdom;
   9.439 +    struct csched_vcpu *svc;
   9.440 +    int16_t pri;
   9.441 +
   9.442 +    CSCHED_STAT_CRANK(vcpu_alloc);
   9.443 +
   9.444 +    /* Allocate, if appropriate, per-domain info */
   9.445 +    if ( is_idle_vcpu(vc) )
   9.446 +    {
   9.447 +        sdom = NULL;
   9.448 +        pri = CSCHED_PRI_IDLE;
   9.449 +    }
   9.450 +    else if ( CSCHED_DOM(dom) )
   9.451 +    {
   9.452 +        sdom = CSCHED_DOM(dom);
   9.453 +        pri = CSCHED_PRI_TS_UNDER;
   9.454 +    }
   9.455 +    else 
   9.456 +    {
   9.457 +        sdom = xmalloc(struct csched_dom);
   9.458 +        if ( !sdom )
   9.459 +            return -1;
   9.460 +
   9.461 +        /* Initialize credit and weight */
   9.462 +        INIT_LIST_HEAD(&sdom->active_vcpu);
   9.463 +        sdom->active_vcpu_count = 0;
   9.464 +        INIT_LIST_HEAD(&sdom->active_sdom_elem);
   9.465 +        sdom->dom = dom;
   9.466 +        sdom->weight = CSCHED_DEFAULT_WEIGHT;
   9.467 +        sdom->cap = 0U;
   9.468 +        dom->sched_priv = sdom;
   9.469 +        pri = CSCHED_PRI_TS_UNDER;
   9.470 +    }
   9.471 +
   9.472 +    /* Allocate per-VCPU info */
   9.473 +    svc = xmalloc(struct csched_vcpu);
   9.474 +    if ( !svc )
   9.475 +        return -1;
   9.476 +
   9.477 +    INIT_LIST_HEAD(&svc->runq_elem);
   9.478 +    INIT_LIST_HEAD(&svc->active_vcpu_elem);
   9.479 +    svc->sdom = sdom;
   9.480 +    svc->vcpu = vc;
   9.481 +    atomic_set(&svc->credit, 0);
   9.482 +    svc->credit_last = 0;
   9.483 +    svc->credit_incr = 0U;
   9.484 +    svc->state_active = 0U;
   9.485 +    svc->state_idle = 0U;
   9.486 +    svc->pri = pri;
   9.487 +    vc->sched_priv = svc;
   9.488 +
   9.489 +    CSCHED_VCPU_CHECK(vc);
   9.490 +
   9.491 +    /* Attach fair-share VCPUs to the accounting list */
   9.492 +    if ( likely(sdom != NULL) )
   9.493 +        csched_vcpu_acct(svc, 0);
   9.494 +
   9.495 +    return 0;
   9.496 +}
   9.497 +
   9.498 +static void
   9.499 +csched_vcpu_add(struct vcpu *vc) 
   9.500 +{
   9.501 +    CSCHED_STAT_CRANK(vcpu_add);
   9.502 +
   9.503 +    /* Allocate per-PCPU info */
   9.504 +    if ( unlikely(!CSCHED_PCPU(vc->processor)) )
   9.505 +        csched_pcpu_init(vc->processor);
   9.506 +
   9.507 +    CSCHED_VCPU_CHECK(vc);
   9.508 +}
   9.509 +
   9.510 +static void
   9.511 +csched_vcpu_free(struct vcpu *vc)
   9.512 +{
   9.513 +    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
   9.514 +    struct csched_dom * const sdom = svc->sdom;
   9.515 +    unsigned long flags;
   9.516 +
   9.517 +    BUG_ON( sdom == NULL );
   9.518 +    BUG_ON( !list_empty(&svc->runq_elem) );
   9.519 +
   9.520 +    spin_lock_irqsave(&csched_priv.lock, flags);
   9.521 +
   9.522 +    if ( !list_empty(&svc->active_vcpu_elem) )
   9.523 +        __csched_vcpu_acct_idle_locked(svc);
   9.524 +
   9.525 +    spin_unlock_irqrestore(&csched_priv.lock, flags);
   9.526 +
   9.527 +    xfree(svc);
   9.528 +}
   9.529 +
   9.530 +static void
   9.531 +csched_vcpu_sleep(struct vcpu *vc)
   9.532 +{
   9.533 +    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
   9.534 +
   9.535 +    CSCHED_STAT_CRANK(vcpu_sleep);
   9.536 +
   9.537 +    BUG_ON( is_idle_vcpu(vc) );
   9.538 +
   9.539 +    if ( schedule_data[vc->processor].curr == vc )
   9.540 +        cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
   9.541 +    else if ( __vcpu_on_runq(svc) )
   9.542 +        __runq_remove(svc);
   9.543 +}
   9.544 +
   9.545 +static void
   9.546 +csched_vcpu_wake(struct vcpu *vc)
   9.547 +{
   9.548 +    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
   9.549 +    const unsigned int cpu = vc->processor;
   9.550 +
   9.551 +    BUG_ON( is_idle_vcpu(vc) );
   9.552 +
   9.553 +    if ( unlikely(schedule_data[cpu].curr == vc) )
   9.554 +    {
   9.555 +        CSCHED_STAT_CRANK(vcpu_wake_running);
   9.556 +        return;
   9.557 +    }
   9.558 +    if ( unlikely(__vcpu_on_runq(svc)) )
   9.559 +    {
   9.560 +        CSCHED_STAT_CRANK(vcpu_wake_onrunq);
   9.561 +        return;
   9.562 +    }
   9.563 +
   9.564 +    if ( likely(vcpu_runnable(vc)) )
   9.565 +        CSCHED_STAT_CRANK(vcpu_wake_runnable);
   9.566 +    else
   9.567 +        CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
   9.568 +
   9.569 +    /* Put the VCPU on the runq and tickle CPUs */
   9.570 +    __runq_insert(cpu, svc);
   9.571 +    __runq_tickle(cpu, svc);
   9.572 +}
   9.573 +
   9.574 +static int
   9.575 +csched_vcpu_set_affinity(struct vcpu *vc, cpumask_t *affinity)
   9.576 +{
   9.577 +    unsigned long flags;
   9.578 +    int lcpu;
   9.579 +
   9.580 +    if ( vc == current )
   9.581 +    {
   9.582 +        /* No locking needed but also can't move on the spot... */
   9.583 +        if ( !cpu_isset(vc->processor, *affinity) )
   9.584 +            return -EBUSY;
   9.585 +
   9.586 +        vc->cpu_affinity = *affinity;
   9.587 +    }
   9.588 +    else
   9.589 +    {
   9.590 +        /* Pause, modify, and unpause. */
   9.591 +        vcpu_pause(vc);
   9.592 +
   9.593 +        vc->cpu_affinity = *affinity;
   9.594 +        if ( !cpu_isset(vc->processor, vc->cpu_affinity) )
   9.595 +        {
   9.596 +            /*
   9.597 +             * We must grab the scheduler lock for the CPU currently owning
   9.598 +             * this VCPU before changing its ownership.
   9.599 +             */
   9.600 +            vcpu_schedule_lock_irqsave(vc, flags);
   9.601 +            lcpu = vc->processor;
   9.602 +
   9.603 +            vc->processor = first_cpu(vc->cpu_affinity);
   9.604 +
   9.605 +            spin_unlock_irqrestore(&schedule_data[lcpu].schedule_lock, flags);
   9.606 +        }
   9.607 +
   9.608 +        vcpu_unpause(vc);
   9.609 +    }
   9.610 +
   9.611 +    return 0;
   9.612 +}
   9.613 +
   9.614 +static int
   9.615 +csched_dom_cntl(
   9.616 +    struct domain *d,
   9.617 +    struct sched_adjdom_cmd *cmd)
   9.618 +{
   9.619 +    struct csched_dom * const sdom = CSCHED_DOM(d);
   9.620 +    unsigned long flags;
   9.621 +
   9.622 +    if ( cmd->direction == SCHED_INFO_GET )
   9.623 +    {
   9.624 +        cmd->u.credit.weight = sdom->weight;
   9.625 +        cmd->u.credit.cap = sdom->cap;
   9.626 +    }
   9.627 +    else
   9.628 +    {
   9.629 +        ASSERT( cmd->direction == SCHED_INFO_PUT );
   9.630 +
   9.631 +        spin_lock_irqsave(&csched_priv.lock, flags);
   9.632 +
   9.633 +        if ( cmd->u.credit.weight != 0 )
   9.634 +        {
   9.635 +            csched_priv.weight -= sdom->weight;
   9.636 +            sdom->weight = cmd->u.credit.weight;
   9.637 +            csched_priv.weight += sdom->weight;
   9.638 +        }
   9.639 +
   9.640 +        if ( cmd->u.credit.cap != (uint16_t)~0U )
   9.641 +            sdom->cap = cmd->u.credit.cap;
   9.642 +
   9.643 +        spin_unlock_irqrestore(&csched_priv.lock, flags);
   9.644 +    }
   9.645 +
   9.646 +    return 0;
   9.647 +}
   9.648 +
   9.649 +static void
   9.650 +csched_dom_free(struct domain *dom)
   9.651 +{
   9.652 +    struct csched_dom * const sdom = CSCHED_DOM(dom);
   9.653 +    int i;
   9.654 +
   9.655 +    CSCHED_STAT_CRANK(dom_free);
   9.656 +
   9.657 +    for ( i = 0; i < MAX_VIRT_CPUS; i++ )
   9.658 +    {
   9.659 +        if ( dom->vcpu[i] )
   9.660 +            csched_vcpu_free(dom->vcpu[i]);
   9.661 +    }
   9.662 +
   9.663 +    xfree(sdom);
   9.664 +}
   9.665 +
   9.666 +/*
   9.667 + * This is a O(n) optimized sort of the runq.
   9.668 + *
   9.669 + * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
   9.670 + * through the runq and move up any UNDERs that are preceded by OVERS. We
   9.671 + * remember the last UNDER to make the move up operation O(1).
   9.672 + */
   9.673 +static void
   9.674 +csched_runq_sort(unsigned int cpu)
   9.675 +{
   9.676 +    struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
   9.677 +    struct list_head *runq, *elem, *next, *last_under;
   9.678 +    struct csched_vcpu *svc_elem;
   9.679 +    unsigned long flags;
   9.680 +    int sort_epoch;
   9.681 +
   9.682 +    sort_epoch = csched_priv.runq_sort;
   9.683 +    if ( sort_epoch == spc->runq_sort_last )
   9.684 +        return;
   9.685 +
   9.686 +    spc->runq_sort_last = sort_epoch;
   9.687 +
   9.688 +    spin_lock_irqsave(&schedule_data[cpu].schedule_lock, flags);
   9.689 +
   9.690 +    runq = &spc->runq;
   9.691 +    elem = runq->next;
   9.692 +    last_under = runq;
   9.693 +
   9.694 +    while ( elem != runq )
   9.695 +    {
   9.696 +        next = elem->next;
   9.697 +        svc_elem = __runq_elem(elem);
   9.698 +
   9.699 +        if ( svc_elem->pri == CSCHED_PRI_TS_UNDER )
   9.700 +        {
   9.701 +            /* does elem need to move up the runq? */
   9.702 +            if ( elem->prev != last_under )
   9.703 +            {
   9.704 +                list_del(elem);
   9.705 +                list_add(elem, last_under);
   9.706 +            }
   9.707 +            last_under = elem;
   9.708 +        }
   9.709 +
   9.710 +        elem = next;
   9.711 +    }
   9.712 +
   9.713 +    spin_unlock_irqrestore(&schedule_data[cpu].schedule_lock, flags);
   9.714 +}
   9.715 +
   9.716 +static void
   9.717 +csched_acct(void)
   9.718 +{
   9.719 +    unsigned long flags;
   9.720 +    struct list_head *iter_vcpu, *next_vcpu;
   9.721 +    struct list_head *iter_sdom, *next_sdom;
   9.722 +    struct csched_vcpu *svc;
   9.723 +    struct csched_dom *sdom;
   9.724 +    uint32_t credit_total;
   9.725 +    uint32_t weight_total;
   9.726 +    uint32_t weight_left;
   9.727 +    uint32_t credit_fair;
   9.728 +    uint32_t credit_peak;
   9.729 +    int credit_balance;
   9.730 +    int credit_xtra;
   9.731 +    int credit;
   9.732 +
   9.733 +
   9.734 +    spin_lock_irqsave(&csched_priv.lock, flags);
   9.735 +
   9.736 +    weight_total = csched_priv.weight;
   9.737 +    credit_total = csched_priv.credit;
   9.738 +
   9.739 +    /* Converge balance towards 0 when it drops negative */
   9.740 +    if ( csched_priv.credit_balance < 0 )
   9.741 +    {
   9.742 +        credit_total -= csched_priv.credit_balance;
   9.743 +        CSCHED_STAT_CRANK(acct_balance);
   9.744 +    }
   9.745 +
   9.746 +    if ( unlikely(weight_total == 0) )
   9.747 +    {
   9.748 +        csched_priv.credit_balance = 0;
   9.749 +        spin_unlock_irqrestore(&csched_priv.lock, flags);
   9.750 +        CSCHED_STAT_CRANK(acct_no_work);
   9.751 +        return;
   9.752 +    }
   9.753 +
   9.754 +    CSCHED_STAT_CRANK(acct_run);
   9.755 +
   9.756 +    weight_left = weight_total;
   9.757 +    credit_balance = 0;
   9.758 +    credit_xtra = 0;
   9.759 +
   9.760 +    list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
   9.761 +    {
   9.762 +        sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
   9.763 +
   9.764 +        BUG_ON( is_idle_domain(sdom->dom) );
   9.765 +        BUG_ON( sdom->active_vcpu_count == 0 );
   9.766 +        BUG_ON( sdom->weight == 0 );
   9.767 +        BUG_ON( sdom->weight > weight_left );
   9.768 +
   9.769 +        weight_left -= sdom->weight;
   9.770 +
   9.771 +        /*
   9.772 +         * A domain's fair share is computed using its weight in competition
   9.773 +         * with that of all other active domains.
   9.774 +         *
   9.775 +         * At most, a domain can use credits to run all its active VCPUs
   9.776 +         * for one full accounting period. We allow a domain to earn more
   9.777 +         * only when the system-wide credit balance is negative.
   9.778 +         */
   9.779 +        credit_peak = sdom->active_vcpu_count * CSCHED_ACCT_PERIOD;
   9.780 +        if ( csched_priv.credit_balance < 0 )
   9.781 +        {
   9.782 +            credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
   9.783 +                             (weight_total - 1)
   9.784 +                           ) / weight_total;
   9.785 +        }
   9.786 +        if ( sdom->cap != 0U )
   9.787 +        {
   9.788 +            uint32_t credit_cap = ((sdom->cap * CSCHED_ACCT_PERIOD) + 99) / 100;
   9.789 +            if ( credit_cap < credit_peak )
   9.790 +                credit_peak = credit_cap;
   9.791 +        }
   9.792 +
   9.793 +        credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
   9.794 +                      ) / weight_total;
   9.795 +
   9.796 +        if ( credit_fair < credit_peak )
   9.797 +        {
   9.798 +            credit_xtra = 1;
   9.799 +        }
   9.800 +        else
   9.801 +        {
   9.802 +            if ( weight_left != 0U )
   9.803 +            {
   9.804 +                /* Give other domains a chance at unused credits */
   9.805 +                credit_total += ( ( ( credit_fair - credit_peak
   9.806 +                                    ) * weight_total
   9.807 +                                  ) + ( weight_left - 1 )
   9.808 +                                ) / weight_left;
   9.809 +            }
   9.810 +
   9.811 +            if ( credit_xtra )
   9.812 +            {
   9.813 +                /*
   9.814 +                 * Lazily keep domains with extra credits at the head of
   9.815 +                 * the queue to give others a chance at them in future
   9.816 +                 * accounting periods.
   9.817 +                 */
   9.818 +                CSCHED_STAT_CRANK(acct_reorder);
   9.819 +                list_del(&sdom->active_sdom_elem);
   9.820 +                list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
   9.821 +            }
   9.822 +
   9.823 +            credit_fair = credit_peak;
   9.824 +        }
   9.825 +
   9.826 +        /* Compute fair share per VCPU */
   9.827 +        credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
   9.828 +                      ) / sdom->active_vcpu_count;
   9.829 +
   9.830 +
   9.831 +        list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
   9.832 +        {
   9.833 +            svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
   9.834 +            BUG_ON( sdom != svc->sdom );
   9.835 +
   9.836 +            /* Increment credit */
   9.837 +            atomic_add(credit_fair, &svc->credit);
   9.838 +            credit = atomic_read(&svc->credit);
   9.839 +
   9.840 +            /*
   9.841 +             * Recompute priority or, if VCPU is idling, remove it from
   9.842 +             * the active list.
   9.843 +             */
   9.844 +            if ( credit < 0 )
   9.845 +            {
   9.846 +                if ( sdom->cap == 0U )
   9.847 +                    svc->pri = CSCHED_PRI_TS_OVER;
   9.848 +                else
   9.849 +                    svc->pri = CSCHED_PRI_TS_PARKED;
   9.850 +
   9.851 +                if ( credit < -CSCHED_TSLICE )
   9.852 +                {
   9.853 +                    CSCHED_STAT_CRANK(acct_min_credit);
   9.854 +                    credit = -CSCHED_TSLICE;
   9.855 +                    atomic_set(&svc->credit, credit);
   9.856 +                }
   9.857 +            }
   9.858 +            else
   9.859 +            {
   9.860 +                svc->pri = CSCHED_PRI_TS_UNDER;
   9.861 +
   9.862 +                if ( credit > CSCHED_TSLICE )
   9.863 +                    __csched_vcpu_acct_idle_locked(svc);
   9.864 +            }
   9.865 +
   9.866 +            svc->credit_last = credit;
   9.867 +            svc->credit_incr = credit_fair;
   9.868 +            credit_balance += credit;
   9.869 +        }
   9.870 +    }
   9.871 +
   9.872 +    csched_priv.credit_balance = credit_balance;
   9.873 +
   9.874 +    spin_unlock_irqrestore(&csched_priv.lock, flags);
   9.875 +
   9.876 +    /* Inform each CPU that its runq needs to be sorted */
   9.877 +    csched_priv.runq_sort++;
   9.878 +}
   9.879 +
   9.880 +static void
   9.881 +csched_tick(unsigned int cpu)
   9.882 +{
   9.883 +    struct csched_vcpu * const svc = CSCHED_VCPU(current);
   9.884 +    struct csched_dom * const sdom = svc->sdom;
   9.885 +
   9.886 +    /*
   9.887 +     * Accounting for running VCPU
   9.888 +     *
   9.889 +     * Note: Some VCPUs, such as the idle tasks, are not credit scheduled.
   9.890 +     */
   9.891 +    if ( likely(sdom != NULL) )
   9.892 +    {
   9.893 +        csched_vcpu_acct(svc, CSCHED_TICK);
   9.894 +    }
   9.895 +
   9.896 +    /*
   9.897 +     * Accounting duty
   9.898 +     *
   9.899 +     * Note: Currently, this is always done by the master boot CPU. Eventually,
   9.900 +     * we could distribute or at the very least cycle the duty.
   9.901 +     */
   9.902 +    if ( (csched_priv.master == cpu) &&
   9.903 +         (schedule_data[cpu].tick % CSCHED_ACCT_NTICKS) == 0 )
   9.904 +    {
   9.905 +        csched_acct();
   9.906 +    }
   9.907 +
   9.908 +    /*
   9.909 +     * Check if runq needs to be sorted
   9.910 +     *
   9.911 +     * Every physical CPU resorts the runq after the accounting master has
   9.912 +     * modified priorities. This is a special O(n) sort and runs at most
   9.913 +     * once per accounting period (currently 30 milliseconds).
   9.914 +     */
   9.915 +    csched_runq_sort(cpu);
   9.916 +}
   9.917 +
   9.918 +static struct csched_vcpu *
   9.919 +csched_runq_steal(struct csched_pcpu *spc, int cpu, int pri)
   9.920 +{
   9.921 +    struct list_head *iter;
   9.922 +    struct csched_vcpu *speer;
   9.923 +    struct vcpu *vc;
   9.924 +
   9.925 +    list_for_each( iter, &spc->runq )
   9.926 +    {
   9.927 +        speer = __runq_elem(iter);
   9.928 +
   9.929 +        /*
   9.930 +         * If next available VCPU here is not of higher priority than ours,
   9.931 +         * this PCPU is useless to us.
   9.932 +         */
   9.933 +        if ( speer->pri <= CSCHED_PRI_IDLE || speer->pri <= pri )
   9.934 +        {
   9.935 +            CSCHED_STAT_CRANK(steal_peer_idle);
   9.936 +            break;
   9.937 +        }
   9.938 +
   9.939 +        /* Is this VCPU is runnable on our PCPU? */
   9.940 +        vc = speer->vcpu;
   9.941 +        BUG_ON( is_idle_vcpu(vc) );
   9.942 +
   9.943 +        if ( __csched_vcpu_is_stealable(cpu, vc) )
   9.944 +        {
   9.945 +            /* We got a candidate. Grab it! */
   9.946 +            __runq_remove(speer);
   9.947 +            vc->processor = cpu;
   9.948 +
   9.949 +            return speer;
   9.950 +        }
   9.951 +    }
   9.952 +
   9.953 +    return NULL;
   9.954 +}
   9.955 +
   9.956 +static struct csched_vcpu *
   9.957 +csched_load_balance(int cpu, struct csched_vcpu *snext)
   9.958 +{
   9.959 +    struct csched_pcpu *spc;
   9.960 +    struct csched_vcpu *speer;
   9.961 +    int peer_cpu;
   9.962 +
   9.963 +    if ( snext->pri == CSCHED_PRI_IDLE )
   9.964 +        CSCHED_STAT_CRANK(load_balance_idle);
   9.965 +    else if ( snext->pri == CSCHED_PRI_TS_OVER )
   9.966 +        CSCHED_STAT_CRANK(load_balance_over);
   9.967 +    else
   9.968 +        CSCHED_STAT_CRANK(load_balance_other);
   9.969 +
   9.970 +    peer_cpu = cpu;
   9.971 +    BUG_ON( peer_cpu != snext->vcpu->processor );
   9.972 +
   9.973 +    while ( 1 )
   9.974 +    {
   9.975 +        /* For each PCPU in the system starting with our neighbour... */
   9.976 +        peer_cpu = (peer_cpu + 1) % csched_priv.ncpus;
   9.977 +        if ( peer_cpu == cpu )
   9.978 +            break;
   9.979 +
   9.980 +        BUG_ON( peer_cpu >= csched_priv.ncpus );
   9.981 +        BUG_ON( peer_cpu == cpu );
   9.982 +
   9.983 +        /*
   9.984 +         * Get ahold of the scheduler lock for this peer CPU.
   9.985 +         *
   9.986 +         * Note: We don't spin on this lock but simply try it. Spinning could
   9.987 +         * cause a deadlock if the peer CPU is also load balancing and trying
   9.988 +         * to lock this CPU.
   9.989 +         */
   9.990 +        if ( spin_trylock(&schedule_data[peer_cpu].schedule_lock) )
   9.991 +        {
   9.992 +
   9.993 +            spc = CSCHED_PCPU(peer_cpu);
   9.994 +            if ( unlikely(spc == NULL) )
   9.995 +            {
   9.996 +                CSCHED_STAT_CRANK(steal_peer_down);
   9.997 +                speer = NULL;
   9.998 +            }
   9.999 +            else
  9.1000 +            {
  9.1001 +                speer = csched_runq_steal(spc, cpu, snext->pri);
  9.1002 +            }
  9.1003 +
  9.1004 +            spin_unlock(&schedule_data[peer_cpu].schedule_lock);
  9.1005 +
  9.1006 +            /* Got one! */
  9.1007 +            if ( speer )
  9.1008 +            {
  9.1009 +                CSCHED_STAT_CRANK(vcpu_migrate);
  9.1010 +                return speer;
  9.1011 +            }
  9.1012 +        }
  9.1013 +        else
  9.1014 +        {
  9.1015 +            CSCHED_STAT_CRANK(steal_trylock_failed);
  9.1016 +        }
  9.1017 +    }
  9.1018 +
  9.1019 +
  9.1020 +    /* Failed to find more important work */
  9.1021 +    __runq_remove(snext);
  9.1022 +    return snext;
  9.1023 +}
  9.1024 +
  9.1025 +/*
  9.1026 + * This function is in the critical path. It is designed to be simple and
  9.1027 + * fast for the common case.
  9.1028 + */
  9.1029 +static struct task_slice
  9.1030 +csched_schedule(s_time_t now)
  9.1031 +{
  9.1032 +    const int cpu = smp_processor_id();
  9.1033 +    struct list_head * const runq = RUNQ(cpu);
  9.1034 +    struct csched_vcpu * const scurr = CSCHED_VCPU(current);
  9.1035 +    struct csched_vcpu *snext;
  9.1036 +    struct task_slice ret;
  9.1037 +
  9.1038 +    CSCHED_STAT_CRANK(schedule);
  9.1039 +    CSCHED_VCPU_CHECK(current);
  9.1040 +
  9.1041 +    /*
  9.1042 +     * Select next runnable local VCPU (ie top of local runq)
  9.1043 +     */
  9.1044 +    if ( vcpu_runnable(current) )
  9.1045 +        __runq_insert(cpu, scurr);
  9.1046 +    else
  9.1047 +        BUG_ON( is_idle_vcpu(current) || list_empty(runq) );
  9.1048 +
  9.1049 +    snext = __runq_elem(runq->next);
  9.1050 +
  9.1051 +    /*
  9.1052 +     * SMP Load balance:
  9.1053 +     *
  9.1054 +     * If the next highest priority local runnable VCPU has already eaten
  9.1055 +     * through its credits, look on other PCPUs to see if we have more
  9.1056 +     * urgent work... If not, csched_load_balance() will return snext, but
  9.1057 +     * already removed from the runq.
  9.1058 +     */
  9.1059 +    if ( snext->pri > CSCHED_PRI_TS_OVER )
  9.1060 +        __runq_remove(snext);
  9.1061 +    else
  9.1062 +        snext = csched_load_balance(cpu, snext);
  9.1063 +
  9.1064 +    /*
  9.1065 +     * Update idlers mask if necessary. When we're idling, other CPUs
  9.1066 +     * will tickle us when they get extra work.
  9.1067 +     */
  9.1068 +    if ( snext->pri == CSCHED_PRI_IDLE )
  9.1069 +    {
  9.1070 +        if ( !cpu_isset(cpu, csched_priv.idlers) )
  9.1071 +            cpu_set(cpu, csched_priv.idlers);
  9.1072 +    }
  9.1073 +    else if ( cpu_isset(cpu, csched_priv.idlers) )
  9.1074 +    {
  9.1075 +        cpu_clear(cpu, csched_priv.idlers);
  9.1076 +    }
  9.1077 +
  9.1078 +    /*
  9.1079 +     * Return task to run next...
  9.1080 +     */
  9.1081 +    ret.time = MILLISECS(CSCHED_TSLICE);
  9.1082 +    ret.task = snext->vcpu;
  9.1083 +
  9.1084 +    CSCHED_VCPU_CHECK(ret.task);
  9.1085 +    BUG_ON( !vcpu_runnable(ret.task) );
  9.1086 +
  9.1087 +    return ret;
  9.1088 +}
  9.1089 +
  9.1090 +static void
  9.1091 +csched_dump_vcpu(struct csched_vcpu *svc)
  9.1092 +{
  9.1093 +    struct csched_dom * const sdom = svc->sdom;
  9.1094 +
  9.1095 +    printk("[%i.%i] pri=%i cpu=%i",
  9.1096 +            svc->vcpu->domain->domain_id,
  9.1097 +            svc->vcpu->vcpu_id,
  9.1098 +            svc->pri,
  9.1099 +            svc->vcpu->processor);
  9.1100 +
  9.1101 +    if ( sdom )
  9.1102 +    {
  9.1103 +        printk(" credit=%i (%d+%u) {a=%u i=%u w=%u}",
  9.1104 +            atomic_read(&svc->credit),
  9.1105 +            svc->credit_last,
  9.1106 +            svc->credit_incr,
  9.1107 +            svc->state_active,
  9.1108 +            svc->state_idle,
  9.1109 +            sdom->weight);
  9.1110 +    }
  9.1111 +
  9.1112 +    printk("\n");
  9.1113 +}
  9.1114 +
  9.1115 +static void
  9.1116 +csched_dump_pcpu(int cpu)
  9.1117 +{
  9.1118 +    struct list_head *runq, *iter;
  9.1119 +    struct csched_pcpu *spc;
  9.1120 +    struct csched_vcpu *svc;
  9.1121 +    int loop;
  9.1122 +
  9.1123 +    spc = CSCHED_PCPU(cpu);
  9.1124 +    runq = &spc->runq;
  9.1125 +
  9.1126 +    printk(" tick=%lu, sort=%d\n",
  9.1127 +            schedule_data[cpu].tick,
  9.1128 +            spc->runq_sort_last);
  9.1129 +
  9.1130 +    /* current VCPU */
  9.1131 +    svc = CSCHED_VCPU(schedule_data[cpu].curr);
  9.1132 +    if ( svc )
  9.1133 +    {
  9.1134 +        printk("\trun: ");
  9.1135 +        csched_dump_vcpu(svc);
  9.1136 +    }
  9.1137 +
  9.1138 +    loop = 0;
  9.1139 +    list_for_each( iter, runq )
  9.1140 +    {
  9.1141 +        svc = __runq_elem(iter);
  9.1142 +        if ( svc )
  9.1143 +        {
  9.1144 +            printk("\t%3d: ", ++loop);
  9.1145 +            csched_dump_vcpu(svc);
  9.1146 +        }
  9.1147 +    }
  9.1148 +}
  9.1149 +
  9.1150 +static void
  9.1151 +csched_dump(void)
  9.1152 +{
  9.1153 +    struct list_head *iter_sdom, *iter_svc;
  9.1154 +    int loop;
  9.1155 +
  9.1156 +    printk("info:\n"
  9.1157 +           "\tncpus              = %u\n"
  9.1158 +           "\tmaster             = %u\n"
  9.1159 +           "\tcredit             = %u\n"
  9.1160 +           "\tcredit balance     = %d\n"
  9.1161 +           "\tweight             = %u\n"
  9.1162 +           "\trunq_sort          = %u\n"
  9.1163 +           "\ttick               = %dms\n"
  9.1164 +           "\ttslice             = %dms\n"
  9.1165 +           "\taccounting period  = %dms\n"
  9.1166 +           "\tdefault-weight     = %d\n",
  9.1167 +           csched_priv.ncpus,
  9.1168 +           csched_priv.master,
  9.1169 +           csched_priv.credit,
  9.1170 +           csched_priv.credit_balance,
  9.1171 +           csched_priv.weight,
  9.1172 +           csched_priv.runq_sort,
  9.1173 +           CSCHED_TICK,
  9.1174 +           CSCHED_TSLICE,
  9.1175 +           CSCHED_ACCT_PERIOD,
  9.1176 +           CSCHED_DEFAULT_WEIGHT);
  9.1177 +
  9.1178 +    printk("idlers: 0x%lx\n", csched_priv.idlers.bits[0]);
  9.1179 +
  9.1180 +    CSCHED_STATS_PRINTK();
  9.1181 +
  9.1182 +    printk("active vcpus:\n");
  9.1183 +    loop = 0;
  9.1184 +    list_for_each( iter_sdom, &csched_priv.active_sdom )
  9.1185 +    {
  9.1186 +        struct csched_dom *sdom;
  9.1187 +        sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
  9.1188 +
  9.1189 +        list_for_each( iter_svc, &sdom->active_vcpu )
  9.1190 +        {
  9.1191 +            struct csched_vcpu *svc;
  9.1192 +            svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
  9.1193 +
  9.1194 +            printk("\t%3d: ", ++loop);
  9.1195 +            csched_dump_vcpu(svc);
  9.1196 +        }
  9.1197 +    }
  9.1198 +}
  9.1199 +
  9.1200 +static void
  9.1201 +csched_init(void)
  9.1202 +{
  9.1203 +    spin_lock_init(&csched_priv.lock);
  9.1204 +    INIT_LIST_HEAD(&csched_priv.active_sdom);
  9.1205 +    csched_priv.ncpus = 0;
  9.1206 +    csched_priv.master = UINT_MAX;
  9.1207 +    cpus_clear(csched_priv.idlers);
  9.1208 +    csched_priv.weight = 0U;
  9.1209 +    csched_priv.credit = 0U;
  9.1210 +    csched_priv.credit_balance = 0;
  9.1211 +    csched_priv.runq_sort = 0U;
  9.1212 +    CSCHED_STATS_RESET();
  9.1213 +}
  9.1214 +
  9.1215 +
  9.1216 +struct scheduler sched_credit_def = {
  9.1217 +    .name           = "SMP Credit Scheduler",
  9.1218 +    .opt_name       = "credit",
  9.1219 +    .sched_id       = SCHED_CREDIT,
  9.1220 +
  9.1221 +    .alloc_task     = csched_vcpu_alloc,
  9.1222 +    .add_task       = csched_vcpu_add,
  9.1223 +    .sleep          = csched_vcpu_sleep,
  9.1224 +    .wake           = csched_vcpu_wake,
  9.1225 +    .set_affinity   = csched_vcpu_set_affinity,
  9.1226 +
  9.1227 +    .adjdom         = csched_dom_cntl,
  9.1228 +    .free_task      = csched_dom_free,
  9.1229 +
  9.1230 +    .tick           = csched_tick,
  9.1231 +    .do_schedule    = csched_schedule,
  9.1232 +
  9.1233 +    .dump_cpu_state = csched_dump_pcpu,
  9.1234 +    .dump_settings  = csched_dump,
  9.1235 +    .init           = csched_init,
  9.1236 +};
    10.1 --- a/xen/common/schedule.c	Fri May 26 09:44:29 2006 +0100
    10.2 +++ b/xen/common/schedule.c	Fri May 26 11:14:36 2006 +0100
    10.3 @@ -50,9 +50,11 @@ struct schedule_data schedule_data[NR_CP
    10.4  
    10.5  extern struct scheduler sched_bvt_def;
    10.6  extern struct scheduler sched_sedf_def;
    10.7 +extern struct scheduler sched_credit_def;
    10.8  static struct scheduler *schedulers[] = { 
    10.9      &sched_bvt_def,
   10.10      &sched_sedf_def,
   10.11 +    &sched_credit_def,
   10.12      NULL
   10.13  };
   10.14  
   10.15 @@ -639,6 +641,8 @@ static void t_timer_fn(void *unused)
   10.16  
   10.17      page_scrub_schedule_work();
   10.18  
   10.19 +    SCHED_OP(tick, cpu);
   10.20 +
   10.21      set_timer(&t_timer[cpu], NOW() + MILLISECS(10));
   10.22  }
   10.23  
   10.24 @@ -681,6 +685,7 @@ void __init scheduler_init(void)
   10.25          printk("Could not find scheduler: %s\n", opt_sched);
   10.26  
   10.27      printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
   10.28 +    SCHED_OP(init);
   10.29  
   10.30      if ( idle_vcpu[0] != NULL )
   10.31      {
    11.1 --- a/xen/include/public/sched_ctl.h	Fri May 26 09:44:29 2006 +0100
    11.2 +++ b/xen/include/public/sched_ctl.h	Fri May 26 11:14:36 2006 +0100
    11.3 @@ -10,6 +10,7 @@
    11.4  /* Scheduler types. */
    11.5  #define SCHED_BVT      0
    11.6  #define SCHED_SEDF     4
    11.7 +#define SCHED_CREDIT   5
    11.8  
    11.9  /* Set or get info? */
   11.10  #define SCHED_INFO_PUT 0
   11.11 @@ -48,6 +49,10 @@ struct sched_adjdom_cmd {
   11.12              uint32_t extratime;
   11.13              uint32_t weight;
   11.14          } sedf;
   11.15 +        struct csched_domain {
   11.16 +            uint16_t weight;
   11.17 +            uint16_t cap;
   11.18 +        } credit;
   11.19      } u;
   11.20  };
   11.21  
    12.1 --- a/xen/include/xen/sched-if.h	Fri May 26 09:44:29 2006 +0100
    12.2 +++ b/xen/include/xen/sched-if.h	Fri May 26 11:14:36 2006 +0100
    12.3 @@ -58,6 +58,8 @@ struct scheduler {
    12.4      char *opt_name;         /* option name for this scheduler    */
    12.5      unsigned int sched_id;  /* ID for this scheduler             */
    12.6  
    12.7 +    void         (*init)           (void);
    12.8 +    void         (*tick)           (unsigned int cpu);
    12.9      int          (*alloc_task)     (struct vcpu *);
   12.10      void         (*add_task)       (struct vcpu *);
   12.11      void         (*free_task)      (struct domain *);
    13.1 --- a/xen/include/xen/softirq.h	Fri May 26 09:44:29 2006 +0100
    13.2 +++ b/xen/include/xen/softirq.h	Fri May 26 11:14:36 2006 +0100
    13.3 @@ -26,6 +26,19 @@ typedef void (*softirq_handler)(void);
    13.4  asmlinkage void do_softirq(void);
    13.5  extern void open_softirq(int nr, softirq_handler handler);
    13.6  
    13.7 +static inline void cpumask_raise_softirq(cpumask_t mask, unsigned int nr)
    13.8 +{
    13.9 +    int cpu;
   13.10 +
   13.11 +    for_each_cpu_mask(cpu, mask)
   13.12 +    {
   13.13 +        if ( test_and_set_bit(nr, &softirq_pending(cpu)) )
   13.14 +            cpu_clear(cpu, mask);
   13.15 +    }
   13.16 +
   13.17 +    smp_send_event_check_mask(mask);
   13.18 +}
   13.19 +
   13.20  static inline void cpu_raise_softirq(unsigned int cpu, unsigned int nr)
   13.21  {
   13.22      if ( !test_and_set_bit(nr, &softirq_pending(cpu)) )