ia64/xen-unstable

changeset 1194:814cfcf225c4

bitkeeper revision 1.808 (4058996anVCLQRr3o_Adf9GqJybYSg)

Various updates related to the new generic scheduler API.

The BVT scheduler has been ported to this API and a simple Round Robin
scheduler has been added. There's a new generic control interface for
setting scheduling parameters from userspace.

Use the sched=xxx option at boot time to choose the scheduler. Default
is BVT. The possibilities are "bvt" and "rrobin".
author mwilli2@equilibrium.research.intel-research.net
date Wed Mar 17 18:31:06 2004 +0000 (2004-03-17)
parents afe38e38642a
children aaaaffbe35aa
files .rootkeys docs/interface.tex tools/xc/lib/xc.h tools/xc/lib/xc_atropos.c tools/xc/lib/xc_bvtsched.c tools/xc/lib/xc_private.h tools/xc/lib/xc_rrobin.c tools/xc/py/Xc.c xen/common/dom0_ops.c xen/common/domain.c xen/common/kernel.c xen/common/keyhandler.c xen/common/sched_bvt.c xen/common/sched_rrobin.c xen/common/schedule.c xen/include/hypervisor-ifs/dom0_ops.h xen/include/hypervisor-ifs/sched-ctl.h xen/include/xeno/sched-if.h xen/include/xeno/sched.h
line diff
     1.1 --- a/.rootkeys	Wed Mar 17 17:13:18 2004 +0000
     1.2 +++ b/.rootkeys	Wed Mar 17 18:31:06 2004 +0000
     1.3 @@ -71,6 +71,7 @@ 3fbca441SjQr8vJwTQIgH1laysaWog tools/xc/
     1.4  3fbba6dbDfYvJSsw9500b4SZyUhxjQ tools/xc/lib/Makefile
     1.5  3fbba6dc1uU7U3IFeF6A-XEOYF2MkQ tools/xc/lib/rpm.spec
     1.6  3fbba6dcrNxtygEcgJYAJJ1gCQqfsA tools/xc/lib/xc.h
     1.7 +40589968oCfoUlXd460CjVAkBE8IBA tools/xc/lib/xc_atropos.c
     1.8  3fbba6dbEVkVMX0JuDFzap9jeaucGA tools/xc/lib/xc_bvtsched.c
     1.9  3fbba6dbasJQV-MVElDC0DGSHMiL5w tools/xc/lib/xc_domain.c
    1.10  40278d99BLsfUv3qxv0I8C1sClZ0ow tools/xc/lib/xc_elf.h
    1.11 @@ -83,6 +84,7 @@ 40278d9ctaHVDaEuwhXI3Om2JOjx9w tools/xc/
    1.12  4051bce6CHAsYh8P5t2OHDtRWOP9og tools/xc/lib/xc_physdev.c
    1.13  3fbba6dctWRWlFJkYb6hdix2X4WMuw tools/xc/lib/xc_private.c
    1.14  3fbba6dcbVrG2hPzEzwdeV_UC8kydQ tools/xc/lib/xc_private.h
    1.15 +40589968UQFnJeOMn8UIFLbXBuwXjw tools/xc/lib/xc_rrobin.c
    1.16  3fbba6dcoGq9hQlksrBUfC2P5F6sGg tools/xc/lib/xc_vbd.c
    1.17  3fbba6dc38q-ioRlwSR_quw4G3qUeQ tools/xc/lib/xc_vif.c
    1.18  3fbd0a3dTwnDcfdw0-v46dPbX98zDw tools/xc/py/Makefile
    1.19 @@ -169,6 +171,8 @@ 3e54c38dkHAev597bPr71-hGzTdocg xen/commo
    1.20  4051bcecFeq4DE70p4zGO5setf47CA xen/common/physdev.c
    1.21  4006e659i9j-doVxY7DKOGU4XVin1Q xen/common/rbtree.c
    1.22  3ddb79bdHqdQpATqC0rmUZNbsb6L6A xen/common/resource.c
    1.23 +40589968dD2D1aejwSOvrROg7fOvGQ xen/common/sched_bvt.c
    1.24 +40589968be_t_n0-w6ggceW7h-sx0w xen/common/sched_rrobin.c
    1.25  3e397e6619PgAfBbw2XFbXkewvUWgw xen/common/schedule.c
    1.26  3ddb79bdB9RNMnkQnUyZ5C9hhMSQQw xen/common/slab.c
    1.27  3ddb79bd0gVQYmL2zvuJnldvD0AGxQ xen/common/softirq.c
    1.28 @@ -497,6 +501,7 @@ 3ddb79c25UE59iu4JJcbRalx95mvcg xen/inclu
    1.29  3ead095dE_VF-QA88rl_5cWYRWtRVQ xen/include/hypervisor-ifs/kbd.h
    1.30  3ddb79c2oRPrzClk3zbTkRHlpumzKA xen/include/hypervisor-ifs/network.h
    1.31  4051db79512nOCGweabrFWO2M2h5ng xen/include/hypervisor-ifs/physdev.h
    1.32 +40589968wmhPmV5-ENbBYmMjnedgKw xen/include/hypervisor-ifs/sched-ctl.h
    1.33  404f3d2eR2Owk-ZcGOx9ULGHg3nrww xen/include/hypervisor-ifs/trace.h
    1.34  3f0d22cbroqp_BkoDPwkfRJhaw1LiQ xen/include/hypervisor-ifs/vbd.h
    1.35  3ddb79c4qbCoOFHrv9sCGshbWzBVlQ xen/include/scsi/scsi.h
    1.36 @@ -562,6 +567,7 @@ 3e54c38de9SUSYSAwxDf_DwkpAnQFA xen/inclu
    1.37  3ddb79c04nQVR3EYM5L4zxDV_MCo1g xen/include/xeno/prefetch.h
    1.38  4006e65fWMwLqcocgik6wbF0Eeh0Og xen/include/xeno/rbtree.h
    1.39  3e4540ccU1sgCx8seIMGlahmMfv7yQ xen/include/xeno/reboot.h
    1.40 +40589969nPq3DMzv24RDb5LXE9brHw xen/include/xeno/sched-if.h
    1.41  3ddb79c0LzqqS0LhAQ50ekgj4oGl7Q xen/include/xeno/sched.h
    1.42  403a06a7H0hpHcKpAiDe5BPnaXWTlA xen/include/xeno/serial.h
    1.43  3ddb79c0VDeD-Oft5eNfMneTU3D1dQ xen/include/xeno/skbuff.h
     2.1 --- a/docs/interface.tex	Wed Mar 17 17:13:18 2004 +0000
     2.2 +++ b/docs/interface.tex	Wed Mar 17 18:31:06 2004 +0000
     2.3 @@ -353,7 +353,7 @@ create ``virtual disks'' on demand.
     2.4  \subsection{Virtual Disk Management}
     2.5  The VD management code consists of a set of python libraries. It can therefore
     2.6  be accessed by custom scripts as well as the convenience scripts provided. The
     2.7 -VD database is a SQLite database in /var/db/xen\_vdisk.sqlite.
     2.8 +VD database is a SQLite database in /var/db/xen\_vdisks.sqlite.
     2.9  
    2.10  The VD scripts and general VD usage are documented in the VBD-HOWTO.txt.
    2.11  
    2.12 @@ -379,6 +379,307 @@ giving the page back to the hypervisor, 
    2.13  and providing control interfaces for managing scheduling, networking, and
    2.14  blocks.
    2.15  
    2.16 +\chapter{CPU Scheduler}
    2.17 +
    2.18 +Xen offers a uniform API for CPU schedulers.  It is possible to choose
    2.19 +from a number of schedulers at boot and it should be easy to add more.
    2.20 +
    2.21 +\paragraph*{Note: SMP host support}
    2.22 +Xen has always supported SMP host systems.  Domains are statically assigned to
    2.23 +CPUs, either at creation time or when manually pinning to a particular CPU.
    2.24 +The current schedulers then run locally on each CPU to decide which of the
    2.25 +assigned domains should be run there.
    2.26 +
    2.27 +\section{Standard Schedulers}
    2.28 +
    2.29 +These BVT and Round Robin schedulers are part of the normal Xen
    2.30 +distribution.  A port of the Atropos scheduler from the Nemesis
    2.31 +operating system is almost complete and will be added shortly.
    2.32 +
    2.33 +\subsection{Borrowed Virtual Time (BVT)}
    2.34 +
    2.35 +This was the original Xen scheduler.  BVT is designed for general-purpose
    2.36 +environments but also provides support for latency-sensitive threads.  It
    2.37 +provides long-term weighted sharing but allows tasks a limited ability to
    2.38 +``warp back'' in virtual time so that they are dispatched earlier.
    2.39 +
    2.40 +BVT can be activated by specifying {\tt sched=bvt} as a boot argument to Xen.
    2.41 +
    2.42 +\subsection{Round Robin}
    2.43 +
    2.44 +The round robin scheduler is a very simple example of some of the basic parts
    2.45 +of the scheduler API.
    2.46 +
    2.47 +Round robin can be activated by specifying {\tt sched=rrobin} as a boot
    2.48 +argument to Xen.
    2.49 +
    2.50 +\section{Scheduling API}
    2.51 +
    2.52 +The scheduling API is used by both the schedulers described above and should
    2.53 +also be used by any new schedulers.  It provides a generic interface and also
    2.54 +implements much of the ``boilerplate'' code.
    2.55 +
    2.56 +\paragraph*{Note:} the scheduler API is currently undergoing active development,
    2.57 +so there may be some changes to this API, although they are expected to be small.
    2.58 +
    2.59 +Schedulers conforming to this API are described by the following
    2.60 +structure:
    2.61 +
    2.62 +\begin{verbatim}
    2.63 +struct scheduler
    2.64 +{
    2.65 +    char *name;             /* full name for this scheduler      */
    2.66 +    char *opt_name;         /* option name for this scheduler    */
    2.67 +    unsigned int sched_id;  /* ID for this scheduler             */
    2.68 +
    2.69 +    int          (*init_scheduler) ();
    2.70 +    int          (*alloc_task)     (struct task_struct *);
    2.71 +    void         (*add_task)       (struct task_struct *);
    2.72 +    void         (*free_task)      (struct task_struct *);
    2.73 +    void         (*rem_task)       (struct task_struct *);
    2.74 +    void         (*wake_up)        (struct task_struct *);
    2.75 +    long         (*do_block)       (struct task_struct *);
    2.76 +    task_slice_t (*do_schedule)    (s_time_t);
    2.77 +    int          (*control)        (struct sched_ctl_cmd *);
    2.78 +    int          (*adjdom)         (struct task_struct *,
    2.79 +                                    struct sched_adjdom_cmd *);
    2.80 +    s32          (*reschedule)     (struct task_struct *);
    2.81 +    void         (*dump_settings)  (void);
    2.82 +    void         (*dump_cpu_state) (int);
    2.83 +    void         (*dump_runq_el)   (struct task_struct *);
    2.84 +};
    2.85 +\end{verbatim}
    2.86 +
    2.87 +The only method that {\em must} be implemented is
    2.88 +{\tt do\_schedule()}.  However, if there is not some implementation for the
    2.89 +{\tt wake\_up()} method then waking tasks will not get put on the runqueue!
    2.90 +
    2.91 +The fields of the above structure are described in more detail below.
    2.92 +
    2.93 +\subsubsection{name}
    2.94 +
    2.95 +The name field is an arbitrary descriptive ASCII string.
    2.96 +
    2.97 +\subsubsection{opt\_name}
    2.98 +
    2.99 +This field is the value of the {\tt sched=} boot-time option that will select
   2.100 +this scheduler.
   2.101 +
   2.102 +\subsubsection{sched\_id}
   2.103 +
   2.104 +This is an integer that uniquely identifies this scheduler.  There should be a
   2.105 +macro corrsponding to this scheduler ID in {\tt <hypervisor-ifs/sched-if.h>}.
   2.106 +
   2.107 +\subsubsection{init\_scheduler}
   2.108 +
   2.109 +\paragraph*{Purpose}
   2.110 +
   2.111 +This is a function for performing any scheduler-specific initialisation.  For
   2.112 +instance, it might allocate memory for per-CPU scheduler data and initialise it
   2.113 +appropriately.
   2.114 +
   2.115 +\paragraph*{Call environment}
   2.116 +
   2.117 +This function is called after the initialisation performed by the generic
   2.118 +layer.  The function is called exactly once, for the scheduler that has been
   2.119 +selected.
   2.120 +
   2.121 +\paragraph*{Return values}
   2.122 +
   2.123 +This should return negative on failure --- failure to initialise the scheduler
   2.124 +will cause an immediate panic.
   2.125 +
   2.126 +\subsubsection{alloc\_task}
   2.127 +
   2.128 +\paragraph*{Purpose}
   2.129 +This is called when a {\tt task\_struct} is allocated by the generic scheduler
   2.130 +layer.  A particular scheduler implementation may use this method to allocate
   2.131 +per-task data for this task.  It may use the {\tt sched\_priv} pointer in the
   2.132 +{\tt task\_struct} to point to this data.
   2.133 +
   2.134 +\paragraph*{Call environment}
   2.135 +The generic layer guarantees that the {\tt sched\_priv} field will
   2.136 +remain intact from the time this method is called until the task is
   2.137 +deallocated (so long as the scheduler implementation does not change
   2.138 +it!).
   2.139 +
   2.140 +\paragraph*{Return values}
   2.141 +Negative on failure.
   2.142 +
   2.143 +\subsubsection{add\_task}
   2.144 +
   2.145 +\paragraph*{Purpose}
   2.146 +
   2.147 +Called when a task is initially added by the generic layer.
   2.148 +
   2.149 +\paragraph*{Call environment}
   2.150 +
   2.151 +The fields in the {\tt task\_struct} are now filled out and available for use.
   2.152 +Schedulers should implement appropriate initialisation of any per-task private
   2.153 +information in this method.
   2.154 +
   2.155 +\subsubsection{free\_task}
   2.156 +
   2.157 +\paragraph*{Purpose}
   2.158 +
   2.159 +Schedulers should free the space used by any associated private data
   2.160 +structures.
   2.161 +
   2.162 +\paragraph*{Call environment}
   2.163 +
   2.164 +This is called when a {\tt task\_struct} is about to be deallocated.
   2.165 +The generic layer will have done generic task removal operations and
   2.166 +(if implemented) called the scheduler's {\tt rem\_task} method before
   2.167 +this method is called.
   2.168 +
   2.169 +\subsubsection{rem\_task}
   2.170 +
   2.171 +\paragraph*{Purpose}
   2.172 +
   2.173 +This is called when a task is being removed from scheduling.
   2.174 +
   2.175 +\subsubsection{wake\_up}
   2.176 +
   2.177 +\paragraph*{Purpose}
   2.178 +
   2.179 +Called when a task is woken up, this method should put the task on the runqueue
   2.180 +(or do the scheduler-specific equivalent action).
   2.181 +
   2.182 +\paragraph*{Call environment}
   2.183 +
   2.184 +The generic layer guarantees that the task is already in state
   2.185 +RUNNING.
   2.186 +
   2.187 +\subsubsection{do\_block}
   2.188 +
   2.189 +\paragraph*{Purpose}
   2.190 +
   2.191 +This function is called when a task is blocked.  This function should
   2.192 +not remove the task from the runqueue.
   2.193 +
   2.194 +\paragraph*{Call environment}
   2.195 +
   2.196 +The EVENTS\_MASTER\_ENABLE\_BIT is already set and the task state changed to
   2.197 +TASK\_INTERRUPTIBLE on entry to this method.
   2.198 +
   2.199 +\subsubsection{do\_schedule}
   2.200 +
   2.201 +This method must be implemented.
   2.202 +
   2.203 +\paragraph*{Purpose}
   2.204 +
   2.205 +The method is called each time a new task must be chosen for scheduling on the
   2.206 +current CPU.  The current time as passed as the single argument (the current
   2.207 +task can be found using the {\tt current} variable).
   2.208 +
   2.209 +This method should select the next task to run on this CPU and set it's minimum
   2.210 +time to run as well as returning the data described below.
   2.211 +
   2.212 +This method should also take the appropriate action if the previous
   2.213 +task has blocked, e.g. removing it from the runqueue.
   2.214 +
   2.215 +\paragraph*{Call environment}
   2.216 +
   2.217 +The other fields in the {\tt task\_struct} are updated by the generic layer,
   2.218 +which also performs all Xen-specific tasks and performs the actual task switch
   2.219 +(unless the previous task has been chosen again).
   2.220 +
   2.221 +This method is called with the {\tt schedule\_lock} held for the current CPU
   2.222 +and with interrupts disabled.
   2.223 +
   2.224 +\paragraph*{Return values}
   2.225 +
   2.226 +Must return a {\tt struct task\_slice} describing what task to run and how long
   2.227 +for (at maximum).
   2.228 +
   2.229 +\subsubsection{control}
   2.230 +
   2.231 +\paragraph*{Purpose}
   2.232 +
   2.233 +This method is called for global scheduler control operations.  It takes a
   2.234 +pointer to a {\tt struct sched\_ctl\_cmd}, from which it should select the
   2.235 +appropriate command data.
   2.236 +
   2.237 +\paragraph*{Call environment}
   2.238 +
   2.239 +The generic layer guarantees that when this method is called, the caller was
   2.240 +using the same control interface version and that the caller selected the
   2.241 +correct scheduler ID, hence the scheduler's implementation does not need to
   2.242 +sanity-check these parts of the call.
   2.243 +
   2.244 +\paragraph*{Return values}
   2.245 +
   2.246 +This function should return the value to be passed back to user space, hence it
   2.247 +should either be 0 or an appropriate errno value.
   2.248 +
   2.249 +\subsubsection{sched\_adjdom}
   2.250 +
   2.251 +\paragraph*{Purpose}
   2.252 +
   2.253 +This method is called to adjust the scheduling parameters of a particular
   2.254 +domain.
   2.255 +
   2.256 +\paragraph*{Call environment}
   2.257 +
   2.258 +The generic layer guarantees that the caller has specified the correct
   2.259 +control interface version and scheduler ID and that the supplied {\tt
   2.260 +task\_struct} will not be deallocated during the call (hence it is not
   2.261 +necessary to {\tt get\_task\_struct}).
   2.262 +
   2.263 +\paragraph*{Return values}
   2.264 +
   2.265 +This function should return the value to be passed back to user space, hence it
   2.266 +should either be 0 or an appropriate errno value.
   2.267 +
   2.268 +\subsubsection{reschedule}
   2.269 +
   2.270 +\paragraph*{Purpose}
   2.271 +
   2.272 +This method is called to determine if a reschedule is required as a result of a
   2.273 +particular task.
   2.274 +
   2.275 +\paragraph*{Call environment}
   2.276 +The generic layer will cause a reschedule if the current domain is the idle
   2.277 +task or it has exceeded its minimum time slice before a reschedule.  The
   2.278 +generic layer guarantees that the task passed is not currently running but is
   2.279 +on the runqueue.
   2.280 +
   2.281 +\paragraph*{Return values}
   2.282 +
   2.283 +Should return a mask of CPUs to cause a reschedule on.
   2.284 +
   2.285 +\subsubsection{dump\_settings}
   2.286 +
   2.287 +\paragraph*{Purpose}
   2.288 +
   2.289 +If implemented, this should dump any private global settings for this
   2.290 +scheduler to the console.
   2.291 +
   2.292 +\paragraph*{Call environment}
   2.293 +
   2.294 +This function is called with interrupts enabled.
   2.295 +
   2.296 +\subsubsection{dump\_cpu\_state}
   2.297 +
   2.298 +\paragraph*{Purpose}
   2.299 +
   2.300 +This method should dump any private settings for the specified CPU.
   2.301 +
   2.302 +\paragraph*{Call environment}
   2.303 +
   2.304 +This function is called with interrupts disabled and the {\tt schedule\_lock}
   2.305 +for the specified CPU held.
   2.306 +
   2.307 +\subsubsection{dump\_runq\_el}
   2.308 +
   2.309 +\paragraph*{Purpose}
   2.310 +
   2.311 +This method should dump any private settings for the specified task.
   2.312 +
   2.313 +\paragraph*{Call environment}
   2.314 +
   2.315 +This function is called with interrupts disabled and the {\tt schedule\_lock}
   2.316 +for the task's CPU held.
   2.317  
   2.318  \chapter{Debugging}
   2.319  
     3.1 --- a/tools/xc/lib/xc.h	Wed Mar 17 17:13:18 2004 +0000
     3.2 +++ b/tools/xc/lib/xc.h	Wed Mar 17 18:31:06 2004 +0000
     3.3 @@ -81,6 +81,13 @@ int xc_bvtsched_domain_set(int xc_handle
     3.4                             unsigned long warpl,
     3.5                             unsigned long warpu);
     3.6  
     3.7 +int xc_atropos_domain_set(int xc_handle,
     3.8 +			  u64 domid,
     3.9 +			  int xtratime);
    3.10 +
    3.11 +int xc_rrobin_global_set(int xc_handle,
    3.12 +			 u64 slice);
    3.13 +
    3.14  typedef struct {
    3.15      unsigned long credit_bytes;
    3.16      unsigned long credit_usec;
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/tools/xc/lib/xc_atropos.c	Wed Mar 17 18:31:06 2004 +0000
     4.3 @@ -0,0 +1,38 @@
     4.4 +/******************************************************************************
     4.5 + * xc_atropos.c
     4.6 + * 
     4.7 + * API for manipulating parameters of the Atropos scheduler.
     4.8 + * 
     4.9 + * by Mark Williamson, Copyright (c) 2004 Intel Research Cambridge.
    4.10 + */
    4.11 +
    4.12 +#include "xc_private.h"
    4.13 +
    4.14 +int xc_atropos_global_set(int xc_handle,
    4.15 +			  unsigned long ctx_allow)
    4.16 +{
    4.17 +    dom0_op_t op;
    4.18 +    op.cmd = DOM0_SCHEDCTL;
    4.19 +    op.u.schedctl.if_ver = SCHED_CTL_IF_VER;
    4.20 +    op.u.schedctl.sched_id = SCHED_BVT;
    4.21 +
    4.22 +    op.u.schedctl.u.bvt.ctx_allow = ctx_allow;
    4.23 +    return do_dom0_op(xc_handle, &op);
    4.24 +}
    4.25 +
    4.26 +int xc_atropos_domain_set(int xc_handle,
    4.27 +			  u64 domid, int xtratime)
    4.28 +{
    4.29 +    dom0_op_t op;
    4.30 +    op.cmd = DOM0_ADJUSTDOM;
    4.31 +
    4.32 +    op.u.adjustdom.domain  = (domid_t)domid;
    4.33 +    op.u.adjustdom.if_ver = SCHED_CTL_IF_VER;
    4.34 +    op.u.adjustdom.sched_id = SCHED_ATROPOS;
    4.35 +
    4.36 +    op.u.adjustdom.u.atropos.xtratime = xtratime;
    4.37 +
    4.38 +    printf("Doing dom0 op!\n");
    4.39 +
    4.40 +    return do_dom0_op(xc_handle, &op);
    4.41 +}
     5.1 --- a/tools/xc/lib/xc_bvtsched.c	Wed Mar 17 17:13:18 2004 +0000
     5.2 +++ b/tools/xc/lib/xc_bvtsched.c	Wed Mar 17 18:31:06 2004 +0000
     5.3 @@ -12,8 +12,10 @@ int xc_bvtsched_global_set(int xc_handle
     5.4                             unsigned long ctx_allow)
     5.5  {
     5.6      dom0_op_t op;
     5.7 -    op.cmd = DOM0_BVTCTL;
     5.8 -    op.u.bvtctl.ctx_allow = ctx_allow;
     5.9 +    op.cmd = DOM0_SCHEDCTL;
    5.10 +    op.u.schedctl.sched_id = SCHED_BVT;
    5.11 +    op.u.schedctl.if_ver = SCHED_CTL_IF_VER;
    5.12 +    op.u.schedctl.u.bvt.ctx_allow = ctx_allow;
    5.13      return do_dom0_op(xc_handle, &op);
    5.14  }
    5.15  
    5.16 @@ -25,11 +27,16 @@ int xc_bvtsched_domain_set(int xc_handle
    5.17                             unsigned long warpu)
    5.18  {
    5.19      dom0_op_t op;
    5.20 +    struct bvt_adjdom *adjptr = &op.u.adjustdom.u.bvt;
    5.21 +
    5.22      op.cmd = DOM0_ADJUSTDOM;
    5.23 +    op.u.adjustdom.sched_id = SCHED_BVT;
    5.24 +    op.u.adjustdom.if_ver   = SCHED_CTL_IF_VER;
    5.25      op.u.adjustdom.domain  = (domid_t)domid;
    5.26 -    op.u.adjustdom.mcu_adv = mcuadv;
    5.27 -    op.u.adjustdom.warp    = warp;
    5.28 -    op.u.adjustdom.warpl   = warpl;
    5.29 -    op.u.adjustdom.warpu   = warpu;
    5.30 +
    5.31 +    adjptr->mcu_adv = mcuadv;
    5.32 +    adjptr->warp    = warp;
    5.33 +    adjptr->warpl   = warpl;
    5.34 +    adjptr->warpu   = warpu;
    5.35      return do_dom0_op(xc_handle, &op);
    5.36  }
     6.1 --- a/tools/xc/lib/xc_private.h	Wed Mar 17 17:13:18 2004 +0000
     6.2 +++ b/tools/xc/lib/xc_private.h	Wed Mar 17 18:31:06 2004 +0000
     6.3 @@ -23,6 +23,7 @@
     6.4  #include <dom0_ops.h>
     6.5  #include <vbd.h>
     6.6  #include <event_channel.h>
     6.7 +#include <sched-ctl.h>
     6.8  
     6.9  #define _PAGE_PRESENT   0x001
    6.10  #define _PAGE_RW        0x002
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/tools/xc/lib/xc_rrobin.c	Wed Mar 17 18:31:06 2004 +0000
     7.3 @@ -0,0 +1,20 @@
     7.4 +/******************************************************************************
     7.5 + * xc_rrobin.c
     7.6 + * 
     7.7 + * API for manipulating parameters of the Round Robin scheduler
     7.8 + * 
     7.9 + * by Mark Williamson, Copyright (c) 2004 Intel Research Cambridge.
    7.10 + */
    7.11 +
    7.12 +#include "xc_private.h"
    7.13 +
    7.14 +int xc_rrobin_global_set(int xc_handle, u64 slice)
    7.15 +{
    7.16 +    dom0_op_t op;
    7.17 +    op.cmd = DOM0_SCHEDCTL;
    7.18 +    op.u.schedctl.if_ver = SCHED_CTL_IF_VER;
    7.19 +    op.u.schedctl.sched_id = SCHED_RROBIN;
    7.20 +
    7.21 +    op.u.schedctl.u.rrobin.slice = slice;
    7.22 +    return do_dom0_op(xc_handle, &op);
    7.23 +}
     8.1 --- a/tools/xc/py/Xc.c	Wed Mar 17 17:13:18 2004 +0000
     8.2 +++ b/tools/xc/py/Xc.c	Wed Mar 17 18:31:06 2004 +0000
     8.3 @@ -290,10 +290,10 @@ static PyObject *pyxc_bvtsched_domain_se
     8.4      u64           dom;
     8.5      unsigned long mcuadv, warp, warpl, warpu;
     8.6  
     8.7 -    static char *kwd_list[] = { "dom", "mcuadv", "warp", "warpl", 
     8.8 +    static char *kwd_list[] = { "dom", "mcuadv", "warp", "warpl",
     8.9                                  "warpu", NULL };
    8.10  
    8.11 -    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Lllll", kwd_list, 
    8.12 +    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Lllll", kwd_list,
    8.13                                        &dom, &mcuadv, &warp, &warpl, &warpu) )
    8.14          return NULL;
    8.15  
    8.16 @@ -862,6 +862,49 @@ static PyObject *pyxc_physinfo(PyObject 
    8.17                           "cpu_khz",     info.cpu_khz);
    8.18  }
    8.19  
    8.20 +static PyObject *pyxc_atropos_domain_set(PyObject *self,
    8.21 +                                         PyObject *args,
    8.22 +                                         PyObject *kwds)
    8.23 +{
    8.24 +    XcObject *xc = (XcObject *)self;
    8.25 +    PyObject *ret_obj;
    8.26 +    int xtratime;
    8.27 +    u64 domid;
    8.28 +
    8.29 +    static char *kwd_list[] = { "dom", "xtratime", NULL };
    8.30 +    
    8.31 +    if( !PyArg_ParseTupleAndKeywords(args, kwds, "Li", kwd_list, &domid,
    8.32 +                                     &xtratime) )
    8.33 +        return NULL;
    8.34 +   
    8.35 +    if ( xc_atropos_domain_set(xc->xc_handle, domid, xtratime) != 0 )
    8.36 +        return PyErr_SetFromErrno(xc_error);
    8.37 +
    8.38 +    Py_INCREF(zero);
    8.39 +    return zero;
    8.40 +}
    8.41 +
    8.42 +static PyObject *pyxc_rrobin_global_set(PyObject *self,
    8.43 +                                        PyObject *args,
    8.44 +                                        PyObject *kwds)
    8.45 +{
    8.46 +    XcObject *xc = (XcObject *)self;
    8.47 +    PyObject *ret_obj;
    8.48 +    u64 slice;
    8.49 +    
    8.50 +    static char *kwd_list[] = { "slice", NULL };
    8.51 +
    8.52 +    if( !PyArg_ParseTupleAndKeywords(args, kwds, "L", kwd_list, &slice) )
    8.53 +        return NULL;
    8.54 +    
    8.55 +    if ( xc_rrobin_global_set(xc->xc_handle, slice) != 0 )
    8.56 +        return PyErr_SetFromErrno(xc_error);
    8.57 +    
    8.58 +    Py_INCREF(zero);
    8.59 +    return zero;
    8.60 +}
    8.61 +
    8.62 +
    8.63  static PyMethodDef pyxc_methods[] = {
    8.64      { "domain_create", 
    8.65        (PyCFunction)pyxc_domain_create, 
    8.66 @@ -955,15 +998,15 @@ static PyMethodDef pyxc_methods[] = {
    8.67        " cmdline [str, n/a]: Kernel parameters, if any.\n\n"
    8.68        "Returns: [int] 0 on success; -1 on error.\n" },
    8.69  
    8.70 -    { "bvtsched_global_set", 
    8.71 -      (PyCFunction)pyxc_bvtsched_global_set, 
    8.72 +    { "bvtsched_global_set",
    8.73 +      (PyCFunction)pyxc_bvtsched_global_set,
    8.74        METH_VARARGS | METH_KEYWORDS, "\n"
    8.75        "Set global tuning parameters for Borrowed Virtual Time scheduler.\n"
    8.76        " ctx_allow [int]: Minimal guaranteed quantum (I think!).\n\n"
    8.77        "Returns: [int] 0 on success; -1 on error.\n" },
    8.78  
    8.79 -    { "bvtsched_domain_set", 
    8.80 -      (PyCFunction)pyxc_bvtsched_domain_set, 
    8.81 +    { "bvtsched_domain_set",
    8.82 +      (PyCFunction)pyxc_bvtsched_domain_set,
    8.83        METH_VARARGS | METH_KEYWORDS, "\n"
    8.84        "Set per-domain tuning parameters for Borrowed Virtual Time scheduler.\n"
    8.85        " dom    [long]: Identifier of domain to be tuned.\n"
    8.86 @@ -973,6 +1016,22 @@ static PyMethodDef pyxc_methods[] = {
    8.87        " warpu  [int]:  Internal BVT parameter.\n\n"
    8.88        "Returns: [int] 0 on success; -1 on error.\n" },
    8.89  
    8.90 +    { "atropos_domain_set",
    8.91 +      (PyCFunction)pyxc_atropos_domain_set,
    8.92 +      METH_VARARGS | METH_KEYWORDS, "\n"
    8.93 +      "Set the extra time flag for a domain when running with Atropos.\n"
    8.94 +      " dom [long]: domain to set\n"
    8.95 +      " xtratime [int]: boolean\n"
    8.96 +      "Returns: [int] 0 on success; -1 on error.\n" },
    8.97 +
    8.98 +    { "rrobin_global_set",
    8.99 +      (PyCFunction)pyxc_rrobin_global_set,
   8.100 +      METH_KEYWORDS, "\n"
   8.101 +      "Set Round Robin scheduler slice.\n"
   8.102 +      " slice [long]: Round Robin scheduler slice\n"
   8.103 +      "Returns: [int] 0 on success, throws an exception on failure\n"
   8.104 +    },
   8.105 +
   8.106      { "vif_scheduler_set", 
   8.107        (PyCFunction)pyxc_vif_scheduler_set, 
   8.108        METH_VARARGS | METH_KEYWORDS, "\n"
     9.1 --- a/xen/common/dom0_ops.c	Wed Mar 17 17:13:18 2004 +0000
     9.2 +++ b/xen/common/dom0_ops.c	Wed Mar 17 18:31:06 2004 +0000
     9.3 @@ -18,6 +18,7 @@
     9.4  #include <asm/pdb.h>
     9.5  #include <xeno/trace.h>
     9.6  #include <xeno/console.h>
     9.7 +#include <hypervisor-ifs/sched-ctl.h>
     9.8  
     9.9  extern unsigned int alloc_new_dom_mem(struct task_struct *, unsigned int);
    9.10  
    9.11 @@ -196,22 +197,15 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
    9.12      }
    9.13      break;
    9.14  
    9.15 -    case DOM0_BVTCTL:
    9.16 +    case DOM0_SCHEDCTL:
    9.17      {
    9.18 -        unsigned long  ctx_allow = op->u.bvtctl.ctx_allow;
    9.19 -        ret = sched_bvtctl(ctx_allow);        
    9.20 +        ret = sched_ctl(&op->u.schedctl);
    9.21      }
    9.22      break;
    9.23  
    9.24      case DOM0_ADJUSTDOM:
    9.25      {
    9.26 -        domid_t        dom     = op->u.adjustdom.domain;
    9.27 -        unsigned long  mcu_adv = op->u.adjustdom.mcu_adv;
    9.28 -        unsigned long  warp    = op->u.adjustdom.warp;
    9.29 -        unsigned long  warpl   = op->u.adjustdom.warpl;
    9.30 -        unsigned long  warpu   = op->u.adjustdom.warpu;
    9.31 -
    9.32 -        ret = sched_adjdom(dom, mcu_adv, warp, warpl, warpu);
    9.33 +        ret = sched_adjdom(&op->u.adjustdom);
    9.34      }
    9.35      break;
    9.36  
    9.37 @@ -281,7 +275,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
    9.38              if ( (p->state == TASK_STOPPED) || (p->state == TASK_DYING) )
    9.39                  op->u.getdomaininfo.state = DOMSTATE_STOPPED;
    9.40              op->u.getdomaininfo.hyp_events  = p->hyp_events;
    9.41 -            op->u.getdomaininfo.mcu_advance = p->mcu_advance;
    9.42 +//            op->u.getdomaininfo.mcu_advance = p->mcu_advance;
    9.43              op->u.getdomaininfo.tot_pages   = p->tot_pages;
    9.44              op->u.getdomaininfo.cpu_time    = p->cpu_time;
    9.45              op->u.getdomaininfo.shared_info_frame = 
    10.1 --- a/xen/common/domain.c	Wed Mar 17 17:13:18 2004 +0000
    10.2 +++ b/xen/common/domain.c	Wed Mar 17 18:31:06 2004 +0000
    10.3 @@ -43,7 +43,6 @@ struct task_struct *do_createdomain(domi
    10.4  
    10.5      if ( (p = alloc_task_struct()) == NULL )
    10.6          return NULL;
    10.7 -    memset(p, 0, sizeof(*p));
    10.8  
    10.9      atomic_set(&p->refcnt, 1);
   10.10  
   10.11 @@ -496,7 +495,7 @@ void release_task(struct task_struct *p)
   10.12      UNSHARE_PFN(virt_to_page(p->shared_info));
   10.13      free_all_dom_mem(p);
   10.14  
   10.15 -    kmem_cache_free(task_struct_cachep, p);
   10.16 +    free_task_struct(p);
   10.17  }
   10.18  
   10.19  
    11.1 --- a/xen/common/kernel.c	Wed Mar 17 17:13:18 2004 +0000
    11.2 +++ b/xen/common/kernel.c	Wed Mar 17 18:31:06 2004 +0000
    11.3 @@ -71,6 +71,8 @@ int opt_watchdog=0;
    11.4  unsigned char opt_pdb[10] = "none";
    11.5  /* opt_tbuf_size: trace buffer size (in pages) */
    11.6  unsigned int opt_tbuf_size = 1;
    11.7 +/* opt_sched: scheduler - default to Borrowed Virtual Time */
    11.8 +char opt_sched[10] = "bvt";
    11.9  
   11.10  static struct {
   11.11      unsigned char *name;
   11.12 @@ -91,6 +93,7 @@ static struct {
   11.13      { "watchdog",         OPT_BOOL, &opt_watchdog },
   11.14      { "pdb",              OPT_STR,  &opt_pdb },
   11.15      { "tbuf_size",        OPT_UINT, &opt_tbuf_size },
   11.16 +    { "sched",            OPT_STR,  &opt_sched },
   11.17      { NULL,               0,        NULL     }
   11.18  };
   11.19  
    12.1 --- a/xen/common/keyhandler.c	Wed Mar 17 17:13:18 2004 +0000
    12.2 +++ b/xen/common/keyhandler.c	Wed Mar 17 18:31:06 2004 +0000
    12.3 @@ -86,7 +86,15 @@ static char *task_states[] =
    12.4      NULL,
    12.5      NULL,
    12.6      NULL,
    12.7 -    "Dying     ", 
    12.8 +    "Dying     ",
    12.9 +    NULL,
   12.10 +    NULL,
   12.11 +    NULL,
   12.12 +    NULL,
   12.13 +    NULL,
   12.14 +    NULL,
   12.15 +    NULL,
   12.16 +    "Sched priv"
   12.17  }; 
   12.18  
   12.19  void do_task_queues(u_char key, void *dev_id, struct pt_regs *regs) 
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/xen/common/sched_bvt.c	Wed Mar 17 18:31:06 2004 +0000
    13.3 @@ -0,0 +1,427 @@
    13.4 +/* -*-  Mode:C; c-basic-offset:4; tab-width:4 -*-
    13.5 + ****************************************************************************
    13.6 + * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
    13.7 + * (C) 2002-2003 University of Cambridge
    13.8 + * (C) 2004      - Mark Williamson - Intel Research Cambridge
    13.9 + ****************************************************************************
   13.10 + *
   13.11 + *        File: common/schedule.c
   13.12 + *      Author: Rolf Neugebauer & Keir Fraser
   13.13 + *              Updated for generic API by Mark Williamson
   13.14 + *
   13.15 + * Description: CPU scheduling
   13.16 + *              implements A Borrowed Virtual Time scheduler.
   13.17 + *              (see Duda & Cheriton SOSP'99)
   13.18 + */
   13.19 +
   13.20 +#include <xeno/config.h>
   13.21 +#include <xeno/init.h>
   13.22 +#include <xeno/lib.h>
   13.23 +#include <xeno/sched.h>
   13.24 +#include <xeno/delay.h>
   13.25 +#include <xeno/event.h>
   13.26 +#include <xeno/time.h>
   13.27 +#include <xeno/ac_timer.h>
   13.28 +#include <xeno/interrupt.h>
   13.29 +#include <xeno/timer.h>
   13.30 +#include <xeno/perfc.h>
   13.31 +#include <xeno/sched-if.h>
   13.32 +#include <xeno/slab.h>
   13.33 +
   13.34 +/* all per-domain BVT-specific scheduling info is stored here */
   13.35 +struct bvt_dom_info
   13.36 +{
   13.37 +    unsigned long mcu_advance;      /* inverse of weight */
   13.38 +    u32           avt;              /* actual virtual time */
   13.39 +    u32           evt;              /* effective virtual time */
   13.40 +    int           warpback;         /* warp?  */
   13.41 +    long          warp;             /* virtual time warp */
   13.42 +    long          warpl;            /* warp limit */
   13.43 +    long          warpu;            /* unwarp time requirement */
   13.44 +    s_time_t      warped;           /* time it ran warped last time */
   13.45 +    s_time_t      uwarped;          /* time it ran unwarped last time */
   13.46 +};
   13.47 +
   13.48 +struct bvt_cpu_info
   13.49 +{
   13.50 +    unsigned long svt; /* XXX check this is unsigned long! */
   13.51 +};
   13.52 +
   13.53 +
   13.54 +#define DOM_INF(p) 	((struct bvt_dom_info *)(p)->sched_priv)
   13.55 +#define CPU_INF(cpu)  ((struct bvt_cpu_info *)(schedule_data[cpu]).sched_priv)
   13.56 +#define CPU_SVT(cpu)  (CPU_INF(cpu)->svt)
   13.57 +
   13.58 +#define MCU            (s32)MICROSECS(100)    /* Minimum unit */
   13.59 +#define MCU_ADVANCE    10                     /* default weight */
   13.60 +#define TIME_SLOP      (s32)MICROSECS(50)     /* allow time to slip a bit */
   13.61 +static s32 ctx_allow = (s32)MILLISECS(5);     /* context switch allowance */
   13.62 +
   13.63 +/* SLAB cache for struct bvt_dom_info objects */
   13.64 +static kmem_cache_t *dom_info_cache;
   13.65 +
   13.66 +/*
   13.67 + * Calculate the effective virtual time for a domain. Take into account 
   13.68 + * warping limits
   13.69 + */
   13.70 +static void __calc_evt(struct bvt_dom_info *inf)
   13.71 +{
   13.72 +    s_time_t now = NOW();
   13.73 +
   13.74 +    if ( inf->warpback ) 
   13.75 +    {
   13.76 +        if ( ((now - inf->warped) < inf->warpl) &&
   13.77 +             ((now - inf->uwarped) > inf->warpu) )
   13.78 +        {
   13.79 +            /* allowed to warp */
   13.80 +            inf->evt = inf->avt - inf->warp;
   13.81 +        } 
   13.82 +        else 
   13.83 +        {
   13.84 +            /* warped for too long -> unwarp */
   13.85 +            inf->evt      = inf->avt;
   13.86 +            inf->uwarped  = now;
   13.87 +            inf->warpback = 0;
   13.88 +        }
   13.89 +    } 
   13.90 +    else 
   13.91 +    {
   13.92 +        inf->evt = inf->avt;
   13.93 +    }
   13.94 +}
   13.95 +
   13.96 +/**
   13.97 + * bvt_alloc_task - allocate BVT private structures for a task
   13.98 + * @p:              task to allocate private structures for
   13.99 + *
  13.100 + * Returns non-zero on failure.
  13.101 + */
  13.102 +int bvt_alloc_task(struct task_struct *p)
  13.103 +{
  13.104 +    DOM_INF(p)
  13.105 +        = (struct bvt_dom_info *)kmem_cache_alloc(dom_info_cache,GFP_KERNEL);
  13.106 +	
  13.107 +	if ( DOM_INF(p) == NULL )
  13.108 +        return -1;
  13.109 +    
  13.110 +    return 0;
  13.111 +}
  13.112 +
  13.113 +/*
  13.114 + * Add and remove a domain
  13.115 + */
  13.116 +void bvt_add_task(struct task_struct *p) 
  13.117 +{
  13.118 +    struct bvt_dom_info *inf = DOM_INF(p);
  13.119 +
  13.120 +    ASSERT(inf != NULL);
  13.121 +    ASSERT(p   != NULL);
  13.122 +
  13.123 +    inf->mcu_advance = MCU_ADVANCE;
  13.124 +
  13.125 +    if ( p->domain == IDLE_DOMAIN_ID )
  13.126 +    {
  13.127 +        inf->avt = inf->evt = ~0U;
  13.128 +    } 
  13.129 +    else 
  13.130 +    {
  13.131 +        /* Set avt and evt to system virtual time. */
  13.132 +        inf->avt         = CPU_SVT(p->processor);
  13.133 +        inf->evt         = CPU_SVT(p->processor);
  13.134 +        /* Set some default values here. */
  13.135 +        inf->warpback    = 0;
  13.136 +        inf->warp        = 0;
  13.137 +        inf->warpl       = 0;
  13.138 +        inf->warpu       = 0;
  13.139 +    }
  13.140 +
  13.141 +    return;
  13.142 +}
  13.143 +
  13.144 +/**
  13.145 + * bvt_free_task - free BVT private structures for a task
  13.146 + * @p:             task
  13.147 + */
  13.148 +void bvt_free_task(struct task_struct *p)
  13.149 +{
  13.150 +    ASSERT( p->sched_priv != NULL );
  13.151 +    kmem_cache_free( dom_info_cache, p->sched_priv );
  13.152 +}
  13.153 +
  13.154 +
  13.155 +void bvt_wake_up(struct task_struct *p)
  13.156 +{
  13.157 +    struct bvt_dom_info *inf = DOM_INF(p);
  13.158 +
  13.159 +    ASSERT(inf != NULL);
  13.160 +    
  13.161 +
  13.162 +    /* set the BVT parameters */
  13.163 +    if (inf->avt < CPU_SVT(p->processor))
  13.164 +        inf->avt = CPU_SVT(p->processor);
  13.165 +
  13.166 +    /* deal with warping here */
  13.167 +    inf->warpback  = 1;
  13.168 +    inf->warped    = NOW();
  13.169 +    __calc_evt(inf);
  13.170 +    __add_to_runqueue_head(p);
  13.171 +}
  13.172 +
  13.173 +/* 
  13.174 + * Block the currently-executing domain until a pertinent event occurs.
  13.175 + */
  13.176 +static long bvt_do_block(struct task_struct *p)
  13.177 +{
  13.178 +    DOM_INF(p)->warpback = 0; 
  13.179 +    return 0;
  13.180 +}
  13.181 +
  13.182 +/* Control the scheduler. */
  13.183 +int bvt_ctl(struct sched_ctl_cmd *cmd)
  13.184 +{
  13.185 +    struct bvt_ctl *params = &cmd->u.bvt;
  13.186 +    
  13.187 +    ctx_allow = params->ctx_allow;
  13.188 +
  13.189 +    return 0;
  13.190 +}
  13.191 +
  13.192 +/* Adjust scheduling parameter for a given domain. */
  13.193 +int bvt_adjdom(struct task_struct *p,
  13.194 +               struct sched_adjdom_cmd *cmd)
  13.195 +{
  13.196 +    struct bvt_adjdom *params = &cmd->u.bvt;
  13.197 +    unsigned long mcu_adv = params->mcu_adv,
  13.198 +                    warp  = params->warp,
  13.199 +                    warpl = params->warpl,
  13.200 +                    warpu = params->warpu;
  13.201 +    
  13.202 +    struct bvt_dom_info *inf = DOM_INF(p);
  13.203 +
  13.204 +    /* Sanity -- this can avoid divide-by-zero. */
  13.205 +    if ( mcu_adv == 0 )
  13.206 +        return -EINVAL;
  13.207 +
  13.208 +    spin_lock_irq(&schedule_lock[p->processor]);   
  13.209 +    inf->mcu_advance = mcu_adv;
  13.210 +    inf->warp = warp;
  13.211 +    inf->warpl = warpl;
  13.212 +    inf->warpu = warpu;
  13.213 +    spin_unlock_irq(&schedule_lock[p->processor]); 
  13.214 +
  13.215 +    return 0;
  13.216 +}
  13.217 +
  13.218 +
  13.219 +/* 
  13.220 + * The main function
  13.221 + * - deschedule the current domain.
  13.222 + * - pick a new domain.
  13.223 + *   i.e., the domain with lowest EVT.
  13.224 + *   The runqueue should be ordered by EVT so that is easy.
  13.225 + */
  13.226 +static task_slice_t bvt_do_schedule(s_time_t now)
  13.227 +{
  13.228 +    struct task_struct *prev = current, *next = NULL, *next_prime, *p;
  13.229 +    struct list_head   *tmp;
  13.230 +    int                 cpu = prev->processor;
  13.231 +    s32                 r_time;     /* time for new dom to run */
  13.232 +    s32                 ranfor;     /* assume we never run longer than 2.1s! */
  13.233 +    s32                 mcus;
  13.234 +    u32                 next_evt, next_prime_evt, min_avt;
  13.235 +    struct bvt_dom_info *prev_inf       = DOM_INF(prev),
  13.236 +                        *p_inf          = NULL,
  13.237 +                        *next_inf       = NULL,
  13.238 +                        *next_prime_inf = NULL;
  13.239 +    task_slice_t        ret;
  13.240 +
  13.241 +    ASSERT(prev->sched_priv != NULL);
  13.242 +    ASSERT(prev_inf != NULL);
  13.243 +
  13.244 +    if ( likely(!is_idle_task(prev)) ) 
  13.245 +    {
  13.246 +        ranfor = (s32)(now - prev->lastschd);
  13.247 +        /* Calculate mcu and update avt. */
  13.248 +        mcus = (ranfor + MCU - 1) / MCU;
  13.249 +        prev_inf->avt += mcus * prev_inf->mcu_advance;
  13.250 +        
  13.251 +        __calc_evt(prev_inf);
  13.252 +        
  13.253 +        __del_from_runqueue(prev);
  13.254 +        
  13.255 +        if ( likely(prev->state == TASK_RUNNING) )
  13.256 +            __add_to_runqueue_tail(prev);
  13.257 +    }
  13.258 +
  13.259 +    /* We should at least have the idle task */
  13.260 +    ASSERT(!list_empty(&schedule_data[cpu].runqueue));
  13.261 +
  13.262 +    /*
  13.263 +     * scan through the run queue and pick the task with the lowest evt
  13.264 +     * *and* the task the second lowest evt.
  13.265 +     * this code is O(n) but we expect n to be small.
  13.266 +     */
  13.267 +    next       = schedule_data[cpu].idle;
  13.268 +    next_prime = NULL;
  13.269 +
  13.270 +    next_evt       = ~0U;
  13.271 +    next_prime_evt = ~0U;
  13.272 +    min_avt        = ~0U;
  13.273 +
  13.274 +    list_for_each ( tmp, &schedule_data[cpu].runqueue )
  13.275 +    {
  13.276 +        p     = list_entry(tmp, struct task_struct, run_list);
  13.277 +        p_inf = DOM_INF(p);
  13.278 +
  13.279 +        if ( p_inf->evt < next_evt )
  13.280 +        {
  13.281 +            next_prime     = next;
  13.282 +            next_prime_evt = next_evt;
  13.283 +            next = p;
  13.284 +            next_evt = p_inf->evt;
  13.285 +        } 
  13.286 +        else if ( next_prime_evt == ~0U )
  13.287 +        {
  13.288 +            next_prime_evt = p_inf->evt;
  13.289 +            next_prime     = p;
  13.290 +        } 
  13.291 +        else if ( p_inf->evt < next_prime_evt )
  13.292 +        {
  13.293 +            next_prime_evt = p_inf->evt;
  13.294 +            next_prime     = p;
  13.295 +        }
  13.296 +
  13.297 +        /* Determine system virtual time. */
  13.298 +        if ( p_inf->avt < min_avt )
  13.299 +            min_avt = p_inf->avt;
  13.300 +    }
  13.301 +
  13.302 +    /* Update system virtual time. */
  13.303 +    if ( min_avt != ~0U )
  13.304 +        CPU_SVT(cpu) = min_avt;
  13.305 +
  13.306 +    /* check for virtual time overrun on this cpu */
  13.307 +    if ( CPU_SVT(cpu) >= 0xf0000000 )
  13.308 +    {
  13.309 +        u_long t_flags; 
  13.310 +        write_lock_irqsave(&tasklist_lock, t_flags); 
  13.311 +        for_each_domain ( p )
  13.312 +        {
  13.313 +            if ( p->processor == cpu )
  13.314 +            {
  13.315 +                p_inf->evt -= 0xe0000000;
  13.316 +                p_inf->avt -= 0xe0000000;
  13.317 +            }
  13.318 +        } 
  13.319 +        write_unlock_irqrestore(&tasklist_lock, t_flags); 
  13.320 +        CPU_SVT(cpu) -= 0xe0000000;
  13.321 +    }
  13.322 +
  13.323 +    /* work out time for next run through scheduler */
  13.324 +    if ( is_idle_task(next) ) 
  13.325 +    {
  13.326 +        r_time = ctx_allow;
  13.327 +        goto sched_done;
  13.328 +    }
  13.329 +
  13.330 +    if ( (next_prime == NULL) || is_idle_task(next_prime) )
  13.331 +    {
  13.332 +        /* We have only one runnable task besides the idle task. */
  13.333 +        r_time = 10 * ctx_allow;     /* RN: random constant */
  13.334 +        goto sched_done;
  13.335 +    }
  13.336 +
  13.337 +    next_prime_inf = DOM_INF(next_prime);
  13.338 +    next_inf       = DOM_INF(next);
  13.339 +
  13.340 +    /*
  13.341 +     * If we are here then we have two runnable tasks.
  13.342 +     * Work out how long 'next' can run till its evt is greater than
  13.343 +     * 'next_prime's evt. Take context switch allowance into account.
  13.344 +     */
  13.345 +    ASSERT(next_prime_inf->evt >= next_inf->evt);
  13.346 +    
  13.347 +    r_time = ((next_prime_inf->evt - next_inf->evt)/next_inf->mcu_advance)
  13.348 +        + ctx_allow;
  13.349 +
  13.350 +    ASSERT(r_time >= ctx_allow);
  13.351 +
  13.352 + sched_done:
  13.353 +    next->min_slice = ctx_allow;
  13.354 +    ret.task = next;
  13.355 +    ret.time = r_time;
  13.356 +
  13.357 +    return ret;
  13.358 +}
  13.359 +
  13.360 +
  13.361 +static void bvt_dump_runq_el(struct task_struct *p)
  13.362 +{
  13.363 +    struct bvt_dom_info *inf = DOM_INF(p);
  13.364 +    
  13.365 +    printk("mcua=0x%04lX ev=0x%08X av=0x%08X ",
  13.366 +           inf->mcu_advance, inf->evt, inf->avt);
  13.367 +}
  13.368 +
  13.369 +static void bvt_dump_settings(void)
  13.370 +{
  13.371 +    printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns ", (u32)MCU, (s32)ctx_allow );
  13.372 +}
  13.373 +
  13.374 +static void bvt_dump_cpu_state(int i)
  13.375 +{
  13.376 +    printk("svt=0x%08lX ", CPU_SVT(i));
  13.377 +}
  13.378 +
  13.379 +
  13.380 +/* Initialise the data structures. */
  13.381 +int bvt_init_scheduler()
  13.382 +{
  13.383 +    int i;
  13.384 +
  13.385 +    for ( i = 0; i < NR_CPUS; i++ )
  13.386 +    {
  13.387 +        CPU_INF(i) = kmalloc(sizeof(struct bvt_cpu_info), GFP_KERNEL);
  13.388 +
  13.389 +        if ( CPU_INF(i) == NULL )
  13.390 +        {
  13.391 +            printk("Failed to allocate BVT scheduler private per-CPU memory!\n");
  13.392 +            return -1;
  13.393 +        }
  13.394 +
  13.395 +        CPU_SVT(i) = 0; /* XXX do I really need to do this? */
  13.396 +    }
  13.397 +
  13.398 +    dom_info_cache = kmem_cache_create("BVT dom info",
  13.399 +                                       sizeof(struct bvt_dom_info),
  13.400 +                                       0, 0, NULL, NULL);
  13.401 +
  13.402 +    if ( dom_info_cache == NULL )
  13.403 +    {
  13.404 +        printk("BVT: Failed to allocate domain info SLAB cache");
  13.405 +        return -1;
  13.406 +    }
  13.407 +
  13.408 +    return 0;
  13.409 +}
  13.410 +
  13.411 +
  13.412 +struct scheduler sched_bvt_def = {
  13.413 +    .name     = "Borrowed Virtual Time",
  13.414 +    .opt_name = "bvt",
  13.415 +    .sched_id = SCHED_BVT,
  13.416 +    
  13.417 +    .init_scheduler = bvt_init_scheduler,
  13.418 +    .alloc_task     = bvt_alloc_task,
  13.419 +    .add_task       = bvt_add_task,
  13.420 +    .free_task      = bvt_free_task,
  13.421 +    .wake_up        = bvt_wake_up,
  13.422 +    .do_block       = bvt_do_block,
  13.423 +    .do_schedule    = bvt_do_schedule,
  13.424 +    .control        = bvt_ctl,
  13.425 +    .adjdom         = bvt_adjdom,
  13.426 +    .dump_settings  = bvt_dump_settings,
  13.427 +    .dump_cpu_state = bvt_dump_cpu_state,
  13.428 +    .dump_runq_el   = bvt_dump_runq_el,
  13.429 +};
  13.430 +
    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/xen/common/sched_rrobin.c	Wed Mar 17 18:31:06 2004 +0000
    14.3 @@ -0,0 +1,56 @@
    14.4 +/****************************************************************************
    14.5 + * Very stupid Round Robin Scheduler for Xen
    14.6 + *
    14.7 + * by Mark Williamson (C) 2004 Intel Research Cambridge
    14.8 + */
    14.9 +
   14.10 +#include <xeno/sched.h>
   14.11 +#include <xeno/sched-if.h>
   14.12 +#include <hypervisor-ifs/sched-ctl.h>
   14.13 +#include <xeno/ac_timer.h>
   14.14 +#include <xeno/time.h>
   14.15 +
   14.16 +static s_time_t rr_slice = MILLISECS(10);
   14.17 +
   14.18 +static task_slice_t rr_do_schedule(s_time_t now)
   14.19 +{
   14.20 +    struct task_struct *prev = current;
   14.21 +    int cpu = current->processor;
   14.22 +    task_slice_t ret;
   14.23 + 
   14.24 +    __del_from_runqueue(prev);
   14.25 +    
   14.26 +    if ( prev->state == TASK_RUNNING )
   14.27 +      __add_to_runqueue_tail(prev);
   14.28 +    
   14.29 +    ret.task = list_entry(schedule_data[cpu].runqueue.next,
   14.30 +                    struct task_struct, run_list);
   14.31 +
   14.32 +    ret.time = rr_slice;
   14.33 +
   14.34 +    return ret;
   14.35 +}
   14.36 +
   14.37 +static int rr_ctl(struct sched_ctl_cmd *cmd)
   14.38 +{
   14.39 +    rr_slice = cmd->u.rrobin.slice;
   14.40 +    return 0;
   14.41 +}
   14.42 +
   14.43 +static void rr_dump_settings()
   14.44 +{
   14.45 +    printk("rr_slice = %llu ", rr_slice);
   14.46 +}
   14.47 +
   14.48 +struct scheduler sched_rrobin_def = {
   14.49 +    .name     = "Stupid Round Robin Scheduler",
   14.50 +    .opt_name = "rrobin",
   14.51 +    .sched_id = SCHED_RROBIN,
   14.52 +
   14.53 +    .wake_up        = __add_to_runqueue_head,
   14.54 +    .do_schedule    = rr_do_schedule,
   14.55 +    .control        = rr_ctl,
   14.56 +    .dump_settings  = rr_dump_settings,
   14.57 +};
   14.58 +
   14.59 +
    15.1 --- a/xen/common/schedule.c	Wed Mar 17 17:13:18 2004 +0000
    15.2 +++ b/xen/common/schedule.c	Wed Mar 17 18:31:06 2004 +0000
    15.3 @@ -2,14 +2,16 @@
    15.4   ****************************************************************************
    15.5   * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
    15.6   * (C) 2002-2003 University of Cambridge
    15.7 + * (C) 2004      - Mark Williamson - Intel Research Cambridge
    15.8   ****************************************************************************
    15.9   *
   15.10   *        File: common/schedule.c
   15.11   *      Author: Rolf Neugebauer & Keir Fraser
   15.12 + *              Updated for generic API by Mark Williamson
   15.13   * 
   15.14 - * Description: CPU scheduling
   15.15 - *              implements A Borrowed Virtual Time scheduler.
   15.16 - *              (see Duda & Cheriton SOSP'99)
   15.17 + * Description: Generic CPU scheduling code
   15.18 + *              implements support functionality for the Xen scheduler API.
   15.19 + *
   15.20   */
   15.21  
   15.22  #include <xeno/config.h>
   15.23 @@ -23,6 +25,9 @@
   15.24  #include <xeno/interrupt.h>
   15.25  #include <xeno/timer.h>
   15.26  #include <xeno/perfc.h>
   15.27 +#include <xeno/sched-if.h>
   15.28 +#include <hypervisor-ifs/sched-ctl.h>
   15.29 +#include <xeno/trace.h>
   15.30  
   15.31  /*#define WAKEUP_HISTO*/
   15.32  /*#define BLOCKTIME_HISTO*/
   15.33 @@ -33,23 +38,54 @@
   15.34  #define BUCKETS 200
   15.35  #endif
   15.36  
   15.37 -#define MCU            (s32)MICROSECS(100)    /* Minimum unit */
   15.38 -#define MCU_ADVANCE    10                     /* default weight */
   15.39  #define TIME_SLOP      (s32)MICROSECS(50)     /* allow time to slip a bit */
   15.40 -static s32 ctx_allow = (s32)MILLISECS(5);     /* context switch allowance */
   15.41 +
   15.42 +/* XXX MAW pull trace-related #defines out of here and into an auto-generated
   15.43 + * header file later on! */
   15.44 +#define TRC_SCHED_DOM_ADD             0x00010000
   15.45 +#define TRC_SCHED_DOM_REM             0x00010001
   15.46 +#define TRC_SCHED_WAKE                0x00010002
   15.47 +#define TRC_SCHED_BLOCK               0x00010003
   15.48 +#define TRC_SCHED_YIELD               0x00010004
   15.49 +#define TRC_SCHED_SET_TIMER           0x00010005
   15.50 +#define TRC_SCHED_CTL                 0x00010006
   15.51 +#define TRC_SCHED_ADJDOM              0x00010007
   15.52 +#define TRC_SCHED_RESCHED             0x00010008
   15.53 +#define TRC_SCHED_SWITCH              0x00010009
   15.54 +#define TRC_SCHED_S_TIMER_FN          0x0001000A
   15.55 +#define TRC_SCHED_T_TIMER_FN          0x0001000B
   15.56 +#define TRC_SCHED_DOM_TIMER_FN        0x0001000C
   15.57 +#define TRC_SCHED_FALLBACK_TIMER_FN   0x0001000D
   15.58 +
   15.59 +#define _HIGH32(_x) (_x >> 32)
   15.60 +#define _LOW32(_x)  ((u32)_x )
   15.61  
   15.62 -typedef struct schedule_data_st
   15.63 -{
   15.64 -    struct list_head    runqueue;       /* runqueue */
   15.65 -    struct task_struct *curr;           /* current task */
   15.66 -    struct task_struct *idle;           /* idle task for this cpu */
   15.67 -    u32                 svt;            /* system virtual time. per CPU??? */
   15.68 -    struct ac_timer     s_timer;        /* scheduling timer  */
   15.69 -#ifdef BUCKETS
   15.70 -    u32                 hist[BUCKETS];  /* for scheduler latency histogram */
   15.71 -#endif
   15.72 -} __cacheline_aligned schedule_data_t;
   15.73 -static schedule_data_t schedule_data[NR_CPUS];
   15.74 +/* Various timer handlers. */
   15.75 +static void s_timer_fn(unsigned long unused);
   15.76 +static void t_timer_fn(unsigned long unused);
   15.77 +static void dom_timer_fn(unsigned long data);
   15.78 +static void fallback_timer_fn(unsigned long unused);
   15.79 +
   15.80 +/* this is global for now so that private implementations can reach it */
   15.81 +schedule_data_t schedule_data[NR_CPUS];
   15.82 +
   15.83 +/* XXX would be nice if the schedulers array could get populated
   15.84 + * automagically without having to hack the code in here         */
   15.85 +extern struct scheduler sched_bvt_def, sched_rrobin_def;
   15.86 +static struct scheduler *schedulers[] = { &sched_bvt_def,
   15.87 +                                          &sched_rrobin_def,
   15.88 +                                          NULL};
   15.89 +
   15.90 +/* scheduler ops for the current scheduler */
   15.91 +static struct scheduler ops;
   15.92 +
   15.93 +/* for scheduler functions that return void             */
   15.94 +#define SCHED_FN_VOID(fn, ...) do { if ( ops.fn ) ops.fn(__VA_ARGS__); } \
   15.95 +                               while (0)
   15.96 +
   15.97 +/* for scheduler functions that return a numeric value  */
   15.98 +#define SCHED_FN_RET(fn, ...)                             \
   15.99 +         (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) : 0 )
  15.100  
  15.101  spinlock_t schedule_lock[NR_CPUS] __cacheline_aligned;
  15.102  
  15.103 @@ -62,110 +98,78 @@ static struct ac_timer t_timer[NR_CPUS];
  15.104   */
  15.105  static struct ac_timer fallback_timer[NR_CPUS];
  15.106  
  15.107 -/* Various timer handlers. */
  15.108 -static void s_timer_fn(unsigned long unused);
  15.109 -static void t_timer_fn(unsigned long unused);
  15.110 -static void dom_timer_fn(unsigned long data);
  15.111 -static void fallback_timer_fn(unsigned long unused);
  15.112 -
  15.113 -/*
  15.114 - * Wrappers for run-queue management. Must be called with the schedule_lock
  15.115 - * held.
  15.116 - */
  15.117 -static inline void __add_to_runqueue_head(struct task_struct * p)
  15.118 -{    
  15.119 -    list_add(&p->run_list, &schedule_data[p->processor].runqueue);
  15.120 -}
  15.121 +extern kmem_cache_t *task_struct_cachep;
  15.122  
  15.123 -static inline void __add_to_runqueue_tail(struct task_struct * p)
  15.124 -{
  15.125 -    list_add_tail(&p->run_list, &schedule_data[p->processor].runqueue);
  15.126 -}
  15.127 -
  15.128 -static inline void __del_from_runqueue(struct task_struct * p)
  15.129 +void free_task_struct(struct task_struct *p)
  15.130  {
  15.131 -    list_del(&p->run_list);
  15.132 -    p->run_list.next = NULL;
  15.133 -}
  15.134 -
  15.135 -static inline int __task_on_runqueue(struct task_struct *p)
  15.136 -{
  15.137 -    return p->run_list.next != NULL;
  15.138 +    SCHED_FN_VOID(free_task, p);
  15.139 +    kmem_cache_free(task_struct_cachep, p);
  15.140  }
  15.141  
  15.142 -#define next_domain(p) \\
  15.143 -        list_entry((p)->run_list.next, struct task_struct, run_list)
  15.144 +/**
  15.145 + * alloc_task_struct - allocate a new task_struct and sched private structures
  15.146 + */
  15.147 +struct task_struct *alloc_task_struct(void)
  15.148 +{
  15.149 +    struct task_struct *p;
  15.150  
  15.151 -/*
  15.152 - * Calculate the effective virtual time for a domain. Take into account 
  15.153 - * warping limits
  15.154 - */
  15.155 -static void __calc_evt(struct task_struct *p)
  15.156 -{
  15.157 -    s_time_t now = NOW();
  15.158 -    if ( p->warpback ) 
  15.159 +    p=((struct task_struct *)kmem_cache_alloc(task_struct_cachep,GFP_KERNEL));
  15.160 +
  15.161 +    if ( p == NULL )
  15.162 +        return NULL;
  15.163 +
  15.164 +    memset(p, 0, sizeof(*p));    
  15.165 +
  15.166 +    if ( SCHED_FN_RET(alloc_task, p) < 0)
  15.167      {
  15.168 -        if ( ((now - p->warped) < p->warpl) &&
  15.169 -             ((now - p->uwarped) > p->warpu) )
  15.170 -        {
  15.171 -            /* allowed to warp */
  15.172 -            p->evt = p->avt - p->warp;
  15.173 -        } 
  15.174 -        else 
  15.175 -        {
  15.176 -            /* warped for too long -> unwarp */
  15.177 -            p->evt      = p->avt;
  15.178 -            p->uwarped  = now;
  15.179 -            p->warpback = 0;
  15.180 -        }
  15.181 -    } 
  15.182 -    else 
  15.183 -    {
  15.184 -        p->evt = p->avt;
  15.185 +        kmem_cache_free(task_struct_cachep, p);
  15.186 +        return NULL;
  15.187      }
  15.188 +    
  15.189 +    return p;
  15.190  }
  15.191  
  15.192 -
  15.193  /*
  15.194   * Add and remove a domain
  15.195   */
  15.196  void sched_add_domain(struct task_struct *p) 
  15.197  {
  15.198      p->state       = TASK_STOPPED;
  15.199 -    p->mcu_advance = MCU_ADVANCE;
  15.200  
  15.201 -    if ( p->domain == IDLE_DOMAIN_ID )
  15.202 -    {
  15.203 -        p->avt = p->evt = ~0U;
  15.204 -        schedule_data[p->processor].idle = p;
  15.205 -    } 
  15.206 -    else 
  15.207 +    if( p->domain != IDLE_DOMAIN_ID )
  15.208      {
  15.209 -        /* Set avt end evt to system virtual time. */
  15.210 -        p->avt         = schedule_data[p->processor].svt;
  15.211 -        p->evt         = schedule_data[p->processor].svt;
  15.212 -        /* Set some default values here. */
  15.213 -        p->warpback    = 0;
  15.214 -        p->warp        = 0;
  15.215 -        p->warpl       = 0;
  15.216 -        p->warpu       = 0;
  15.217 -
  15.218          /* Initialise the per-domain timer. */
  15.219          init_ac_timer(&p->timer);
  15.220          p->timer.cpu      =  p->processor;
  15.221          p->timer.data     = (unsigned long)p;
  15.222          p->timer.function = &dom_timer_fn;
  15.223 +    }
  15.224 +    else
  15.225 +    {
  15.226 +        schedule_data[p->processor].idle = p;
  15.227 +    }
  15.228  
  15.229 -    }
  15.230 +    SCHED_FN_VOID(add_task, p);
  15.231 +
  15.232 +    TRACE_3D(TRC_SCHED_DOM_ADD, _HIGH32(p->domain), _LOW32(p->domain), p);
  15.233  }
  15.234  
  15.235 +/* XXX race condition here?   we could both add and remove a domain at once, in
  15.236 + * theory.  ick! */
  15.237 +/* XXX is the task already removed from the runlist at this point? */
  15.238  int sched_rem_domain(struct task_struct *p) 
  15.239  {
  15.240      int x, y = p->state;
  15.241      do {
  15.242          if ( (x = y) == TASK_DYING ) return 0;
  15.243      } while ( (y = cmpxchg(&p->state, x, TASK_DYING)) != x );
  15.244 +
  15.245      rem_ac_timer(&p->timer);
  15.246 +
  15.247 +    SCHED_FN_VOID(rem_task, p);
  15.248 +
  15.249 +    TRACE_3D(TRC_SCHED_DOM_REM, _HIGH32(p->domain), _LOW32(p->domain), p);
  15.250 +
  15.251      return 1;
  15.252  }
  15.253  
  15.254 @@ -174,6 +178,11 @@ void init_idle_task(void)
  15.255  {
  15.256      unsigned long flags;
  15.257      struct task_struct *p = current;
  15.258 +
  15.259 +    if ( SCHED_FN_RET (alloc_task, p) < 0)
  15.260 +		panic("Failed to allocate scheduler private data for idle task");
  15.261 +    SCHED_FN_VOID(add_task, p);
  15.262 +
  15.263      spin_lock_irqsave(&schedule_lock[p->processor], flags);
  15.264      p->has_cpu = 1;
  15.265      p->state = TASK_RUNNING;
  15.266 @@ -182,31 +191,25 @@ void init_idle_task(void)
  15.267      spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
  15.268  }
  15.269  
  15.270 -
  15.271  void __wake_up(struct task_struct *p)
  15.272  {
  15.273 +    TRACE_3D(TRC_SCHED_WAKE, _HIGH32(p->domain), _LOW32(p->domain), p);
  15.274 +
  15.275      ASSERT(p->state != TASK_DYING);
  15.276  
  15.277      if ( unlikely(__task_on_runqueue(p)) )
  15.278          return;
  15.279  
  15.280      p->state = TASK_RUNNING;
  15.281 -    __add_to_runqueue_head(p);
  15.282  
  15.283 -    /* set the BVT parameters */
  15.284 -    if (p->avt < schedule_data[p->processor].svt)
  15.285 -        p->avt = schedule_data[p->processor].svt;
  15.286 -
  15.287 -    /* deal with warping here */
  15.288 -    p->warpback  = 1;
  15.289 -    p->warped    = NOW();
  15.290 -    __calc_evt(p);
  15.291 +    SCHED_FN_VOID(wake_up, p);
  15.292  
  15.293  #ifdef WAKEUP_HISTO
  15.294      p->wokenup = NOW();
  15.295  #endif
  15.296  }
  15.297  
  15.298 +
  15.299  void wake_up(struct task_struct *p)
  15.300  {
  15.301      unsigned long flags;
  15.302 @@ -220,9 +223,10 @@ void wake_up(struct task_struct *p)
  15.303   */
  15.304  static long do_block(void)
  15.305  {
  15.306 +    ASSERT(current->domain != IDLE_DOMAIN_ID);
  15.307      set_bit(EVENTS_MASTER_ENABLE_BIT, &current->shared_info->events_mask);
  15.308      current->state = TASK_INTERRUPTIBLE;
  15.309 -    current->warpback = 0; 
  15.310 +    TRACE_2D(TRC_SCHED_BLOCK, current->domain, current);
  15.311      __enter_scheduler();
  15.312      return 0;
  15.313  }
  15.314 @@ -232,6 +236,7 @@ static long do_block(void)
  15.315   */
  15.316  static long do_yield(void)
  15.317  {
  15.318 +    TRACE_2D(TRC_SCHED_YIELD, current->domain, current);
  15.319      __enter_scheduler();
  15.320      return 0;
  15.321  }
  15.322 @@ -295,37 +300,56 @@ long do_set_timer_op(unsigned long timeo
  15.323          add_ac_timer(&p->timer);
  15.324      }
  15.325  
  15.326 +    TRACE_5D(TRC_SCHED_SET_TIMER, _HIGH32(p->domain), _LOW32(p->domain),
  15.327 +             p, timeout_hi, timeout_lo);
  15.328 +
  15.329      return 0;
  15.330  }
  15.331  
  15.332  
  15.333 -/* Control the scheduler. */
  15.334 -long sched_bvtctl(unsigned long c_allow)
  15.335 +/**
  15.336 + * sched_ctl - dispatch a scheduler control operation
  15.337 + * @cmd:       the command passed in the dom0 op
  15.338 + *
  15.339 + * Given a generic scheduler control operation, call the control function for
  15.340 + * the scheduler in use, passing the appropriate control information from the
  15.341 + * union supplied.
  15.342 + */
  15.343 +long sched_ctl(struct sched_ctl_cmd *cmd)
  15.344  {
  15.345 -    ctx_allow = c_allow;
  15.346 -    return 0;
  15.347 +    TRACE_0D(TRC_SCHED_CTL);
  15.348 +
  15.349 +    if ( cmd->if_ver != SCHED_CTL_IF_VER )
  15.350 +        return -EACCES;
  15.351 +
  15.352 +    if ( cmd->sched_id != ops.sched_id )
  15.353 +        return -EINVAL;
  15.354 +
  15.355 +    return SCHED_FN_RET(control, cmd);
  15.356  }
  15.357  
  15.358 +
  15.359  /* Adjust scheduling parameter for a given domain. */
  15.360 -long sched_adjdom(domid_t dom, unsigned long mcu_adv, unsigned long warp, 
  15.361 -                 unsigned long warpl, unsigned long warpu)
  15.362 +long sched_adjdom(struct sched_adjdom_cmd *cmd)
  15.363  {
  15.364 -    struct task_struct *p;
  15.365 +    struct task_struct *p;    
  15.366 +    
  15.367 +    if ( cmd->if_ver != SCHED_CTL_IF_VER )
  15.368 +        return -EACCES;
  15.369  
  15.370 -    /* Sanity -- this can avoid divide-by-zero. */
  15.371 -    if ( mcu_adv == 0 )
  15.372 +    if ( cmd->sched_id != ops.sched_id )
  15.373          return -EINVAL;
  15.374  
  15.375 -    p = find_domain_by_id(dom);
  15.376 -    if ( p == NULL ) 
  15.377 +    p = find_domain_by_id(cmd->domain);
  15.378 +
  15.379 +    if( p == NULL )
  15.380          return -ESRCH;
  15.381  
  15.382 -    spin_lock_irq(&schedule_lock[p->processor]);   
  15.383 -    p->mcu_advance = mcu_adv;
  15.384 -    spin_unlock_irq(&schedule_lock[p->processor]); 
  15.385 +    TRACE_2D(TRC_SCHED_ADJDOM, _HIGH32(p->domain), _LOW32(p->domain));
  15.386  
  15.387 -    put_task_struct(p);
  15.388 +    SCHED_FN_VOID(adjdom, p, cmd);
  15.389  
  15.390 +    put_task_struct(p); 
  15.391      return 0;
  15.392  }
  15.393  
  15.394 @@ -339,17 +363,19 @@ long sched_adjdom(domid_t dom, unsigned 
  15.395   */
  15.396  unsigned long __reschedule(struct task_struct *p)
  15.397  {
  15.398 -    int cpu = p->processor;
  15.399 +       int cpu = p->processor;
  15.400      struct task_struct *curr;
  15.401      s_time_t now, min_time;
  15.402  
  15.403 +    TRACE_3D(TRC_SCHED_RESCHED, _HIGH32(p->domain), _LOW32(p->domain), p);
  15.404 +
  15.405      if ( unlikely(p->has_cpu || !__task_on_runqueue(p)) )
  15.406          return 0;
  15.407  
  15.408      now = NOW();
  15.409      curr = schedule_data[cpu].curr;
  15.410      /* domain should run at least for ctx_allow */
  15.411 -    min_time = curr->lastschd + ctx_allow;
  15.412 +    min_time = curr->lastschd + curr->min_slice;
  15.413  
  15.414      if ( is_idle_task(curr) || (min_time <= now) )
  15.415      {
  15.416 @@ -362,161 +388,67 @@ unsigned long __reschedule(struct task_s
  15.417      if ( schedule_data[cpu].s_timer.expires > min_time + TIME_SLOP )
  15.418          mod_ac_timer(&schedule_data[cpu].s_timer, min_time);
  15.419  
  15.420 -    return 0;
  15.421 +    return SCHED_FN_RET(reschedule, p);
  15.422  }
  15.423  
  15.424 -
  15.425  void reschedule(struct task_struct *p)
  15.426  {
  15.427      unsigned long flags, cpu_mask;
  15.428 +
  15.429      spin_lock_irqsave(&schedule_lock[p->processor], flags);
  15.430      cpu_mask = __reschedule(p);
  15.431 +
  15.432      spin_unlock_irqrestore(&schedule_lock[p->processor], flags);
  15.433      hyp_event_notify(cpu_mask);
  15.434  }
  15.435  
  15.436 -
  15.437  /* 
  15.438   * The main function
  15.439 - * - deschedule the current domain.
  15.440 - * - pick a new domain.
  15.441 - *   i.e., the domain with lowest EVT.
  15.442 - *   The runqueue should be ordered by EVT so that is easy.
  15.443 + * - deschedule the current domain (scheduler independent).
  15.444 + * - pick a new domain (scheduler dependent).
  15.445   */
  15.446  asmlinkage void __enter_scheduler(void)
  15.447  {
  15.448 -    struct task_struct *prev = current, *next = NULL, *next_prime, *p;
  15.449 -    struct list_head   *tmp;
  15.450 +    struct task_struct *prev = current, *next = NULL;
  15.451      int                 cpu = prev->processor;
  15.452      s_time_t            now;
  15.453 +    task_slice_t        next_slice;
  15.454      s32                 r_time;     /* time for new dom to run */
  15.455 -    s32                 ranfor;     /* assume we never run longer than 2.1s! */
  15.456 -    s32                 mcus;
  15.457 -    u32                 next_evt, next_prime_evt, min_avt;
  15.458  
  15.459      perfc_incrc(sched_run);
  15.460  
  15.461 +    clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events);
  15.462 +
  15.463      spin_lock_irq(&schedule_lock[cpu]);
  15.464  
  15.465      now = NOW();
  15.466  
  15.467      rem_ac_timer(&schedule_data[cpu].s_timer);
  15.468 -
  15.469 +    
  15.470      ASSERT(!in_interrupt());
  15.471      ASSERT(__task_on_runqueue(prev));
  15.472      ASSERT(prev->state != TASK_UNINTERRUPTIBLE);
  15.473 +    ASSERT(prev != NULL);
  15.474  
  15.475 -    if ( likely(!is_idle_task(prev)) ) 
  15.476 +    if ( prev->state == TASK_INTERRUPTIBLE )
  15.477      {
  15.478 -        ranfor = (s32)(now - prev->lastschd);
  15.479 -        prev->cpu_time += ranfor;
  15.480 -    
  15.481 -        /* Calculate mcu and update avt. */
  15.482 -        mcus = (ranfor + MCU - 1) / MCU;
  15.483 -        prev->avt += mcus * prev->mcu_advance;
  15.484 -        
  15.485 -        __calc_evt(prev);
  15.486 -        
  15.487 -        __del_from_runqueue(prev);
  15.488 -        
  15.489 -        if ( likely(prev->state == TASK_RUNNING) ||
  15.490 -             unlikely((prev->state == TASK_INTERRUPTIBLE) && 
  15.491 -                      signal_pending(prev)) )
  15.492 -        {
  15.493 +        /* this check is needed to avoid a race condition */
  15.494 +        if ( signal_pending(prev) )
  15.495              prev->state = TASK_RUNNING;
  15.496 -            __add_to_runqueue_tail(prev);
  15.497 -        }
  15.498 +        else
  15.499 +            SCHED_FN_VOID(do_block, prev);
  15.500      }
  15.501  
  15.502 -    clear_bit(_HYP_EVENT_NEED_RESCHED, &prev->hyp_events);
  15.503 -
  15.504 -    /* We should at least have the idle task */
  15.505 -    ASSERT(!list_empty(&schedule_data[cpu].runqueue));
  15.506 -
  15.507 -    /*
  15.508 -     * scan through the run queue and pick the task with the lowest evt
  15.509 -     * *and* the task the second lowest evt.
  15.510 -     * this code is O(n) but we expect n to be small.
  15.511 -     */
  15.512 -    next       = schedule_data[cpu].idle;
  15.513 -    next_prime = NULL;
  15.514 -
  15.515 -    next_evt       = ~0U;
  15.516 -    next_prime_evt = ~0U;
  15.517 -    min_avt        = ~0U;
  15.518 -
  15.519 -    list_for_each ( tmp, &schedule_data[cpu].runqueue )
  15.520 -    {
  15.521 -        p = list_entry(tmp, struct task_struct, run_list);
  15.522 -        if ( p->evt < next_evt )
  15.523 -        {
  15.524 -            next_prime     = next;
  15.525 -            next_prime_evt = next_evt;
  15.526 -            next = p;
  15.527 -            next_evt = p->evt;
  15.528 -        } 
  15.529 -        else if ( next_prime_evt == ~0U )
  15.530 -        {
  15.531 -            next_prime_evt = p->evt;
  15.532 -            next_prime     = p;
  15.533 -        } 
  15.534 -        else if ( p->evt < next_prime_evt )
  15.535 -        {
  15.536 -            next_prime_evt = p->evt;
  15.537 -            next_prime     = p;
  15.538 -        }
  15.539 -
  15.540 -        /* Determine system virtual time. */
  15.541 -        if ( p->avt < min_avt )
  15.542 -            min_avt = p->avt;
  15.543 -    }
  15.544 +    /* get policy-specific decision on scheduling... */
  15.545 +    next_slice = ops.do_schedule(now);
  15.546  
  15.547 -    /* Update system virtual time. */
  15.548 -    if ( min_avt != ~0U )
  15.549 -        schedule_data[cpu].svt = min_avt;
  15.550 -
  15.551 -    /* check for virtual time overrun on this cpu */
  15.552 -    if ( schedule_data[cpu].svt >= 0xf0000000 )
  15.553 -    {
  15.554 -        u_long t_flags; 
  15.555 -        write_lock_irqsave(&tasklist_lock, t_flags); 
  15.556 -        for_each_domain ( p )
  15.557 -        {
  15.558 -            if ( p->processor == cpu )
  15.559 -            {
  15.560 -                p->evt -= 0xe0000000;
  15.561 -                p->avt -= 0xe0000000;
  15.562 -            }
  15.563 -        } 
  15.564 -        write_unlock_irqrestore(&tasklist_lock, t_flags); 
  15.565 -        schedule_data[cpu].svt -= 0xe0000000;
  15.566 -    }
  15.567 +    r_time = next_slice.time;
  15.568 +    next   = next_slice.task;
  15.569  
  15.570 -    /* work out time for next run through scheduler */
  15.571 -    if ( is_idle_task(next) ) 
  15.572 -    {
  15.573 -        r_time = ctx_allow;
  15.574 -        goto sched_done;
  15.575 -    }
  15.576 +    if ( likely(!is_idle_task(prev)) ) 
  15.577 +        prev->cpu_time += (now - prev->lastschd);
  15.578  
  15.579 -    if ( (next_prime == NULL) || is_idle_task(next_prime) )
  15.580 -    {
  15.581 -        /* We have only one runnable task besides the idle task. */
  15.582 -        r_time = 10 * ctx_allow;     /* RN: random constant */
  15.583 -        goto sched_done;
  15.584 -    }
  15.585 -
  15.586 -    /*
  15.587 -     * If we are here then we have two runnable tasks.
  15.588 -     * Work out how long 'next' can run till its evt is greater than
  15.589 -     * 'next_prime's evt. Take context switch allowance into account.
  15.590 -     */
  15.591 -    ASSERT(next_prime->evt >= next->evt);
  15.592 -    
  15.593 -    r_time = ((next_prime->evt - next->evt)/next->mcu_advance) + ctx_allow;
  15.594 -
  15.595 - sched_done:
  15.596 -    ASSERT(r_time >= ctx_allow);
  15.597 +    /* now, switch to the new task... */
  15.598  
  15.599      prev->has_cpu = 0;
  15.600      next->has_cpu = 1;
  15.601 @@ -537,7 +469,7 @@ asmlinkage void __enter_scheduler(void)
  15.602  
  15.603      if ( unlikely(prev == next) )
  15.604          return;
  15.605 -
  15.606 +    
  15.607      perfc_incrc(sched_ctx);
  15.608  
  15.609  #if defined(WAKEUP_HISTO)
  15.610 @@ -558,6 +490,10 @@ asmlinkage void __enter_scheduler(void)
  15.611      }
  15.612  #endif
  15.613  
  15.614 +    TRACE_2D(TRC_SCHED_SWITCH, next->domain, next);
  15.615 +
  15.616 +    ASSERT(next->processor == current->processor);
  15.617 +
  15.618      switch_to(prev, next);
  15.619      
  15.620      if ( unlikely(prev->state == TASK_DYING) ) 
  15.621 @@ -591,6 +527,8 @@ int idle_cpu(int cpu)
  15.622  /* The scheduler timer: force a run through the scheduler*/
  15.623  static void s_timer_fn(unsigned long unused)
  15.624  {
  15.625 +    TRACE_0D(TRC_SCHED_S_TIMER_FN);
  15.626 +    
  15.627      set_bit(_HYP_EVENT_NEED_RESCHED, &current->hyp_events);
  15.628      perfc_incrc(sched_irq);
  15.629  }
  15.630 @@ -600,6 +538,8 @@ static void t_timer_fn(unsigned long unu
  15.631  {
  15.632      struct task_struct *p = current;
  15.633  
  15.634 +    TRACE_0D(TRC_SCHED_T_TIMER_FN);
  15.635 +
  15.636      if ( !is_idle_task(p) ) 
  15.637          set_bit(_EVENT_TIMER, &p->shared_info->events);
  15.638  
  15.639 @@ -613,6 +553,8 @@ static void dom_timer_fn(unsigned long d
  15.640      unsigned long cpu_mask = 0;
  15.641      struct task_struct *p = (struct task_struct *)data;
  15.642  
  15.643 +    TRACE_0D(TRC_SCHED_DOM_TIMER_FN);
  15.644 +
  15.645      cpu_mask |= mark_guest_event(p, _EVENT_TIMER);
  15.646      guest_event_notify(cpu_mask);
  15.647  }
  15.648 @@ -623,6 +565,8 @@ static void fallback_timer_fn(unsigned l
  15.649  {
  15.650      struct task_struct *p = current;
  15.651  
  15.652 +    TRACE_0D(TRC_SCHED_FALLBACK_TIMER_FN);
  15.653 +
  15.654      if ( !is_idle_task(p) )
  15.655          update_dom_time(p->shared_info);
  15.656  
  15.657 @@ -660,6 +604,29 @@ void __init scheduler_init(void)
  15.658      }
  15.659  
  15.660      schedule_data[0].idle = &idle0_task;
  15.661 +
  15.662 +    extern char opt_sched[];
  15.663 +
  15.664 +    for ( i = 0; schedulers[i] != NULL; i++ )
  15.665 +    {
  15.666 +        ops = *schedulers[i]; /* fetch operations structure */
  15.667 +
  15.668 +        if(strcmp(ops.opt_name, opt_sched) == 0)
  15.669 +            break;
  15.670 +    }
  15.671 +    
  15.672 +    if ( schedulers[i] == NULL )
  15.673 +        printk("Could not find scheduler: %s\n", opt_sched);
  15.674 +
  15.675 +    printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
  15.676 +    
  15.677 +    if ( ops.do_schedule == NULL)
  15.678 +        panic("Chosen scheduler has NULL do_schedule!");
  15.679 +
  15.680 +    if ( SCHED_FN_RET(init_scheduler) < 0 )
  15.681 +        panic("Initialising scheduler failed!");
  15.682 +
  15.683 +    SCHED_FN_VOID(add_task, &idle0_task);
  15.684  }
  15.685  
  15.686  /*
  15.687 @@ -698,12 +665,9 @@ static void dump_rqueue(struct list_head
  15.688              (unsigned long) queue->next, (unsigned long) queue->prev);
  15.689      list_for_each (list, queue) {
  15.690          p = list_entry(list, struct task_struct, run_list);
  15.691 -        printk("%3d: %llu has=%c mcua=0x%04lX"
  15.692 -               " ev=0x%08X av=0x%08X c=0x%X%08X\n",
  15.693 -               loop++, p->domain,
  15.694 -               p->has_cpu ? 'T':'F',
  15.695 -               p->mcu_advance, p->evt, p->avt,
  15.696 -               (u32)(p->cpu_time>>32), (u32)p->cpu_time);
  15.697 +        printk("%3d: %llu has=%c ", loop++, p->domain, p->has_cpu ? 'T':'F');
  15.698 +        SCHED_FN_VOID(dump_runq_el, p);
  15.699 +        printk("c=0x%X%08X\n", (u32)(p->cpu_time>>32), (u32)p->cpu_time);
  15.700          printk("         l: %lx n: %lx  p: %lx\n",
  15.701                 (unsigned long)list, (unsigned long)list->next,
  15.702                 (unsigned long)list->prev);
  15.703 @@ -717,11 +681,13 @@ void dump_runq(u_char key, void *dev_id,
  15.704      s_time_t now = NOW();
  15.705      int i;
  15.706  
  15.707 -    printk("BVT: mcu=0x%08Xns ctx_allow=0x%08Xns NOW=0x%08X%08X\n",
  15.708 -           (u32)MCU, (u32)ctx_allow, (u32)(now>>32), (u32)now); 
  15.709 +	printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
  15.710 +    SCHED_FN_VOID(dump_settings);
  15.711 +    printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now); 
  15.712      for (i = 0; i < smp_num_cpus; i++) {
  15.713          spin_lock_irqsave(&schedule_lock[i], flags);
  15.714 -        printk("CPU[%02d] svt=0x%08X ", i, (s32)schedule_data[i].svt);
  15.715 +        printk("CPU[%02d] ", i);
  15.716 +        SCHED_FN_VOID(dump_cpu_state,i);
  15.717          dump_rqueue(&schedule_data[i].runqueue, "rq"); 
  15.718          spin_unlock_irqrestore(&schedule_lock[i], flags);
  15.719      }
    16.1 --- a/xen/include/hypervisor-ifs/dom0_ops.h	Wed Mar 17 17:13:18 2004 +0000
    16.2 +++ b/xen/include/hypervisor-ifs/dom0_ops.h	Wed Mar 17 18:31:06 2004 +0000
    16.3 @@ -11,13 +11,14 @@
    16.4  #define __DOM0_OPS_H__
    16.5  
    16.6  #include "hypervisor-if.h"
    16.7 +#include "sched-ctl.h"
    16.8  
    16.9  /*
   16.10   * Make sure you increment the interface version whenever you modify this file!
   16.11   * This makes sure that old versions of dom0 tools will stop working in a
   16.12   * well-defined way (rather than crashing the machine, for instance).
   16.13   */
   16.14 -#define DOM0_INTERFACE_VERSION   0xAAAA0008
   16.15 +#define DOM0_INTERFACE_VERSION   0xAAAA0009
   16.16  
   16.17  #define MAX_CMD_LEN       256
   16.18  #define MAX_DOMAIN_NAME    16
   16.19 @@ -74,23 +75,13 @@ typedef struct dom0_builddomain_st
   16.20      full_execution_context_t ctxt;
   16.21  } dom0_builddomain_t;
   16.22  
   16.23 -#define DOM0_BVTCTL            6
   16.24 -typedef struct dom0_bvtctl_st
   16.25 -{
   16.26 -    /* IN variables. */
   16.27 -    unsigned long ctx_allow;  /* context switch allowance */
   16.28 -} dom0_bvtctl_t;
   16.29 +#define DOM0_SCHEDCTL            6
   16.30 + /* struct sched_ctl_cmd is from sched-ctl.h   */
   16.31 +typedef struct sched_ctl_cmd dom0_schedctl_t;
   16.32  
   16.33  #define DOM0_ADJUSTDOM         7
   16.34 -typedef struct dom0_adjustdom_st
   16.35 -{
   16.36 -    /* IN variables. */
   16.37 -    domid_t       domain;     /* domain id */
   16.38 -    unsigned long mcu_adv;    /* mcu advance: inverse of weight */
   16.39 -    unsigned long warp;       /* time warp */
   16.40 -    unsigned long warpl;      /* warp limit */
   16.41 -    unsigned long warpu;      /* unwarp time requirement */
   16.42 -} dom0_adjustdom_t;
   16.43 +/* struct sched_adjdom_cmd is from sched-ctl.h */
   16.44 +typedef struct sched_adjdom_cmd dom0_adjustdom_t;
   16.45  
   16.46  #define DOM0_GETDOMAININFO    12
   16.47  typedef struct dom0_getdomaininfo_st
   16.48 @@ -234,7 +225,7 @@ typedef struct dom0_op_st
   16.49          dom0_stopdomain_t       stopdomain;
   16.50          dom0_destroydomain_t    destroydomain;
   16.51          dom0_getmemlist_t       getmemlist;
   16.52 -        dom0_bvtctl_t           bvtctl;
   16.53 +        dom0_schedctl_t         schedctl;
   16.54          dom0_adjustdom_t        adjustdom;
   16.55          dom0_builddomain_t      builddomain;
   16.56          dom0_getdomaininfo_t    getdomaininfo;
    17.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.2 +++ b/xen/include/hypervisor-ifs/sched-ctl.h	Wed Mar 17 18:31:06 2004 +0000
    17.3 @@ -0,0 +1,68 @@
    17.4 +/**
    17.5 + * Generic scheduler control interface.
    17.6 + *
    17.7 + * Mark Williamson, (C) 2004 Intel Research Cambridge
    17.8 + */
    17.9 +
   17.10 +#ifndef _SCHED_CTL_H_
   17.11 +#define _SCHED_CTL_H_
   17.12 +
   17.13 +/**
   17.14 + * When this file is changed, increment the version number.  This ensures that
   17.15 + * tools will refuse to work (rather than causing a crash) when they're
   17.16 + * out-of-sync with the Xen version number.
   17.17 + */
   17.18 +#define SCHED_CTL_IF_VER 0x0001
   17.19 +
   17.20 +/* scheduler types */
   17.21 +#define SCHED_BVT      0
   17.22 +#define SCHED_ATROPOS  1
   17.23 +#define SCHED_RROBIN   2
   17.24 +
   17.25 +/* generic scheduler control command - union of all scheduler control
   17.26 + * command structures */
   17.27 +struct sched_ctl_cmd
   17.28 +{
   17.29 +    unsigned int if_ver;
   17.30 +    unsigned int sched_id;
   17.31 +    
   17.32 +    union
   17.33 +    {
   17.34 +        struct bvt_ctl
   17.35 +        {
   17.36 +            /* IN variables. */
   17.37 +            unsigned long ctx_allow;  /* context switch allowance */
   17.38 +        } bvt;
   17.39 +
   17.40 +        struct rrobin_ctl
   17.41 +        {
   17.42 +            /* IN variables */
   17.43 +            u64 slice;                /* round robin time slice */
   17.44 +        } rrobin;
   17.45 +    } u;
   17.46 +};
   17.47 +
   17.48 +struct sched_adjdom_cmd
   17.49 +{
   17.50 +    unsigned int if_ver;
   17.51 +    unsigned int sched_id;
   17.52 +    domid_t domain;
   17.53 +    
   17.54 +    union
   17.55 +    {
   17.56 +        struct bvt_adjdom
   17.57 +        {
   17.58 +            unsigned long mcu_adv;    /* mcu advance: inverse of weight */
   17.59 +            unsigned long warp;       /* time warp */
   17.60 +            unsigned long warpl;      /* warp limit */
   17.61 +            unsigned long warpu;      /* unwarp time requirement */
   17.62 +        } bvt;
   17.63 +
   17.64 +        struct atropos_adjdom
   17.65 +        {
   17.66 +            int xtratime;
   17.67 +        } atropos;
   17.68 +    } u;
   17.69 +};
   17.70 +
   17.71 +#endif /* _SCHED_CTL_H_ */
    18.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    18.2 +++ b/xen/include/xeno/sched-if.h	Wed Mar 17 18:31:06 2004 +0000
    18.3 @@ -0,0 +1,90 @@
    18.4 +#include <asm/types.h>
    18.5 +
    18.6 +/*
    18.7 + * Additional declarations for the generic scheduler interface.  This should
    18.8 + * only be included by files that implement conforming schedulers.
    18.9 + *
   18.10 + * Portions by Mark Williamson are (C) 2004 Intel Research Cambridge
   18.11 + */
   18.12 +
   18.13 +#define BUCKETS 10
   18.14 +
   18.15 +typedef struct schedule_data_st
   18.16 +{
   18.17 +    struct list_head    runqueue;       /* runqueue */
   18.18 +    struct task_struct *curr;           /* current task */
   18.19 +    struct task_struct *idle;           /* idle task for this cpu */
   18.20 +    void *              sched_priv;
   18.21 +    struct ac_timer     s_timer;        /* scheduling timer  */
   18.22 +#ifdef BUCKETS
   18.23 +    u32                 hist[BUCKETS];  /* for scheduler latency histogram */
   18.24 +#endif
   18.25 +} __cacheline_aligned schedule_data_t;
   18.26 +
   18.27 +
   18.28 +typedef struct task_slice_st
   18.29 +{
   18.30 +    struct task_struct *task;
   18.31 +    s_time_t            time;
   18.32 +} task_slice_t;
   18.33 +
   18.34 +struct scheduler
   18.35 +{
   18.36 +    char *name;             /* full name for this scheduler      */
   18.37 +    char *opt_name;         /* option name for this scheduler    */
   18.38 +    unsigned int sched_id;  /* ID for this scheduler             */
   18.39 +
   18.40 +    int          (*init_scheduler) ();
   18.41 +    int          (*alloc_task)     (struct task_struct *);
   18.42 +    void         (*add_task)       (struct task_struct *);
   18.43 +    void         (*free_task)      (struct task_struct *);
   18.44 +    void         (*rem_task)       (struct task_struct *);
   18.45 +    void         (*wake_up)        (struct task_struct *);
   18.46 +    /* XXX why does do_block need to return anything at all? */
   18.47 +    long         (*do_block)       (struct task_struct *);
   18.48 +    task_slice_t (*do_schedule)    (s_time_t);
   18.49 +    int          (*control)        (struct sched_ctl_cmd *);
   18.50 +    int          (*adjdom)         (struct task_struct *,
   18.51 +                                    struct sched_adjdom_cmd *);
   18.52 +    s32          (*reschedule)     (struct task_struct *);
   18.53 +    void         (*dump_settings)  (void);
   18.54 +    void         (*dump_cpu_state) (int);
   18.55 +    void         (*dump_runq_el)   (struct task_struct *);
   18.56 +};
   18.57 +
   18.58 +/* per CPU scheduler information */
   18.59 +extern schedule_data_t schedule_data[];
   18.60 +
   18.61 +/*
   18.62 + * Wrappers for run-queue management. Must be called with the schedule_lock
   18.63 + * held.
   18.64 + */
   18.65 +static inline void __add_to_runqueue_head(struct task_struct * p)
   18.66 +{    
   18.67 +    list_add(&p->run_list, &schedule_data[p->processor].runqueue);
   18.68 +}
   18.69 +
   18.70 +static inline void __add_to_runqueue_tail(struct task_struct * p)
   18.71 +{
   18.72 +    list_add_tail(&p->run_list, &schedule_data[p->processor].runqueue);
   18.73 +}
   18.74 +
   18.75 +static inline void __del_from_runqueue(struct task_struct * p)
   18.76 +{
   18.77 +    list_del(&p->run_list);
   18.78 +    p->run_list.next = NULL;
   18.79 +}
   18.80 +
   18.81 +static inline int __task_on_runqueue(struct task_struct *p)
   18.82 +{
   18.83 +    return p->run_list.next != NULL;
   18.84 +}
   18.85 +
   18.86 +#define next_domain(p) \\
   18.87 +        list_entry((p)->run_list.next, struct task_struct, run_list)
   18.88 +
   18.89 +
   18.90 +static inline int __runqueue_empty(int cpu)
   18.91 +{
   18.92 +    return list_empty(&schedule_data[cpu].runqueue);
   18.93 +}
    19.1 --- a/xen/include/xeno/sched.h	Wed Mar 17 17:13:18 2004 +0000
    19.2 +++ b/xen/include/xeno/sched.h	Wed Mar 17 18:31:06 2004 +0000
    19.3 @@ -114,16 +114,9 @@ struct task_struct
    19.4      s_time_t         wokenup;       /* time domain got woken up */
    19.5      struct ac_timer  timer;         /* one-shot timer for timeout values */
    19.6  
    19.7 -    /* BVT scheduler specific. */
    19.8 -    unsigned long mcu_advance;      /* inverse of weight */
    19.9 -    u32           avt;              /* actual virtual time */
   19.10 -    u32           evt;              /* effective virtual time */
   19.11 -    int           warpback;         /* warp?  */
   19.12 -    long          warp;             /* virtual time warp */
   19.13 -    long          warpl;            /* warp limit */
   19.14 -    long          warpu;            /* unwarp time requirement */
   19.15 -    s_time_t      warped;           /* time it ran warped last time */
   19.16 -    s_time_t      uwarped;          /* time it ran unwarped last time */
   19.17 +    s_time_t         min_slice;     /* minimum time before reschedule */
   19.18 +
   19.19 +    void *sched_priv;               /* scheduler-specific data */
   19.20  
   19.21      /* Network I/O */
   19.22      net_vif_t *net_vif_list[MAX_DOMAIN_VIFS];
   19.23 @@ -177,6 +170,7 @@ struct task_struct
   19.24  #define TASK_UNINTERRUPTIBLE     2
   19.25  #define TASK_STOPPED             4
   19.26  #define TASK_DYING               8
   19.27 +#define TASK_SCHED_PRIV          16
   19.28  
   19.29  #include <asm/uaccess.h> /* for KERNEL_DS */
   19.30  
   19.31 @@ -186,8 +180,6 @@ struct task_struct
   19.32      domain:      IDLE_DOMAIN_ID, \
   19.33      state:       TASK_RUNNING,   \
   19.34      has_cpu:     0,              \
   19.35 -    evt:         0xffffffff,     \
   19.36 -    avt:         0xffffffff,     \
   19.37      mm:          IDLE0_MM,       \
   19.38      addr_limit:  KERNEL_DS,      \
   19.39      thread:      INIT_THREAD,    \
   19.40 @@ -202,9 +194,9 @@ extern struct task_struct *idle_task[NR_
   19.41  
   19.42  #include <xeno/slab.h>
   19.43  
   19.44 -extern kmem_cache_t *task_struct_cachep;
   19.45 -#define alloc_task_struct()  \
   19.46 -  ((struct task_struct *)kmem_cache_alloc(task_struct_cachep,GFP_KERNEL))
   19.47 +void free_task_struct(struct task_struct *p);
   19.48 +struct task_struct *alloc_task_struct();
   19.49 +
   19.50  #define put_task_struct(_p) \
   19.51    if ( atomic_dec_and_test(&(_p)->refcnt) ) release_task(_p)
   19.52  #define get_task_struct(_p)  \
   19.53 @@ -251,15 +243,14 @@ extern spinlock_t schedule_lock[NR_CPUS]
   19.54  void scheduler_init(void);
   19.55  void schedulers_start(void);
   19.56  void sched_add_domain(struct task_struct *p);
   19.57 -int sched_rem_domain(struct task_struct *p);
   19.58 -long sched_bvtctl(unsigned long ctx_allow);
   19.59 -long sched_adjdom(domid_t dom, unsigned long mcu_adv, unsigned long warp, 
   19.60 -                  unsigned long warpl, unsigned long warpu);
   19.61 +int  sched_rem_domain(struct task_struct *p);
   19.62 +long sched_ctl(struct sched_ctl_cmd *);
   19.63 +long sched_adjdom(struct sched_adjdom_cmd *);
   19.64  void init_idle_task(void);
   19.65  void __wake_up(struct task_struct *p);
   19.66  void wake_up(struct task_struct *p);
   19.67 +void reschedule(struct task_struct *p);
   19.68  unsigned long __reschedule(struct task_struct *p);
   19.69 -void reschedule(struct task_struct *p);
   19.70  
   19.71  /* NB. Limited entry in Xen. Not for arbitrary use! */
   19.72  asmlinkage void __enter_scheduler(void);
   19.73 @@ -302,4 +293,4 @@ extern struct task_struct *task_list;
   19.74  
   19.75  extern void update_process_times(int user);
   19.76  
   19.77 -#endif
   19.78 +#endif /*_LINUX_SCHED_H */