ia64/xen-unstable

changeset 1308:0eac2a407546

bitkeeper revision 1.868 (4083e83c-_BTzCUkGyO_-jRGzSENLg)

Merge ssh://srg//auto/groups/xeno/BK/xeno.bk
into equilibrium.research.intel-research.net:/home/irchomes/mwilli2/src/xeno.bk
author mwilli2@equilibrium.research.intel-research.net
date Mon Apr 19 14:54:52 2004 +0000 (2004-04-19)
parents ee3ca1b3f62a 8fd9232c2133
children 2d0465936335
files .rootkeys docs/Sched-HOWTO.txt docs/Xen-HOWTO.txt docs/interface.tex tools/examples/README tools/examples/xc_dom_control.py tools/xc/lib/xc_atropos.c tools/xc/py/Xc.c xen/common/sched_atropos.c xen/include/hypervisor-ifs/dom0_ops.h xen/include/hypervisor-ifs/sched_ctl.h
line diff
     1.1 --- a/.rootkeys	Mon Apr 19 14:03:03 2004 +0000
     1.2 +++ b/.rootkeys	Mon Apr 19 14:54:52 2004 +0000
     1.3 @@ -8,6 +8,7 @@ 3f5ef5a2l4kfBYSQTUaOyyD76WROZQ README.CD
     1.4  3f69d8abYB1vMyD_QVDvzxy5Zscf1A TODO
     1.5  405ef604hIZH5pGi2uwlrlSvUMrutw docs/Console-HOWTO.txt
     1.6  3f9e7d53iC47UnlfORp9iC1vai6kWw docs/Makefile
     1.7 +4083e798FbE1MIsQaIYvjnx1uvFhBg docs/Sched-HOWTO.txt
     1.8  40083bb4LVQzRqA3ABz0__pPhGNwtA docs/VBD-HOWTO.txt
     1.9  4021053fmeFrEyPHcT8JFiDpLNgtHQ docs/Xen-HOWTO.txt
    1.10  3f9e7d60PWZJeVh5xdnk0nLUdxlqEA docs/eps/xenlogo.eps
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/docs/Sched-HOWTO.txt	Mon Apr 19 14:54:52 2004 +0000
     2.3 @@ -0,0 +1,135 @@
     2.4 +Xen Scheduler HOWTO
     2.5 +===================
     2.6 +
     2.7 +by Mark Williamson
     2.8 +(c) 2004 Intel Research Cambridge
     2.9 +
    2.10 +
    2.11 +Introduction
    2.12 +------------
    2.13 +
    2.14 +Xen offers a choice of CPU schedulers.  All available schedulers are
    2.15 +included in Xen at compile time and the administrator may select a
    2.16 +particular scheduler using a boot-time parameter to Xen.  It is
    2.17 +expected that administrators will choose the scheduler most
    2.18 +appropriate to their application and configure the machine to boot
    2.19 +with that scheduler.
    2.20 +
    2.21 +Note: the default scheduler is the Borrowed Virtual Time (BVT)
    2.22 +scheduler which was also used in previous releases of Xen.  No
    2.23 +configuration changes are required to keep using this scheduler.
    2.24 +
    2.25 +This file provides a brief description of the CPU schedulers available
    2.26 +in Xen, what they are useful for and the parameters that are used to
    2.27 +configure them.  This information is necessarily fairly technical at
    2.28 +the moment.  The recommended way to fully understand the scheduling
    2.29 +algorithms is to read the relevant research papers.
    2.30 +
    2.31 +The interface to the schedulers is basically "raw" at the moment,
    2.32 +without sanity checking - administrators should be careful when
    2.33 +setting the parameters since it is possible for a mistake to hang
    2.34 +domains, or the entire system (in particular, double check parameters
    2.35 +for sanity and make sure that DOM0 will get enough CPU time to remain
    2.36 +usable).  Note that xc_dom_control.py takes time values in
    2.37 +nanoseconds.
    2.38 +
    2.39 +Future tools will implement friendlier control interfaces.
    2.40 +
    2.41 +
    2.42 +Borrowed Virtual Time (BVT)
    2.43 +---------------------------
    2.44 +
    2.45 +All releases of Xen have featured the BVT scheduler, which is used to
    2.46 +provide proportional fair shares of the CPU based on weights assigned
    2.47 +to domains.  BVT is "work conserving" - the CPU will never be left
    2.48 +idle if there are runnable tasks.
    2.49 +
    2.50 +BVT uses "virtual time" to make decisions on which domain should be
    2.51 +scheduled on the processor.  Each time a scheduling decision is
    2.52 +required, BVT evaluates the "Effective Virtual Time" of all domains
    2.53 +and then schedules the domain with the least EVT.  Domains are allowed
    2.54 +to "borrow" virtual time by "time warping", which reduces their EVT by
    2.55 +a certain amount, so that they may be scheduled sooner.  In order to
    2.56 +maintain long term fairness, there are limits on when a domain can
    2.57 +time warp and for how long.  [ For more details read the SOSP'99 paper
    2.58 +by Duda and Cheriton ]
    2.59 +
    2.60 +In the Xen implementation, domains time warp when they unblock, so
    2.61 +that domain wakeup latencies are reduced.
    2.62 +
    2.63 +The BVT algorithm uses the following per-domain parameters (set using
    2.64 +xc_dom_control.py cpu_bvtset):
    2.65 +
    2.66 +* mcuadv - the MCU (Minimum Charging Unit) advance determines the
    2.67 +           proportional share of the CPU that a domain receives.  It
    2.68 +           is set inversely proportionally to a domain's sharing weight.
    2.69 +* warp   - the amount of "virtual time" the domain is allowed to warp
    2.70 +           backwards
    2.71 +* warpl  - the warp limit is the maximum time a domain can run warped for
    2.72 +* warpu  - the unwarp requirement is the minimum time a domain must
    2.73 +           run unwarped for before it can warp again
    2.74 +
    2.75 +BVT also has the following global parameter (set using
    2.76 +xc_dom_control.py cpu_bvtslice):
    2.77 +
    2.78 +* ctx_allow - the context switch allowance is similar to the "quantum"
    2.79 +              in traditional schedulers.  It is the minimum time that
    2.80 +              a scheduled domain will be allowed to run before be
    2.81 +              pre-empted.  This prevents thrashing of the CPU.
    2.82 +
    2.83 +BVT can now be selected by passing the 'sched=bvt' argument to Xen at
    2.84 +boot-time and is the default scheduler if no 'sched' argument is
    2.85 +supplied.
    2.86 +
    2.87 +Atropos
    2.88 +-------
    2.89 +
    2.90 +Atropos is a scheduler originally developed for the Nemesis multimedia
    2.91 +operating system.  Atropos can be used to reserve absolute shares of
    2.92 +the CPU.  It also includes some features to improve the efficiency of
    2.93 +domains that block for I/O and to allow spare CPU time to be shared
    2.94 +out.
    2.95 +
    2.96 +The Atropos algorithm has the following parameters for each domain
    2.97 +(set using xc_dom_control.py cpu_atropos_set):
    2.98 +
    2.99 + * slice    - The length of time per period that a domain is guaranteed.
   2.100 + * period   - The period over which a domain is guaranteed to receive
   2.101 +              its slice of CPU time.
   2.102 + * latency  - The latency hint is used to control how soon after
   2.103 +              waking up a domain should be scheduled.
   2.104 + * xtratime - This is a true (1) / false (0) flag that specifies whether
   2.105 +             a domain should be allowed a share of the system slack time.
   2.106 +
   2.107 +Every domain has an associated period and slice.  The domain should
   2.108 +receive 'slice' nanoseconds every 'period' nanoseconds.  This allows
   2.109 +the administrator to configure both the absolute share of the CPU a
   2.110 +domain receives and the frequency with which it is scheduled.  When
   2.111 +domains unblock, their period is reduced to the value of the latency
   2.112 +hint (the slice is scaled accordingly so that they still get the same
   2.113 +proportion of the CPU).  For each subsequent period, the slice and
   2.114 +period times are doubled until they reach their original values.
   2.115 +
   2.116 +Atropos is selected by adding 'sched=atropos' to Xen's boot-time
   2.117 +arguments.
   2.118 +
   2.119 +Note: don't overcommit the CPU when using Atropos (i.e. don't reserve
   2.120 +more CPU than is available - the utilisation should be kept to
   2.121 +slightly less than 100% in order to ensure predictable behaviour).
   2.122 +
   2.123 +Round-Robin
   2.124 +-----------
   2.125 +
   2.126 +The Round-Robin scheduler is provided as a simple example of Xen's
   2.127 +internal scheduler API.  For production systems, one of the other
   2.128 +schedulers should be used, since they are more flexible and more
   2.129 +efficient.
   2.130 +
   2.131 +The Round-robin scheduler has one global parameter (set using
   2.132 +xc_dom_control.py cpu_rrobin_slice):
   2.133 +
   2.134 + * rr_slice - The time for which each domain runs before the next
   2.135 +              scheduling decision is made.
   2.136 +
   2.137 +The Round-Robin scheduler can be selected by adding 'sched=rrobin' to
   2.138 +Xen's boot-time arguments.
     3.1 --- a/docs/Xen-HOWTO.txt	Mon Apr 19 14:03:03 2004 +0000
     3.2 +++ b/docs/Xen-HOWTO.txt	Mon Apr 19 14:54:52 2004 +0000
     3.3 @@ -231,6 +231,11 @@ The following is a list of command line 
     3.4                    enabled in debug builds.  Most users can ignore
     3.5                    this feature completely.
     3.6  
     3.7 + sched=xxx        Select the CPU scheduler Xen should use.  The current
     3.8 +                  possibilities are 'bvt', 'atropos' and 'rrobin'.  The
     3.9 +                  default is 'bvt'.  For more information see
    3.10 +                  Sched-HOWTO.txt.
    3.11 +
    3.12  Boot into Domain 0
    3.13  ==============================
    3.14  
     4.1 --- a/docs/interface.tex	Mon Apr 19 14:03:03 2004 +0000
     4.2 +++ b/docs/interface.tex	Mon Apr 19 14:54:52 2004 +0000
     4.3 @@ -392,26 +392,14 @@ assigned domains should be run there.
     4.4  
     4.5  \section{Standard Schedulers}
     4.6  
     4.7 -These BVT and Round Robin schedulers are part of the normal Xen
     4.8 -distribution.  A port of the Atropos scheduler from the Nemesis
     4.9 -operating system is almost complete and will be added shortly.
    4.10 -
    4.11 -\subsection{Borrowed Virtual Time (BVT)}
    4.12 +These BVT, Atropos and Round Robin schedulers are part of the normal
    4.13 +Xen distribution.  BVT provides porportional fair shares of the CPU to
    4.14 +the running domains.  Atropos can be used to reserve absolute shares
    4.15 +of the CPU for each domain.  Round-robin is provided as an example of
    4.16 +Xen's internal scheduler API.
    4.17  
    4.18 -This was the original Xen scheduler.  BVT is designed for general-purpose
    4.19 -environments but also provides support for latency-sensitive threads.  It
    4.20 -provides long-term weighted sharing but allows tasks a limited ability to
    4.21 -``warp back'' in virtual time so that they are dispatched earlier.
    4.22 -
    4.23 -BVT can be activated by specifying {\tt sched=bvt} as a boot argument to Xen.
    4.24 -
    4.25 -\subsection{Round Robin}
    4.26 -
    4.27 -The round robin scheduler is a very simple example of some of the basic parts
    4.28 -of the scheduler API.
    4.29 -
    4.30 -Round robin can be activated by specifying {\tt sched=rrobin} as a boot
    4.31 -argument to Xen.
    4.32 +More information on the characteristics and use of these schedulers is
    4.33 +available in { \tt Sched-HOWTO.txt }.
    4.34  
    4.35  \section{Scheduling API}
    4.36  
    4.37 @@ -419,9 +407,6 @@ The scheduling API is used by both the s
    4.38  also be used by any new schedulers.  It provides a generic interface and also
    4.39  implements much of the ``boilerplate'' code.
    4.40  
    4.41 -\paragraph*{Note:} the scheduler API is currently undergoing active development,
    4.42 -so there may be some changes to this API, although they are expected to be small.
    4.43 -
    4.44  Schedulers conforming to this API are described by the following
    4.45  structure:
    4.46  
    4.47 @@ -438,7 +423,7 @@ struct scheduler
    4.48      void         (*free_task)      (struct task_struct *);
    4.49      void         (*rem_task)       (struct task_struct *);
    4.50      void         (*wake_up)        (struct task_struct *);
    4.51 -    long         (*do_block)       (struct task_struct *);
    4.52 +    void         (*do_block)       (struct task_struct *);
    4.53      task_slice_t (*do_schedule)    (s_time_t);
    4.54      int          (*control)        (struct sched_ctl_cmd *);
    4.55      int          (*adjdom)         (struct task_struct *,
    4.56 @@ -458,7 +443,7 @@ The fields of the above structure are de
    4.57  
    4.58  \subsubsection{name}
    4.59  
    4.60 -The name field is an arbitrary descriptive ASCII string.
    4.61 +The name field should point to a descriptive ASCII string.
    4.62  
    4.63  \subsubsection{opt\_name}
    4.64  
    4.65 @@ -486,22 +471,22 @@ selected.
    4.66  
    4.67  \paragraph*{Return values}
    4.68  
    4.69 -This should return negative on failure --- failure to initialise the scheduler
    4.70 -will cause an immediate panic.
    4.71 +This should return negative on failure --- this will cause an
    4.72 +immediate panic and the system will fail to boot.
    4.73  
    4.74  \subsubsection{alloc\_task}
    4.75  
    4.76  \paragraph*{Purpose}
    4.77 -This is called when a {\tt task\_struct} is allocated by the generic scheduler
    4.78 -layer.  A particular scheduler implementation may use this method to allocate
    4.79 -per-task data for this task.  It may use the {\tt sched\_priv} pointer in the
    4.80 -{\tt task\_struct} to point to this data.
    4.81 +Called when a {\tt task\_struct} is allocated by the generic scheduler
    4.82 +layer.  A particular scheduler implementation may use this method to
    4.83 +allocate per-task data for this task.  It may use the {\tt
    4.84 +sched\_priv} pointer in the {\tt task\_struct} to point to this data.
    4.85  
    4.86  \paragraph*{Call environment}
    4.87  The generic layer guarantees that the {\tt sched\_priv} field will
    4.88  remain intact from the time this method is called until the task is
    4.89  deallocated (so long as the scheduler implementation does not change
    4.90 -it!).
    4.91 +it explicitly!).
    4.92  
    4.93  \paragraph*{Return values}
    4.94  Negative on failure.
    4.95 @@ -536,7 +521,8 @@ this method is called.
    4.96  
    4.97  \paragraph*{Purpose}
    4.98  
    4.99 -This is called when a task is being removed from scheduling.
   4.100 +This is called when a task is being removed from scheduling (but is
   4.101 +not yet being freed).
   4.102  
   4.103  \subsubsection{wake\_up}
   4.104  
   4.105 @@ -547,8 +533,7 @@ Called when a task is woken up, this met
   4.106  
   4.107  \paragraph*{Call environment}
   4.108  
   4.109 -The generic layer guarantees that the task is already in state
   4.110 -RUNNING.
   4.111 +The task is already set to state RUNNING.
   4.112  
   4.113  \subsubsection{do\_block}
   4.114  
   4.115 @@ -560,7 +545,9 @@ not remove the task from the runqueue.
   4.116  \paragraph*{Call environment}
   4.117  
   4.118  The EVENTS\_MASTER\_ENABLE\_BIT is already set and the task state changed to
   4.119 -TASK\_INTERRUPTIBLE on entry to this method.
   4.120 +TASK\_INTERRUPTIBLE on entry to this method.  A call to the {\tt
   4.121 +  do\_schedule} method will be made after this method returns, in
   4.122 +order to select the next task to run.
   4.123  
   4.124  \subsubsection{do\_schedule}
   4.125  
   4.126 @@ -570,7 +557,7 @@ This method must be implemented.
   4.127  
   4.128  The method is called each time a new task must be chosen for scheduling on the
   4.129  current CPU.  The current time as passed as the single argument (the current
   4.130 -task can be found using the {\tt current} variable).
   4.131 +task can be found using the {\tt current} macro).
   4.132  
   4.133  This method should select the next task to run on this CPU and set it's minimum
   4.134  time to run as well as returning the data described below.
   4.135 @@ -585,7 +572,7 @@ which also performs all Xen-specific tas
   4.136  (unless the previous task has been chosen again).
   4.137  
   4.138  This method is called with the {\tt schedule\_lock} held for the current CPU
   4.139 -and with interrupts disabled.
   4.140 +and local interrupts interrupts disabled.
   4.141  
   4.142  \paragraph*{Return values}
   4.143  
   4.144 @@ -597,15 +584,16 @@ for (at maximum).
   4.145  \paragraph*{Purpose}
   4.146  
   4.147  This method is called for global scheduler control operations.  It takes a
   4.148 -pointer to a {\tt struct sched\_ctl\_cmd}, from which it should select the
   4.149 -appropriate command data.
   4.150 +pointer to a {\tt struct sched\_ctl\_cmd}, which it should either
   4.151 +source data from or populate with data, depending on the value of the
   4.152 +{\tt direction} field.
   4.153  
   4.154  \paragraph*{Call environment}
   4.155  
   4.156 -The generic layer guarantees that when this method is called, the caller was
   4.157 -using the same control interface version and that the caller selected the
   4.158 -correct scheduler ID, hence the scheduler's implementation does not need to
   4.159 -sanity-check these parts of the call.
   4.160 +The generic layer guarantees that when this method is called, the
   4.161 +caller was using the caller selected the correct scheduler ID, hence
   4.162 +the scheduler's implementation does not need to sanity-check these
   4.163 +parts of the call.
   4.164  
   4.165  \paragraph*{Return values}
   4.166  
   4.167 @@ -617,7 +605,9 @@ should either be 0 or an appropriate err
   4.168  \paragraph*{Purpose}
   4.169  
   4.170  This method is called to adjust the scheduling parameters of a particular
   4.171 -domain.
   4.172 +domain, or to query their current values.  The function should check
   4.173 +the {\tt direction} field of the {\tt sched\_adjdom\_cmd} it receives in
   4.174 +order to determine which of these operations is being performed.
   4.175  
   4.176  \paragraph*{Call environment}
   4.177  
   4.178 @@ -681,6 +671,7 @@ This method should dump any private sett
   4.179  This function is called with interrupts disabled and the {\tt schedule\_lock}
   4.180  for the task's CPU held.
   4.181  
   4.182 +
   4.183  \chapter{Debugging}
   4.184  
   4.185  Xen provides tools for debugging both Xen and guest OSes.  Currently, the
     5.1 --- a/tools/examples/README	Mon Apr 19 14:03:03 2004 +0000
     5.2 +++ b/tools/examples/README	Mon Apr 19 14:54:52 2004 +0000
     5.3 @@ -16,27 +16,31 @@ send it (preferably with a little summar
     5.4  xc_dom_control.py
     5.5   - general tool for controling running domains
     5.6    Usage: xc_dom_control.py [command] <params>
     5.7 +
     5.8    stop      [dom]        -- pause a domain
     5.9    start     [dom]        -- un-pause a domain
    5.10 -  shutdown  [dom]        -- request a domain to shutdown (can specify 'all')
    5.11 +  shutdown  [dom] [[-w]] -- request a domain to shutdown (can specify 'all')
    5.12                              (optionally wait for complete shutdown)
    5.13    destroy   [dom]        -- immediately terminate a domain
    5.14    pincpu    [dom] [cpu]  -- pin a domain to the specified CPU
    5.15    suspend   [dom] [file] -- write domain's memory to a file and terminate
    5.16  			    (resume by re-running xc_dom_create with -L option)
    5.17 -  restore   [file]       -- resume a domain from a file
    5.18 +  unwatch   [dom]        -- kill the auto-restart daemon for a domain
    5.19    list                   -- print info about all domains
    5.20    listvbds               -- print info about all virtual block devs
    5.21    cpu_bvtset [dom] [mcuadv] [warp] [warpl] [warpu]
    5.22 -                         -- set scheduling parameters for domain
    5.23 -  cpu_bvtslice [slice]   -- default scheduler slice
    5.24 +                         -- set BVT scheduling parameters for domain
    5.25 +  cpu_bvtslice [slice]   -- set default BVT scheduler slice
    5.26 +  cpu_atropos_set [dom] [period] [slice] [latency] [xtratime]
    5.27 +                         -- set Atropos scheduling parameters for domain
    5.28 +  cpu_rrobin_slice [slice] -- set Round Robin scheduler slice
    5.29    vif_stats [dom] [vif]  -- get stats for a given network vif
    5.30    vif_addip [dom] [vif] [ip]  -- add an IP address to a given vif
    5.31    vif_setsched [dom] [vif] [bytes] [usecs] -- rate limit vif bandwidth
    5.32    vif_getsched [dom] [vif] -- print vif's scheduling parameters
    5.33    vbd_add [dom] [uname] [dev] [mode] -- make disk/partition uname available to 
    5.34                              domain as dev e.g. 'vbd_add 2 phy:sda3 hda1 w'
    5.35 -  vbd_remove [dom] [dev] -- remove disk or partition attached as 'dev'
    5.36 +  vbd_remove [dom] [dev] -- remove disk or partition attached as 'dev' 
    5.37  
    5.38  
    5.39  xc_dom_create.py
    5.40 @@ -89,7 +93,7 @@ Args to override the kernel command line
    5.41  
    5.42  
    5.43  
    5.44 -xc_vd_tool
    5.45 +xc_vd_tool.py
    5.46   - tool for manipulating virtual disks
    5.47     Usage: xc_vd_tool command <params>
    5.48  
    5.49 @@ -126,3 +130,9 @@ This is a Sys-V init script for RedHat s
    5.50  On a RedHat system it should be possible to issue commands to this
    5.51  script using the "service" command and to configure if / when it is
    5.52  run automatically, using the "chkconfig" command.
    5.53 +
    5.54 +xend
    5.55 +This is a Sys-V init script for RedHat systems, which can be used to
    5.56 +start the Xen Daemon (xend) at boot time.
    5.57 +
    5.58 + - Usage: xend {start|stop|status|restart|reload}
     6.1 --- a/tools/examples/xc_dom_control.py	Mon Apr 19 14:03:03 2004 +0000
     6.2 +++ b/tools/examples/xc_dom_control.py	Mon Apr 19 14:54:52 2004 +0000
     6.3 @@ -21,8 +21,11 @@ Usage: %s [command] <params>
     6.4    list                   -- print info about all domains
     6.5    listvbds               -- print info about all virtual block devs
     6.6    cpu_bvtset [dom] [mcuadv] [warp] [warpl] [warpu]
     6.7 -                         -- set scheduling parameters for domain
     6.8 -  cpu_bvtslice [slice]   -- default scheduler slice
     6.9 +                         -- set BVT scheduling parameters for domain
    6.10 +  cpu_bvtslice [slice]   -- set default BVT scheduler slice
    6.11 +  cpu_atropos_set [dom] [period] [slice] [latency] [xtratime]
    6.12 +                         -- set Atropos scheduling parameters for domain
    6.13 +  cpu_rrobin_slice [slice] -- set Round Robin scheduler slice
    6.14    vif_stats [dom] [vif]  -- get stats for a given network vif
    6.15    vif_addip [dom] [vif] [ip]  -- add an IP address to a given vif
    6.16    vif_setsched [dom] [vif] [bytes] [usecs] -- rate limit vif bandwidth
    6.17 @@ -255,6 +258,17 @@ elif cmd == 'vbd_remove':
    6.18  	print "Failed"
    6.19  	sys.exit(1)
    6.20  
    6.21 +elif cmd == 'cpu_atropos_set': # args: dom period slice latency xtratime
    6.22 +    if len(sys.argv) < 6:
    6.23 +        usage()
    6.24 +        sys.exit(1)
    6.25 +
    6.26 +    (period, slice, latency, xtratime) = map(lambda x: int(x), sys.argv[3:7])
    6.27 +    
    6.28 +    rc = xc.atropos_domain_set(dom, period, slice, latency, xtratime)
    6.29 +
    6.30 +elif cmd == 'cpu_rrobin_slice':
    6.31 +    rc = xc.rrobin_global_set(slice=int(sys.argv[2]))
    6.32  
    6.33  else:
    6.34      usage()
     7.1 --- a/tools/xc/lib/xc_atropos.c	Mon Apr 19 14:03:03 2004 +0000
     7.2 +++ b/tools/xc/lib/xc_atropos.c	Mon Apr 19 14:54:52 2004 +0000
     7.3 @@ -20,8 +20,8 @@ int xc_atropos_domain_set(int xc_handle,
     7.4      op.u.adjustdom.sched_id = SCHED_ATROPOS;
     7.5      op.u.adjustdom.direction = SCHED_INFO_PUT;
     7.6  
     7.7 -    p->period   = period;
     7.8 -    p->slice    = slice;
     7.9 +    p->nat_period   = period;
    7.10 +    p->nat_slice    = slice;
    7.11      p->latency  = latency;
    7.12      p->xtratime = xtratime;
    7.13  
    7.14 @@ -42,8 +42,8 @@ int xc_atropos_domain_get(int xc_handle,
    7.15  
    7.16      ret = do_dom0_op(xc_handle, &op);
    7.17  
    7.18 -    *period   = p->period;
    7.19 -    *slice    = p->slice;
    7.20 +    *period   = p->nat_period;
    7.21 +    *slice    = p->nat_slice;
    7.22      *latency  = p->latency;
    7.23      *xtratime = p->xtratime;
    7.24  
     8.1 --- a/tools/xc/py/Xc.c	Mon Apr 19 14:03:03 2004 +0000
     8.2 +++ b/tools/xc/py/Xc.c	Mon Apr 19 14:54:52 2004 +0000
     8.3 @@ -1029,7 +1029,7 @@ static PyObject *pyxc_rrobin_global_get(
     8.4      if ( xc_rrobin_global_get(xc->xc_handle, &slice) != 0 )
     8.5          return PyErr_SetFromErrno(xc_error);
     8.6      
     8.7 -    return Py_BuildValue("s:L", "slice", slice);
     8.8 +    return Py_BuildValue("{s:L}", "slice", slice);
     8.9  }
    8.10  
    8.11  
    8.12 @@ -1160,7 +1160,7 @@ static PyMethodDef pyxc_methods[] = {
    8.13        " domain [long]: Domain ID.\n"
    8.14        " mcuadv [long]: MCU Advance.\n"
    8.15        " warp   [long]: Warp.\n"
    8.16 -      " warpu  [long]:\n"
    8.17 +      " warpu  [long]: Unwarp requirement.\n"
    8.18        " warpl  [long]: Warp limit,\n"
    8.19      },
    8.20  
    8.21 @@ -1402,7 +1402,7 @@ static PyObject *PyXc_new(PyObject *self
    8.22      if ( (xc->xc_handle = xc_interface_open()) == -1 )
    8.23      {
    8.24          PyObject_Del((PyObject *)xc);
    8.25 -        return NULL;
    8.26 +	return PyErr_SetFromErrno(xc_error);
    8.27      }
    8.28  
    8.29      return (PyObject *)xc;
     9.1 --- a/xen/common/sched_atropos.c	Mon Apr 19 14:03:03 2004 +0000
     9.2 +++ b/xen/common/sched_atropos.c	Mon Apr 19 14:54:52 2004 +0000
     9.3 @@ -30,7 +30,7 @@
     9.4  #define Activation_Reason_Preempted 2
     9.5  #define Activation_Reason_Extra     3
     9.6  
     9.7 -/* The following will be used for atropos-specific per-domain data fields */
     9.8 +/* Atropos-specific per-domain data */
     9.9  struct at_dom_info
    9.10  {
    9.11      /* MAW Xen additions */
    9.12 @@ -40,18 +40,20 @@ struct at_dom_info
    9.13  
    9.14      /* (what remains of) the original fields */
    9.15  
    9.16 -    s_time_t     deadline;       /* Next deadline                */
    9.17 -    s_time_t     prevddln;       /* Previous deadline            */
    9.18 +    s_time_t     deadline;       /* Next deadline                        */
    9.19 +    s_time_t     prevddln;       /* Previous deadline                    */
    9.20      
    9.21 -    s_time_t     remain;         /* Time remaining this period   */
    9.22 -    s_time_t     period;         /* Period of time allocation    */
    9.23 -    s_time_t     slice;          /* Length of allocation         */
    9.24 -    s_time_t     latency;        /* Unblocking latency           */
    9.25 +    s_time_t     remain;         /* Time remaining this period           */
    9.26 +    s_time_t     period;         /* Current period of time allocation    */
    9.27 +    s_time_t     nat_period;     /* Natural period                       */
    9.28 +    s_time_t     slice;          /* Current length of allocation         */
    9.29 +    s_time_t     nat_slice;      /* Natural length of allocation         */
    9.30 +    s_time_t     latency;        /* Unblocking latency                   */
    9.31  
    9.32 -    int          xtratime;       /* Prepared to accept extra?    */
    9.33 +    int          xtratime;       /* Prepared to accept extra time?       */
    9.34  };
    9.35  
    9.36 -
    9.37 +/* Atropos-specific per-CPU data */
    9.38  struct at_cpu_info
    9.39  {
    9.40      struct list_head waitq; /* wait queue*/
    9.41 @@ -65,9 +67,11 @@ struct at_cpu_info
    9.42  
    9.43  #define BESTEFFORT_QUANTUM MILLISECS(5)
    9.44  
    9.45 +
    9.46  /* SLAB cache for struct at_dom_info objects */
    9.47  static kmem_cache_t *dom_info_cache;
    9.48  
    9.49 +
    9.50  /** calculate the length of a linked list */
    9.51  static int q_len(struct list_head *q) 
    9.52  {
    9.53 @@ -167,17 +171,15 @@ static void at_add_task(struct task_stru
    9.54      DOM_INFO(p)->owner = p;
    9.55      p->lastschd = now;
    9.56   
    9.57 -    if(is_idle_task(p))
    9.58 -      DOM_INFO(p)->slice = MILLISECS(5);
    9.59 -
    9.60 -    /* DOM 0's scheduling parameters must be set here in order for it to boot
    9.61 -     * the system! */
    9.62 +    /* DOM 0's parameters must be set here for it to boot the system! */
    9.63      if(p->domain == 0)
    9.64      {
    9.65          DOM_INFO(p)->remain = MILLISECS(15);
    9.66 -        DOM_INFO(p)->period = MILLISECS(20);
    9.67 -        DOM_INFO(p)->slice  = MILLISECS(15);
    9.68 -        DOM_INFO(p)->latency = MILLISECS(10);
    9.69 +        DOM_INFO(p)->nat_period =
    9.70 +            DOM_INFO(p)->period = MILLISECS(20);
    9.71 +        DOM_INFO(p)->nat_slice =
    9.72 +            DOM_INFO(p)->slice = MILLISECS(15);
    9.73 +        DOM_INFO(p)->latency = MILLISECS(5);
    9.74          DOM_INFO(p)->xtratime = 1;
    9.75          DOM_INFO(p)->deadline = now;
    9.76          DOM_INFO(p)->prevddln = now;
    9.77 @@ -185,11 +187,13 @@ static void at_add_task(struct task_stru
    9.78      else /* other domains run basically best effort unless otherwise set */
    9.79      {
    9.80          DOM_INFO(p)->remain = 0;
    9.81 -        DOM_INFO(p)->period = MILLISECS(10000);
    9.82 -        DOM_INFO(p)->slice  = MILLISECS(10);
    9.83 -        DOM_INFO(p)->latency = MILLISECS(10000);
    9.84 +        DOM_INFO(p)->nat_period =
    9.85 +            DOM_INFO(p)->period = SECONDS(10);
    9.86 +        DOM_INFO(p)->nat_slice =
    9.87 +            DOM_INFO(p)->slice  = MILLISECS(10);
    9.88 +        DOM_INFO(p)->latency = SECONDS(10);
    9.89          DOM_INFO(p)->xtratime = 1;
    9.90 -        DOM_INFO(p)->deadline = now + MILLISECS(10000);
    9.91 +        DOM_INFO(p)->deadline = now + SECONDS(10);
    9.92          DOM_INFO(p)->prevddln = 0;
    9.93      }
    9.94  
    9.95 @@ -226,10 +230,19 @@ static void dequeue(struct task_struct *
    9.96   * This function deals with updating the sdom for a domain
    9.97   * which has just been unblocked.  
    9.98   *
    9.99 - * ASSERT: On entry, the sdom has already been removed from the block
   9.100 - * queue (it can be done more efficiently if we know that it
   9.101 - * is on the head of the queue) but its deadline field has not been
   9.102 - * restored yet.
   9.103 + * Xen's Atropos treats unblocking slightly differently to Nemesis:
   9.104 + *
   9.105 + * - "Short blocking" domains (i.e. that unblock before their deadline has
   9.106 + *  expired) are treated the same as in nemesis (put on the wait queue and
   9.107 + *  given preferential treatment in selecting domains for extra time).
   9.108 + *
   9.109 + * - "Long blocking" domains do not simply have their period truncated to their
   9.110 + *  unblocking latency as before but also have their slice recomputed to be the
   9.111 + *  same fraction of their new period.  Each time the domain is scheduled, the
   9.112 + *  period and slice are doubled until they reach their original ("natural")
   9.113 + *  values, as set by the user (and stored in nat_period and nat_slice).  The
   9.114 + *  idea is to give better response times to unblocking whilst preserving QoS
   9.115 + *  guarantees to other domains.
   9.116   */
   9.117  static void unblock(struct task_struct *sdom)
   9.118  {
   9.119 @@ -239,18 +252,27 @@ static void unblock(struct task_struct *
   9.120      dequeue(sdom);
   9.121  
   9.122      /* We distinguish two cases... short and long blocks */
   9.123 -    if ( inf->deadline < time ) {
   9.124 +
   9.125 +    if ( inf->deadline < time )
   9.126 +    {
   9.127 +        /* Long blocking case */
   9.128 +
   9.129  	/* The sdom has passed its deadline since it was blocked. 
   9.130  	   Give it its new deadline based on the latency value. */
   9.131 -	inf->prevddln = time; 
   9.132 +	inf->prevddln = time;
   9.133 +
   9.134 +        /* Scale the scheduling parameters as requested by the latency hint. */
   9.135  	inf->deadline = time + inf->latency;
   9.136 -	inf->remain   = inf->slice;
   9.137 -        if(inf->remain > 0)
   9.138 -            sdom->state = TASK_RUNNING;
   9.139 -        else
   9.140 -            sdom->state = ATROPOS_TASK_WAIT;
   9.141 -        
   9.142 -    } else {
   9.143 +        inf->slice = inf->nat_slice / ( inf->nat_period / inf->latency );
   9.144 +        inf->period = inf->latency;
   9.145 +	inf->remain = inf->slice;
   9.146 +
   9.147 +        sdom->state = TASK_RUNNING;
   9.148 +    }
   9.149 +    else
   9.150 +    {
   9.151 +        /* Short blocking case */
   9.152 +
   9.153  	/* We leave REMAIN intact, but put this domain on the WAIT
   9.154  	   queue marked as recently unblocked.  It will be given
   9.155  	   priority over other domains on the wait queue until while
   9.156 @@ -288,9 +310,8 @@ task_slice_t ksched_scheduler(s_time_t t
   9.157  
   9.158      /* If we were spinning in the idle loop, there is no current
   9.159       * domain to deschedule. */
   9.160 -    if (is_idle_task(cur_sdom)) {
   9.161 +    if (is_idle_task(cur_sdom))
   9.162  	goto deschedule_done;
   9.163 -    }
   9.164  
   9.165      /*****************************
   9.166       * 
   9.167 @@ -308,7 +329,8 @@ task_slice_t ksched_scheduler(s_time_t t
   9.168      dequeue(cur_sdom);
   9.169  
   9.170      if ((cur_sdom->state == TASK_RUNNING) ||
   9.171 -        (cur_sdom->state == ATROPOS_TASK_UNBLOCKED)) {
   9.172 +        (cur_sdom->state == ATROPOS_TASK_UNBLOCKED))
   9.173 +    {
   9.174  
   9.175  	/* In this block, we are doing accounting for an sdom which has 
   9.176  	   been running in contracted time.  Note that this could now happen
   9.177 @@ -318,10 +340,11 @@ task_slice_t ksched_scheduler(s_time_t t
   9.178  	cur_info->remain  -= ranfor;
   9.179  
   9.180  	/* If guaranteed time has run out... */
   9.181 -	if ( cur_info->remain <= 0 ) {
   9.182 +	if ( cur_info->remain <= 0 )
   9.183 +        {
   9.184  	    /* Move domain to correct position in WAIT queue */
   9.185              /* XXX sdom_unblocked doesn't need this since it is 
   9.186 -	     already in the correct place. */
   9.187 +               already in the correct place. */
   9.188  	    cur_sdom->state = ATROPOS_TASK_WAIT;
   9.189  	}
   9.190      }
   9.191 @@ -351,6 +374,20 @@ task_slice_t ksched_scheduler(s_time_t t
   9.192  
   9.193          dequeue(sdom);
   9.194  
   9.195 +        if ( inf->period != inf->nat_period )
   9.196 +        {
   9.197 +            /* This domain has had its parameters adjusted as a result of
   9.198 +             * unblocking and they need to be adjusted before requeuing it */
   9.199 +            inf->slice  *= 2;
   9.200 +            inf->period *= 2;
   9.201 +            
   9.202 +            if ( inf->period > inf->nat_period )
   9.203 +            {
   9.204 +                inf->period = inf->nat_period;
   9.205 +                inf->slice  = inf->nat_slice;
   9.206 +            }
   9.207 +        }
   9.208 +
   9.209  	/* Domain begins a new period and receives a slice of CPU 
   9.210  	 * If this domain has been blocking then throw away the
   9.211  	 * rest of it's remain - it can't be trusted */
   9.212 @@ -358,8 +395,10 @@ task_slice_t ksched_scheduler(s_time_t t
   9.213  	    inf->remain = inf->slice;
   9.214      	else 
   9.215  	    inf->remain += inf->slice;
   9.216 +
   9.217  	inf->prevddln = inf->deadline;
   9.218  	inf->deadline += inf->period;
   9.219 +
   9.220          if(inf->remain > 0)
   9.221              sdom->state = TASK_RUNNING;
   9.222          else
   9.223 @@ -391,8 +430,8 @@ task_slice_t ksched_scheduler(s_time_t t
   9.224      /* MAW - the idle domain is always on the run queue.  We run from the
   9.225       * runqueue if it's NOT the idle domain or if there's nothing on the wait
   9.226       * queue */
   9.227 -    if (cur_sdom->domain == IDLE_DOMAIN_ID && !list_empty(WAITQ(cpu))) {
   9.228 -
   9.229 +    if (cur_sdom->domain == IDLE_DOMAIN_ID && !list_empty(WAITQ(cpu)))
   9.230 +    {
   9.231          struct list_head *item;
   9.232  
   9.233  	/* Try running a domain on the WAIT queue - this part of the
   9.234 @@ -426,24 +465,23 @@ task_slice_t ksched_scheduler(s_time_t t
   9.235  	   flag set.  The NEXT_OPTM field is used to cheaply achieve
   9.236  	   an approximation of round-robin order */
   9.237          list_for_each(item, WAITQ(cpu))
   9.238 -            {
   9.239 -                struct at_dom_info *inf =
   9.240 -                    list_entry(item, struct at_dom_info, waitq);
   9.241 -                
   9.242 -                sdom = inf->owner;
   9.243 -
   9.244 -                if (inf->xtratime && i >= waitq_rrobin) {
   9.245 -                    cur_sdom = sdom;
   9.246 -                    cur_info  = inf;
   9.247 -                    newtime = time + BESTEFFORT_QUANTUM;
   9.248 -                    reason  = Activation_Reason_Extra;
   9.249 -                    waitq_rrobin = i + 1; /* set this value ready for next */
   9.250 -                    goto found;
   9.251 -                }
   9.252 -
   9.253 -                i++;
   9.254 +        {
   9.255 +            struct at_dom_info *inf =
   9.256 +                list_entry(item, struct at_dom_info, waitq);
   9.257 +            
   9.258 +            sdom = inf->owner;
   9.259 +            
   9.260 +            if (inf->xtratime && i >= waitq_rrobin) {
   9.261 +                cur_sdom = sdom;
   9.262 +                cur_info  = inf;
   9.263 +                newtime = time + BESTEFFORT_QUANTUM;
   9.264 +                reason  = Activation_Reason_Extra;
   9.265 +                waitq_rrobin = i + 1; /* set this value ready for next */
   9.266 +                goto found;
   9.267              }
   9.268 -
   9.269 +            
   9.270 +            i++;
   9.271 +        }
   9.272      }
   9.273  
   9.274      found:
   9.275 @@ -523,15 +561,21 @@ static int at_adjdom(struct task_struct 
   9.276  {
   9.277      if ( cmd->direction == SCHED_INFO_PUT )
   9.278      {
   9.279 -        DOM_INFO(p)->period   = cmd->u.atropos.period;
   9.280 -        DOM_INFO(p)->slice    = cmd->u.atropos.slice;
   9.281 +        /* sanity checking! */
   9.282 +        if( cmd->u.atropos.latency > cmd->u.atropos.nat_period
   9.283 +            || cmd->u.atropos.latency == 0
   9.284 +            || cmd->u.atropos.nat_slice > cmd->u.atropos.nat_period )
   9.285 +            return -EINVAL;
   9.286 +
   9.287 +        DOM_INFO(p)->nat_period   = cmd->u.atropos.nat_period;
   9.288 +        DOM_INFO(p)->nat_slice    = cmd->u.atropos.nat_slice;
   9.289          DOM_INFO(p)->latency  = cmd->u.atropos.latency;
   9.290          DOM_INFO(p)->xtratime = !!cmd->u.atropos.xtratime;
   9.291      }
   9.292      else if ( cmd->direction == SCHED_INFO_GET )
   9.293      {
   9.294 -        cmd->u.atropos.period   = DOM_INFO(p)->period;
   9.295 -        cmd->u.atropos.slice    = DOM_INFO(p)->slice;
   9.296 +        cmd->u.atropos.nat_period   = DOM_INFO(p)->nat_period;
   9.297 +        cmd->u.atropos.nat_slice    = DOM_INFO(p)->nat_slice;
   9.298          cmd->u.atropos.latency  = DOM_INFO(p)->latency;
   9.299          cmd->u.atropos.xtratime = DOM_INFO(p)->xtratime;
   9.300      }
   9.301 @@ -548,9 +592,6 @@ static int at_alloc_task(struct task_str
   9.302      if( (DOM_INFO(p) = kmem_cache_alloc(dom_info_cache, GFP_KERNEL)) == NULL )
   9.303          return -1;
   9.304  
   9.305 -    if(p->domain == IDLE_DOMAIN_ID)
   9.306 -      printk("ALLOC IDLE ON CPU %d\n", p->processor);
   9.307 -
   9.308      memset(DOM_INFO(p), 0, sizeof(struct at_dom_info));
   9.309  
   9.310      return 0;
   9.311 @@ -563,6 +604,7 @@ static void at_free_task(struct task_str
   9.312      kmem_cache_free( dom_info_cache, DOM_INFO(p) );
   9.313  }
   9.314  
   9.315 +
   9.316  /* print decoded domain private state value (if known) */
   9.317  static int at_prn_state(int state)
   9.318  {
    11.1 --- a/xen/include/hypervisor-ifs/sched_ctl.h	Mon Apr 19 14:03:03 2004 +0000
    11.2 +++ b/xen/include/hypervisor-ifs/sched_ctl.h	Mon Apr 19 14:54:52 2004 +0000
    11.3 @@ -60,8 +60,8 @@ struct sched_adjdom_cmd
    11.4  
    11.5          struct atropos_adjdom
    11.6          {
    11.7 -            u64 period;
    11.8 -            u64 slice;
    11.9 +            u64 nat_period;
   11.10 +            u64 nat_slice;
   11.11              u64 latency;
   11.12              int xtratime;
   11.13          } atropos;