ia64/xen-unstable

changeset 2674:1c21b245b050

bitkeeper revision 1.1159.1.248 (41770cd3xdqfaxecPtOsPCrCe_NtiA)

Merge ssh://srg//auto/groups/xeno/BK/xeno-unstable.bk
into equilibrium.research:/export/scratch/xeno-balloon.bk
author mwilli2@equilibrium.research
date Thu Oct 21 01:11:47 2004 +0000 (2004-10-21)
parents 081dd58e4d58 1cec0bdb4c6f
children 15749c9317e9
files docs/interface.tex docs/user.tex xen/arch/x86/memory.c xen/common/schedule.c
line diff
     1.1 --- a/docs/interface.tex	Thu Oct 21 01:11:34 2004 +0000
     1.2 +++ b/docs/interface.tex	Thu Oct 21 01:11:47 2004 +0000
     1.3 @@ -51,32 +51,33 @@ operating system images to be run simult
     1.4  
     1.5  Virtualizing the machine in this manner provides flexibility allowing
     1.6  different users to choose their preferred operating system (Windows,
     1.7 -Linux, FreeBSD, or a custom operating system). Furthermore, Xen provides
     1.8 +Linux, NetBSD, or a custom operating system).  Furthermore, Xen provides
     1.9  secure partitioning between these 'domains', and enables better resource
    1.10  accounting and QoS isolation than can be achieved with a conventional
    1.11  operating system.
    1.12  
    1.13  The hypervisor runs directly on server hardware and dynamically partitions
    1.14  it between a number of {\it domains}, each of which hosts an instance
    1.15 -of a {\it guest operating system}. The hypervisor provides just enough
    1.16 +of a {\it guest operating system}.  The hypervisor provides just enough
    1.17  abstraction of the machine to allow effective isolation and resource 
    1.18  management between these domains.
    1.19  
    1.20 -Xen essentially takes a virtual machine approach as pioneered by IBM VM/370.
    1.21 -However, unlike VM/370 or more recent efforts such as VMWare and Virtual PC,
    1.22 -Xen doesn not attempt to completely virtualize the underlying hardware. Instead
    1.23 -parts of the hosted guest operating systems to work with the hypervisor; the
    1.24 -operating system is effectively ported to a new target architecture, typically
    1.25 -requiring changes in just the machine-dependent code. The user-level API is
    1.26 -unchanged, thus existing binaries and operating system distributions can work
    1.27 -unmodified.
    1.28 +Xen essentially takes a virtual machine approach as pioneered by IBM
    1.29 +VM/370.  However, unlike VM/370 or more recent efforts such as VMWare
    1.30 +and Virtual PC, Xen doesn not attempt to completely virtualize the
    1.31 +underlying hardware.  Instead parts of the hosted guest operating
    1.32 +systems are modified to work with the hypervisor; the operating system
    1.33 +is effectively ported to a new target architecture, typically
    1.34 +requiring changes in just the machine-dependent code.  The user-level
    1.35 +API is unchanged, thus existing binaries and operating system
    1.36 +distributions can work unmodified.
    1.37  
    1.38  In addition to exporting virtualized instances of CPU, memory, network and
    1.39  block devicees, Xen exposes a control interface to set how these resources
    1.40 -are shared between the running domains. The control interface is privileged
    1.41 +are shared between the running domains.  The control interface is privileged
    1.42  and may only be accessed by one particular virtual machine: {\it domain0}.
    1.43  This domain is a required part of any Xen-base server and runs the application
    1.44 -software that manages the control-plane aspects of the platform. Running the
    1.45 +software that manages the control-plane aspects of the platform.  Running the
    1.46  control software in {\it domain0}, distinct from the hypervisor itself, allows
    1.47  the Xen framework to separate the notions of {\it mechanism} and {\it policy}
    1.48  within the system.
    1.49 @@ -84,58 +85,59 @@ within the system.
    1.50  
    1.51  \chapter{CPU state}
    1.52  
    1.53 -All privileged state must be handled by Xen. The guest OS has no direct access
    1.54 -to CR3 and is not permitted to update privileged bits in EFLAGS.
    1.55 +All privileged state must be handled by Xen.  The guest OS has no
    1.56 +direct access to CR3 and is not permitted to update privileged bits in
    1.57 +EFLAGS.
    1.58  
    1.59  \chapter{Exceptions}
    1.60  The IDT is virtualised by submitting a virtual 'trap
    1.61 -table' to Xen. Most trap handlers are identical to native x86
    1.62 -handlers. The page-fault handler is a noteable exception.
    1.63 +table' to Xen.  Most trap handlers are identical to native x86
    1.64 +handlers.  The page-fault handler is a noteable exception.
    1.65  
    1.66  \chapter{Interrupts and events}
    1.67  Interrupts are virtualized by mapping them to events, which are delivered 
    1.68 -asynchronously to the target domain. A guest OS can map these events onto
    1.69 +asynchronously to the target domain.  A guest OS can map these events onto
    1.70  its standard interrupt dispatch mechanisms, such as a simple vectoring 
    1.71 -scheme. Each physical interrupt source controlled by the hypervisor, including
    1.72 +scheme.  Each physical interrupt source controlled by the hypervisor, including
    1.73  network devices, disks, or the timer subsystem, is responsible for identifying
    1.74  the target for an incoming interrupt and sending an event to that domain.
    1.75  
    1.76  This demultiplexing mechanism also provides a device-specific mechanism for 
    1.77 -event coalescing or hold-off. For example, a guest OS may request to only 
    1.78 +event coalescing or hold-off.  For example, a guest OS may request to only 
    1.79  actually receive an event after {\it n} packets are queued ready for delivery
    1.80  to it, {\it t} nanoseconds after the first packet arrived (which ever is true
    1.81 -first). This allows latency and throughput requirements to be addressed on a
    1.82 +first).  This allows latency and throughput requirements to be addressed on a
    1.83  domain-specific basis.
    1.84  
    1.85  \chapter{Time}
    1.86  Guest operating systems need to be aware of the passage of real time and their
    1.87 -own ``virtual time'', i.e. the time they have been executing. Furthermore, a
    1.88 +own ``virtual time'', i.e. the time they have been executing.  Furthermore, a
    1.89  notion of time is required in the hypervisor itself for scheduling and the
    1.90 -activities that relate to it. To this end the hypervisor provides for notions
    1.91 -of time: cycle counter time, system time, wall clock time, domain virtual 
    1.92 +activities that relate to it.  To this end the hypervisor provides for notions
    1.93 +of time:  cycle counter time, system time, wall clock time, domain virtual 
    1.94  time.
    1.95  
    1.96  
    1.97  \section{Cycle counter time}
    1.98  This provides the finest-grained, free-running time reference, with the
    1.99 -approximate frequency being publicly accessible. The cycle counter time is
   1.100 -used to accurately extrapolate the other time references. On SMP machines
   1.101 +approximate frequency being publicly accessible.  The cycle counter time is
   1.102 +used to accurately extrapolate the other time references.  On SMP machines
   1.103  it is currently assumed that the cycle counter time is synchronised between
   1.104 -CPUs. The current x86-based implementation achieves this within inter-CPU
   1.105 +CPUs.  The current x86-based implementation achieves this within inter-CPU
   1.106  communication latencies.
   1.107  
   1.108  \section{System time}
   1.109  This is a 64-bit value containing the nanoseconds elapsed since boot
   1.110 -time. Unlike cycle counter time, system time accurately reflects the
   1.111 +time.  Unlike cycle counter time, system time accurately reflects the
   1.112  passage of real time, i.e.  it is adjusted several times a second for timer
   1.113 -drift. This is done by running an NTP client in {\it domain0} on behalf of
   1.114 -the machine, feeding updates to the hypervisor. Intermediate values can be
   1.115 +drift.  This is done by running an NTP client in {\it domain0} on behalf of
   1.116 +the machine, feeding updates to the hypervisor.  Intermediate values can be
   1.117  extrapolated using the cycle counter.
   1.118  
   1.119  \section{Wall clock time}
   1.120  This is the actual ``time of day'' Unix style struct timeval (i.e. seconds and
   1.121 -microseconds since 1 January 1970, adjusted by leap seconds etc.). Again, an 
   1.122 -NTP client hosted by {\it domain0} can help maintain this value. To guest 
   1.123 +microseconds since 1 January 1970, adjusted by leap seconds etc.).  Again, an 
   1.124 +NTP client hosted by {\it domain0} can help maintain this value.  To guest 
   1.125  operating systems this value will be reported instead of the hardware RTC
   1.126  clock value and they can use the system time and cycle counter times to start
   1.127  and remain perfectly in time.
   1.128 @@ -143,118 +145,136 @@ and remain perfectly in time.
   1.129  
   1.130  \section{Domain virtual time}
   1.131  This progresses at the same pace as cycle counter time, but only while a
   1.132 -domain is executing. It stops while a domain is de-scheduled. Therefore the
   1.133 +domain is executing.  It stops while a domain is de-scheduled.  Therefore the
   1.134  share of the CPU that a domain receives is indicated by the rate at which
   1.135  its domain virtual time increases, relative to the rate at which cycle
   1.136  counter time does so.
   1.137  
   1.138  \section{Time interface}
   1.139  Xen exports some timestamps to guest operating systems through their shared
   1.140 -info page. Timestamps are provided for system time and wall-clock time. Xen
   1.141 +info page.  Timestamps are provided for system time and wall-clock time.  Xen
   1.142  also provides the cycle counter values at the time of the last update
   1.143 -allowing guests to calculate the current values. The cpu frequency and a
   1.144 +allowing guests to calculate the current values.  The cpu frequency and a
   1.145  scaling factor are provided for guests to convert cycle counter values to
   1.146 -real time. Since all time stamps need to be updated and read
   1.147 +real time.  Since all time stamps need to be updated and read
   1.148  \emph{atomically} two version numbers are also stored in the shared info
   1.149  page.
   1.150  
   1.151  Xen will ensure that the time stamps are updated frequently enough to avoid
   1.152 -an overflow of the cycle counter values. Guest can check if its notion of
   1.153 +an overflow of the cycle counter values.  A guest can check if its notion of
   1.154  time is up-to-date by comparing the version numbers.
   1.155  
   1.156  \section{Timer events}
   1.157  
   1.158  Xen maintains a periodic timer (currently with a 10ms period) which sends a
   1.159 -timer event to the currently executing domain. This allows Guest OSes to
   1.160 -keep track of the passing of time when executing. The scheduler also
   1.161 +timer event to the currently executing domain.  This allows Guest OSes to
   1.162 +keep track of the passing of time when executing.  The scheduler also
   1.163  arranges for a newly activated domain to receive a timer event when
   1.164  scheduled so that the Guest OS can adjust to the passage of time while it
   1.165  has been inactive.
   1.166  
   1.167  In addition, Xen exports a hypercall interface to each domain which allows
   1.168 -them to request a timer event send to them at the specified system
   1.169 -time. Guest OSes may use this timer to implemented timeout values when they
   1.170 +them to request a timer event sent to them at the specified system
   1.171 +time.  Guest OSes may use this timer to implement timeout values when they
   1.172  block.
   1.173  
   1.174  \chapter{Memory}
   1.175  
   1.176 -The hypervisor is responsible for providing memory to each of the domains running 
   1.177 -over it. However, the Xen hypervisor's duty is restricted to managing physical
   1.178 -memory and to policing page table updates. All other memory management functions
   1.179 -are handly externally. Start-of-day issues such as building initial page tables
   1.180 -for a domain, loading its kernel image and so on are done by the {\it domain builder}
   1.181 -running in user-space with {\it domain0}. Paging to disk and swapping is handled
   1.182 -by the guest operating systems themselves, if they need it.
   1.183 +The hypervisor is responsible for providing memory to each of the
   1.184 +domains running over it.  However, the Xen hypervisor's duty is
   1.185 +restricted to managing physical memory and to policying page table
   1.186 +updates.  All other memory management functions are handled
   1.187 +externally.  Start-of-day issues such as building initial page tables
   1.188 +for a domain, loading its kernel image and so on are done by the {\it
   1.189 +domain builder} running in user-space in {\it domain0}.  Paging to
   1.190 +disk and swapping is handled by the guest operating systems
   1.191 +themselves, if they need it.
   1.192  
   1.193 -On a Xen-based system, the hypervisor itself runs in {\it ring 0}. It has full
   1.194 -access to the physical memory available in the system and is responsible for 
   1.195 -allocating portions of it to the domains. Guest operating systems run in and use
   1.196 -{\it rings 1}, {\it 2} and {\it 3} as they see fit, aside from the fact that
   1.197 -segmentation is used to prevent the guest OS from accessing a portion of the 
   1.198 -linear address space that is reserved for use by the hypervisor. This approach
   1.199 -allows transitions between the guest OS and hypervisor without flushing the TLB.
   1.200 -We expect most guest operating systems will use ring 1 for their own operation
   1.201 -and place applications (if they support such a notion) in ring 3.
   1.202 +On a Xen-based system, the hypervisor itself runs in {\it ring 0}.  It
   1.203 +has full access to the physical memory available in the system and is
   1.204 +responsible for allocating portions of it to the domains.  Guest
   1.205 +operating systems run in and use {\it rings 1}, {\it 2} and {\it 3} as
   1.206 +they see fit, aside from the fact that segmentation is used to prevent
   1.207 +the guest OS from accessing a portion of the linear address space that
   1.208 +is reserved for use by the hypervisor.  This approach allows
   1.209 +transitions between the guest OS and hypervisor without flushing the
   1.210 +TLB.  We expect most guest operating systems will use ring 1 for their
   1.211 +own operation and place applications (if they support such a notion)
   1.212 +in ring 3.
   1.213  
   1.214  \section{Physical Memory Allocation}
   1.215 -The hypervisor reserves a small fixed portion of physical memory at system boot
   1.216 -time. This special memory region is located at the beginning of physical memory
   1.217 -and is mapped at the very top of every virtual address space. 
   1.218 +The hypervisor reserves a small fixed portion of physical memory at
   1.219 +system boot time.  This special memory region is located at the
   1.220 +beginning of physical memory and is mapped at the very top of every
   1.221 +virtual address space.
   1.222  
   1.223  Any physical memory that is not used directly by the hypervisor is divided into
   1.224 -pages and is available for allocation to domains. The hypervisor tracks which
   1.225 -pages are free and which pages have been allocated to each domain. When a new
   1.226 +pages and is available for allocation to domains.  The hypervisor tracks which
   1.227 +pages are free and which pages have been allocated to each domain.  When a new
   1.228  domain is initialized, the hypervisor allocates it pages drawn from the free 
   1.229 -list. The amount of memory required by the domain is passed to the hypervisor
   1.230 +list.  The amount of memory required by the domain is passed to the hypervisor
   1.231  as one of the parameters for new domain initialization by the domain builder.
   1.232  
   1.233 -Domains can never be allocated further memory beyond that which was requested
   1.234 -for them on initialization. However, a domain can return pages to the hypervisor
   1.235 -if it discovers that its memory requirements have diminished.
   1.236 +Domains can never be allocated further memory beyond that which was
   1.237 +requested for them on initialization.  However, a domain can return
   1.238 +pages to the hypervisor if it discovers that its memory requirements
   1.239 +have diminished.
   1.240  
   1.241  % put reasons for why pages might be returned here.
   1.242  \section{Page Table Updates}
   1.243  In addition to managing physical memory allocation, the hypervisor is also in
   1.244 -charge of performing page table updates on behalf of the domains. This is 
   1.245 +charge of performing page table updates on behalf of the domains.  This is 
   1.246  neccessary to prevent domains from adding arbitrary mappings to their page
   1.247  tables or introducing mappings to other's page tables.
   1.248  
   1.249 +\section{Writabel Page Tables}
   1.250 +A domain can also request write access to its page tables.  In this
   1.251 +mode, Xen notes write attempts to page table pages and makes the page
   1.252 +temporarily writable.  In-use page table pages are also disconnect
   1.253 +from the page directory.  The domain can now update entries in these
   1.254 +page table pages without the assistance of Xen.  As soon as the
   1.255 +writabel page table pages get used as page table pages, Xen makes the
   1.256 +pages read-only again and revalidates the entries in the pages.
   1.257 +
   1.258  \section{Segment Descriptor Tables}
   1.259  
   1.260  On boot a guest is supplied with a default GDT, which is {\em not}
   1.261 -taken from its own memory allocation. If the guest wishes to use other
   1.262 +taken from its own memory allocation.  If the guest wishes to use other
   1.263  than the default `flat' ring-1 and ring-3 segments that this default
   1.264  table provides, it must register a custom GDT and/or LDT with Xen,
   1.265  allocated from its own memory.
   1.266  
   1.267  int {\bf set\_gdt}(unsigned long *{\em frame\_list}, int {\em entries})
   1.268  
   1.269 -{\em frame\_list}: An array of up to 16 page frames within which the GDT
   1.270 -resides. Any frame registered as a GDT frame may only be mapped
   1.271 -read-only within the guest's address space (e.g., no writeable
   1.272 +{\em frame\_list}: An array of up to 16 page frames within which the
   1.273 +GDT resides.  Any frame registered as a GDT frame may only be mapped
   1.274 +read-only within the guest's address space (e.g., no writable
   1.275  mappings, no use as a page-table page, and so on).
   1.276  
   1.277 -{\em entries}: The number of descriptor-entry slots in the GDT. Note that
   1.278 -the table must be large enough to contain Xen's reserved entries; thus
   1.279 -we must have '{\em entries $>$ LAST\_RESERVED\_GDT\_ENTRY}'. Note also that,
   1.280 -after registering the GDT, slots {\em FIRST\_} through
   1.281 -{\em LAST\_RESERVED\_GDT\_ENTRY} are no longer usable by the guest and may be
   1.282 -overwritten by Xen.
   1.283 +{\em entries}: The number of descriptor-entry slots in the GDT.  Note
   1.284 +that the table must be large enough to contain Xen's reserved entries;
   1.285 +thus we must have '{\em entries $>$ LAST\_RESERVED\_GDT\_ENTRY}'.
   1.286 +Note also that, after registering the GDT, slots {\em FIRST\_} through
   1.287 +{\em LAST\_RESERVED\_GDT\_ENTRY} are no longer usable by the guest and
   1.288 +may be overwritten by Xen.
   1.289  
   1.290  \section{Pseudo-Physical Memory}
   1.291 -The usual problem of external fragmentation means that a domain is unlikely to
   1.292 -receive a contiguous stretch of physical memory. However, most guest operating
   1.293 -systems do not have built-in support for operating in a fragmented physical
   1.294 -address space e.g. Linux has to have a one-to-one mapping for it physical
   1.295 -memory. There a notion of {\it pseudo physical memory} is introdouced. 
   1.296 -Once a domain is allocated a number of pages, at its start of the day, one of
   1.297 -the first things it needs to do is build its own {\it real physical} to 
   1.298 -{\it pseudo physical} mapping. From that moment onwards {\it pseudo physical}
   1.299 -address are used instead of discontiguous {\it real physical} addresses. Thus,
   1.300 -the rest of the guest OS code has an impression of operating in a contiguous
   1.301 -address space. Guest OS page tables contain real physical addresses. Mapping
   1.302 -{\it pseudo physical} to {\it real physical} addresses is need on page
   1.303 -table updates and also on remapping memory regions with the guest OS.
   1.304 +The usual problem of external fragmentation means that a domain is
   1.305 +unlikely to receive a contiguous stretch of physical memory.  However,
   1.306 +most guest operating systems do not have built-in support for
   1.307 +operating in a fragmented physical address space e.g. Linux has to
   1.308 +have a one-to-one mapping for its physical memory.  There a notion of
   1.309 +{\it pseudo physical memory} is introdouced.  Xen maintains a {\it
   1.310 +real physical} to {\it pseudo physical} mapping which can be consulted
   1.311 +by every domain.  Additionally, at its start of day, a domain is
   1.312 +supplied a {\it pseudo physical} to {\it real physical} mapping which
   1.313 +it needs to keep updated itself.  From that moment onwards {\it pseudo
   1.314 +physical} addresses are used instead of discontiguous {\it real
   1.315 +physical} addresses.  Thus, the rest of the guest OS code has an
   1.316 +impression of operating in a contiguous address space.  Guest OS page
   1.317 +tables contain real physical addresses.  Mapping {\it pseudo physical}
   1.318 +to {\it real physical} addresses is needed on page table updates and
   1.319 +also on remapping memory regions with the guest OS.
   1.320  
   1.321  
   1.322  
   1.323 @@ -272,11 +292,11 @@ In terms of networking this means packet
   1.324  
   1.325  On the transmission side, the backend needs to perform two key actions:
   1.326  \begin{itemize}
   1.327 -\item {\tt Validation:} A domain is only allowed to emit packets
   1.328 +\item {\tt Validation:} A domain may only be allowed to emit packets
   1.329  matching a certain specification; for example, ones in which the
   1.330  source IP address matches one assigned to the virtual interface over
   1.331 -which it is sent. The backend is responsible for ensuring any such
   1.332 -requirements are met, either by checking or by stamping outgoing
   1.333 +which it is sent.  The backend would be responsible for ensuring any
   1.334 +such requirements are met, either by checking or by stamping outgoing
   1.335  packets with prescribed values for certain fields.
   1.336  
   1.337  Validation functions can be configured using standard firewall rules
   1.338 @@ -284,13 +304,13 @@ Validation functions can be configured u
   1.339  
   1.340  \item {\tt Scheduling:} Since a number of domains can share a single
   1.341  ``real'' network interface, the hypervisor must mediate access when
   1.342 -several domains each have packets queued for transmission. Of course,
   1.343 +several domains each have packets queued for transmission.  Of course,
   1.344  this general scheduling function subsumes basic shaping or
   1.345  rate-limiting schemes.
   1.346  
   1.347  \item {\tt Logging and Accounting:} The hypervisor can be configured
   1.348  with classifier rules that control how packets are accounted or
   1.349 -logged. For example, {\it domain0} could request that it receives a
   1.350 +logged.  For example, {\it domain0} could request that it receives a
   1.351  log message or copy of the packet whenever another domain attempts to
   1.352  send a TCP packet containg a SYN.
   1.353  \end{itemize}
   1.354 @@ -303,8 +323,8 @@ to which it must be delivered and delive
   1.355  \section{Data Transfer}
   1.356  
   1.357  Each virtual interface uses two ``descriptor rings'', one for transmit,
   1.358 -the other for receive. Each descriptor identifies a block of contiguous
   1.359 -physical memory allocated to the domain. There are four cases:
   1.360 +the other for receive.  Each descriptor identifies a block of contiguous
   1.361 +physical memory allocated to the domain.  There are four cases:
   1.362  
   1.363  \begin{itemize}
   1.364  
   1.365 @@ -326,15 +346,15 @@ Real physical addresses are used through
   1.366  translation from pseudo-physical addresses if that is necessary.
   1.367  
   1.368  If a domain does not keep its receive ring stocked with empty buffers then 
   1.369 -packets destined to it may be dropped. This provides some defense against 
   1.370 +packets destined to it may be dropped.  This provides some defense against 
   1.371  receiver-livelock problems because an overload domain will cease to receive
   1.372 -further data. Similarly, on the transmit path, it provides the application
   1.373 +further data.  Similarly, on the transmit path, it provides the application
   1.374  with feedback on the rate at which packets are able to leave the system.
   1.375  
   1.376  Synchronization between the hypervisor and the domain is achieved using 
   1.377 -counters held in shared memory that is accessible to both. Each ring has
   1.378 +counters held in shared memory that is accessible to both.  Each ring has
   1.379  associated producer and consumer indices indicating the area in the ring
   1.380 -that holds descriptors that contain data. After receiving {\it n} packets
   1.381 +that holds descriptors that contain data.  After receiving {\it n} packets
   1.382  or {\t nanoseconds} after receiving the first packet, the hypervisor sends
   1.383  an event to the domain. 
   1.384  
   1.385 @@ -342,7 +362,7 @@ an event to the domain.
   1.386  
   1.387  \section{Virtual Block Devices (VBDs)}
   1.388  
   1.389 -All guest OS disk access goes through the VBD interface. The VBD
   1.390 +All guest OS disk access goes through the VBD interface.  The VBD
   1.391  interface provides the administrator with the ability to selectively
   1.392  grant domains access to portions of block storage devices visible to
   1.393  the the block backend device (usually domain 0).
   1.394 @@ -360,7 +380,7 @@ Domains which have been granted access t
   1.395  to read and write it by shared memory communications with the backend domain. 
   1.396  
   1.397  In overview, the same style of descriptor-ring that is used for
   1.398 -network packets is used here. Each domain has one ring that carries
   1.399 +network packets is used here.  Each domain has one ring that carries
   1.400  operation requests to the hypervisor and carries the results back
   1.401  again.
   1.402  
   1.403 @@ -390,7 +410,7 @@ assigned domains should be run there.
   1.404  \section{Standard Schedulers}
   1.405  
   1.406  These BVT, Atropos and Round Robin schedulers are part of the normal
   1.407 -Xen distribution.  BVT provides porportional fair shares of the CPU to
   1.408 +Xen distribution.  BVT provides proportional fair shares of the CPU to
   1.409  the running domains.  Atropos can be used to reserve absolute shares
   1.410  of the CPU for each domain.  Round-robin is provided as an example of
   1.411  Xen's internal scheduler API.
   1.412 @@ -569,7 +589,7 @@ which also performs all Xen-specific tas
   1.413  (unless the previous task has been chosen again).
   1.414  
   1.415  This method is called with the {\tt schedule\_lock} held for the current CPU
   1.416 -and local interrupts interrupts disabled.
   1.417 +and local interrupts disabled.
   1.418  
   1.419  \paragraph*{Return values}
   1.420  
   1.421 @@ -588,9 +608,8 @@ source data from or populate with data, 
   1.422  \paragraph*{Call environment}
   1.423  
   1.424  The generic layer guarantees that when this method is called, the
   1.425 -caller was using the caller selected the correct scheduler ID, hence
   1.426 -the scheduler's implementation does not need to sanity-check these
   1.427 -parts of the call.
   1.428 +caller selected the correct scheduler ID, hence the scheduler's
   1.429 +implementation does not need to sanity-check these parts of the call.
   1.430  
   1.431  \paragraph*{Return values}
   1.432  
   1.433 @@ -739,21 +758,17 @@ xentrace\_format} and {\tt xentrace\_cpu
   1.434  
   1.435  Install trap handler table.
   1.436  
   1.437 -\section{ mmu\_update(mmu\_update\_t *req, int count)} 
   1.438 +\section{ mmu\_update(mmu\_update\_t *req, int count, int *success_count)} 
   1.439  Update the page table for the domain. Updates can be batched.
   1.440 -The update types are: 
   1.441 +success_count will be updated to report the number of successfull
   1.442 +updates.  The update types are:
   1.443  
   1.444  {\it MMU\_NORMAL\_PT\_UPDATE}:
   1.445  
   1.446 -{\it MMU\_UNCHECKED\_PT\_UPDATE}:
   1.447 -
   1.448  {\it MMU\_MACHPHYS\_UPDATE}:
   1.449  
   1.450  {\it MMU\_EXTENDED\_COMMAND}:
   1.451  
   1.452 -\section{ console\_write(const char *str, int count)}
   1.453 -Output buffer str to the console.
   1.454 -
   1.455  \section{ set\_gdt(unsigned long *frame\_list, int entries)} 
   1.456  Set the global descriptor table - virtualization for lgdt.
   1.457  
   1.458 @@ -761,28 +776,24 @@ Set the global descriptor table - virtua
   1.459  Request context switch from hypervisor.
   1.460  
   1.461  \section{ set\_callbacks(unsigned long event\_selector, unsigned long event\_address,
   1.462 -                        unsigned long failsafe\_selector, unsigned long failsafe\_address) } 
   1.463 - Register OS event processing routine. In Linux both the event\_selector and 
   1.464 -failsafe\_selector are the kernel's CS. The value event\_address specifies the address for
   1.465 -an interrupt handler dispatch routine and failsafe\_address specifies a handler for 
   1.466 -application faults.
   1.467 -
   1.468 -\section{ net\_io\_op(netop\_t *op)}  
   1.469 -Notify hypervisor of updates to transmit and/or receive descriptor rings.
   1.470 +                        unsigned long failsafe\_selector, unsigned
   1.471 + long failsafe\_address) } Register OS event processing routine.  In
   1.472 + Linux both the event\_selector and failsafe\_selector are the
   1.473 + kernel's CS.  The value event\_address specifies the address for an
   1.474 + interrupt handler dispatch routine and failsafe\_address specifies a
   1.475 + handler for application faults.
   1.476  
   1.477  \section{ fpu\_taskswitch(void)} 
   1.478  Notify hypervisor that fpu registers needed to be save on context switch.
   1.479  
   1.480  \section{ sched\_op(unsigned long op)} 
   1.481 -Request scheduling operation from hypervisor. The options are: {\it yield},
   1.482 -{\it block}, {\it stop}, and {\it exit}. {\it yield} keeps the calling
   1.483 -domain run-able but may cause a reschedule if other domains are
   1.484 -run-able. {\it block} removes the calling domain from the run queue and the
   1.485 -domains sleeps until an event is delivered to it. {\it stop} and {\it exit}
   1.486 -should be self-explanatory.
   1.487 -
   1.488 -\section{ set\_dom\_timer(dom\_timer\_arg\_t *timer\_arg)} 
   1.489 -Request a timer event to be sent at the specified system time.
   1.490 +Request scheduling operation from hypervisor. The options are: {\it
   1.491 +yield}, {\it block}, and {\it shutdown}.  {\it yield} keeps the
   1.492 +calling domain run-able but may cause a reschedule if other domains
   1.493 +are run-able.  {\it block} removes the calling domain from the run
   1.494 +queue and the domains sleeps until an event is delivered to it.  {\it
   1.495 +shutdown} is used to end the domain's execution and allows to specify
   1.496 +whether the domain should reboot, halt or suspend..
   1.497  
   1.498  \section{ dom0\_op(dom0\_op\_t *op)} 
   1.499  Administrative domain operations for domain management. The options are:
   1.500 @@ -790,26 +801,30 @@ Administrative domain operations for dom
   1.501  {\it DOM0\_CREATEDOMAIN}: create new domain, specifying the name and memory usage
   1.502  in kilobytes.
   1.503  
   1.504 -{\it DOM0\_STARTDOMAIN}: make domain schedulable
   1.505 +{\it DOM0\_CREATEDOMAIN}: create domain
   1.506  
   1.507 -{\it DOM0\_STOPDOMAIN}: mark domain as unschedulable
   1.508 +{\it DOM0\_PAUSEDOMAIN}: mark domain as unschedulable
   1.509 +
   1.510 +{\it DOM0\_UNPAUSEDOMAIN}: mark domain as schedulable
   1.511  
   1.512  {\it DOM0\_DESTROYDOMAIN}: deallocate resources associated with the domain
   1.513  
   1.514  {\it DOM0\_GETMEMLIST}: get list of pages used by the domain
   1.515  
   1.516 -{\it DOM0\_BUILDDOMAIN}: do final guest OS setup for domain
   1.517 -
   1.518 -{\it DOM0\_BVTCTL}: adjust scheduler context switch time
   1.519 +{\it DOM0\_SCHEDCTL}:
   1.520  
   1.521  {\it DOM0\_ADJUSTDOM}: adjust scheduling priorities for domain
   1.522  
   1.523 +{\it DOM0\_BUILDDOMAIN}: do final guest OS setup for domain
   1.524 +
   1.525  {\it DOM0\_GETDOMAINFO}: get statistics about the domain
   1.526  
   1.527  {\it DOM0\_GETPAGEFRAMEINFO}:
   1.528  
   1.529  {\it DOM0\_IOPL}: set IO privilege level
   1.530  
   1.531 +{\it DOM0\_MSR}:
   1.532 +
   1.533  {\it DOM0\_DEBUG}: interactively call pervasive debugger
   1.534  
   1.535  {\it DOM0\_SETTIME}: set system time
   1.536 @@ -827,34 +842,60 @@ in kilobytes.
   1.537  
   1.538  {\it DOM0\_SCHED\_ID}: get the ID of the current Xen scheduler
   1.539  
   1.540 +{\it DOM0\_SHADOW\_CONTROL}:
   1.541 +
   1.542  {\it DOM0\_SETDOMAINNAME}: set the name of a domain
   1.543  
   1.544  {\it DOM0\_SETDOMAININITIALMEM}: set initial memory allocation of a domain
   1.545  
   1.546 +{\it DOM0\_SETDOMAINMAXMEM}: set maximum memory allocation of a domain
   1.547 +
   1.548  {\it DOM0\_GETPAGEFRAMEINFO2}:
   1.549  
   1.550 +{\it DOM0\_SETDOMAINVMASSIST}: set domain VM assist options
   1.551 +
   1.552 +
   1.553  \section{ set\_debugreg(int reg, unsigned long value)}
   1.554  set debug register reg to value
   1.555  
   1.556  \section{ get\_debugreg(int reg)}
   1.557   get the debug register reg
   1.558  
   1.559 -\section{ update\_descriptor(unsigned long pa, unsigned long word1, unsigned long word2)} 
   1.560 +\section{ update\_descriptor(unsigned long ma, unsigned long word1, unsigned long word2)} 
   1.561  
   1.562  \section{ set\_fast\_trap(int idx)}
   1.563   install traps to allow guest OS to bypass hypervisor
   1.564  
   1.565 -\section{ dom\_mem\_op(unsigned int op, void *pages, unsigned long nr\_pages)}
   1.566 - increase or decrease memory reservations for guest OS
   1.567 +\section{ dom\_mem\_op(unsigned int op, unsigned long *extent_list, unsigned long nr\_extents, unsigned int extent_order)}
   1.568 +Increase or decrease memory reservations for guest OS
   1.569 +
   1.570 +\section{ multicall(void *call\_list, int nr\_calls)}
   1.571 +Execute a series of hypervisor calls
   1.572  
   1.573 -\section{ multicall(multicall\_entry\_t *call\_list, int nr\_calls)}
   1.574 - execute a series of hypervisor calls
   1.575 +\section{ update\_va\_mapping(unsigned long page\_nr, unsigned long val, unsigned long flags)}
   1.576 +
   1.577 +\section{ set\_timer\_op(uint64_t timeout)} 
   1.578 +Request a timer event to be sent at the specified system time.
   1.579 +
   1.580 +\section{ event\_channel\_op(void *op)} 
   1.581 +Iinter-domain event-channel management.
   1.582  
   1.583 -\section{ kbd\_op(unsigned char op, unsigned char val)}
   1.584 +\section{ xen\_version(int cmd)}
   1.585 +Request Xen version number.
   1.586 +
   1.587 +\section{ console\_io(int cmd, int count, char *str)}
   1.588 +Interact with the console, operations are:
   1.589 +
   1.590 +{\it CONSOLEIO\_write}: Output count characters from buffer str.
   1.591  
   1.592 -\section{update\_va\_mapping(unsigned long page\_nr, unsigned long val, unsigned long flags)}
   1.593 +{\it CONSOLEIO\_read}: Input at most count characters into buffer str.
   1.594 +
   1.595 +\section{ physdev\_op(void *physdev\_op)}
   1.596  
   1.597 -\section{ event\_channel\_op(unsigned int cmd, unsigned int id)} 
   1.598 -inter-domain event-channel management, options are: open, close, send, and status.
   1.599 +\section{ grant\_table\_op(unsigned int cmd, void *uop, unsigned int count)}
   1.600 +
   1.601 +\section{ vm\_assist(unsigned int cmd, unsigned int type)}
   1.602 +
   1.603 +\section{ update\_va\_mapping\_otherdomain(unsigned long page\_nr, unsigned long val, unsigned long flags, uint16_t domid)}
   1.604  
   1.605  \end{document}
     2.1 --- a/docs/user.tex	Thu Oct 21 01:11:34 2004 +0000
     2.2 +++ b/docs/user.tex	Thu Oct 21 01:11:47 2004 +0000
     2.3 @@ -76,29 +76,28 @@ Xen support is available for increasingl
     2.4  following OSs have either been ported already or a port is in
     2.5  progress:
     2.6  \begin{itemize}
     2.7 -\item Dragonfly BSD
     2.8 -\item FreeBSD 5.3
     2.9  \item Linux 2.4
    2.10  \item Linux 2.6
    2.11  \item NetBSD 2.0
    2.12 +\item Dragonfly BSD
    2.13 +\item FreeBSD 5.3
    2.14  \item Plan 9
    2.15 -\item Windows XP
    2.16 +% \item Windows XP
    2.17  \end{itemize}
    2.18  
    2.19 -Right now, Linux 2.4 and 2.6 are available for Xen 2.0.  NetBSD
    2.20 -port will be updated to run on Xen 2.0, hopefully in time for the NetBSD
    2.21 -2.0 release.  It is intended that Xen support be integrated into the
    2.22 -official releases of Linux 2.6, NetBSD 2.0, FreeBSD and Dragonfly BSD.
    2.23 +Right now, Linux 2.4, Linux 2.6 and NetBSD are available for Xen 2.0.
    2.24 +It is intended that Xen support be integrated into the official
    2.25 +releases of Linux 2.6, NetBSD 2.0, FreeBSD and Dragonfly BSD.
    2.26  
    2.27  Even running multiple copies of Linux can be very useful, providing a
    2.28  means of containing faults to one OS image, providing performance
    2.29  isolation between the various OS instances and trying out multiple
    2.30  distros.
    2.31  
    2.32 -The Windows XP port is only available to those who have signed the
    2.33 -Microsoft Academic Source License.  Publically available XP support
    2.34 -will not be available for the foreseeable future (this may change when
    2.35 -Intel's Vanderpool Technology becomes available).
    2.36 +% The Windows XP port is only available to those who have signed the
    2.37 +% Microsoft Academic Source License.  Publically available XP support
    2.38 +% will not be available for the foreseeable future (this may change when
    2.39 +% Intel's Vanderpool Technology becomes available).
    2.40  
    2.41  Possible usage scenarios for Xen include:
    2.42  \begin{description}
    2.43 @@ -170,19 +169,12 @@ at:\\
    2.44  {\tt http://www.cl.cam.ac.uk/netos/papers/2003-xensosp.pdf}\\
    2.45  Work to port Xen to x86\_64 and IA64 is currently underway.
    2.46  
    2.47 -Xen is targeted at server-class machines, and the current list of
    2.48 -supported hardware very much reflects this, avoiding the need for us
    2.49 -to write drivers for "legacy" hardware. It is likely that some desktop
    2.50 -chipsets will fail to work properly with the default Xen
    2.51 -configuration: specifying {\tt noacpi} or {\tt ignorebiostables} when
    2.52 -booting Xen may help in these cases.
    2.53 -
    2.54  Xen requires a ``P6'' or newer processor (e.g. Pentium Pro, Celeron,
    2.55  Pentium II, Pentium III, Pentium IV, Xeon, AMD Athlon, AMD Duron).
    2.56  Multiprocessor machines are supported, and we also have basic support
    2.57  for HyperThreading (SMT), although this remains a topic for ongoing
    2.58 -research. We're also working on an x86\_64 port (though Xen should
    2.59 -already run on these systems just fine in 32-bit mode).
    2.60 +research.  We're also working on an x86\_64 port (though Xen already
    2.61 +runs on these systems just fine in 32-bit mode).
    2.62  
    2.63  Xen can currently use up to 4GB of memory.  It is possible for x86
    2.64  machines to address up to 64GB of physical memory but (unless an
    2.65 @@ -190,13 +182,16 @@ external developer volunteers) there are
    2.66  systems.  The x86\_64 port is the planned route to supporting more
    2.67  than 4GB of memory.
    2.68  
    2.69 -In contrast to previous Xen versions, in Xen 2.0 device drivers run
    2.70 -within a privileged guest OS rather than within Xen itself. This means
    2.71 -that we should be compatible with the majority of device hardware
    2.72 -supported by Linux.  The default XenLinux build contains support for
    2.73 -relatively modern server-class network and disk hardware, but you can
    2.74 -add support for other hardware by configuring your XenLinux kernel in
    2.75 -the normal way (e.g. \verb_# make ARCH=xen xconfig_).
    2.76 +Xen offloads most of the hardware support issues to the guest OS
    2.77 +running in Domain 0.  Xen itself only contains code to detect and
    2.78 +start additional processors, setup interrupt routing and perform PCI
    2.79 +bus enumeration.  Device drivers run within a privileged guest OS
    2.80 +rather than within Xen itself.  This means that we should be
    2.81 +compatible with the majority of device hardware supported by Linux.
    2.82 +The default XenLinux build contains support for relatively modern
    2.83 +server-class network and disk hardware, but you can add support for
    2.84 +other hardware by configuring your XenLinux kernel in the normal way
    2.85 +(e.g. \verb_# make ARCH=xen menuconfig_).
    2.86  
    2.87  \section{History}
    2.88  
    2.89 @@ -218,9 +213,9 @@ Xen has since grown into a project in it
    2.90  investigate interesting research issues regarding the best techniques
    2.91  for virtualizing resources such as the CPU, memory, disk and network.
    2.92  The project has been bolstered by support from Intel Research
    2.93 -Cambridge, and HP Labs, who are now working closely with us. We're
    2.94 -also in receipt of support from Microsoft Research Cambridge to port
    2.95 -Windows XP to run on Xen.
    2.96 +Cambridge, and HP Labs, who are now working closely with us.
    2.97 +% We're also in receipt of support from Microsoft Research Cambridge to
    2.98 +% port Windows XP to run on Xen.
    2.99  
   2.100  Xen was first described in the 2003 paper at SOSP \\
   2.101  ({\tt http://www.cl.cam.ac.uk/netos/papers/2003-xensosp.pdf}).
   2.102 @@ -241,7 +236,7 @@ will work to conclusively achieve that p
   2.103  
   2.104  \chapter{Installation}
   2.105  
   2.106 -The Xen distribution includes three main components: Xen itself,
   2.107 +The Xen distribution includes three main components:  Xen itself,
   2.108  utilities to convert a standard Linux tree to run on Xen and the
   2.109  userspace tools required to operate a Xen-based system.
   2.110  
   2.111 @@ -259,14 +254,14 @@ http://www.twistedmatrix.com}).  There m
   2.112  your distribution; alternatively it can be installed by running {\tt \#
   2.113  make install-twisted} in the root of the Xen source tree.
   2.114  \item The Linux bridge control tools (see {\tt
   2.115 -http://bridge.sourceforge.net}).  There may be a packages of these
   2.116 -tools available for your distribution.
   2.117 +http://bridge.sourceforge.net}).  There may be packages of these tools
   2.118 +available for your distribution.
   2.119  \item Linux IP Routing Tools
   2.120  \item make
   2.121 +\item gcc
   2.122 +\item libcurl
   2.123 +\item zlib-dev
   2.124  \item python-dev
   2.125 -\item gcc
   2.126 -\item zlib-dev
   2.127 -\item libcurl
   2.128  \item python2.3-pycurl
   2.129  \item python2.3-twisted
   2.130  \end{itemize}
   2.131 @@ -302,11 +297,11 @@ run:
   2.132  # bk clone bk://xen.bkbits.net/xen-2.0.bk
   2.133  \end{verbatim}
   2.134  
   2.135 -Under your current directory, a new directory named `xen-2.0.bk'
   2.136 -has been created, which contains all the source code for the Xen
   2.137 -hypervisor and the Xen tools.  The directory also contains `sparse'
   2.138 -Linux source trees, containing only the files that differ between
   2.139 -XenLinux and standard Linux.
   2.140 +Under your current directory, a new directory named `xen-2.0.bk' has
   2.141 +been created, which contains all the source code for the Xen
   2.142 +hypervisor and the Xen tools.  The directory also contains `sparse' OS
   2.143 +source trees, containing only the files that require changes to allow
   2.144 +the OS to run on Xen.
   2.145  
   2.146  Once you have cloned the repository, you can update to the newest
   2.147  changes to the repository by running:
   2.148 @@ -330,8 +325,9 @@ The Xen source code repository is struct
   2.149  \item[\path{tools/}] Xen node controller daemon (Xend), command line tools, 
   2.150    control libraries
   2.151  \item[\path{xen/}] The Xen hypervisor itself.
   2.152 -\item[\path{linux-2.4.27-xen/}] Linux 2.4 support for Xen
   2.153 -\item[\path{linux-2.6.8.1-xen/}] Linux 2.6 support for Xen
   2.154 +\item[\path{linux-2.4.27-xen/}] Xen support for Linux 2.4
   2.155 +\item[\path{linux-2.6.8.1-xen/}] Xen support for Linux 2.6
   2.156 +\item[\path{netbsd-2.0-xen-sparse/}] Xen support for NetBSD 2.0
   2.157  \item[\path{docs/}] various documentation files for users and developers
   2.158  \item[\path{extras/}] currently this contains the Mini OS, aimed at developers
   2.159  \end{description}
   2.160 @@ -351,37 +347,46 @@ following:
   2.161        unprivileged virtual machines.
   2.162  \end{itemize}
   2.163  
   2.164 -Inspect the Makefile if you want to see what goes on during a
   2.165 -build. Building Xen and the tools is straightforward, but XenLinux is
   2.166 -more complicated. The makefile needs a `pristine' linux kernel tree
   2.167 -which it will then add the Xen architecture files to. You can tell the
   2.168 +Inspect the Makefile if you want to see what goes on during a build.
   2.169 +Building Xen and the tools is straightforward, but XenLinux is more
   2.170 +complicated.  The makefile needs a `pristine' linux kernel tree which
   2.171 +it will then add the Xen architecture files to.  You can tell the
   2.172  makefile the location of the appropriate linux compressed tar file by
   2.173  setting the LINUX\_SRC environment variable, e.g. \\
   2.174  \verb!# LINUX_SRC=/tmp/linux-2.6.8.1.tar.bz2 make world! \\ or by
   2.175 -placing the tar file somewhere in the search path of {\tt LINUX\_SRC\_PATH}
   2.176 -which defaults to ``{\tt .:..}". If the makefile can't find a suitable
   2.177 -kernel tar file it attempts to download it from kernel.org (this won't
   2.178 -work if you're behind a firewall).
   2.179 +placing the tar file somewhere in the search path of {\tt
   2.180 +LINUX\_SRC\_PATH} which defaults to ``{\tt .:..}".  If the makefile
   2.181 +can't find a suitable kernel tar file it attempts to download it from
   2.182 +kernel.org (this won't work if you're behind a firewall).
   2.183  
   2.184  After untaring the pristine kernel tree, the makefile uses the {\tt
   2.185 -mkbuildtree} script to add the Xen patches the kernel. It then builds
   2.186 -two different XenLinux images, one with a ``-xen0'' extension which
   2.187 -contains hardware device drivers and drivers for Xen's virtual devices,
   2.188 -and one with a ``-xenU'' extension that just contains the virtual ones.
   2.189 +mkbuildtree} script to add the Xen patches to the kernel.  It then
   2.190 +builds two different XenLinux images, one with a ``-xen0'' extension
   2.191 +which contains hardware device drivers and drivers for Xen's virtual
   2.192 +devices, and one with a ``-xenU'' extension that just contains the
   2.193 +virtual ones.
   2.194  
   2.195  The procedure is similar to build the Linux 2.4 port: \\
   2.196  \verb!# LINUX_SRC=/path/to/linux2.4/source make linux24!
   2.197  
   2.198 -In both cases, if you have an SMP machine you may wish to give the
   2.199 -{\tt '-j4'} argument to make to get a parallel build.
   2.200 +The NetBSD port can be built using: \\ \verb!# make netbsd! \\ The
   2.201 +NetBSD port is built using a snapshot of the netbsd-2-0 cvs branch.
   2.202 +The snapshot is downloaded as part of the build process, if it is not
   2.203 +yet present in the {\tt NETBSD_SRC_PATH} search path.  The build
   2.204 +process also downloads a toolchain which includes all the tools
   2.205 +necessary to build the NetBSD kernel under Linux.
   2.206 +
   2.207 +If you have an SMP machine you may wish to give the {\tt '-j4'}
   2.208 +argument to make to get a parallel build.
   2.209  
   2.210  XXX Insert details on customising the kernel to be built.
   2.211  i.e. merging config files
   2.212  
   2.213 -If you have an existing kernel configuration that you would like to
   2.214 -use for domain 0, you should copy it to
   2.215 +If you have an existing Linux kernel configuration that you would like
   2.216 +to use for domain 0, you should copy it to
   2.217  install/boot/config-2.6.8.1-xen0.  During the first build, you may be
   2.218 -asked about some Xen-specific options.
   2.219 +asked about some Xen-specific options.  We advised accepting the
   2.220 +defaults for these options.
   2.221  
   2.222  \framebox{\parbox{5in}{
   2.223  {\bf Distro specific:} \\
   2.224 @@ -395,10 +400,10 @@ locations, do: \\
   2.225  \verb_# make install_
   2.226  
   2.227  Alternatively, users with special installation requirements may wish
   2.228 -to install them manually by copying file to their appropriate
   2.229 +to install them manually by copying the files to their appropriate
   2.230  destinations.
   2.231  
   2.232 -Take a look at the files in \path{install/boot/}:
   2.233 +Files in \path{install/boot/} include:
   2.234  \begin{itemize}
   2.235  \item \path{install/boot/xen.gz} The Xen 'kernel'
   2.236  \item \path{install/boot/vmlinuz-2.6.8.1-xen0}  Domain 0 XenLinux kernel
   2.237 @@ -406,7 +411,7 @@ Take a look at the files in \path{instal
   2.238  \end{itemize}
   2.239  
   2.240  The difference between the two Linux kernels that are built is due to
   2.241 -the configuration file used for each. The "U" suffixed unprivileged
   2.242 +the configuration file used for each.  The "U" suffixed unprivileged
   2.243  version doesn't contain any of the physical hardware device drivers
   2.244  --- it is 30\% smaller and hence may be preferred for your
   2.245  non-privileged domains.  The ``0'' suffixed privileged version can be
   2.246 @@ -472,7 +477,7 @@ serial console.  Add the line:
   2.247  Users of the XenLinux 2.6 kernel should disable Thread Local Storage
   2.248  (e.g. by doing a {\tt mv /lib/tls /lib/tls.disabled}) before
   2.249  attempting to run with a XenLinux kernel.  You can always reenable it
   2.250 -my restoring the directory to its original location (i.e. {\tt mv
   2.251 +by restoring the directory to its original location (i.e. {\tt mv
   2.252    /lib/tls.disabled /lib/tls}).
   2.253  
   2.254  The TLS implementation uses segmentation in a way that is not
   2.255 @@ -504,7 +509,8 @@ should still be able to reboot with your
   2.256  
   2.257  The first step in creating a new domain is to prepare a root
   2.258  filesystem for it to boot off.  Typically, this might be stored in a
   2.259 -normal partition, a disk file and LVM volume, or on an NFS server.
   2.260 +normal partition, an LVM or other volume manager partition, a disk
   2.261 +file or on an NFS server.
   2.262  
   2.263  A simple way to do this is simply to boot from your standard OS
   2.264  install CD and install the distribution into another partition on your
   2.265 @@ -513,7 +519,7 @@ hard drive.
   2.266  {\em N.b } you can boot with Xen and XenLinux without installing any
   2.267  special userspace tools but will need to have the prerequisites
   2.268  described in Section~\ref{sec:prerequisites} and the Xen control tools
   2.269 -are installed before you proceed.
   2.270 +installed before you proceed.
   2.271  
   2.272  \section{From the web interface}
   2.273  
   2.274 @@ -535,10 +541,10 @@ require a more complex setup, you will w
   2.275  configuration file --- details of the configuration file formats are
   2.276  included in Chapter~\ref{cha:config}.
   2.277  
   2.278 -The \path{xmdefconfig1} file is a simple template configuration file
   2.279 +The \path{xmexample1} file is a simple template configuration file
   2.280  for describing a single VM.
   2.281  
   2.282 -The \path{xmdefconfig2} file is a template description that is intended
   2.283 +The \path{xmexample2} file is a template description that is intended
   2.284  to be reused for multiple virtual machines.  Setting the value of the
   2.285  {\tt vmid} variable on the {\tt xm} command line
   2.286  fills in parts of this template.
   2.287 @@ -600,8 +606,8 @@ kernel = "/boot/vmlinuz-2.6.8.1-xenU" # 
   2.288  memory = 64
   2.289  name = "ttylinux"
   2.290  cpu = -1 # leave to Xen to pick
   2.291 -nics=1
   2.292 -ip="1.2.3.4"
   2.293 +nics = 1
   2.294 +ip = "1.2.3.4"
   2.295  disk = ['file:/path/to/ttylinux-disk,sda1,w']
   2.296  root = "/dev/sda1 ro"
   2.297  \end{verbatim}
   2.298 @@ -667,11 +673,11 @@ e.g. \verb_# xm console 1_ (open console
   2.299  \subsection{\tt xm list}
   2.300  
   2.301  The output of {\tt xm list} is in rows of the following format:\\
   2.302 -\verb_domid name memory cpu state cputime_
   2.303 +\verb_name domid memory cpu state cputime console_
   2.304  
   2.305  \begin{description}
   2.306 +\item[name]  The descriptive name of the virtual machine.
   2.307  \item[domid] The number of the domain ID this virtual machine is running in.
   2.308 -\item[name]  The descriptive name of the virtual machine.
   2.309  \item[memory] Memory size in megabytes.
   2.310  \item[cpu]   The CPU this domain is running on.
   2.311  \item[state] Domain state consists of 5 fields:
   2.312 @@ -683,6 +689,7 @@ The output of {\tt xm list} is in rows o
   2.313    \item[c] crashed
   2.314    \end{description}
   2.315  \item[cputime] How much CPU time (in seconds) the domain has used so far.
   2.316 +\item[console] TCP port accepting connections to the domain's console.
   2.317  \end{description}
   2.318  
   2.319  The {\tt xm list} command also supports a long output format when the
   2.320 @@ -702,8 +709,8 @@ It is possible to use a file in Domain 0
   2.321  virtual machine.  As well as being convenient, this also has the
   2.322  advantage that the virtual block device will be {\em sparse} --- space
   2.323  will only really be allocated as parts of the file are used.  So if a
   2.324 -virtual machine uses only half its disk space then the file really
   2.325 -takes up a half of the size allocated.
   2.326 +virtual machine uses only half of its disk space then the file really
   2.327 +takes up half of the size allocated.
   2.328  
   2.329  For example, to create a 2GB sparse file-backed virtual block device
   2.330  (actually only consumes 1KB of disk):
   2.331 @@ -717,7 +724,7 @@ Make a file system in the disk file: \\
   2.332  
   2.333  Populate the file system e.g. by copying from the current root:
   2.334  \begin{verbatim}
   2.335 -# mount vm1disk /mnt -o loop
   2.336 +# mount -o loop vm1disk /mnt
   2.337  # cp -ax / /mnt
   2.338  \end{verbatim}
   2.339  Tailor the file system by editing \path{/etc/fstab},
   2.340 @@ -729,8 +736,6 @@ this example put \path{/dev/sda1} to roo
   2.341  Now unmount (this is important!):\\
   2.342  \verb_# umount /mnt_
   2.343  
   2.344 -And detach the file from its loop device:
   2.345 -
   2.346  In the configuration file set:\\
   2.347  \verb_disk = ['file:/full/path/to/vm1disk,sda1,w']_
   2.348  
   2.349 @@ -784,7 +789,7 @@ order to handle the {\tt vmid} variable.
   2.350  
   2.351  
   2.352  \chapter{Xend (Node control daemon)}
   2.353 -\label{cha:xensv}
   2.354 +\label{cha:xend}
   2.355  
   2.356  The Xen Daemon (Xend) performs system management functions related to
   2.357  virtual machines.  It forms a central point of control for a machine
   2.358 @@ -801,11 +806,10 @@ Xend command line:
   2.359  \verb_# xend restart_ & restart Xend if running, otherwise start it \\
   2.360  \end{tabular}
   2.361  
   2.362 -An SysV init script called {\tt xend} is provided to start Xend at
   2.363 -boot time.  The {\tt make install} will install this script in
   2.364 -{\path{/etc/init.d} automatically.  To enable it, you can make
   2.365 -symbolic links in the appropriate runlevel directories or use the {\tt
   2.366 -chkconfig} tool, where available.
   2.367 +A SysV init script called {\tt xend} is provided to start Xend at boot
   2.368 +time.  {\tt make install} installs this script in {\path{/etc/init.d}.
   2.369 +To enable it, you have to make symbolic links in the appropriate
   2.370 +runlevel directories or use the {\tt chkconfig} tool, where available.
   2.371  
   2.372  Once Xend is running, more sophisticated administration can be done
   2.373  using the Xensv web interface (see Chapter~\ref{cha:xensv}).
   2.374 @@ -822,7 +826,7 @@ It will automatically start Xend if it i
   2.375  
   2.376  By default, Xensv will serve out the web interface on port 8080.  This
   2.377  can be changed by editing {\tt
   2.378 -/usr/lib/python2.2/site-packages/xen/sv/params.py}.
   2.379 +/usr/lib/python2.3/site-packages/xen/sv/params.py}.
   2.380  
   2.381  Once Xensv is running, the web interface can be used to manage running
   2.382  domains and provides a user friendly domain creation wizard.
   2.383 @@ -839,13 +843,13 @@ The general format of an xm command line
   2.384  # xm command [switches] [arguments] [variables]
   2.385  \end{verbatim}
   2.386  
   2.387 -The available {\em switches } and {\em arguments}are dependent on the
   2.388 +The available {\em switches } and {\em arguments} are dependent on the
   2.389  {\em command} chosen.  The {\em variables} may be set using
   2.390 -declarations of the form {\tt variable=value} and may be used to set /
   2.391 -override any of the values in the configuration file being used,
   2.392 -including the standard variables described above and any custom
   2.393 -variables (for instance, the \path{xmdefconfig} file uses a {\tt vmid}
   2.394 -variable).
   2.395 +declarations of the form {\tt variable=value} and command line
   2.396 +declarations override any of the values in the configuration file
   2.397 +being used, including the standard variables described above and any
   2.398 +custom variables (for instance, the \path{xmdefconfig} file uses a
   2.399 +{\tt vmid} variable).
   2.400  
   2.401  The available commands are as follows:
   2.402  
   2.403 @@ -1050,7 +1054,7 @@ domains), so the FBVT derivative has bee
   2.404  \item[ctx\_allow]
   2.405    the context switch allowance is similar to the "quantum"
   2.406    in traditional schedulers.  It is the minimum time that
   2.407 -  a scheduled domain will be allowed to run before be
   2.408 +  a scheduled domain will be allowed to run before being
   2.409    pre-empted.  This prevents thrashing of the CPU.
   2.410  \end{description}
   2.411  
   2.412 @@ -1124,13 +1128,13 @@ higher throughput.
   2.413  
   2.414  \chapter{Privileged domains}
   2.415  
   2.416 -There are two possible types of privileges: IO privileges and
   2.417 +There are two possible types of privileges:  IO privileges and
   2.418  administration privileges.
   2.419  
   2.420  \section{Driver domains (IO Privileges)}
   2.421  
   2.422  IO privileges can be assigned to allow a domain to drive PCI devices
   2.423 -itself.  This is used for to support driver domains.
   2.424 +itself.  This is used to support driver domains.
   2.425  
   2.426  Setting backend privileges is currently only supported in SXP format
   2.427  config files (??? is this true - there's nothing in xmdefconfig,
   2.428 @@ -1201,8 +1205,11 @@ make option1=y option2=y
   2.429  
   2.430  \section{List of options}
   2.431  
   2.432 +{\bf verbose=y }\\
   2.433 +Enable debugging messages when Xen detects an unexpected condition.
   2.434 +Also enables console output from all domains. \\
   2.435  {\bf debug=y }\\
   2.436 -Enable debug assertions and console output.
   2.437 +Enable debug assertions.  Implies {\bf verbose=y }.
   2.438  (Primarily useful for tracing bugs in Xen).        \\
   2.439  {\bf debugger=y }\\
   2.440  Enable the in-Xen pervasive debugger (PDB).
   2.441 @@ -1253,12 +1260,12 @@ editing \path{grub.conf}.
   2.442   Disable Hyperthreading. \\
   2.443  
   2.444  {\bf badpage=$<$page number$>$[,$<$page number$>$] } \\
   2.445 -                  Specify a list of pages not to be allocated for use 
   2.446 -                  because they contain bad bytes. For example, if your
   2.447 -                  memory tester says that byte 0x12345678 is bad, you would
   2.448 -                  place 'badpage=0x12345' on Xen's command line (i.e., the
   2.449 -                  last three digits of the byte address are not
   2.450 -                  included!). \\
   2.451 + Specify a list of pages not to be allocated for use 
   2.452 + because they contain bad bytes. For example, if your
   2.453 + memory tester says that byte 0x12345678 is bad, you would
   2.454 + place 'badpage=0x12345' on Xen's command line (i.e., the
   2.455 + last three digits of the byte address are not
   2.456 + included!). \\
   2.457  
   2.458  {\bf com1=$<$baud$>$,DPS[,$<$io\_base$>$,$<$irq$>$] \\
   2.459   com2=$<$baud$>$,DPS[,$<$io\_base$>$,$<$irq$>$] } \\
   2.460 @@ -1294,7 +1301,7 @@ editing \path{grub.conf}.
   2.461   pressed three times. Specifying '`' disables switching.
   2.462   The <auto-switch-char> specifies whether Xen should
   2.463   auto-switch input to DOM0 when it boots -- if it is 'x'
   2.464 - then auto-switching is disabled. Any other value, or
   2.465 + then auto-switching is disabled.  Any other value, or
   2.466   omitting the character, enables auto-switching.
   2.467   [NB. Default for this option is 'a'] \\
   2.468  
   2.469 @@ -1398,17 +1405,19 @@ Xen domain:
   2.470  \item Set up Xen 2.0 and test that it's working, as described earlier in
   2.471        this manual.
   2.472  
   2.473 -\item Create disk images for root-fs and swap (alternatively, you might create
   2.474 -      dedicated partitions, LVM logical volumes, etc. if that suits your setup).
   2.475 +\item Create disk images for root-fs and swap (alternatively, you
   2.476 +      might create dedicated partitions, LVM logical volumes, etc. if
   2.477 +      that suits your setup).
   2.478  \begin{verbatim}  
   2.479  dd if=/dev/zero of=/path/diskimage bs=1024k count=size_in_mbytes
   2.480  dd if=/dev/zero of=/path/swapimage bs=1024k count=size_in_mbytes
   2.481  \end{verbatim}
   2.482 -      If you're going to use this filesystem / diskimage only as a `template' for
   2.483 -      other vm diskimages, something like 300 MB should be enough.. (of course it 
   2.484 -      depends what kind of packages you are planning to install to the template)
   2.485 +      If you're going to use this filesystem / diskimage only as a
   2.486 +      `template' for other vm diskimages, something like 300 MB should
   2.487 +      be enough.. (of course it depends what kind of packages you are
   2.488 +      planning to install to the template)
   2.489  
   2.490 -\item Create filesystem and swap to the images
   2.491 +\item Create the filesystem and initialise the swap image
   2.492  \begin{verbatim}
   2.493  mkfs.ext3 /path/diskimage
   2.494  mkswap /path/swapimage
   2.495 @@ -1421,7 +1430,7 @@ mount -o loop /path/diskimage /mnt/disk
   2.496  
   2.497  \item Install {\tt debootstrap}
   2.498  
   2.499 -Make sure you have debootstrap installed on the host. If you are
   2.500 +Make sure you have debootstrap installed on the host.  If you are
   2.501  running Debian sarge (3.1 / testing) or unstable you can install it by
   2.502  running {\tt apt-get install debootstrap}.  Otherwise, it can be
   2.503  downloaded from the Debian project website.
   2.504 @@ -1439,7 +1448,7 @@ You can use any other Debian http/ftp mi
   2.505  chroot /mnt/disk /bin/bash
   2.506  \end{verbatim}
   2.507  
   2.508 -Edit the following files using vi or nano and make needed settings:
   2.509 +Edit the following files using vi or nano and make needed changes:
   2.510  \begin{verbatim}
   2.511  /etc/hostname
   2.512  /etc/hosts
   2.513 @@ -1460,7 +1469,7 @@ Add Debian mirror to:
   2.514  /etc/apt/sources.list
   2.515  \end{verbatim}
   2.516  
   2.517 -And create fstab like this:
   2.518 +Create fstab like this:
   2.519  \begin{verbatim}
   2.520  /dev/sda1       /       ext3    errors=remount-ro       0       1
   2.521  /dev/sda2       none    swap    sw                      0       0
   2.522 @@ -1474,8 +1483,8 @@ Logout
   2.523  umount /mnt/disk
   2.524  \end{verbatim}
   2.525  
   2.526 -\item      Create Xen 2.0 configuration file for the new domain. You can use the
   2.527 -        example-configurations coming with xen as a template.
   2.528 +\item Create Xen 2.0 configuration file for the new domain. You can
   2.529 +        use the example-configurations coming with xen as a template.
   2.530  
   2.531          Make sure you have the following set up:
   2.532  \begin{verbatim}
   2.533 @@ -1500,9 +1509,9 @@ xm list
   2.534  Started domain testdomain2, console on port 9626
   2.535  \end{verbatim}
   2.536          
   2.537 -        There you can see the ID of the console: 26. You can also
   2.538 -        list the consoles with {\tt xm consoles"}. (ID is the last two digits of
   2.539 -        the portnumber.)
   2.540 +        There you can see the ID of the console: 26. You can also list
   2.541 +        the consoles with {\tt xm consoles"}. (ID is the last two
   2.542 +        digits of the portnumber.)
   2.543  
   2.544          Attach to the console:
   2.545  
   2.546 @@ -1510,14 +1519,15 @@ Started domain testdomain2, console on p
   2.547  xm console 26
   2.548  \end{verbatim}
   2.549  
   2.550 -        or by telnetting to the port 9626 of localhost (the xm console progam works better).
   2.551 +        or by telnetting to the port 9626 of localhost (the xm console
   2.552 +        progam works better).
   2.553  
   2.554  \item   Log in and run base-config
   2.555  
   2.556          As a default there's no password for the root.
   2.557  
   2.558          Check that everything looks OK, and the system started without
   2.559 -        errors. Check that the swap is active, and the network settings are
   2.560 +        errors.  Check that the swap is active, and the network settings are
   2.561          correct.
   2.562  
   2.563          Run {\tt /usr/sbin/base-config} to set up the Debian settings.
   2.564 @@ -1535,4 +1545,3 @@ simply copying the image file.  Once thi
   2.565  image-specific settings (hostname, network settings, etc).
   2.566  
   2.567  \end{document}
   2.568 -
     3.1 --- a/xen/arch/x86/memory.c	Thu Oct 21 01:11:34 2004 +0000
     3.2 +++ b/xen/arch/x86/memory.c	Thu Oct 21 01:11:47 2004 +0000
     3.3 @@ -1604,7 +1604,11 @@ void ptwr_flush(const int which)
     3.4      if ( unlikely(__get_user(pte, ptep)) )
     3.5      {
     3.6          MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
     3.7 -        domain_crash();
     3.8 +        /*
     3.9 +         * Really a bug. We could read this PTE during the initial fault,
    3.10 +         * and pagetables can't have changed meantime. XXX Multi-proc guests?
    3.11 +         */
    3.12 +        BUG();
    3.13      }
    3.14      PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
    3.15                  PTWR_PRINT_WHICH, ptep, pte);
    3.16 @@ -1627,7 +1631,11 @@ void ptwr_flush(const int which)
    3.17      if ( unlikely(__put_user(pte, ptep)) )
    3.18      {
    3.19          MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
    3.20 -        domain_crash();
    3.21 +        /*
    3.22 +         * Really a bug. We could write this PTE during the initial fault,
    3.23 +         * and pagetables can't have changed meantime. XXX Multi-proc guests?
    3.24 +         */
    3.25 +        BUG();
    3.26      }
    3.27  
    3.28      /* Ensure that there are no stale writable mappings in any TLB. */
    3.29 @@ -1668,6 +1676,14 @@ void ptwr_flush(const int which)
    3.30          if ( unlikely(!get_page_from_l1e(nl1e, d)) )
    3.31          {
    3.32              MEM_LOG("ptwr: Could not re-validate l1 page\n");
    3.33 +            /*
    3.34 +             * Make the remaining p.t's consistent before crashing, so the
    3.35 +             * reference counts are correct.
    3.36 +             */
    3.37 +            memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
    3.38 +                   (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
    3.39 +            unmap_domain_mem(pl1e);
    3.40 +            ptwr_info[cpu].ptinfo[which].l1va = 0;
    3.41              domain_crash();
    3.42          }
    3.43          
    3.44 @@ -1781,6 +1797,9 @@ int ptwr_do_page_fault(unsigned long add
    3.45      {
    3.46          MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
    3.47                  &linear_pg_table[addr>>PAGE_SHIFT]);
    3.48 +        /* Toss the writable pagetable state and crash. */
    3.49 +        unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
    3.50 +        ptwr_info[cpu].ptinfo[which].l1va = 0;
    3.51          domain_crash();
    3.52      }
    3.53      
     4.1 --- a/xen/common/schedule.c	Thu Oct 21 01:11:34 2004 +0000
     4.2 +++ b/xen/common/schedule.c	Thu Oct 21 01:11:47 2004 +0000
     4.3 @@ -326,6 +326,9 @@ void __enter_scheduler(void)
     4.4      task_slice_t        next_slice;
     4.5      s32                 r_time;     /* time for new dom to run */
     4.6  
     4.7 +    cleanup_writable_pagetable(
     4.8 +        prev, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
     4.9 +
    4.10      perfc_incrc(sched_run);
    4.11      
    4.12      spin_lock_irq(&schedule_data[cpu].schedule_lock);
    4.13 @@ -373,9 +376,6 @@ void __enter_scheduler(void)
    4.14      if ( unlikely(prev == next) )
    4.15          return;
    4.16      
    4.17 -    cleanup_writable_pagetable(
    4.18 -        prev, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
    4.19 -
    4.20      perfc_incrc(sched_ctx);
    4.21  
    4.22  #if defined(WAKE_HISTO)