ia64/xen-unstable
changeset 6979:f8e7af29daa1
merge?
line diff
1.1 --- a/Makefile Tue Sep 20 09:43:29 2005 +0000 1.2 +++ b/Makefile Tue Sep 20 09:43:46 2005 +0000 1.3 @@ -166,27 +166,25 @@ uninstall: D=$(DESTDIR) 1.4 uninstall: 1.5 [ -d $(D)/etc/xen ] && mv -f $(D)/etc/xen $(D)/etc/xen.old-`date +%s` 1.6 rm -rf $(D)/etc/init.d/xend* 1.7 - rm -rf $(D)/usr/$(LIBDIR)/libxc* $(D)/usr/$(LIBDIR)/libxutil* 1.8 - rm -rf $(D)/usr/$(LIBDIR)/python/xen $(D)/usr/include/xen 1.9 - rm -rf $(D)/usr/$(LIBDIR)/share/xen $(D)/usr/$(LIBDIR)/libxenstore* 1.10 + rm -rf $(D)/etc/hotplug/xen-backend.agent 1.11 rm -rf $(D)/var/run/xen* $(D)/var/lib/xen* 1.12 - rm -rf $(D)/usr/include/xcs_proto.h $(D)/usr/include/xc.h 1.13 - rm -rf $(D)/usr/include/xs_lib.h $(D)/usr/include/xs.h 1.14 - rm -rf $(D)/usr/sbin/xcs $(D)/usr/sbin/xcsdump $(D)/usr/sbin/xen* 1.15 - rm -rf $(D)/usr/sbin/netfix 1.16 - rm -rf $(D)/usr/sbin/xfrd $(D)/usr/sbin/xm 1.17 - rm -rf $(D)/usr/share/doc/xen $(D)/usr/man/man*/xentrace* 1.18 - rm -rf $(D)/usr/bin/xen* $(D)/usr/bin/miniterm 1.19 rm -rf $(D)/boot/*xen* 1.20 rm -rf $(D)/lib/modules/*xen* 1.21 + rm -rf $(D)/usr/bin/xen* $(D)/usr/bin/lomount 1.22 rm -rf $(D)/usr/bin/cpuperf-perfcntr $(D)/usr/bin/cpuperf-xen 1.23 rm -rf $(D)/usr/bin/xc_shadow 1.24 - rm -rf $(D)/usr/share/xen $(D)/usr/libexec/xen 1.25 + rm -rf $(D)/usr/include/xenctrl.h 1.26 + rm -rf $(D)/usr/include/xs_lib.h $(D)/usr/include/xs.h 1.27 + rm -rf $(D)/usr/include/xen 1.28 + rm -rf $(D)/usr/$(LIBDIR)/libxenctrl* $(D)/usr/$(LIBDIR)/libxenguest* 1.29 + rm -rf $(D)/usr/$(LIBDIR)/libxenstore* 1.30 + rm -rf $(D)/usr/$(LIBDIR)/python/xen $(D)/usr/$(LIBDIR)/xen 1.31 + rm -rf $(D)/usr/libexec/xen 1.32 + rm -rf $(D)/usr/sbin/xen* $(D)/usr/sbin/netfix $(D)/usr/sbin/xm 1.33 + rm -rf $(D)/usr/share/doc/xen 1.34 + rm -rf $(D)/usr/share/xen 1.35 rm -rf $(D)/usr/share/man/man1/xen* 1.36 rm -rf $(D)/usr/share/man/man8/xen* 1.37 - rm -rf $(D)/usr/lib/xen 1.38 - rm -rf $(D)/etc/hotplug.d/xen-backend 1.39 - rm -rf $(D)/etc/hotplug/xen-backend.agent 1.40 1.41 # Legacy targets for compatibility 1.42 linux24:
2.1 --- a/docs/Makefile Tue Sep 20 09:43:29 2005 +0000 2.2 +++ b/docs/Makefile Tue Sep 20 09:43:46 2005 +0000 2.3 @@ -12,7 +12,7 @@ DOXYGEN := doxygen 2.4 2.5 pkgdocdir := /usr/share/doc/xen 2.6 2.7 -DOC_TEX := $(wildcard src/*.tex) 2.8 +DOC_TEX := src/user.tex src/interface.tex 2.9 DOC_PS := $(patsubst src/%.tex,ps/%.ps,$(DOC_TEX)) 2.10 DOC_PDF := $(patsubst src/%.tex,pdf/%.pdf,$(DOC_TEX)) 2.11 DOC_HTML := $(patsubst src/%.tex,html/%/index.html,$(DOC_TEX))
3.1 --- a/docs/src/interface.tex Tue Sep 20 09:43:29 2005 +0000 3.2 +++ b/docs/src/interface.tex Tue Sep 20 09:43:46 2005 +0000 3.3 @@ -87,1084 +87,23 @@ itself, allows the Xen framework to sepa 3.4 mechanism and policy within the system. 3.5 3.6 3.7 - 3.8 -\chapter{Virtual Architecture} 3.9 - 3.10 -On a Xen-based system, the hypervisor itself runs in {\it ring 0}. It 3.11 -has full access to the physical memory available in the system and is 3.12 -responsible for allocating portions of it to the domains. Guest 3.13 -operating systems run in and use {\it rings 1}, {\it 2} and {\it 3} as 3.14 -they see fit. Segmentation is used to prevent the guest OS from 3.15 -accessing the portion of the address space that is reserved for 3.16 -Xen. We expect most guest operating systems will use ring 1 for their 3.17 -own operation and place applications in ring 3. 3.18 - 3.19 -In this chapter we consider the basic virtual architecture provided 3.20 -by Xen: the basic CPU state, exception and interrupt handling, and 3.21 -time. Other aspects such as memory and device access are discussed 3.22 -in later chapters. 3.23 - 3.24 -\section{CPU state} 3.25 - 3.26 -All privileged state must be handled by Xen. The guest OS has no 3.27 -direct access to CR3 and is not permitted to update privileged bits in 3.28 -EFLAGS. Guest OSes use \emph{hypercalls} to invoke operations in Xen; 3.29 -these are analogous to system calls but occur from ring 1 to ring 0. 3.30 - 3.31 -A list of all hypercalls is given in Appendix~\ref{a:hypercalls}. 3.32 - 3.33 - 3.34 - 3.35 -\section{Exceptions} 3.36 - 3.37 -A virtual IDT is provided --- a domain can submit a table of trap 3.38 -handlers to Xen via the {\tt set\_trap\_table()} hypercall. Most trap 3.39 -handlers are identical to native x86 handlers, although the page-fault 3.40 -handler is somewhat different. 3.41 - 3.42 - 3.43 -\section{Interrupts and events} 3.44 - 3.45 -Interrupts are virtualized by mapping them to \emph{events}, which are 3.46 -delivered asynchronously to the target domain using a callback 3.47 -supplied via the {\tt set\_callbacks()} hypercall. A guest OS can map 3.48 -these events onto its standard interrupt dispatch mechanisms. Xen is 3.49 -responsible for determining the target domain that will handle each 3.50 -physical interrupt source. For more details on the binding of event 3.51 -sources to events, see Chapter~\ref{c:devices}. 3.52 - 3.53 - 3.54 - 3.55 -\section{Time} 3.56 - 3.57 -Guest operating systems need to be aware of the passage of both real 3.58 -(or wallclock) time and their own `virtual time' (the time for 3.59 -which they have been executing). Furthermore, Xen has a notion of 3.60 -time which is used for scheduling. The following notions of 3.61 -time are provided: 3.62 - 3.63 -\begin{description} 3.64 -\item[Cycle counter time.] 3.65 - 3.66 -This provides a fine-grained time reference. The cycle counter time is 3.67 -used to accurately extrapolate the other time references. On SMP machines 3.68 -it is currently assumed that the cycle counter time is synchronized between 3.69 -CPUs. The current x86-based implementation achieves this within inter-CPU 3.70 -communication latencies. 3.71 - 3.72 -\item[System time.] 3.73 - 3.74 -This is a 64-bit counter which holds the number of nanoseconds that 3.75 -have elapsed since system boot. 3.76 - 3.77 - 3.78 -\item[Wall clock time.] 3.79 - 3.80 -This is the time of day in a Unix-style {\tt struct timeval} (seconds 3.81 -and microseconds since 1 January 1970, adjusted by leap seconds). An 3.82 -NTP client hosted by {\it domain 0} can keep this value accurate. 3.83 - 3.84 - 3.85 -\item[Domain virtual time.] 3.86 - 3.87 -This progresses at the same pace as system time, but only while a 3.88 -domain is executing --- it stops while a domain is de-scheduled. 3.89 -Therefore the share of the CPU that a domain receives is indicated by 3.90 -the rate at which its virtual time increases. 3.91 - 3.92 -\end{description} 3.93 - 3.94 - 3.95 -Xen exports timestamps for system time and wall-clock time to guest 3.96 -operating systems through a shared page of memory. Xen also provides 3.97 -the cycle counter time at the instant the timestamps were calculated, 3.98 -and the CPU frequency in Hertz. This allows the guest to extrapolate 3.99 -system and wall-clock times accurately based on the current cycle 3.100 -counter time. 3.101 - 3.102 -Since all time stamps need to be updated and read \emph{atomically} 3.103 -two version numbers are also stored in the shared info page. The 3.104 -first is incremented prior to an update, while the second is only 3.105 -incremented afterwards. Thus a guest can be sure that it read a consistent 3.106 -state by checking the two version numbers are equal. 3.107 - 3.108 -Xen includes a periodic ticker which sends a timer event to the 3.109 -currently executing domain every 10ms. The Xen scheduler also sends a 3.110 -timer event whenever a domain is scheduled; this allows the guest OS 3.111 -to adjust for the time that has passed while it has been inactive. In 3.112 -addition, Xen allows each domain to request that they receive a timer 3.113 -event sent at a specified system time by using the {\tt 3.114 -set\_timer\_op()} hypercall. Guest OSes may use this timer to 3.115 -implement timeout values when they block. 3.116 - 3.117 - 3.118 - 3.119 -%% % akw: demoting this to a section -- not sure if there is any point 3.120 -%% % though, maybe just remove it. 3.121 - 3.122 -\section{Xen CPU Scheduling} 3.123 - 3.124 -Xen offers a uniform API for CPU schedulers. It is possible to choose 3.125 -from a number of schedulers at boot and it should be easy to add more. 3.126 -The BVT, Atropos and Round Robin schedulers are part of the normal 3.127 -Xen distribution. BVT provides proportional fair shares of the CPU to 3.128 -the running domains. Atropos can be used to reserve absolute shares 3.129 -of the CPU for each domain. Round-robin is provided as an example of 3.130 -Xen's internal scheduler API. 3.131 - 3.132 -\paragraph*{Note: SMP host support} 3.133 -Xen has always supported SMP host systems. Domains are statically assigned to 3.134 -CPUs, either at creation time or when manually pinning to a particular CPU. 3.135 -The current schedulers then run locally on each CPU to decide which of the 3.136 -assigned domains should be run there. The user-level control software 3.137 -can be used to perform coarse-grain load-balancing between CPUs. 3.138 +%% chapter Virtual Architecture moved to architecture.tex 3.139 +\include{src/interface/architecture} 3.140 3.141 - 3.142 -%% More information on the characteristics and use of these schedulers is 3.143 -%% available in {\tt Sched-HOWTO.txt}. 3.144 - 3.145 - 3.146 -\section{Privileged operations} 3.147 - 3.148 -Xen exports an extended interface to privileged domains (viz.\ {\it 3.149 - Domain 0}). This allows such domains to build and boot other domains 3.150 -on the server, and provides control interfaces for managing 3.151 -scheduling, memory, networking, and block devices. 3.152 - 3.153 - 3.154 -\chapter{Memory} 3.155 -\label{c:memory} 3.156 - 3.157 -Xen is responsible for managing the allocation of physical memory to 3.158 -domains, and for ensuring safe use of the paging and segmentation 3.159 -hardware. 3.160 - 3.161 - 3.162 -\section{Memory Allocation} 3.163 - 3.164 - 3.165 -Xen resides within a small fixed portion of physical memory; it also 3.166 -reserves the top 64MB of every virtual address space. The remaining 3.167 -physical memory is available for allocation to domains at a page 3.168 -granularity. Xen tracks the ownership and use of each page, which 3.169 -allows it to enforce secure partitioning between domains. 3.170 - 3.171 -Each domain has a maximum and current physical memory allocation. 3.172 -A guest OS may run a `balloon driver' to dynamically adjust its 3.173 -current memory allocation up to its limit. 3.174 - 3.175 - 3.176 -%% XXX SMH: I use machine and physical in the next section (which 3.177 -%% is kinda required for consistency with code); wonder if this 3.178 -%% section should use same terms? 3.179 -%% 3.180 -%% Probably. 3.181 -%% 3.182 -%% Merging this and below section at some point prob makes sense. 3.183 - 3.184 -\section{Pseudo-Physical Memory} 3.185 - 3.186 -Since physical memory is allocated and freed on a page granularity, 3.187 -there is no guarantee that a domain will receive a contiguous stretch 3.188 -of physical memory. However most operating systems do not have good 3.189 -support for operating in a fragmented physical address space. To aid 3.190 -porting such operating systems to run on top of Xen, we make a 3.191 -distinction between \emph{machine memory} and \emph{pseudo-physical 3.192 -memory}. 3.193 - 3.194 -Put simply, machine memory refers to the entire amount of memory 3.195 -installed in the machine, including that reserved by Xen, in use by 3.196 -various domains, or currently unallocated. We consider machine memory 3.197 -to comprise a set of 4K \emph{machine page frames} numbered 3.198 -consecutively starting from 0. Machine frame numbers mean the same 3.199 -within Xen or any domain. 3.200 - 3.201 -Pseudo-physical memory, on the other hand, is a per-domain 3.202 -abstraction. It allows a guest operating system to consider its memory 3.203 -allocation to consist of a contiguous range of physical page frames 3.204 -starting at physical frame 0, despite the fact that the underlying 3.205 -machine page frames may be sparsely allocated and in any order. 3.206 - 3.207 -To achieve this, Xen maintains a globally readable {\it 3.208 -machine-to-physical} table which records the mapping from machine page 3.209 -frames to pseudo-physical ones. In addition, each domain is supplied 3.210 -with a {\it physical-to-machine} table which performs the inverse 3.211 -mapping. Clearly the machine-to-physical table has size proportional 3.212 -to the amount of RAM installed in the machine, while each 3.213 -physical-to-machine table has size proportional to the memory 3.214 -allocation of the given domain. 3.215 - 3.216 -Architecture dependent code in guest operating systems can then use 3.217 -the two tables to provide the abstraction of pseudo-physical 3.218 -memory. In general, only certain specialized parts of the operating 3.219 -system (such as page table management) needs to understand the 3.220 -difference between machine and pseudo-physical addresses. 3.221 - 3.222 -\section{Page Table Updates} 3.223 - 3.224 -In the default mode of operation, Xen enforces read-only access to 3.225 -page tables and requires guest operating systems to explicitly request 3.226 -any modifications. Xen validates all such requests and only applies 3.227 -updates that it deems safe. This is necessary to prevent domains from 3.228 -adding arbitrary mappings to their page tables. 3.229 - 3.230 -To aid validation, Xen associates a type and reference count with each 3.231 -memory page. A page has one of the following 3.232 -mutually-exclusive types at any point in time: page directory ({\sf 3.233 -PD}), page table ({\sf PT}), local descriptor table ({\sf LDT}), 3.234 -global descriptor table ({\sf GDT}), or writable ({\sf RW}). Note that 3.235 -a guest OS may always create readable mappings of its own memory 3.236 -regardless of its current type. 3.237 -%%% XXX: possibly explain more about ref count 'lifecyle' here? 3.238 -This mechanism is used to 3.239 -maintain the invariants required for safety; for example, a domain 3.240 -cannot have a writable mapping to any part of a page table as this 3.241 -would require the page concerned to simultaneously be of types {\sf 3.242 - PT} and {\sf RW}. 3.243 - 3.244 - 3.245 -%\section{Writable Page Tables} 3.246 - 3.247 -Xen also provides an alternative mode of operation in which guests be 3.248 -have the illusion that their page tables are directly writable. Of 3.249 -course this is not really the case, since Xen must still validate 3.250 -modifications to ensure secure partitioning. To this end, Xen traps 3.251 -any write attempt to a memory page of type {\sf PT} (i.e., that is 3.252 -currently part of a page table). If such an access occurs, Xen 3.253 -temporarily allows write access to that page while at the same time 3.254 -{\em disconnecting} it from the page table that is currently in 3.255 -use. This allows the guest to safely make updates to the page because 3.256 -the newly-updated entries cannot be used by the MMU until Xen 3.257 -revalidates and reconnects the page. 3.258 -Reconnection occurs automatically in a number of situations: for 3.259 -example, when the guest modifies a different page-table page, when the 3.260 -domain is preempted, or whenever the guest uses Xen's explicit 3.261 -page-table update interfaces. 3.262 - 3.263 -Finally, Xen also supports a form of \emph{shadow page tables} in 3.264 -which the guest OS uses a independent copy of page tables which are 3.265 -unknown to the hardware (i.e.\ which are never pointed to by {\tt 3.266 -cr3}). Instead Xen propagates changes made to the guest's tables to the 3.267 -real ones, and vice versa. This is useful for logging page writes 3.268 -(e.g.\ for live migration or checkpoint). A full version of the shadow 3.269 -page tables also allows guest OS porting with less effort. 3.270 - 3.271 -\section{Segment Descriptor Tables} 3.272 +%% chapter Memory moved to memory.tex 3.273 +\include{src/interface/memory} 3.274 3.275 -On boot a guest is supplied with a default GDT, which does not reside 3.276 -within its own memory allocation. If the guest wishes to use other 3.277 -than the default `flat' ring-1 and ring-3 segments that this GDT 3.278 -provides, it must register a custom GDT and/or LDT with Xen, 3.279 -allocated from its own memory. Note that a number of GDT 3.280 -entries are reserved by Xen -- any custom GDT must also include 3.281 -sufficient space for these entries. 3.282 - 3.283 -For example, the following hypercall is used to specify a new GDT: 3.284 - 3.285 -\begin{quote} 3.286 -int {\bf set\_gdt}(unsigned long *{\em frame\_list}, int {\em entries}) 3.287 - 3.288 -{\em frame\_list}: An array of up to 16 machine page frames within 3.289 -which the GDT resides. Any frame registered as a GDT frame may only 3.290 -be mapped read-only within the guest's address space (e.g., no 3.291 -writable mappings, no use as a page-table page, and so on). 3.292 - 3.293 -{\em entries}: The number of descriptor-entry slots in the GDT. Note 3.294 -that the table must be large enough to contain Xen's reserved entries; 3.295 -thus we must have `{\em entries $>$ LAST\_RESERVED\_GDT\_ENTRY}\ '. 3.296 -Note also that, after registering the GDT, slots {\em FIRST\_} through 3.297 -{\em LAST\_RESERVED\_GDT\_ENTRY} are no longer usable by the guest and 3.298 -may be overwritten by Xen. 3.299 -\end{quote} 3.300 - 3.301 -The LDT is updated via the generic MMU update mechanism (i.e., via 3.302 -the {\tt mmu\_update()} hypercall. 3.303 - 3.304 -\section{Start of Day} 3.305 - 3.306 -The start-of-day environment for guest operating systems is rather 3.307 -different to that provided by the underlying hardware. In particular, 3.308 -the processor is already executing in protected mode with paging 3.309 -enabled. 3.310 - 3.311 -{\it Domain 0} is created and booted by Xen itself. For all subsequent 3.312 -domains, the analogue of the boot-loader is the {\it domain builder}, 3.313 -user-space software running in {\it domain 0}. The domain builder 3.314 -is responsible for building the initial page tables for a domain 3.315 -and loading its kernel image at the appropriate virtual address. 3.316 - 3.317 - 3.318 - 3.319 -\chapter{Devices} 3.320 -\label{c:devices} 3.321 - 3.322 -Devices such as network and disk are exported to guests using a 3.323 -split device driver. The device driver domain, which accesses the 3.324 -physical device directly also runs a {\em backend} driver, serving 3.325 -requests to that device from guests. Each guest will use a simple 3.326 -{\em frontend} driver, to access the backend. Communication between these 3.327 -domains is composed of two parts: First, data is placed onto a shared 3.328 -memory page between the domains. Second, an event channel between the 3.329 -two domains is used to pass notification that data is outstanding. 3.330 -This separation of notification from data transfer allows message 3.331 -batching, and results in very efficient device access. 3.332 - 3.333 -Event channels are used extensively in device virtualization; each 3.334 -domain has a number of end-points or \emph{ports} each of which 3.335 -may be bound to one of the following \emph{event sources}: 3.336 -\begin{itemize} 3.337 - \item a physical interrupt from a real device, 3.338 - \item a virtual interrupt (callback) from Xen, or 3.339 - \item a signal from another domain 3.340 -\end{itemize} 3.341 - 3.342 -Events are lightweight and do not carry much information beyond 3.343 -the source of the notification. Hence when performing bulk data 3.344 -transfer, events are typically used as synchronization primitives 3.345 -over a shared memory transport. Event channels are managed via 3.346 -the {\tt event\_channel\_op()} hypercall; for more details see 3.347 -Section~\ref{s:idc}. 3.348 - 3.349 -This chapter focuses on some individual device interfaces 3.350 -available to Xen guests. 3.351 - 3.352 -\section{Network I/O} 3.353 - 3.354 -Virtual network device services are provided by shared memory 3.355 -communication with a backend domain. From the point of view of 3.356 -other domains, the backend may be viewed as a virtual ethernet switch 3.357 -element with each domain having one or more virtual network interfaces 3.358 -connected to it. 3.359 - 3.360 -\subsection{Backend Packet Handling} 3.361 - 3.362 -The backend driver is responsible for a variety of actions relating to 3.363 -the transmission and reception of packets from the physical device. 3.364 -With regard to transmission, the backend performs these key actions: 3.365 - 3.366 -\begin{itemize} 3.367 -\item {\bf Validation:} To ensure that domains do not attempt to 3.368 - generate invalid (e.g. spoofed) traffic, the backend driver may 3.369 - validate headers ensuring that source MAC and IP addresses match the 3.370 - interface that they have been sent from. 3.371 - 3.372 - Validation functions can be configured using standard firewall rules 3.373 - ({\small{\tt iptables}} in the case of Linux). 3.374 - 3.375 -\item {\bf Scheduling:} Since a number of domains can share a single 3.376 - physical network interface, the backend must mediate access when 3.377 - several domains each have packets queued for transmission. This 3.378 - general scheduling function subsumes basic shaping or rate-limiting 3.379 - schemes. 3.380 - 3.381 -\item {\bf Logging and Accounting:} The backend domain can be 3.382 - configured with classifier rules that control how packets are 3.383 - accounted or logged. For example, log messages might be generated 3.384 - whenever a domain attempts to send a TCP packet containing a SYN. 3.385 -\end{itemize} 3.386 - 3.387 -On receipt of incoming packets, the backend acts as a simple 3.388 -demultiplexer: Packets are passed to the appropriate virtual 3.389 -interface after any necessary logging and accounting have been carried 3.390 -out. 3.391 - 3.392 -\subsection{Data Transfer} 3.393 - 3.394 -Each virtual interface uses two ``descriptor rings'', one for transmit, 3.395 -the other for receive. Each descriptor identifies a block of contiguous 3.396 -physical memory allocated to the domain. 3.397 - 3.398 -The transmit ring carries packets to transmit from the guest to the 3.399 -backend domain. The return path of the transmit ring carries messages 3.400 -indicating that the contents have been physically transmitted and the 3.401 -backend no longer requires the associated pages of memory. 3.402 +%% chapter Devices moved to devices.tex 3.403 +\include{src/interface/devices} 3.404 3.405 -To receive packets, the guest places descriptors of unused pages on 3.406 -the receive ring. The backend will return received packets by 3.407 -exchanging these pages in the domain's memory with new pages 3.408 -containing the received data, and passing back descriptors regarding 3.409 -the new packets on the ring. This zero-copy approach allows the 3.410 -backend to maintain a pool of free pages to receive packets into, and 3.411 -then deliver them to appropriate domains after examining their 3.412 -headers. 3.413 - 3.414 -% 3.415 -%Real physical addresses are used throughout, with the domain performing 3.416 -%translation from pseudo-physical addresses if that is necessary. 3.417 - 3.418 -If a domain does not keep its receive ring stocked with empty buffers then 3.419 -packets destined to it may be dropped. This provides some defence against 3.420 -receive livelock problems because an overload domain will cease to receive 3.421 -further data. Similarly, on the transmit path, it provides the application 3.422 -with feedback on the rate at which packets are able to leave the system. 3.423 - 3.424 - 3.425 -Flow control on rings is achieved by including a pair of producer 3.426 -indexes on the shared ring page. Each side will maintain a private 3.427 -consumer index indicating the next outstanding message. In this 3.428 -manner, the domains cooperate to divide the ring into two message 3.429 -lists, one in each direction. Notification is decoupled from the 3.430 -immediate placement of new messages on the ring; the event channel 3.431 -will be used to generate notification when {\em either} a certain 3.432 -number of outstanding messages are queued, {\em or} a specified number 3.433 -of nanoseconds have elapsed since the oldest message was placed on the 3.434 -ring. 3.435 - 3.436 -% Not sure if my version is any better -- here is what was here before: 3.437 -%% Synchronization between the backend domain and the guest is achieved using 3.438 -%% counters held in shared memory that is accessible to both. Each ring has 3.439 -%% associated producer and consumer indices indicating the area in the ring 3.440 -%% that holds descriptors that contain data. After receiving {\it n} packets 3.441 -%% or {\t nanoseconds} after receiving the first packet, the hypervisor sends 3.442 -%% an event to the domain. 3.443 - 3.444 -\section{Block I/O} 3.445 - 3.446 -All guest OS disk access goes through the virtual block device VBD 3.447 -interface. This interface allows domains access to portions of block 3.448 -storage devices visible to the the block backend device. The VBD 3.449 -interface is a split driver, similar to the network interface 3.450 -described above. A single shared memory ring is used between the 3.451 -frontend and backend drivers, across which read and write messages are 3.452 -sent. 3.453 - 3.454 -Any block device accessible to the backend domain, including 3.455 -network-based block (iSCSI, *NBD, etc), loopback and LVM/MD devices, 3.456 -can be exported as a VBD. Each VBD is mapped to a device node in the 3.457 -guest, specified in the guest's startup configuration. 3.458 - 3.459 -Old (Xen 1.2) virtual disks are not supported under Xen 2.0, since 3.460 -similar functionality can be achieved using the more complete LVM 3.461 -system, which is already in widespread use. 3.462 - 3.463 -\subsection{Data Transfer} 3.464 - 3.465 -The single ring between the guest and the block backend supports three 3.466 -messages: 3.467 - 3.468 -\begin{description} 3.469 -\item [{\small {\tt PROBE}}:] Return a list of the VBDs available to this guest 3.470 - from the backend. The request includes a descriptor of a free page 3.471 - into which the reply will be written by the backend. 3.472 - 3.473 -\item [{\small {\tt READ}}:] Read data from the specified block device. The 3.474 - front end identifies the device and location to read from and 3.475 - attaches pages for the data to be copied to (typically via DMA from 3.476 - the device). The backend acknowledges completed read requests as 3.477 - they finish. 3.478 - 3.479 -\item [{\small {\tt WRITE}}:] Write data to the specified block device. This 3.480 - functions essentially as {\small {\tt READ}}, except that the data moves to 3.481 - the device instead of from it. 3.482 -\end{description} 3.483 - 3.484 -% um... some old text 3.485 -%% In overview, the same style of descriptor-ring that is used for 3.486 -%% network packets is used here. Each domain has one ring that carries 3.487 -%% operation requests to the hypervisor and carries the results back 3.488 -%% again. 3.489 - 3.490 -%% Rather than copying data, the backend simply maps the domain's buffers 3.491 -%% in order to enable direct DMA to them. The act of mapping the buffers 3.492 -%% also increases the reference counts of the underlying pages, so that 3.493 -%% the unprivileged domain cannot try to return them to the hypervisor, 3.494 -%% install them as page tables, or any other unsafe behaviour. 3.495 -%% %block API here 3.496 - 3.497 - 3.498 -\chapter{Further Information} 3.499 - 3.500 - 3.501 -If you have questions that are not answered by this manual, the 3.502 -sources of information listed below may be of interest to you. Note 3.503 -that bug reports, suggestions and contributions related to the 3.504 -software (or the documentation) should be sent to the Xen developers' 3.505 -mailing list (address below). 3.506 - 3.507 -\section{Other documentation} 3.508 - 3.509 -If you are mainly interested in using (rather than developing for) 3.510 -Xen, the {\em Xen Users' Manual} is distributed in the {\tt docs/} 3.511 -directory of the Xen source distribution. 3.512 - 3.513 -% Various HOWTOs are also available in {\tt docs/HOWTOS}. 3.514 - 3.515 -\section{Online references} 3.516 - 3.517 -The official Xen web site is found at: 3.518 -\begin{quote} 3.519 -{\tt http://www.cl.cam.ac.uk/Research/SRG/netos/xen/} 3.520 -\end{quote} 3.521 - 3.522 -This contains links to the latest versions of all on-line 3.523 -documentation. 3.524 - 3.525 -\section{Mailing lists} 3.526 - 3.527 -There are currently four official Xen mailing lists: 3.528 - 3.529 -\begin{description} 3.530 -\item[xen-devel@lists.xensource.com] Used for development 3.531 -discussions and bug reports. Subscribe at: \\ 3.532 -{\small {\tt http://lists.xensource.com/xen-devel}} 3.533 -\item[xen-users@lists.xensource.com] Used for installation and usage 3.534 -discussions and requests for help. Subscribe at: \\ 3.535 -{\small {\tt http://lists.xensource.com/xen-users}} 3.536 -\item[xen-announce@lists.xensource.com] Used for announcements only. 3.537 -Subscribe at: \\ 3.538 -{\small {\tt http://lists.xensource.com/xen-announce}} 3.539 -\item[xen-changelog@lists.xensource.com] Changelog feed 3.540 -from the unstable and 2.0 trees - developer oriented. Subscribe at: \\ 3.541 -{\small {\tt http://lists.xensource.com/xen-changelog}} 3.542 -\end{description} 3.543 - 3.544 -Of these, xen-devel is the most active. 3.545 - 3.546 - 3.547 +%% chapter Further Information moved to further_info.tex 3.548 +\include{src/interface/further_info} 3.549 3.550 3.551 \appendix 3.552 3.553 -%\newcommand{\hypercall}[1]{\vspace{5mm}{\large\sf #1}} 3.554 - 3.555 - 3.556 - 3.557 - 3.558 - 3.559 -\newcommand{\hypercall}[1]{\vspace{2mm}{\sf #1}} 3.560 - 3.561 - 3.562 - 3.563 - 3.564 - 3.565 - 3.566 -\chapter{Xen Hypercalls} 3.567 -\label{a:hypercalls} 3.568 - 3.569 -Hypercalls represent the procedural interface to Xen; this appendix 3.570 -categorizes and describes the current set of hypercalls. 3.571 - 3.572 -\section{Invoking Hypercalls} 3.573 - 3.574 -Hypercalls are invoked in a manner analogous to system calls in a 3.575 -conventional operating system; a software interrupt is issued which 3.576 -vectors to an entry point within Xen. On x86\_32 machines the 3.577 -instruction required is {\tt int \$82}; the (real) IDT is setup so 3.578 -that this may only be issued from within ring 1. The particular 3.579 -hypercall to be invoked is contained in {\tt EAX} --- a list 3.580 -mapping these values to symbolic hypercall names can be found 3.581 -in {\tt xen/include/public/xen.h}. 3.582 - 3.583 -On some occasions a set of hypercalls will be required to carry 3.584 -out a higher-level function; a good example is when a guest 3.585 -operating wishes to context switch to a new process which 3.586 -requires updating various privileged CPU state. As an optimization 3.587 -for these cases, there is a generic mechanism to issue a set of 3.588 -hypercalls as a batch: 3.589 - 3.590 -\begin{quote} 3.591 -\hypercall{multicall(void *call\_list, int nr\_calls)} 3.592 - 3.593 -Execute a series of hypervisor calls; {\tt nr\_calls} is the length of 3.594 -the array of {\tt multicall\_entry\_t} structures pointed to be {\tt 3.595 -call\_list}. Each entry contains the hypercall operation code followed 3.596 -by up to 7 word-sized arguments. 3.597 -\end{quote} 3.598 - 3.599 -Note that multicalls are provided purely as an optimization; there is 3.600 -no requirement to use them when first porting a guest operating 3.601 -system. 3.602 - 3.603 - 3.604 -\section{Virtual CPU Setup} 3.605 - 3.606 -At start of day, a guest operating system needs to setup the virtual 3.607 -CPU it is executing on. This includes installing vectors for the 3.608 -virtual IDT so that the guest OS can handle interrupts, page faults, 3.609 -etc. However the very first thing a guest OS must setup is a pair 3.610 -of hypervisor callbacks: these are the entry points which Xen will 3.611 -use when it wishes to notify the guest OS of an occurrence. 3.612 - 3.613 -\begin{quote} 3.614 -\hypercall{set\_callbacks(unsigned long event\_selector, unsigned long 3.615 - event\_address, unsigned long failsafe\_selector, unsigned long 3.616 - failsafe\_address) } 3.617 - 3.618 -Register the normal (``event'') and failsafe callbacks for 3.619 -event processing. In each case the code segment selector and 3.620 -address within that segment are provided. The selectors must 3.621 -have RPL 1; in XenLinux we simply use the kernel's CS for both 3.622 -{\tt event\_selector} and {\tt failsafe\_selector}. 3.623 - 3.624 -The value {\tt event\_address} specifies the address of the guest OSes 3.625 -event handling and dispatch routine; the {\tt failsafe\_address} 3.626 -specifies a separate entry point which is used only if a fault occurs 3.627 -when Xen attempts to use the normal callback. 3.628 -\end{quote} 3.629 - 3.630 - 3.631 -After installing the hypervisor callbacks, the guest OS can 3.632 -install a `virtual IDT' by using the following hypercall: 3.633 - 3.634 -\begin{quote} 3.635 -\hypercall{set\_trap\_table(trap\_info\_t *table)} 3.636 - 3.637 -Install one or more entries into the per-domain 3.638 -trap handler table (essentially a software version of the IDT). 3.639 -Each entry in the array pointed to by {\tt table} includes the 3.640 -exception vector number with the corresponding segment selector 3.641 -and entry point. Most guest OSes can use the same handlers on 3.642 -Xen as when running on the real hardware; an exception is the 3.643 -page fault handler (exception vector 14) where a modified 3.644 -stack-frame layout is used. 3.645 - 3.646 - 3.647 -\end{quote} 3.648 - 3.649 - 3.650 - 3.651 -\section{Scheduling and Timer} 3.652 - 3.653 -Domains are preemptively scheduled by Xen according to the 3.654 -parameters installed by domain 0 (see Section~\ref{s:dom0ops}). 3.655 -In addition, however, a domain may choose to explicitly 3.656 -control certain behavior with the following hypercall: 3.657 - 3.658 -\begin{quote} 3.659 -\hypercall{sched\_op(unsigned long op)} 3.660 - 3.661 -Request scheduling operation from hypervisor. The options are: {\it 3.662 -yield}, {\it block}, and {\it shutdown}. {\it yield} keeps the 3.663 -calling domain runnable but may cause a reschedule if other domains 3.664 -are runnable. {\it block} removes the calling domain from the run 3.665 -queue and cause is to sleeps until an event is delivered to it. {\it 3.666 -shutdown} is used to end the domain's execution; the caller can 3.667 -additionally specify whether the domain should reboot, halt or 3.668 -suspend. 3.669 -\end{quote} 3.670 - 3.671 -To aid the implementation of a process scheduler within a guest OS, 3.672 -Xen provides a virtual programmable timer: 3.673 - 3.674 -\begin{quote} 3.675 -\hypercall{set\_timer\_op(uint64\_t timeout)} 3.676 - 3.677 -Request a timer event to be sent at the specified system time (time 3.678 -in nanoseconds since system boot). The hypercall actually passes the 3.679 -64-bit timeout value as a pair of 32-bit values. 3.680 - 3.681 -\end{quote} 3.682 - 3.683 -Note that calling {\tt set\_timer\_op()} prior to {\tt sched\_op} 3.684 -allows block-with-timeout semantics. 3.685 - 3.686 - 3.687 -\section{Page Table Management} 3.688 - 3.689 -Since guest operating systems have read-only access to their page 3.690 -tables, Xen must be involved when making any changes. The following 3.691 -multi-purpose hypercall can be used to modify page-table entries, 3.692 -update the machine-to-physical mapping table, flush the TLB, install 3.693 -a new page-table base pointer, and more. 3.694 - 3.695 -\begin{quote} 3.696 -\hypercall{mmu\_update(mmu\_update\_t *req, int count, int *success\_count)} 3.697 - 3.698 -Update the page table for the domain; a set of {\tt count} updates are 3.699 -submitted for processing in a batch, with {\tt success\_count} being 3.700 -updated to report the number of successful updates. 3.701 - 3.702 -Each element of {\tt req[]} contains a pointer (address) and value; 3.703 -the least significant 2-bits of the pointer are used to distinguish 3.704 -the type of update requested as follows: 3.705 -\begin{description} 3.706 - 3.707 -\item[\it MMU\_NORMAL\_PT\_UPDATE:] update a page directory entry or 3.708 -page table entry to the associated value; Xen will check that the 3.709 -update is safe, as described in Chapter~\ref{c:memory}. 3.710 - 3.711 -\item[\it MMU\_MACHPHYS\_UPDATE:] update an entry in the 3.712 - machine-to-physical table. The calling domain must own the machine 3.713 - page in question (or be privileged). 3.714 - 3.715 -\item[\it MMU\_EXTENDED\_COMMAND:] perform additional MMU operations. 3.716 -The set of additional MMU operations is considerable, and includes 3.717 -updating {\tt cr3} (or just re-installing it for a TLB flush), 3.718 -flushing the cache, installing a new LDT, or pinning \& unpinning 3.719 -page-table pages (to ensure their reference count doesn't drop to zero 3.720 -which would require a revalidation of all entries). 3.721 - 3.722 -Further extended commands are used to deal with granting and 3.723 -acquiring page ownership; see Section~\ref{s:idc}. 3.724 - 3.725 - 3.726 -\end{description} 3.727 - 3.728 -More details on the precise format of all commands can be 3.729 -found in {\tt xen/include/public/xen.h}. 3.730 - 3.731 - 3.732 -\end{quote} 3.733 - 3.734 -Explicitly updating batches of page table entries is extremely 3.735 -efficient, but can require a number of alterations to the guest 3.736 -OS. Using the writable page table mode (Chapter~\ref{c:memory}) is 3.737 -recommended for new OS ports. 3.738 - 3.739 -Regardless of which page table update mode is being used, however, 3.740 -there are some occasions (notably handling a demand page fault) where 3.741 -a guest OS will wish to modify exactly one PTE rather than a 3.742 -batch. This is catered for by the following: 3.743 - 3.744 -\begin{quote} 3.745 -\hypercall{update\_va\_mapping(unsigned long page\_nr, unsigned long 3.746 -val, \\ unsigned long flags)} 3.747 - 3.748 -Update the currently installed PTE for the page {\tt page\_nr} to 3.749 -{\tt val}. As with {\tt mmu\_update()}, Xen checks the modification 3.750 -is safe before applying it. The {\tt flags} determine which kind 3.751 -of TLB flush, if any, should follow the update. 3.752 - 3.753 -\end{quote} 3.754 - 3.755 -Finally, sufficiently privileged domains may occasionally wish to manipulate 3.756 -the pages of others: 3.757 -\begin{quote} 3.758 - 3.759 -\hypercall{update\_va\_mapping\_otherdomain(unsigned long page\_nr, 3.760 -unsigned long val, unsigned long flags, uint16\_t domid)} 3.761 - 3.762 -Identical to {\tt update\_va\_mapping()} save that the pages being 3.763 -mapped must belong to the domain {\tt domid}. 3.764 - 3.765 -\end{quote} 3.766 - 3.767 -This privileged operation is currently used by backend virtual device 3.768 -drivers to safely map pages containing I/O data. 3.769 - 3.770 - 3.771 - 3.772 -\section{Segmentation Support} 3.773 - 3.774 -Xen allows guest OSes to install a custom GDT if they require it; 3.775 -this is context switched transparently whenever a domain is 3.776 -[de]scheduled. The following hypercall is effectively a 3.777 -`safe' version of {\tt lgdt}: 3.778 - 3.779 -\begin{quote} 3.780 -\hypercall{set\_gdt(unsigned long *frame\_list, int entries)} 3.781 - 3.782 -Install a global descriptor table for a domain; {\tt frame\_list} is 3.783 -an array of up to 16 machine page frames within which the GDT resides, 3.784 -with {\tt entries} being the actual number of descriptor-entry 3.785 -slots. All page frames must be mapped read-only within the guest's 3.786 -address space, and the table must be large enough to contain Xen's 3.787 -reserved entries (see {\tt xen/include/public/arch-x86\_32.h}). 3.788 - 3.789 -\end{quote} 3.790 - 3.791 -Many guest OSes will also wish to install LDTs; this is achieved by 3.792 -using {\tt mmu\_update()} with an extended command, passing the 3.793 -linear address of the LDT base along with the number of entries. No 3.794 -special safety checks are required; Xen needs to perform this task 3.795 -simply since {\tt lldt} requires CPL 0. 3.796 - 3.797 - 3.798 -Xen also allows guest operating systems to update just an 3.799 -individual segment descriptor in the GDT or LDT: 3.800 - 3.801 -\begin{quote} 3.802 -\hypercall{update\_descriptor(unsigned long ma, unsigned long word1, 3.803 -unsigned long word2)} 3.804 - 3.805 -Update the GDT/LDT entry at machine address {\tt ma}; the new 3.806 -8-byte descriptor is stored in {\tt word1} and {\tt word2}. 3.807 -Xen performs a number of checks to ensure the descriptor is 3.808 -valid. 3.809 - 3.810 -\end{quote} 3.811 - 3.812 -Guest OSes can use the above in place of context switching entire 3.813 -LDTs (or the GDT) when the number of changing descriptors is small. 3.814 - 3.815 -\section{Context Switching} 3.816 - 3.817 -When a guest OS wishes to context switch between two processes, 3.818 -it can use the page table and segmentation hypercalls described 3.819 -above to perform the the bulk of the privileged work. In addition, 3.820 -however, it will need to invoke Xen to switch the kernel (ring 1) 3.821 -stack pointer: 3.822 - 3.823 -\begin{quote} 3.824 -\hypercall{stack\_switch(unsigned long ss, unsigned long esp)} 3.825 - 3.826 -Request kernel stack switch from hypervisor; {\tt ss} is the new 3.827 -stack segment, which {\tt esp} is the new stack pointer. 3.828 - 3.829 -\end{quote} 3.830 - 3.831 -A final useful hypercall for context switching allows ``lazy'' 3.832 -save and restore of floating point state: 3.833 - 3.834 -\begin{quote} 3.835 -\hypercall{fpu\_taskswitch(void)} 3.836 - 3.837 -This call instructs Xen to set the {\tt TS} bit in the {\tt cr0} 3.838 -control register; this means that the next attempt to use floating 3.839 -point will cause a trap which the guest OS can trap. Typically it will 3.840 -then save/restore the FP state, and clear the {\tt TS} bit. 3.841 -\end{quote} 3.842 - 3.843 -This is provided as an optimization only; guest OSes can also choose 3.844 -to save and restore FP state on all context switches for simplicity. 3.845 - 3.846 - 3.847 -\section{Physical Memory Management} 3.848 - 3.849 -As mentioned previously, each domain has a maximum and current 3.850 -memory allocation. The maximum allocation, set at domain creation 3.851 -time, cannot be modified. However a domain can choose to reduce 3.852 -and subsequently grow its current allocation by using the 3.853 -following call: 3.854 - 3.855 -\begin{quote} 3.856 -\hypercall{dom\_mem\_op(unsigned int op, unsigned long *extent\_list, 3.857 - unsigned long nr\_extents, unsigned int extent\_order)} 3.858 - 3.859 -Increase or decrease current memory allocation (as determined by 3.860 -the value of {\tt op}). Each invocation provides a list of 3.861 -extents each of which is $2^s$ pages in size, 3.862 -where $s$ is the value of {\tt extent\_order}. 3.863 - 3.864 -\end{quote} 3.865 - 3.866 -In addition to simply reducing or increasing the current memory 3.867 -allocation via a `balloon driver', this call is also useful for 3.868 -obtaining contiguous regions of machine memory when required (e.g. 3.869 -for certain PCI devices, or if using superpages). 3.870 - 3.871 - 3.872 -\section{Inter-Domain Communication} 3.873 -\label{s:idc} 3.874 - 3.875 -Xen provides a simple asynchronous notification mechanism via 3.876 -\emph{event channels}. Each domain has a set of end-points (or 3.877 -\emph{ports}) which may be bound to an event source (e.g. a physical 3.878 -IRQ, a virtual IRQ, or an port in another domain). When a pair of 3.879 -end-points in two different domains are bound together, then a `send' 3.880 -operation on one will cause an event to be received by the destination 3.881 -domain. 3.882 - 3.883 -The control and use of event channels involves the following hypercall: 3.884 - 3.885 -\begin{quote} 3.886 -\hypercall{event\_channel\_op(evtchn\_op\_t *op)} 3.887 - 3.888 -Inter-domain event-channel management; {\tt op} is a discriminated 3.889 -union which allows the following 7 operations: 3.890 - 3.891 -\begin{description} 3.892 - 3.893 -\item[\it alloc\_unbound:] allocate a free (unbound) local 3.894 - port and prepare for connection from a specified domain. 3.895 -\item[\it bind\_virq:] bind a local port to a virtual 3.896 -IRQ; any particular VIRQ can be bound to at most one port per domain. 3.897 -\item[\it bind\_pirq:] bind a local port to a physical IRQ; 3.898 -once more, a given pIRQ can be bound to at most one port per 3.899 -domain. Furthermore the calling domain must be sufficiently 3.900 -privileged. 3.901 -\item[\it bind\_interdomain:] construct an interdomain event 3.902 -channel; in general, the target domain must have previously allocated 3.903 -an unbound port for this channel, although this can be bypassed by 3.904 -privileged domains during domain setup. 3.905 -\item[\it close:] close an interdomain event channel. 3.906 -\item[\it send:] send an event to the remote end of a 3.907 -interdomain event channel. 3.908 -\item[\it status:] determine the current status of a local port. 3.909 -\end{description} 3.910 - 3.911 -For more details see 3.912 -{\tt xen/include/public/event\_channel.h}. 3.913 - 3.914 -\end{quote} 3.915 - 3.916 -Event channels are the fundamental communication primitive between 3.917 -Xen domains and seamlessly support SMP. However they provide little 3.918 -bandwidth for communication {\sl per se}, and hence are typically 3.919 -married with a piece of shared memory to produce effective and 3.920 -high-performance inter-domain communication. 3.921 - 3.922 -Safe sharing of memory pages between guest OSes is carried out by 3.923 -granting access on a per page basis to individual domains. This is 3.924 -achieved by using the {\tt grant\_table\_op()} hypercall. 3.925 - 3.926 -\begin{quote} 3.927 -\hypercall{grant\_table\_op(unsigned int cmd, void *uop, unsigned int count)} 3.928 - 3.929 -Grant or remove access to a particular page to a particular domain. 3.930 - 3.931 -\end{quote} 3.932 - 3.933 -This is not currently widely in use by guest operating systems, but 3.934 -we intend to integrate support more fully in the near future. 3.935 - 3.936 -\section{PCI Configuration} 3.937 - 3.938 -Domains with physical device access (i.e.\ driver domains) receive 3.939 -limited access to certain PCI devices (bus address space and 3.940 -interrupts). However many guest operating systems attempt to 3.941 -determine the PCI configuration by directly access the PCI BIOS, 3.942 -which cannot be allowed for safety. 3.943 - 3.944 -Instead, Xen provides the following hypercall: 3.945 - 3.946 -\begin{quote} 3.947 -\hypercall{physdev\_op(void *physdev\_op)} 3.948 - 3.949 -Perform a PCI configuration option; depending on the value 3.950 -of {\tt physdev\_op} this can be a PCI config read, a PCI config 3.951 -write, or a small number of other queries. 3.952 - 3.953 -\end{quote} 3.954 - 3.955 - 3.956 -For examples of using {\tt physdev\_op()}, see the 3.957 -Xen-specific PCI code in the linux sparse tree. 3.958 - 3.959 -\section{Administrative Operations} 3.960 -\label{s:dom0ops} 3.961 - 3.962 -A large number of control operations are available to a sufficiently 3.963 -privileged domain (typically domain 0). These allow the creation and 3.964 -management of new domains, for example. A complete list is given 3.965 -below: for more details on any or all of these, please see 3.966 -{\tt xen/include/public/dom0\_ops.h} 3.967 - 3.968 - 3.969 -\begin{quote} 3.970 -\hypercall{dom0\_op(dom0\_op\_t *op)} 3.971 - 3.972 -Administrative domain operations for domain management. The options are: 3.973 - 3.974 -\begin{description} 3.975 -\item [\it DOM0\_CREATEDOMAIN:] create a new domain 3.976 - 3.977 -\item [\it DOM0\_PAUSEDOMAIN:] remove a domain from the scheduler run 3.978 -queue. 3.979 - 3.980 -\item [\it DOM0\_UNPAUSEDOMAIN:] mark a paused domain as schedulable 3.981 - once again. 3.982 - 3.983 -\item [\it DOM0\_DESTROYDOMAIN:] deallocate all resources associated 3.984 -with a domain 3.985 - 3.986 -\item [\it DOM0\_GETMEMLIST:] get list of pages used by the domain 3.987 - 3.988 -\item [\it DOM0\_SCHEDCTL:] 3.989 - 3.990 -\item [\it DOM0\_ADJUSTDOM:] adjust scheduling priorities for domain 3.991 - 3.992 -\item [\it DOM0\_BUILDDOMAIN:] do final guest OS setup for domain 3.993 - 3.994 -\item [\it DOM0\_GETDOMAINFO:] get statistics about the domain 3.995 - 3.996 -\item [\it DOM0\_GETPAGEFRAMEINFO:] 3.997 - 3.998 -\item [\it DOM0\_GETPAGEFRAMEINFO2:] 3.999 - 3.1000 -\item [\it DOM0\_IOPL:] set I/O privilege level 3.1001 - 3.1002 -\item [\it DOM0\_MSR:] read or write model specific registers 3.1003 - 3.1004 -\item [\it DOM0\_DEBUG:] interactively invoke the debugger 3.1005 - 3.1006 -\item [\it DOM0\_SETTIME:] set system time 3.1007 - 3.1008 -\item [\it DOM0\_READCONSOLE:] read console content from hypervisor buffer ring 3.1009 - 3.1010 -\item [\it DOM0\_PINCPUDOMAIN:] pin domain to a particular CPU 3.1011 - 3.1012 -\item [\it DOM0\_GETTBUFS:] get information about the size and location of 3.1013 - the trace buffers (only on trace-buffer enabled builds) 3.1014 - 3.1015 -\item [\it DOM0\_PHYSINFO:] get information about the host machine 3.1016 - 3.1017 -\item [\it DOM0\_PCIDEV\_ACCESS:] modify PCI device access permissions 3.1018 - 3.1019 -\item [\it DOM0\_SCHED\_ID:] get the ID of the current Xen scheduler 3.1020 - 3.1021 -\item [\it DOM0\_SHADOW\_CONTROL:] switch between shadow page-table modes 3.1022 - 3.1023 -\item [\it DOM0\_SETDOMAININITIALMEM:] set initial memory allocation of a domain 3.1024 - 3.1025 -\item [\it DOM0\_SETDOMAINMAXMEM:] set maximum memory allocation of a domain 3.1026 - 3.1027 -\item [\it DOM0\_SETDOMAINVMASSIST:] set domain VM assist options 3.1028 -\end{description} 3.1029 -\end{quote} 3.1030 - 3.1031 -Most of the above are best understood by looking at the code 3.1032 -implementing them (in {\tt xen/common/dom0\_ops.c}) and in 3.1033 -the user-space tools that use them (mostly in {\tt tools/libxc}). 3.1034 - 3.1035 -\section{Debugging Hypercalls} 3.1036 - 3.1037 -A few additional hypercalls are mainly useful for debugging: 3.1038 - 3.1039 -\begin{quote} 3.1040 -\hypercall{console\_io(int cmd, int count, char *str)} 3.1041 - 3.1042 -Use Xen to interact with the console; operations are: 3.1043 - 3.1044 -{\it CONSOLEIO\_write}: Output count characters from buffer str. 3.1045 - 3.1046 -{\it CONSOLEIO\_read}: Input at most count characters into buffer str. 3.1047 -\end{quote} 3.1048 - 3.1049 -A pair of hypercalls allows access to the underlying debug registers: 3.1050 -\begin{quote} 3.1051 -\hypercall{set\_debugreg(int reg, unsigned long value)} 3.1052 - 3.1053 -Set debug register {\tt reg} to {\tt value} 3.1054 - 3.1055 -\hypercall{get\_debugreg(int reg)} 3.1056 - 3.1057 -Return the contents of the debug register {\tt reg} 3.1058 -\end{quote} 3.1059 - 3.1060 -And finally: 3.1061 -\begin{quote} 3.1062 -\hypercall{xen\_version(int cmd)} 3.1063 - 3.1064 -Request Xen version number. 3.1065 -\end{quote} 3.1066 - 3.1067 -This is useful to ensure that user-space tools are in sync 3.1068 -with the underlying hypervisor. 3.1069 - 3.1070 -\section{Deprecated Hypercalls} 3.1071 - 3.1072 -Xen is under constant development and refinement; as such there 3.1073 -are plans to improve the way in which various pieces of functionality 3.1074 -are exposed to guest OSes. 3.1075 - 3.1076 -\begin{quote} 3.1077 -\hypercall{vm\_assist(unsigned int cmd, unsigned int type)} 3.1078 - 3.1079 -Toggle various memory management modes (in particular wrritable page 3.1080 -tables and superpage support). 3.1081 - 3.1082 -\end{quote} 3.1083 - 3.1084 -This is likely to be replaced with mode values in the shared 3.1085 -information page since this is more resilient for resumption 3.1086 -after migration or checkpoint. 3.1087 - 3.1088 - 3.1089 - 3.1090 - 3.1091 - 3.1092 - 3.1093 +%% chapter hypercalls moved to hypercalls.tex 3.1094 +\include{src/interface/hypercalls} 3.1095 3.1096 3.1097 %% 3.1098 @@ -1173,279 +112,9 @@ after migration or checkpoint. 3.1099 %% new scheduler... not clear how many of them there are... 3.1100 %% 3.1101 3.1102 -\begin{comment} 3.1103 - 3.1104 -\chapter{Scheduling API} 3.1105 - 3.1106 -The scheduling API is used by both the schedulers described above and should 3.1107 -also be used by any new schedulers. It provides a generic interface and also 3.1108 -implements much of the ``boilerplate'' code. 3.1109 - 3.1110 -Schedulers conforming to this API are described by the following 3.1111 -structure: 3.1112 - 3.1113 -\begin{verbatim} 3.1114 -struct scheduler 3.1115 -{ 3.1116 - char *name; /* full name for this scheduler */ 3.1117 - char *opt_name; /* option name for this scheduler */ 3.1118 - unsigned int sched_id; /* ID for this scheduler */ 3.1119 - 3.1120 - int (*init_scheduler) (); 3.1121 - int (*alloc_task) (struct task_struct *); 3.1122 - void (*add_task) (struct task_struct *); 3.1123 - void (*free_task) (struct task_struct *); 3.1124 - void (*rem_task) (struct task_struct *); 3.1125 - void (*wake_up) (struct task_struct *); 3.1126 - void (*do_block) (struct task_struct *); 3.1127 - task_slice_t (*do_schedule) (s_time_t); 3.1128 - int (*control) (struct sched_ctl_cmd *); 3.1129 - int (*adjdom) (struct task_struct *, 3.1130 - struct sched_adjdom_cmd *); 3.1131 - s32 (*reschedule) (struct task_struct *); 3.1132 - void (*dump_settings) (void); 3.1133 - void (*dump_cpu_state) (int); 3.1134 - void (*dump_runq_el) (struct task_struct *); 3.1135 -}; 3.1136 -\end{verbatim} 3.1137 - 3.1138 -The only method that {\em must} be implemented is 3.1139 -{\tt do\_schedule()}. However, if there is not some implementation for the 3.1140 -{\tt wake\_up()} method then waking tasks will not get put on the runqueue! 3.1141 - 3.1142 -The fields of the above structure are described in more detail below. 3.1143 - 3.1144 -\subsubsection{name} 3.1145 - 3.1146 -The name field should point to a descriptive ASCII string. 3.1147 - 3.1148 -\subsubsection{opt\_name} 3.1149 - 3.1150 -This field is the value of the {\tt sched=} boot-time option that will select 3.1151 -this scheduler. 3.1152 - 3.1153 -\subsubsection{sched\_id} 3.1154 - 3.1155 -This is an integer that uniquely identifies this scheduler. There should be a 3.1156 -macro corrsponding to this scheduler ID in {\tt <xen/sched-if.h>}. 3.1157 - 3.1158 -\subsubsection{init\_scheduler} 3.1159 - 3.1160 -\paragraph*{Purpose} 3.1161 - 3.1162 -This is a function for performing any scheduler-specific initialisation. For 3.1163 -instance, it might allocate memory for per-CPU scheduler data and initialise it 3.1164 -appropriately. 3.1165 - 3.1166 -\paragraph*{Call environment} 3.1167 - 3.1168 -This function is called after the initialisation performed by the generic 3.1169 -layer. The function is called exactly once, for the scheduler that has been 3.1170 -selected. 3.1171 - 3.1172 -\paragraph*{Return values} 3.1173 - 3.1174 -This should return negative on failure --- this will cause an 3.1175 -immediate panic and the system will fail to boot. 3.1176 - 3.1177 -\subsubsection{alloc\_task} 3.1178 - 3.1179 -\paragraph*{Purpose} 3.1180 -Called when a {\tt task\_struct} is allocated by the generic scheduler 3.1181 -layer. A particular scheduler implementation may use this method to 3.1182 -allocate per-task data for this task. It may use the {\tt 3.1183 -sched\_priv} pointer in the {\tt task\_struct} to point to this data. 3.1184 - 3.1185 -\paragraph*{Call environment} 3.1186 -The generic layer guarantees that the {\tt sched\_priv} field will 3.1187 -remain intact from the time this method is called until the task is 3.1188 -deallocated (so long as the scheduler implementation does not change 3.1189 -it explicitly!). 3.1190 - 3.1191 -\paragraph*{Return values} 3.1192 -Negative on failure. 3.1193 - 3.1194 -\subsubsection{add\_task} 3.1195 - 3.1196 -\paragraph*{Purpose} 3.1197 - 3.1198 -Called when a task is initially added by the generic layer. 3.1199 - 3.1200 -\paragraph*{Call environment} 3.1201 - 3.1202 -The fields in the {\tt task\_struct} are now filled out and available for use. 3.1203 -Schedulers should implement appropriate initialisation of any per-task private 3.1204 -information in this method. 3.1205 - 3.1206 -\subsubsection{free\_task} 3.1207 - 3.1208 -\paragraph*{Purpose} 3.1209 - 3.1210 -Schedulers should free the space used by any associated private data 3.1211 -structures. 3.1212 - 3.1213 -\paragraph*{Call environment} 3.1214 - 3.1215 -This is called when a {\tt task\_struct} is about to be deallocated. 3.1216 -The generic layer will have done generic task removal operations and 3.1217 -(if implemented) called the scheduler's {\tt rem\_task} method before 3.1218 -this method is called. 3.1219 - 3.1220 -\subsubsection{rem\_task} 3.1221 - 3.1222 -\paragraph*{Purpose} 3.1223 - 3.1224 -This is called when a task is being removed from scheduling (but is 3.1225 -not yet being freed). 3.1226 - 3.1227 -\subsubsection{wake\_up} 3.1228 - 3.1229 -\paragraph*{Purpose} 3.1230 - 3.1231 -Called when a task is woken up, this method should put the task on the runqueue 3.1232 -(or do the scheduler-specific equivalent action). 3.1233 - 3.1234 -\paragraph*{Call environment} 3.1235 - 3.1236 -The task is already set to state RUNNING. 3.1237 - 3.1238 -\subsubsection{do\_block} 3.1239 - 3.1240 -\paragraph*{Purpose} 3.1241 - 3.1242 -This function is called when a task is blocked. This function should 3.1243 -not remove the task from the runqueue. 3.1244 - 3.1245 -\paragraph*{Call environment} 3.1246 - 3.1247 -The EVENTS\_MASTER\_ENABLE\_BIT is already set and the task state changed to 3.1248 -TASK\_INTERRUPTIBLE on entry to this method. A call to the {\tt 3.1249 - do\_schedule} method will be made after this method returns, in 3.1250 -order to select the next task to run. 3.1251 - 3.1252 -\subsubsection{do\_schedule} 3.1253 - 3.1254 -This method must be implemented. 3.1255 - 3.1256 -\paragraph*{Purpose} 3.1257 - 3.1258 -The method is called each time a new task must be chosen for scheduling on the 3.1259 -current CPU. The current time as passed as the single argument (the current 3.1260 -task can be found using the {\tt current} macro). 3.1261 - 3.1262 -This method should select the next task to run on this CPU and set it's minimum 3.1263 -time to run as well as returning the data described below. 3.1264 - 3.1265 -This method should also take the appropriate action if the previous 3.1266 -task has blocked, e.g. removing it from the runqueue. 3.1267 - 3.1268 -\paragraph*{Call environment} 3.1269 - 3.1270 -The other fields in the {\tt task\_struct} are updated by the generic layer, 3.1271 -which also performs all Xen-specific tasks and performs the actual task switch 3.1272 -(unless the previous task has been chosen again). 3.1273 - 3.1274 -This method is called with the {\tt schedule\_lock} held for the current CPU 3.1275 -and local interrupts disabled. 3.1276 - 3.1277 -\paragraph*{Return values} 3.1278 - 3.1279 -Must return a {\tt struct task\_slice} describing what task to run and how long 3.1280 -for (at maximum). 3.1281 - 3.1282 -\subsubsection{control} 3.1283 - 3.1284 -\paragraph*{Purpose} 3.1285 - 3.1286 -This method is called for global scheduler control operations. It takes a 3.1287 -pointer to a {\tt struct sched\_ctl\_cmd}, which it should either 3.1288 -source data from or populate with data, depending on the value of the 3.1289 -{\tt direction} field. 3.1290 - 3.1291 -\paragraph*{Call environment} 3.1292 - 3.1293 -The generic layer guarantees that when this method is called, the 3.1294 -caller selected the correct scheduler ID, hence the scheduler's 3.1295 -implementation does not need to sanity-check these parts of the call. 3.1296 - 3.1297 -\paragraph*{Return values} 3.1298 - 3.1299 -This function should return the value to be passed back to user space, hence it 3.1300 -should either be 0 or an appropriate errno value. 3.1301 - 3.1302 -\subsubsection{sched\_adjdom} 3.1303 - 3.1304 -\paragraph*{Purpose} 3.1305 - 3.1306 -This method is called to adjust the scheduling parameters of a particular 3.1307 -domain, or to query their current values. The function should check 3.1308 -the {\tt direction} field of the {\tt sched\_adjdom\_cmd} it receives in 3.1309 -order to determine which of these operations is being performed. 3.1310 - 3.1311 -\paragraph*{Call environment} 3.1312 - 3.1313 -The generic layer guarantees that the caller has specified the correct 3.1314 -control interface version and scheduler ID and that the supplied {\tt 3.1315 -task\_struct} will not be deallocated during the call (hence it is not 3.1316 -necessary to {\tt get\_task\_struct}). 3.1317 - 3.1318 -\paragraph*{Return values} 3.1319 - 3.1320 -This function should return the value to be passed back to user space, hence it 3.1321 -should either be 0 or an appropriate errno value. 3.1322 - 3.1323 -\subsubsection{reschedule} 3.1324 - 3.1325 -\paragraph*{Purpose} 3.1326 - 3.1327 -This method is called to determine if a reschedule is required as a result of a 3.1328 -particular task. 3.1329 - 3.1330 -\paragraph*{Call environment} 3.1331 -The generic layer will cause a reschedule if the current domain is the idle 3.1332 -task or it has exceeded its minimum time slice before a reschedule. The 3.1333 -generic layer guarantees that the task passed is not currently running but is 3.1334 -on the runqueue. 3.1335 - 3.1336 -\paragraph*{Return values} 3.1337 - 3.1338 -Should return a mask of CPUs to cause a reschedule on. 3.1339 - 3.1340 -\subsubsection{dump\_settings} 3.1341 - 3.1342 -\paragraph*{Purpose} 3.1343 - 3.1344 -If implemented, this should dump any private global settings for this 3.1345 -scheduler to the console. 3.1346 - 3.1347 -\paragraph*{Call environment} 3.1348 - 3.1349 -This function is called with interrupts enabled. 3.1350 - 3.1351 -\subsubsection{dump\_cpu\_state} 3.1352 - 3.1353 -\paragraph*{Purpose} 3.1354 - 3.1355 -This method should dump any private settings for the specified CPU. 3.1356 - 3.1357 -\paragraph*{Call environment} 3.1358 - 3.1359 -This function is called with interrupts disabled and the {\tt schedule\_lock} 3.1360 -for the specified CPU held. 3.1361 - 3.1362 -\subsubsection{dump\_runq\_el} 3.1363 - 3.1364 -\paragraph*{Purpose} 3.1365 - 3.1366 -This method should dump any private settings for the specified task. 3.1367 - 3.1368 -\paragraph*{Call environment} 3.1369 - 3.1370 -This function is called with interrupts disabled and the {\tt schedule\_lock} 3.1371 -for the task's CPU held. 3.1372 - 3.1373 -\end{comment} 3.1374 - 3.1375 +%% \include{src/interface/scheduling} 3.1376 +%% scheduling information moved to scheduling.tex 3.1377 +%% still commented out 3.1378 3.1379 3.1380 3.1381 @@ -1457,74 +126,9 @@ for the task's CPU held. 3.1382 %% (and/or kip's stuff?) and write about that instead? 3.1383 %% 3.1384 3.1385 -\begin{comment} 3.1386 - 3.1387 -\chapter{Debugging} 3.1388 - 3.1389 -Xen provides tools for debugging both Xen and guest OSes. Currently, the 3.1390 -Pervasive Debugger provides a GDB stub, which provides facilities for symbolic 3.1391 -debugging of Xen itself and of OS kernels running on top of Xen. The Trace 3.1392 -Buffer provides a lightweight means to log data about Xen's internal state and 3.1393 -behaviour at runtime, for later analysis. 3.1394 - 3.1395 -\section{Pervasive Debugger} 3.1396 - 3.1397 -Information on using the pervasive debugger is available in pdb.txt. 3.1398 - 3.1399 - 3.1400 -\section{Trace Buffer} 3.1401 - 3.1402 -The trace buffer provides a means to observe Xen's operation from domain 0. 3.1403 -Trace events, inserted at key points in Xen's code, record data that can be 3.1404 -read by the {\tt xentrace} tool. Recording these events has a low overhead 3.1405 -and hence the trace buffer may be useful for debugging timing-sensitive 3.1406 -behaviours. 3.1407 - 3.1408 -\subsection{Internal API} 3.1409 - 3.1410 -To use the trace buffer functionality from within Xen, you must {\tt \#include 3.1411 -<xen/trace.h>}, which contains definitions related to the trace buffer. Trace 3.1412 -events are inserted into the buffer using the {\tt TRACE\_xD} ({\tt x} = 0, 1, 3.1413 -2, 3, 4 or 5) macros. These all take an event number, plus {\tt x} additional 3.1414 -(32-bit) data as their arguments. For trace buffer-enabled builds of Xen these 3.1415 -will insert the event ID and data into the trace buffer, along with the current 3.1416 -value of the CPU cycle-counter. For builds without the trace buffer enabled, 3.1417 -the macros expand to no-ops and thus can be left in place without incurring 3.1418 -overheads. 3.1419 - 3.1420 -\subsection{Trace-enabled builds} 3.1421 - 3.1422 -By default, the trace buffer is enabled only in debug builds (i.e. {\tt NDEBUG} 3.1423 -is not defined). It can be enabled separately by defining {\tt TRACE\_BUFFER}, 3.1424 -either in {\tt <xen/config.h>} or on the gcc command line. 3.1425 - 3.1426 -The size (in pages) of the per-CPU trace buffers can be specified using the 3.1427 -{\tt tbuf\_size=n } boot parameter to Xen. If the size is set to 0, the trace 3.1428 -buffers will be disabled. 3.1429 - 3.1430 -\subsection{Dumping trace data} 3.1431 - 3.1432 -When running a trace buffer build of Xen, trace data are written continuously 3.1433 -into the buffer data areas, with newer data overwriting older data. This data 3.1434 -can be captured using the {\tt xentrace} program in domain 0. 3.1435 - 3.1436 -The {\tt xentrace} tool uses {\tt /dev/mem} in domain 0 to map the trace 3.1437 -buffers into its address space. It then periodically polls all the buffers for 3.1438 -new data, dumping out any new records from each buffer in turn. As a result, 3.1439 -for machines with multiple (logical) CPUs, the trace buffer output will not be 3.1440 -in overall chronological order. 3.1441 - 3.1442 -The output from {\tt xentrace} can be post-processed using {\tt 3.1443 -xentrace\_cpusplit} (used to split trace data out into per-cpu log files) and 3.1444 -{\tt xentrace\_format} (used to pretty-print trace data). For the predefined 3.1445 -trace points, there is an example format file in {\tt tools/xentrace/formats }. 3.1446 - 3.1447 -For more information, see the manual pages for {\tt xentrace}, {\tt 3.1448 -xentrace\_format} and {\tt xentrace\_cpusplit}. 3.1449 - 3.1450 -\end{comment} 3.1451 - 3.1452 - 3.1453 +%% \include{src/interface/debugging} 3.1454 +%% debugging information moved to debugging.tex 3.1455 +%% still commented out 3.1456 3.1457 3.1458 \end{document}
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/docs/src/interface/architecture.tex Tue Sep 20 09:43:46 2005 +0000 4.3 @@ -0,0 +1,140 @@ 4.4 +\chapter{Virtual Architecture} 4.5 + 4.6 +On a Xen-based system, the hypervisor itself runs in {\it ring 0}. It 4.7 +has full access to the physical memory available in the system and is 4.8 +responsible for allocating portions of it to the domains. Guest 4.9 +operating systems run in and use {\it rings 1}, {\it 2} and {\it 3} as 4.10 +they see fit. Segmentation is used to prevent the guest OS from 4.11 +accessing the portion of the address space that is reserved for Xen. 4.12 +We expect most guest operating systems will use ring 1 for their own 4.13 +operation and place applications in ring 3. 4.14 + 4.15 +In this chapter we consider the basic virtual architecture provided by 4.16 +Xen: the basic CPU state, exception and interrupt handling, and time. 4.17 +Other aspects such as memory and device access are discussed in later 4.18 +chapters. 4.19 + 4.20 + 4.21 +\section{CPU state} 4.22 + 4.23 +All privileged state must be handled by Xen. The guest OS has no 4.24 +direct access to CR3 and is not permitted to update privileged bits in 4.25 +EFLAGS. Guest OSes use \emph{hypercalls} to invoke operations in Xen; 4.26 +these are analogous to system calls but occur from ring 1 to ring 0. 4.27 + 4.28 +A list of all hypercalls is given in Appendix~\ref{a:hypercalls}. 4.29 + 4.30 + 4.31 +\section{Exceptions} 4.32 + 4.33 +A virtual IDT is provided --- a domain can submit a table of trap 4.34 +handlers to Xen via the {\tt set\_trap\_table()} hypercall. Most trap 4.35 +handlers are identical to native x86 handlers, although the page-fault 4.36 +handler is somewhat different. 4.37 + 4.38 + 4.39 +\section{Interrupts and events} 4.40 + 4.41 +Interrupts are virtualized by mapping them to \emph{events}, which are 4.42 +delivered asynchronously to the target domain using a callback 4.43 +supplied via the {\tt set\_callbacks()} hypercall. A guest OS can map 4.44 +these events onto its standard interrupt dispatch mechanisms. Xen is 4.45 +responsible for determining the target domain that will handle each 4.46 +physical interrupt source. For more details on the binding of event 4.47 +sources to events, see Chapter~\ref{c:devices}. 4.48 + 4.49 + 4.50 +\section{Time} 4.51 + 4.52 +Guest operating systems need to be aware of the passage of both real 4.53 +(or wallclock) time and their own `virtual time' (the time for which 4.54 +they have been executing). Furthermore, Xen has a notion of time which 4.55 +is used for scheduling. The following notions of time are provided: 4.56 + 4.57 +\begin{description} 4.58 +\item[Cycle counter time.] 4.59 + 4.60 + This provides a fine-grained time reference. The cycle counter time 4.61 + is used to accurately extrapolate the other time references. On SMP 4.62 + machines it is currently assumed that the cycle counter time is 4.63 + synchronized between CPUs. The current x86-based implementation 4.64 + achieves this within inter-CPU communication latencies. 4.65 + 4.66 +\item[System time.] 4.67 + 4.68 + This is a 64-bit counter which holds the number of nanoseconds that 4.69 + have elapsed since system boot. 4.70 + 4.71 +\item[Wall clock time.] 4.72 + 4.73 + This is the time of day in a Unix-style {\tt struct timeval} 4.74 + (seconds and microseconds since 1 January 1970, adjusted by leap 4.75 + seconds). An NTP client hosted by {\it domain 0} can keep this 4.76 + value accurate. 4.77 + 4.78 +\item[Domain virtual time.] 4.79 + 4.80 + This progresses at the same pace as system time, but only while a 4.81 + domain is executing --- it stops while a domain is de-scheduled. 4.82 + Therefore the share of the CPU that a domain receives is indicated 4.83 + by the rate at which its virtual time increases. 4.84 + 4.85 +\end{description} 4.86 + 4.87 + 4.88 +Xen exports timestamps for system time and wall-clock time to guest 4.89 +operating systems through a shared page of memory. Xen also provides 4.90 +the cycle counter time at the instant the timestamps were calculated, 4.91 +and the CPU frequency in Hertz. This allows the guest to extrapolate 4.92 +system and wall-clock times accurately based on the current cycle 4.93 +counter time. 4.94 + 4.95 +Since all time stamps need to be updated and read \emph{atomically} 4.96 +two version numbers are also stored in the shared info page. The first 4.97 +is incremented prior to an update, while the second is only 4.98 +incremented afterwards. Thus a guest can be sure that it read a 4.99 +consistent state by checking the two version numbers are equal. 4.100 + 4.101 +Xen includes a periodic ticker which sends a timer event to the 4.102 +currently executing domain every 10ms. The Xen scheduler also sends a 4.103 +timer event whenever a domain is scheduled; this allows the guest OS 4.104 +to adjust for the time that has passed while it has been inactive. In 4.105 +addition, Xen allows each domain to request that they receive a timer 4.106 +event sent at a specified system time by using the {\tt 4.107 + set\_timer\_op()} hypercall. Guest OSes may use this timer to 4.108 +implement timeout values when they block. 4.109 + 4.110 + 4.111 + 4.112 +%% % akw: demoting this to a section -- not sure if there is any point 4.113 +%% % though, maybe just remove it. 4.114 + 4.115 +\section{Xen CPU Scheduling} 4.116 + 4.117 +Xen offers a uniform API for CPU schedulers. It is possible to choose 4.118 +from a number of schedulers at boot and it should be easy to add more. 4.119 +The BVT, Atropos and Round Robin schedulers are part of the normal Xen 4.120 +distribution. BVT provides proportional fair shares of the CPU to the 4.121 +running domains. Atropos can be used to reserve absolute shares of 4.122 +the CPU for each domain. Round-robin is provided as an example of 4.123 +Xen's internal scheduler API. 4.124 + 4.125 +\paragraph*{Note: SMP host support} 4.126 +Xen has always supported SMP host systems. Domains are statically 4.127 +assigned to CPUs, either at creation time or when manually pinning to 4.128 +a particular CPU. The current schedulers then run locally on each CPU 4.129 +to decide which of the assigned domains should be run there. The 4.130 +user-level control software can be used to perform coarse-grain 4.131 +load-balancing between CPUs. 4.132 + 4.133 + 4.134 +%% More information on the characteristics and use of these schedulers 4.135 +%% is available in {\tt Sched-HOWTO.txt}. 4.136 + 4.137 + 4.138 +\section{Privileged operations} 4.139 + 4.140 +Xen exports an extended interface to privileged domains (viz.\ {\it 4.141 + Domain 0}). This allows such domains to build and boot other domains 4.142 +on the server, and provides control interfaces for managing 4.143 +scheduling, memory, networking, and block devices.
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/docs/src/interface/debugging.tex Tue Sep 20 09:43:46 2005 +0000 5.3 @@ -0,0 +1,62 @@ 5.4 +\chapter{Debugging} 5.5 + 5.6 +Xen provides tools for debugging both Xen and guest OSes. Currently, the 5.7 +Pervasive Debugger provides a GDB stub, which provides facilities for symbolic 5.8 +debugging of Xen itself and of OS kernels running on top of Xen. The Trace 5.9 +Buffer provides a lightweight means to log data about Xen's internal state and 5.10 +behaviour at runtime, for later analysis. 5.11 + 5.12 +\section{Pervasive Debugger} 5.13 + 5.14 +Information on using the pervasive debugger is available in pdb.txt. 5.15 + 5.16 + 5.17 +\section{Trace Buffer} 5.18 + 5.19 +The trace buffer provides a means to observe Xen's operation from domain 0. 5.20 +Trace events, inserted at key points in Xen's code, record data that can be 5.21 +read by the {\tt xentrace} tool. Recording these events has a low overhead 5.22 +and hence the trace buffer may be useful for debugging timing-sensitive 5.23 +behaviours. 5.24 + 5.25 +\subsection{Internal API} 5.26 + 5.27 +To use the trace buffer functionality from within Xen, you must {\tt \#include 5.28 +<xen/trace.h>}, which contains definitions related to the trace buffer. Trace 5.29 +events are inserted into the buffer using the {\tt TRACE\_xD} ({\tt x} = 0, 1, 5.30 +2, 3, 4 or 5) macros. These all take an event number, plus {\tt x} additional 5.31 +(32-bit) data as their arguments. For trace buffer-enabled builds of Xen these 5.32 +will insert the event ID and data into the trace buffer, along with the current 5.33 +value of the CPU cycle-counter. For builds without the trace buffer enabled, 5.34 +the macros expand to no-ops and thus can be left in place without incurring 5.35 +overheads. 5.36 + 5.37 +\subsection{Trace-enabled builds} 5.38 + 5.39 +By default, the trace buffer is enabled only in debug builds (i.e. {\tt NDEBUG} 5.40 +is not defined). It can be enabled separately by defining {\tt TRACE\_BUFFER}, 5.41 +either in {\tt <xen/config.h>} or on the gcc command line. 5.42 + 5.43 +The size (in pages) of the per-CPU trace buffers can be specified using the 5.44 +{\tt tbuf\_size=n } boot parameter to Xen. If the size is set to 0, the trace 5.45 +buffers will be disabled. 5.46 + 5.47 +\subsection{Dumping trace data} 5.48 + 5.49 +When running a trace buffer build of Xen, trace data are written continuously 5.50 +into the buffer data areas, with newer data overwriting older data. This data 5.51 +can be captured using the {\tt xentrace} program in domain 0. 5.52 + 5.53 +The {\tt xentrace} tool uses {\tt /dev/mem} in domain 0 to map the trace 5.54 +buffers into its address space. It then periodically polls all the buffers for 5.55 +new data, dumping out any new records from each buffer in turn. As a result, 5.56 +for machines with multiple (logical) CPUs, the trace buffer output will not be 5.57 +in overall chronological order. 5.58 + 5.59 +The output from {\tt xentrace} can be post-processed using {\tt 5.60 +xentrace\_cpusplit} (used to split trace data out into per-cpu log files) and 5.61 +{\tt xentrace\_format} (used to pretty-print trace data). For the predefined 5.62 +trace points, there is an example format file in {\tt tools/xentrace/formats }. 5.63 + 5.64 +For more information, see the manual pages for {\tt xentrace}, {\tt 5.65 +xentrace\_format} and {\tt xentrace\_cpusplit}.
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 6.2 +++ b/docs/src/interface/devices.tex Tue Sep 20 09:43:46 2005 +0000 6.3 @@ -0,0 +1,178 @@ 6.4 +\chapter{Devices} 6.5 +\label{c:devices} 6.6 + 6.7 +Devices such as network and disk are exported to guests using a split 6.8 +device driver. The device driver domain, which accesses the physical 6.9 +device directly also runs a \emph{backend} driver, serving requests to 6.10 +that device from guests. Each guest will use a simple \emph{frontend} 6.11 +driver, to access the backend. Communication between these domains is 6.12 +composed of two parts: First, data is placed onto a shared memory page 6.13 +between the domains. Second, an event channel between the two domains 6.14 +is used to pass notification that data is outstanding. This 6.15 +separation of notification from data transfer allows message batching, 6.16 +and results in very efficient device access. 6.17 + 6.18 +Event channels are used extensively in device virtualization; each 6.19 +domain has a number of end-points or \emph{ports} each of which may be 6.20 +bound to one of the following \emph{event sources}: 6.21 +\begin{itemize} 6.22 + \item a physical interrupt from a real device, 6.23 + \item a virtual interrupt (callback) from Xen, or 6.24 + \item a signal from another domain 6.25 +\end{itemize} 6.26 + 6.27 +Events are lightweight and do not carry much information beyond the 6.28 +source of the notification. Hence when performing bulk data transfer, 6.29 +events are typically used as synchronization primitives over a shared 6.30 +memory transport. Event channels are managed via the {\tt 6.31 + event\_channel\_op()} hypercall; for more details see 6.32 +Section~\ref{s:idc}. 6.33 + 6.34 +This chapter focuses on some individual device interfaces available to 6.35 +Xen guests. 6.36 + 6.37 + 6.38 +\section{Network I/O} 6.39 + 6.40 +Virtual network device services are provided by shared memory 6.41 +communication with a backend domain. From the point of view of other 6.42 +domains, the backend may be viewed as a virtual ethernet switch 6.43 +element with each domain having one or more virtual network interfaces 6.44 +connected to it. 6.45 + 6.46 +\subsection{Backend Packet Handling} 6.47 + 6.48 +The backend driver is responsible for a variety of actions relating to 6.49 +the transmission and reception of packets from the physical device. 6.50 +With regard to transmission, the backend performs these key actions: 6.51 + 6.52 +\begin{itemize} 6.53 +\item {\bf Validation:} To ensure that domains do not attempt to 6.54 + generate invalid (e.g. spoofed) traffic, the backend driver may 6.55 + validate headers ensuring that source MAC and IP addresses match the 6.56 + interface that they have been sent from. 6.57 + 6.58 + Validation functions can be configured using standard firewall rules 6.59 + ({\small{\tt iptables}} in the case of Linux). 6.60 + 6.61 +\item {\bf Scheduling:} Since a number of domains can share a single 6.62 + physical network interface, the backend must mediate access when 6.63 + several domains each have packets queued for transmission. This 6.64 + general scheduling function subsumes basic shaping or rate-limiting 6.65 + schemes. 6.66 + 6.67 +\item {\bf Logging and Accounting:} The backend domain can be 6.68 + configured with classifier rules that control how packets are 6.69 + accounted or logged. For example, log messages might be generated 6.70 + whenever a domain attempts to send a TCP packet containing a SYN. 6.71 +\end{itemize} 6.72 + 6.73 +On receipt of incoming packets, the backend acts as a simple 6.74 +demultiplexer: Packets are passed to the appropriate virtual interface 6.75 +after any necessary logging and accounting have been carried out. 6.76 + 6.77 +\subsection{Data Transfer} 6.78 + 6.79 +Each virtual interface uses two ``descriptor rings'', one for 6.80 +transmit, the other for receive. Each descriptor identifies a block 6.81 +of contiguous physical memory allocated to the domain. 6.82 + 6.83 +The transmit ring carries packets to transmit from the guest to the 6.84 +backend domain. The return path of the transmit ring carries messages 6.85 +indicating that the contents have been physically transmitted and the 6.86 +backend no longer requires the associated pages of memory. 6.87 + 6.88 +To receive packets, the guest places descriptors of unused pages on 6.89 +the receive ring. The backend will return received packets by 6.90 +exchanging these pages in the domain's memory with new pages 6.91 +containing the received data, and passing back descriptors regarding 6.92 +the new packets on the ring. This zero-copy approach allows the 6.93 +backend to maintain a pool of free pages to receive packets into, and 6.94 +then deliver them to appropriate domains after examining their 6.95 +headers. 6.96 + 6.97 +% Real physical addresses are used throughout, with the domain 6.98 +% performing translation from pseudo-physical addresses if that is 6.99 +% necessary. 6.100 + 6.101 +If a domain does not keep its receive ring stocked with empty buffers 6.102 +then packets destined to it may be dropped. This provides some 6.103 +defence against receive livelock problems because an overload domain 6.104 +will cease to receive further data. Similarly, on the transmit path, 6.105 +it provides the application with feedback on the rate at which packets 6.106 +are able to leave the system. 6.107 + 6.108 +Flow control on rings is achieved by including a pair of producer 6.109 +indexes on the shared ring page. Each side will maintain a private 6.110 +consumer index indicating the next outstanding message. In this 6.111 +manner, the domains cooperate to divide the ring into two message 6.112 +lists, one in each direction. Notification is decoupled from the 6.113 +immediate placement of new messages on the ring; the event channel 6.114 +will be used to generate notification when {\em either} a certain 6.115 +number of outstanding messages are queued, {\em or} a specified number 6.116 +of nanoseconds have elapsed since the oldest message was placed on the 6.117 +ring. 6.118 + 6.119 +%% Not sure if my version is any better -- here is what was here 6.120 +%% before: Synchronization between the backend domain and the guest is 6.121 +%% achieved using counters held in shared memory that is accessible to 6.122 +%% both. Each ring has associated producer and consumer indices 6.123 +%% indicating the area in the ring that holds descriptors that contain 6.124 +%% data. After receiving {\it n} packets or {\t nanoseconds} after 6.125 +%% receiving the first packet, the hypervisor sends an event to the 6.126 +%% domain. 6.127 + 6.128 + 6.129 +\section{Block I/O} 6.130 + 6.131 +All guest OS disk access goes through the virtual block device VBD 6.132 +interface. This interface allows domains access to portions of block 6.133 +storage devices visible to the the block backend device. The VBD 6.134 +interface is a split driver, similar to the network interface 6.135 +described above. A single shared memory ring is used between the 6.136 +frontend and backend drivers, across which read and write messages are 6.137 +sent. 6.138 + 6.139 +Any block device accessible to the backend domain, including 6.140 +network-based block (iSCSI, *NBD, etc), loopback and LVM/MD devices, 6.141 +can be exported as a VBD. Each VBD is mapped to a device node in the 6.142 +guest, specified in the guest's startup configuration. 6.143 + 6.144 +Old (Xen 1.2) virtual disks are not supported under Xen 2.0, since 6.145 +similar functionality can be achieved using the more complete LVM 6.146 +system, which is already in widespread use. 6.147 + 6.148 +\subsection{Data Transfer} 6.149 + 6.150 +The single ring between the guest and the block backend supports three 6.151 +messages: 6.152 + 6.153 +\begin{description} 6.154 +\item [{\small {\tt PROBE}}:] Return a list of the VBDs available to 6.155 + this guest from the backend. The request includes a descriptor of a 6.156 + free page into which the reply will be written by the backend. 6.157 + 6.158 +\item [{\small {\tt READ}}:] Read data from the specified block 6.159 + device. The front end identifies the device and location to read 6.160 + from and attaches pages for the data to be copied to (typically via 6.161 + DMA from the device). The backend acknowledges completed read 6.162 + requests as they finish. 6.163 + 6.164 +\item [{\small {\tt WRITE}}:] Write data to the specified block 6.165 + device. This functions essentially as {\small {\tt READ}}, except 6.166 + that the data moves to the device instead of from it. 6.167 +\end{description} 6.168 + 6.169 +%% um... some old text: In overview, the same style of descriptor-ring 6.170 +%% that is used for network packets is used here. Each domain has one 6.171 +%% ring that carries operation requests to the hypervisor and carries 6.172 +%% the results back again. 6.173 + 6.174 +%% Rather than copying data, the backend simply maps the domain's 6.175 +%% buffers in order to enable direct DMA to them. The act of mapping 6.176 +%% the buffers also increases the reference counts of the underlying 6.177 +%% pages, so that the unprivileged domain cannot try to return them to 6.178 +%% the hypervisor, install them as page tables, or any other unsafe 6.179 +%% behaviour. 6.180 +%% 6.181 +%% % block API here
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/docs/src/interface/further_info.tex Tue Sep 20 09:43:46 2005 +0000 7.3 @@ -0,0 +1,49 @@ 7.4 +\chapter{Further Information} 7.5 + 7.6 +If you have questions that are not answered by this manual, the 7.7 +sources of information listed below may be of interest to you. Note 7.8 +that bug reports, suggestions and contributions related to the 7.9 +software (or the documentation) should be sent to the Xen developers' 7.10 +mailing list (address below). 7.11 + 7.12 + 7.13 +\section{Other documentation} 7.14 + 7.15 +If you are mainly interested in using (rather than developing for) 7.16 +Xen, the \emph{Xen Users' Manual} is distributed in the {\tt docs/} 7.17 +directory of the Xen source distribution. 7.18 + 7.19 +% Various HOWTOs are also available in {\tt docs/HOWTOS}. 7.20 + 7.21 + 7.22 +\section{Online references} 7.23 + 7.24 +The official Xen web site is found at: 7.25 +\begin{quote} 7.26 +{\tt http://www.cl.cam.ac.uk/Research/SRG/netos/xen/} 7.27 +\end{quote} 7.28 + 7.29 +This contains links to the latest versions of all on-line 7.30 +documentation. 7.31 + 7.32 + 7.33 +\section{Mailing lists} 7.34 + 7.35 +There are currently four official Xen mailing lists: 7.36 + 7.37 +\begin{description} 7.38 +\item[xen-devel@lists.xensource.com] Used for development 7.39 + discussions and bug reports. Subscribe at: \\ 7.40 + {\small {\tt http://lists.xensource.com/xen-devel}} 7.41 +\item[xen-users@lists.xensource.com] Used for installation and usage 7.42 + discussions and requests for help. Subscribe at: \\ 7.43 + {\small {\tt http://lists.xensource.com/xen-users}} 7.44 +\item[xen-announce@lists.xensource.com] Used for announcements only. 7.45 + Subscribe at: \\ 7.46 + {\small {\tt http://lists.xensource.com/xen-announce}} 7.47 +\item[xen-changelog@lists.xensource.com] Changelog feed 7.48 + from the unstable and 2.0 trees - developer oriented. Subscribe at: \\ 7.49 + {\small {\tt http://lists.xensource.com/xen-changelog}} 7.50 +\end{description} 7.51 + 7.52 +Of these, xen-devel is the most active.
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 8.2 +++ b/docs/src/interface/hypercalls.tex Tue Sep 20 09:43:46 2005 +0000 8.3 @@ -0,0 +1,524 @@ 8.4 + 8.5 +\newcommand{\hypercall}[1]{\vspace{2mm}{\sf #1}} 8.6 + 8.7 +\chapter{Xen Hypercalls} 8.8 +\label{a:hypercalls} 8.9 + 8.10 +Hypercalls represent the procedural interface to Xen; this appendix 8.11 +categorizes and describes the current set of hypercalls. 8.12 + 8.13 +\section{Invoking Hypercalls} 8.14 + 8.15 +Hypercalls are invoked in a manner analogous to system calls in a 8.16 +conventional operating system; a software interrupt is issued which 8.17 +vectors to an entry point within Xen. On x86\_32 machines the 8.18 +instruction required is {\tt int \$82}; the (real) IDT is setup so 8.19 +that this may only be issued from within ring 1. The particular 8.20 +hypercall to be invoked is contained in {\tt EAX} --- a list 8.21 +mapping these values to symbolic hypercall names can be found 8.22 +in {\tt xen/include/public/xen.h}. 8.23 + 8.24 +On some occasions a set of hypercalls will be required to carry 8.25 +out a higher-level function; a good example is when a guest 8.26 +operating wishes to context switch to a new process which 8.27 +requires updating various privileged CPU state. As an optimization 8.28 +for these cases, there is a generic mechanism to issue a set of 8.29 +hypercalls as a batch: 8.30 + 8.31 +\begin{quote} 8.32 +\hypercall{multicall(void *call\_list, int nr\_calls)} 8.33 + 8.34 +Execute a series of hypervisor calls; {\tt nr\_calls} is the length of 8.35 +the array of {\tt multicall\_entry\_t} structures pointed to be {\tt 8.36 +call\_list}. Each entry contains the hypercall operation code followed 8.37 +by up to 7 word-sized arguments. 8.38 +\end{quote} 8.39 + 8.40 +Note that multicalls are provided purely as an optimization; there is 8.41 +no requirement to use them when first porting a guest operating 8.42 +system. 8.43 + 8.44 + 8.45 +\section{Virtual CPU Setup} 8.46 + 8.47 +At start of day, a guest operating system needs to setup the virtual 8.48 +CPU it is executing on. This includes installing vectors for the 8.49 +virtual IDT so that the guest OS can handle interrupts, page faults, 8.50 +etc. However the very first thing a guest OS must setup is a pair 8.51 +of hypervisor callbacks: these are the entry points which Xen will 8.52 +use when it wishes to notify the guest OS of an occurrence. 8.53 + 8.54 +\begin{quote} 8.55 +\hypercall{set\_callbacks(unsigned long event\_selector, unsigned long 8.56 + event\_address, unsigned long failsafe\_selector, unsigned long 8.57 + failsafe\_address) } 8.58 + 8.59 +Register the normal (``event'') and failsafe callbacks for 8.60 +event processing. In each case the code segment selector and 8.61 +address within that segment are provided. The selectors must 8.62 +have RPL 1; in XenLinux we simply use the kernel's CS for both 8.63 +{\tt event\_selector} and {\tt failsafe\_selector}. 8.64 + 8.65 +The value {\tt event\_address} specifies the address of the guest OSes 8.66 +event handling and dispatch routine; the {\tt failsafe\_address} 8.67 +specifies a separate entry point which is used only if a fault occurs 8.68 +when Xen attempts to use the normal callback. 8.69 +\end{quote} 8.70 + 8.71 + 8.72 +After installing the hypervisor callbacks, the guest OS can 8.73 +install a `virtual IDT' by using the following hypercall: 8.74 + 8.75 +\begin{quote} 8.76 +\hypercall{set\_trap\_table(trap\_info\_t *table)} 8.77 + 8.78 +Install one or more entries into the per-domain 8.79 +trap handler table (essentially a software version of the IDT). 8.80 +Each entry in the array pointed to by {\tt table} includes the 8.81 +exception vector number with the corresponding segment selector 8.82 +and entry point. Most guest OSes can use the same handlers on 8.83 +Xen as when running on the real hardware; an exception is the 8.84 +page fault handler (exception vector 14) where a modified 8.85 +stack-frame layout is used. 8.86 + 8.87 + 8.88 +\end{quote} 8.89 + 8.90 + 8.91 + 8.92 +\section{Scheduling and Timer} 8.93 + 8.94 +Domains are preemptively scheduled by Xen according to the 8.95 +parameters installed by domain 0 (see Section~\ref{s:dom0ops}). 8.96 +In addition, however, a domain may choose to explicitly 8.97 +control certain behavior with the following hypercall: 8.98 + 8.99 +\begin{quote} 8.100 +\hypercall{sched\_op(unsigned long op)} 8.101 + 8.102 +Request scheduling operation from hypervisor. The options are: {\it 8.103 +yield}, {\it block}, and {\it shutdown}. {\it yield} keeps the 8.104 +calling domain runnable but may cause a reschedule if other domains 8.105 +are runnable. {\it block} removes the calling domain from the run 8.106 +queue and cause is to sleeps until an event is delivered to it. {\it 8.107 +shutdown} is used to end the domain's execution; the caller can 8.108 +additionally specify whether the domain should reboot, halt or 8.109 +suspend. 8.110 +\end{quote} 8.111 + 8.112 +To aid the implementation of a process scheduler within a guest OS, 8.113 +Xen provides a virtual programmable timer: 8.114 + 8.115 +\begin{quote} 8.116 +\hypercall{set\_timer\_op(uint64\_t timeout)} 8.117 + 8.118 +Request a timer event to be sent at the specified system time (time 8.119 +in nanoseconds since system boot). The hypercall actually passes the 8.120 +64-bit timeout value as a pair of 32-bit values. 8.121 + 8.122 +\end{quote} 8.123 + 8.124 +Note that calling {\tt set\_timer\_op()} prior to {\tt sched\_op} 8.125 +allows block-with-timeout semantics. 8.126 + 8.127 + 8.128 +\section{Page Table Management} 8.129 + 8.130 +Since guest operating systems have read-only access to their page 8.131 +tables, Xen must be involved when making any changes. The following 8.132 +multi-purpose hypercall can be used to modify page-table entries, 8.133 +update the machine-to-physical mapping table, flush the TLB, install 8.134 +a new page-table base pointer, and more. 8.135 + 8.136 +\begin{quote} 8.137 +\hypercall{mmu\_update(mmu\_update\_t *req, int count, int *success\_count)} 8.138 + 8.139 +Update the page table for the domain; a set of {\tt count} updates are 8.140 +submitted for processing in a batch, with {\tt success\_count} being 8.141 +updated to report the number of successful updates. 8.142 + 8.143 +Each element of {\tt req[]} contains a pointer (address) and value; 8.144 +the least significant 2-bits of the pointer are used to distinguish 8.145 +the type of update requested as follows: 8.146 +\begin{description} 8.147 + 8.148 +\item[\it MMU\_NORMAL\_PT\_UPDATE:] update a page directory entry or 8.149 +page table entry to the associated value; Xen will check that the 8.150 +update is safe, as described in Chapter~\ref{c:memory}. 8.151 + 8.152 +\item[\it MMU\_MACHPHYS\_UPDATE:] update an entry in the 8.153 + machine-to-physical table. The calling domain must own the machine 8.154 + page in question (or be privileged). 8.155 + 8.156 +\item[\it MMU\_EXTENDED\_COMMAND:] perform additional MMU operations. 8.157 +The set of additional MMU operations is considerable, and includes 8.158 +updating {\tt cr3} (or just re-installing it for a TLB flush), 8.159 +flushing the cache, installing a new LDT, or pinning \& unpinning 8.160 +page-table pages (to ensure their reference count doesn't drop to zero 8.161 +which would require a revalidation of all entries). 8.162 + 8.163 +Further extended commands are used to deal with granting and 8.164 +acquiring page ownership; see Section~\ref{s:idc}. 8.165 + 8.166 + 8.167 +\end{description} 8.168 + 8.169 +More details on the precise format of all commands can be 8.170 +found in {\tt xen/include/public/xen.h}. 8.171 + 8.172 + 8.173 +\end{quote} 8.174 + 8.175 +Explicitly updating batches of page table entries is extremely 8.176 +efficient, but can require a number of alterations to the guest 8.177 +OS. Using the writable page table mode (Chapter~\ref{c:memory}) is 8.178 +recommended for new OS ports. 8.179 + 8.180 +Regardless of which page table update mode is being used, however, 8.181 +there are some occasions (notably handling a demand page fault) where 8.182 +a guest OS will wish to modify exactly one PTE rather than a 8.183 +batch. This is catered for by the following: 8.184 + 8.185 +\begin{quote} 8.186 +\hypercall{update\_va\_mapping(unsigned long page\_nr, unsigned long 8.187 +val, \\ unsigned long flags)} 8.188 + 8.189 +Update the currently installed PTE for the page {\tt page\_nr} to 8.190 +{\tt val}. As with {\tt mmu\_update()}, Xen checks the modification 8.191 +is safe before applying it. The {\tt flags} determine which kind 8.192 +of TLB flush, if any, should follow the update. 8.193 + 8.194 +\end{quote} 8.195 + 8.196 +Finally, sufficiently privileged domains may occasionally wish to manipulate 8.197 +the pages of others: 8.198 +\begin{quote} 8.199 + 8.200 +\hypercall{update\_va\_mapping\_otherdomain(unsigned long page\_nr, 8.201 +unsigned long val, unsigned long flags, uint16\_t domid)} 8.202 + 8.203 +Identical to {\tt update\_va\_mapping()} save that the pages being 8.204 +mapped must belong to the domain {\tt domid}. 8.205 + 8.206 +\end{quote} 8.207 + 8.208 +This privileged operation is currently used by backend virtual device 8.209 +drivers to safely map pages containing I/O data. 8.210 + 8.211 + 8.212 + 8.213 +\section{Segmentation Support} 8.214 + 8.215 +Xen allows guest OSes to install a custom GDT if they require it; 8.216 +this is context switched transparently whenever a domain is 8.217 +[de]scheduled. The following hypercall is effectively a 8.218 +`safe' version of {\tt lgdt}: 8.219 + 8.220 +\begin{quote} 8.221 +\hypercall{set\_gdt(unsigned long *frame\_list, int entries)} 8.222 + 8.223 +Install a global descriptor table for a domain; {\tt frame\_list} is 8.224 +an array of up to 16 machine page frames within which the GDT resides, 8.225 +with {\tt entries} being the actual number of descriptor-entry 8.226 +slots. All page frames must be mapped read-only within the guest's 8.227 +address space, and the table must be large enough to contain Xen's 8.228 +reserved entries (see {\tt xen/include/public/arch-x86\_32.h}). 8.229 + 8.230 +\end{quote} 8.231 + 8.232 +Many guest OSes will also wish to install LDTs; this is achieved by 8.233 +using {\tt mmu\_update()} with an extended command, passing the 8.234 +linear address of the LDT base along with the number of entries. No 8.235 +special safety checks are required; Xen needs to perform this task 8.236 +simply since {\tt lldt} requires CPL 0. 8.237 + 8.238 + 8.239 +Xen also allows guest operating systems to update just an 8.240 +individual segment descriptor in the GDT or LDT: 8.241 + 8.242 +\begin{quote} 8.243 +\hypercall{update\_descriptor(unsigned long ma, unsigned long word1, 8.244 +unsigned long word2)} 8.245 + 8.246 +Update the GDT/LDT entry at machine address {\tt ma}; the new 8.247 +8-byte descriptor is stored in {\tt word1} and {\tt word2}. 8.248 +Xen performs a number of checks to ensure the descriptor is 8.249 +valid. 8.250 + 8.251 +\end{quote} 8.252 + 8.253 +Guest OSes can use the above in place of context switching entire 8.254 +LDTs (or the GDT) when the number of changing descriptors is small. 8.255 + 8.256 +\section{Context Switching} 8.257 + 8.258 +When a guest OS wishes to context switch between two processes, 8.259 +it can use the page table and segmentation hypercalls described 8.260 +above to perform the the bulk of the privileged work. In addition, 8.261 +however, it will need to invoke Xen to switch the kernel (ring 1) 8.262 +stack pointer: 8.263 + 8.264 +\begin{quote} 8.265 +\hypercall{stack\_switch(unsigned long ss, unsigned long esp)} 8.266 + 8.267 +Request kernel stack switch from hypervisor; {\tt ss} is the new 8.268 +stack segment, which {\tt esp} is the new stack pointer. 8.269 + 8.270 +\end{quote} 8.271 + 8.272 +A final useful hypercall for context switching allows ``lazy'' 8.273 +save and restore of floating point state: 8.274 + 8.275 +\begin{quote} 8.276 +\hypercall{fpu\_taskswitch(void)} 8.277 + 8.278 +This call instructs Xen to set the {\tt TS} bit in the {\tt cr0} 8.279 +control register; this means that the next attempt to use floating 8.280 +point will cause a trap which the guest OS can trap. Typically it will 8.281 +then save/restore the FP state, and clear the {\tt TS} bit. 8.282 +\end{quote} 8.283 + 8.284 +This is provided as an optimization only; guest OSes can also choose 8.285 +to save and restore FP state on all context switches for simplicity. 8.286 + 8.287 + 8.288 +\section{Physical Memory Management} 8.289 + 8.290 +As mentioned previously, each domain has a maximum and current 8.291 +memory allocation. The maximum allocation, set at domain creation 8.292 +time, cannot be modified. However a domain can choose to reduce 8.293 +and subsequently grow its current allocation by using the 8.294 +following call: 8.295 + 8.296 +\begin{quote} 8.297 +\hypercall{dom\_mem\_op(unsigned int op, unsigned long *extent\_list, 8.298 + unsigned long nr\_extents, unsigned int extent\_order)} 8.299 + 8.300 +Increase or decrease current memory allocation (as determined by 8.301 +the value of {\tt op}). Each invocation provides a list of 8.302 +extents each of which is $2^s$ pages in size, 8.303 +where $s$ is the value of {\tt extent\_order}. 8.304 + 8.305 +\end{quote} 8.306 + 8.307 +In addition to simply reducing or increasing the current memory 8.308 +allocation via a `balloon driver', this call is also useful for 8.309 +obtaining contiguous regions of machine memory when required (e.g. 8.310 +for certain PCI devices, or if using superpages). 8.311 + 8.312 + 8.313 +\section{Inter-Domain Communication} 8.314 +\label{s:idc} 8.315 + 8.316 +Xen provides a simple asynchronous notification mechanism via 8.317 +\emph{event channels}. Each domain has a set of end-points (or 8.318 +\emph{ports}) which may be bound to an event source (e.g. a physical 8.319 +IRQ, a virtual IRQ, or an port in another domain). When a pair of 8.320 +end-points in two different domains are bound together, then a `send' 8.321 +operation on one will cause an event to be received by the destination 8.322 +domain. 8.323 + 8.324 +The control and use of event channels involves the following hypercall: 8.325 + 8.326 +\begin{quote} 8.327 +\hypercall{event\_channel\_op(evtchn\_op\_t *op)} 8.328 + 8.329 +Inter-domain event-channel management; {\tt op} is a discriminated 8.330 +union which allows the following 7 operations: 8.331 + 8.332 +\begin{description} 8.333 + 8.334 +\item[\it alloc\_unbound:] allocate a free (unbound) local 8.335 + port and prepare for connection from a specified domain. 8.336 +\item[\it bind\_virq:] bind a local port to a virtual 8.337 +IRQ; any particular VIRQ can be bound to at most one port per domain. 8.338 +\item[\it bind\_pirq:] bind a local port to a physical IRQ; 8.339 +once more, a given pIRQ can be bound to at most one port per 8.340 +domain. Furthermore the calling domain must be sufficiently 8.341 +privileged. 8.342 +\item[\it bind\_interdomain:] construct an interdomain event 8.343 +channel; in general, the target domain must have previously allocated 8.344 +an unbound port for this channel, although this can be bypassed by 8.345 +privileged domains during domain setup. 8.346 +\item[\it close:] close an interdomain event channel. 8.347 +\item[\it send:] send an event to the remote end of a 8.348 +interdomain event channel. 8.349 +\item[\it status:] determine the current status of a local port. 8.350 +\end{description} 8.351 + 8.352 +For more details see 8.353 +{\tt xen/include/public/event\_channel.h}. 8.354 + 8.355 +\end{quote} 8.356 + 8.357 +Event channels are the fundamental communication primitive between 8.358 +Xen domains and seamlessly support SMP. However they provide little 8.359 +bandwidth for communication {\sl per se}, and hence are typically 8.360 +married with a piece of shared memory to produce effective and 8.361 +high-performance inter-domain communication. 8.362 + 8.363 +Safe sharing of memory pages between guest OSes is carried out by 8.364 +granting access on a per page basis to individual domains. This is 8.365 +achieved by using the {\tt grant\_table\_op()} hypercall. 8.366 + 8.367 +\begin{quote} 8.368 +\hypercall{grant\_table\_op(unsigned int cmd, void *uop, unsigned int count)} 8.369 + 8.370 +Grant or remove access to a particular page to a particular domain. 8.371 + 8.372 +\end{quote} 8.373 + 8.374 +This is not currently widely in use by guest operating systems, but 8.375 +we intend to integrate support more fully in the near future. 8.376 + 8.377 +\section{PCI Configuration} 8.378 + 8.379 +Domains with physical device access (i.e.\ driver domains) receive 8.380 +limited access to certain PCI devices (bus address space and 8.381 +interrupts). However many guest operating systems attempt to 8.382 +determine the PCI configuration by directly access the PCI BIOS, 8.383 +which cannot be allowed for safety. 8.384 + 8.385 +Instead, Xen provides the following hypercall: 8.386 + 8.387 +\begin{quote} 8.388 +\hypercall{physdev\_op(void *physdev\_op)} 8.389 + 8.390 +Perform a PCI configuration option; depending on the value 8.391 +of {\tt physdev\_op} this can be a PCI config read, a PCI config 8.392 +write, or a small number of other queries. 8.393 + 8.394 +\end{quote} 8.395 + 8.396 + 8.397 +For examples of using {\tt physdev\_op()}, see the 8.398 +Xen-specific PCI code in the linux sparse tree. 8.399 + 8.400 +\section{Administrative Operations} 8.401 +\label{s:dom0ops} 8.402 + 8.403 +A large number of control operations are available to a sufficiently 8.404 +privileged domain (typically domain 0). These allow the creation and 8.405 +management of new domains, for example. A complete list is given 8.406 +below: for more details on any or all of these, please see 8.407 +{\tt xen/include/public/dom0\_ops.h} 8.408 + 8.409 + 8.410 +\begin{quote} 8.411 +\hypercall{dom0\_op(dom0\_op\_t *op)} 8.412 + 8.413 +Administrative domain operations for domain management. The options are: 8.414 + 8.415 +\begin{description} 8.416 +\item [\it DOM0\_CREATEDOMAIN:] create a new domain 8.417 + 8.418 +\item [\it DOM0\_PAUSEDOMAIN:] remove a domain from the scheduler run 8.419 +queue. 8.420 + 8.421 +\item [\it DOM0\_UNPAUSEDOMAIN:] mark a paused domain as schedulable 8.422 + once again. 8.423 + 8.424 +\item [\it DOM0\_DESTROYDOMAIN:] deallocate all resources associated 8.425 +with a domain 8.426 + 8.427 +\item [\it DOM0\_GETMEMLIST:] get list of pages used by the domain 8.428 + 8.429 +\item [\it DOM0\_SCHEDCTL:] 8.430 + 8.431 +\item [\it DOM0\_ADJUSTDOM:] adjust scheduling priorities for domain 8.432 + 8.433 +\item [\it DOM0\_BUILDDOMAIN:] do final guest OS setup for domain 8.434 + 8.435 +\item [\it DOM0\_GETDOMAINFO:] get statistics about the domain 8.436 + 8.437 +\item [\it DOM0\_GETPAGEFRAMEINFO:] 8.438 + 8.439 +\item [\it DOM0\_GETPAGEFRAMEINFO2:] 8.440 + 8.441 +\item [\it DOM0\_IOPL:] set I/O privilege level 8.442 + 8.443 +\item [\it DOM0\_MSR:] read or write model specific registers 8.444 + 8.445 +\item [\it DOM0\_DEBUG:] interactively invoke the debugger 8.446 + 8.447 +\item [\it DOM0\_SETTIME:] set system time 8.448 + 8.449 +\item [\it DOM0\_READCONSOLE:] read console content from hypervisor buffer ring 8.450 + 8.451 +\item [\it DOM0\_PINCPUDOMAIN:] pin domain to a particular CPU 8.452 + 8.453 +\item [\it DOM0\_GETTBUFS:] get information about the size and location of 8.454 + the trace buffers (only on trace-buffer enabled builds) 8.455 + 8.456 +\item [\it DOM0\_PHYSINFO:] get information about the host machine 8.457 + 8.458 +\item [\it DOM0\_PCIDEV\_ACCESS:] modify PCI device access permissions 8.459 + 8.460 +\item [\it DOM0\_SCHED\_ID:] get the ID of the current Xen scheduler 8.461 + 8.462 +\item [\it DOM0\_SHADOW\_CONTROL:] switch between shadow page-table modes 8.463 + 8.464 +\item [\it DOM0\_SETDOMAININITIALMEM:] set initial memory allocation of a domain 8.465 + 8.466 +\item [\it DOM0\_SETDOMAINMAXMEM:] set maximum memory allocation of a domain 8.467 + 8.468 +\item [\it DOM0\_SETDOMAINVMASSIST:] set domain VM assist options 8.469 +\end{description} 8.470 +\end{quote} 8.471 + 8.472 +Most of the above are best understood by looking at the code 8.473 +implementing them (in {\tt xen/common/dom0\_ops.c}) and in 8.474 +the user-space tools that use them (mostly in {\tt tools/libxc}). 8.475 + 8.476 +\section{Debugging Hypercalls} 8.477 + 8.478 +A few additional hypercalls are mainly useful for debugging: 8.479 + 8.480 +\begin{quote} 8.481 +\hypercall{console\_io(int cmd, int count, char *str)} 8.482 + 8.483 +Use Xen to interact with the console; operations are: 8.484 + 8.485 +{\it CONSOLEIO\_write}: Output count characters from buffer str. 8.486 + 8.487 +{\it CONSOLEIO\_read}: Input at most count characters into buffer str. 8.488 +\end{quote} 8.489 + 8.490 +A pair of hypercalls allows access to the underlying debug registers: 8.491 +\begin{quote} 8.492 +\hypercall{set\_debugreg(int reg, unsigned long value)} 8.493 + 8.494 +Set debug register {\tt reg} to {\tt value} 8.495 + 8.496 +\hypercall{get\_debugreg(int reg)} 8.497 + 8.498 +Return the contents of the debug register {\tt reg} 8.499 +\end{quote} 8.500 + 8.501 +And finally: 8.502 +\begin{quote} 8.503 +\hypercall{xen\_version(int cmd)} 8.504 + 8.505 +Request Xen version number. 8.506 +\end{quote} 8.507 + 8.508 +This is useful to ensure that user-space tools are in sync 8.509 +with the underlying hypervisor. 8.510 + 8.511 +\section{Deprecated Hypercalls} 8.512 + 8.513 +Xen is under constant development and refinement; as such there 8.514 +are plans to improve the way in which various pieces of functionality 8.515 +are exposed to guest OSes. 8.516 + 8.517 +\begin{quote} 8.518 +\hypercall{vm\_assist(unsigned int cmd, unsigned int type)} 8.519 + 8.520 +Toggle various memory management modes (in particular wrritable page 8.521 +tables and superpage support). 8.522 + 8.523 +\end{quote} 8.524 + 8.525 +This is likely to be replaced with mode values in the shared 8.526 +information page since this is more resilient for resumption 8.527 +after migration or checkpoint.
9.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 9.2 +++ b/docs/src/interface/memory.tex Tue Sep 20 09:43:46 2005 +0000 9.3 @@ -0,0 +1,162 @@ 9.4 +\chapter{Memory} 9.5 +\label{c:memory} 9.6 + 9.7 +Xen is responsible for managing the allocation of physical memory to 9.8 +domains, and for ensuring safe use of the paging and segmentation 9.9 +hardware. 9.10 + 9.11 + 9.12 +\section{Memory Allocation} 9.13 + 9.14 +Xen resides within a small fixed portion of physical memory; it also 9.15 +reserves the top 64MB of every virtual address space. The remaining 9.16 +physical memory is available for allocation to domains at a page 9.17 +granularity. Xen tracks the ownership and use of each page, which 9.18 +allows it to enforce secure partitioning between domains. 9.19 + 9.20 +Each domain has a maximum and current physical memory allocation. A 9.21 +guest OS may run a `balloon driver' to dynamically adjust its current 9.22 +memory allocation up to its limit. 9.23 + 9.24 + 9.25 +%% XXX SMH: I use machine and physical in the next section (which is 9.26 +%% kinda required for consistency with code); wonder if this section 9.27 +%% should use same terms? 9.28 +%% 9.29 +%% Probably. 9.30 +%% 9.31 +%% Merging this and below section at some point prob makes sense. 9.32 + 9.33 +\section{Pseudo-Physical Memory} 9.34 + 9.35 +Since physical memory is allocated and freed on a page granularity, 9.36 +there is no guarantee that a domain will receive a contiguous stretch 9.37 +of physical memory. However most operating systems do not have good 9.38 +support for operating in a fragmented physical address space. To aid 9.39 +porting such operating systems to run on top of Xen, we make a 9.40 +distinction between \emph{machine memory} and \emph{pseudo-physical 9.41 + memory}. 9.42 + 9.43 +Put simply, machine memory refers to the entire amount of memory 9.44 +installed in the machine, including that reserved by Xen, in use by 9.45 +various domains, or currently unallocated. We consider machine memory 9.46 +to comprise a set of 4K \emph{machine page frames} numbered 9.47 +consecutively starting from 0. Machine frame numbers mean the same 9.48 +within Xen or any domain. 9.49 + 9.50 +Pseudo-physical memory, on the other hand, is a per-domain 9.51 +abstraction. It allows a guest operating system to consider its memory 9.52 +allocation to consist of a contiguous range of physical page frames 9.53 +starting at physical frame 0, despite the fact that the underlying 9.54 +machine page frames may be sparsely allocated and in any order. 9.55 + 9.56 +To achieve this, Xen maintains a globally readable {\it 9.57 + machine-to-physical} table which records the mapping from machine 9.58 +page frames to pseudo-physical ones. In addition, each domain is 9.59 +supplied with a {\it physical-to-machine} table which performs the 9.60 +inverse mapping. Clearly the machine-to-physical table has size 9.61 +proportional to the amount of RAM installed in the machine, while each 9.62 +physical-to-machine table has size proportional to the memory 9.63 +allocation of the given domain. 9.64 + 9.65 +Architecture dependent code in guest operating systems can then use 9.66 +the two tables to provide the abstraction of pseudo-physical memory. 9.67 +In general, only certain specialized parts of the operating system 9.68 +(such as page table management) needs to understand the difference 9.69 +between machine and pseudo-physical addresses. 9.70 + 9.71 + 9.72 +\section{Page Table Updates} 9.73 + 9.74 +In the default mode of operation, Xen enforces read-only access to 9.75 +page tables and requires guest operating systems to explicitly request 9.76 +any modifications. Xen validates all such requests and only applies 9.77 +updates that it deems safe. This is necessary to prevent domains from 9.78 +adding arbitrary mappings to their page tables. 9.79 + 9.80 +To aid validation, Xen associates a type and reference count with each 9.81 +memory page. A page has one of the following mutually-exclusive types 9.82 +at any point in time: page directory ({\sf PD}), page table ({\sf 9.83 + PT}), local descriptor table ({\sf LDT}), global descriptor table 9.84 +({\sf GDT}), or writable ({\sf RW}). Note that a guest OS may always 9.85 +create readable mappings of its own memory regardless of its current 9.86 +type. 9.87 + 9.88 +%%% XXX: possibly explain more about ref count 'lifecyle' here? 9.89 +This mechanism is used to maintain the invariants required for safety; 9.90 +for example, a domain cannot have a writable mapping to any part of a 9.91 +page table as this would require the page concerned to simultaneously 9.92 +be of types {\sf PT} and {\sf RW}. 9.93 + 9.94 + 9.95 +% \section{Writable Page Tables} 9.96 + 9.97 +Xen also provides an alternative mode of operation in which guests be 9.98 +have the illusion that their page tables are directly writable. Of 9.99 +course this is not really the case, since Xen must still validate 9.100 +modifications to ensure secure partitioning. To this end, Xen traps 9.101 +any write attempt to a memory page of type {\sf PT} (i.e., that is 9.102 +currently part of a page table). If such an access occurs, Xen 9.103 +temporarily allows write access to that page while at the same time 9.104 +\emph{disconnecting} it from the page table that is currently in use. 9.105 +This allows the guest to safely make updates to the page because the 9.106 +newly-updated entries cannot be used by the MMU until Xen revalidates 9.107 +and reconnects the page. Reconnection occurs automatically in a 9.108 +number of situations: for example, when the guest modifies a different 9.109 +page-table page, when the domain is preempted, or whenever the guest 9.110 +uses Xen's explicit page-table update interfaces. 9.111 + 9.112 +Finally, Xen also supports a form of \emph{shadow page tables} in 9.113 +which the guest OS uses a independent copy of page tables which are 9.114 +unknown to the hardware (i.e.\ which are never pointed to by {\tt 9.115 + cr3}). Instead Xen propagates changes made to the guest's tables to 9.116 +the real ones, and vice versa. This is useful for logging page writes 9.117 +(e.g.\ for live migration or checkpoint). A full version of the shadow 9.118 +page tables also allows guest OS porting with less effort. 9.119 + 9.120 + 9.121 +\section{Segment Descriptor Tables} 9.122 + 9.123 +On boot a guest is supplied with a default GDT, which does not reside 9.124 +within its own memory allocation. If the guest wishes to use other 9.125 +than the default `flat' ring-1 and ring-3 segments that this GDT 9.126 +provides, it must register a custom GDT and/or LDT with Xen, allocated 9.127 +from its own memory. Note that a number of GDT entries are reserved by 9.128 +Xen -- any custom GDT must also include sufficient space for these 9.129 +entries. 9.130 + 9.131 +For example, the following hypercall is used to specify a new GDT: 9.132 + 9.133 +\begin{quote} 9.134 + int {\bf set\_gdt}(unsigned long *{\em frame\_list}, int {\em 9.135 + entries}) 9.136 + 9.137 + \emph{frame\_list}: An array of up to 16 machine page frames within 9.138 + which the GDT resides. Any frame registered as a GDT frame may only 9.139 + be mapped read-only within the guest's address space (e.g., no 9.140 + writable mappings, no use as a page-table page, and so on). 9.141 + 9.142 + \emph{entries}: The number of descriptor-entry slots in the GDT. 9.143 + Note that the table must be large enough to contain Xen's reserved 9.144 + entries; thus we must have `{\em entries $>$ 9.145 + LAST\_RESERVED\_GDT\_ENTRY}\ '. Note also that, after registering 9.146 + the GDT, slots \emph{FIRST\_} through 9.147 + \emph{LAST\_RESERVED\_GDT\_ENTRY} are no longer usable by the guest 9.148 + and may be overwritten by Xen. 9.149 +\end{quote} 9.150 + 9.151 +The LDT is updated via the generic MMU update mechanism (i.e., via the 9.152 +{\tt mmu\_update()} hypercall. 9.153 + 9.154 +\section{Start of Day} 9.155 + 9.156 +The start-of-day environment for guest operating systems is rather 9.157 +different to that provided by the underlying hardware. In particular, 9.158 +the processor is already executing in protected mode with paging 9.159 +enabled. 9.160 + 9.161 +{\it Domain 0} is created and booted by Xen itself. For all subsequent 9.162 +domains, the analogue of the boot-loader is the {\it domain builder}, 9.163 +user-space software running in {\it domain 0}. The domain builder is 9.164 +responsible for building the initial page tables for a domain and 9.165 +loading its kernel image at the appropriate virtual address.
10.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 10.2 +++ b/docs/src/interface/scheduling.tex Tue Sep 20 09:43:46 2005 +0000 10.3 @@ -0,0 +1,268 @@ 10.4 +\chapter{Scheduling API} 10.5 + 10.6 +The scheduling API is used by both the schedulers described above and should 10.7 +also be used by any new schedulers. It provides a generic interface and also 10.8 +implements much of the ``boilerplate'' code. 10.9 + 10.10 +Schedulers conforming to this API are described by the following 10.11 +structure: 10.12 + 10.13 +\begin{verbatim} 10.14 +struct scheduler 10.15 +{ 10.16 + char *name; /* full name for this scheduler */ 10.17 + char *opt_name; /* option name for this scheduler */ 10.18 + unsigned int sched_id; /* ID for this scheduler */ 10.19 + 10.20 + int (*init_scheduler) (); 10.21 + int (*alloc_task) (struct task_struct *); 10.22 + void (*add_task) (struct task_struct *); 10.23 + void (*free_task) (struct task_struct *); 10.24 + void (*rem_task) (struct task_struct *); 10.25 + void (*wake_up) (struct task_struct *); 10.26 + void (*do_block) (struct task_struct *); 10.27 + task_slice_t (*do_schedule) (s_time_t); 10.28 + int (*control) (struct sched_ctl_cmd *); 10.29 + int (*adjdom) (struct task_struct *, 10.30 + struct sched_adjdom_cmd *); 10.31 + s32 (*reschedule) (struct task_struct *); 10.32 + void (*dump_settings) (void); 10.33 + void (*dump_cpu_state) (int); 10.34 + void (*dump_runq_el) (struct task_struct *); 10.35 +}; 10.36 +\end{verbatim} 10.37 + 10.38 +The only method that {\em must} be implemented is 10.39 +{\tt do\_schedule()}. However, if there is not some implementation for the 10.40 +{\tt wake\_up()} method then waking tasks will not get put on the runqueue! 10.41 + 10.42 +The fields of the above structure are described in more detail below. 10.43 + 10.44 +\subsubsection{name} 10.45 + 10.46 +The name field should point to a descriptive ASCII string. 10.47 + 10.48 +\subsubsection{opt\_name} 10.49 + 10.50 +This field is the value of the {\tt sched=} boot-time option that will select 10.51 +this scheduler. 10.52 + 10.53 +\subsubsection{sched\_id} 10.54 + 10.55 +This is an integer that uniquely identifies this scheduler. There should be a 10.56 +macro corrsponding to this scheduler ID in {\tt <xen/sched-if.h>}. 10.57 + 10.58 +\subsubsection{init\_scheduler} 10.59 + 10.60 +\paragraph*{Purpose} 10.61 + 10.62 +This is a function for performing any scheduler-specific initialisation. For 10.63 +instance, it might allocate memory for per-CPU scheduler data and initialise it 10.64 +appropriately. 10.65 + 10.66 +\paragraph*{Call environment} 10.67 + 10.68 +This function is called after the initialisation performed by the generic 10.69 +layer. The function is called exactly once, for the scheduler that has been 10.70 +selected. 10.71 + 10.72 +\paragraph*{Return values} 10.73 + 10.74 +This should return negative on failure --- this will cause an 10.75 +immediate panic and the system will fail to boot. 10.76 + 10.77 +\subsubsection{alloc\_task} 10.78 + 10.79 +\paragraph*{Purpose} 10.80 +Called when a {\tt task\_struct} is allocated by the generic scheduler 10.81 +layer. A particular scheduler implementation may use this method to 10.82 +allocate per-task data for this task. It may use the {\tt 10.83 +sched\_priv} pointer in the {\tt task\_struct} to point to this data. 10.84 + 10.85 +\paragraph*{Call environment} 10.86 +The generic layer guarantees that the {\tt sched\_priv} field will 10.87 +remain intact from the time this method is called until the task is 10.88 +deallocated (so long as the scheduler implementation does not change 10.89 +it explicitly!). 10.90 + 10.91 +\paragraph*{Return values} 10.92 +Negative on failure. 10.93 + 10.94 +\subsubsection{add\_task} 10.95 + 10.96 +\paragraph*{Purpose} 10.97 + 10.98 +Called when a task is initially added by the generic layer. 10.99 + 10.100 +\paragraph*{Call environment} 10.101 + 10.102 +The fields in the {\tt task\_struct} are now filled out and available for use. 10.103 +Schedulers should implement appropriate initialisation of any per-task private 10.104 +information in this method. 10.105 + 10.106 +\subsubsection{free\_task} 10.107 + 10.108 +\paragraph*{Purpose} 10.109 + 10.110 +Schedulers should free the space used by any associated private data 10.111 +structures. 10.112 + 10.113 +\paragraph*{Call environment} 10.114 + 10.115 +This is called when a {\tt task\_struct} is about to be deallocated. 10.116 +The generic layer will have done generic task removal operations and 10.117 +(if implemented) called the scheduler's {\tt rem\_task} method before 10.118 +this method is called. 10.119 + 10.120 +\subsubsection{rem\_task} 10.121 + 10.122 +\paragraph*{Purpose} 10.123 + 10.124 +This is called when a task is being removed from scheduling (but is 10.125 +not yet being freed). 10.126 + 10.127 +\subsubsection{wake\_up} 10.128 + 10.129 +\paragraph*{Purpose} 10.130 + 10.131 +Called when a task is woken up, this method should put the task on the runqueue 10.132 +(or do the scheduler-specific equivalent action). 10.133 + 10.134 +\paragraph*{Call environment} 10.135 + 10.136 +The task is already set to state RUNNING. 10.137 + 10.138 +\subsubsection{do\_block} 10.139 + 10.140 +\paragraph*{Purpose} 10.141 + 10.142 +This function is called when a task is blocked. This function should 10.143 +not remove the task from the runqueue. 10.144 + 10.145 +\paragraph*{Call environment} 10.146 + 10.147 +The EVENTS\_MASTER\_ENABLE\_BIT is already set and the task state changed to 10.148 +TASK\_INTERRUPTIBLE on entry to this method. A call to the {\tt 10.149 + do\_schedule} method will be made after this method returns, in 10.150 +order to select the next task to run. 10.151 + 10.152 +\subsubsection{do\_schedule} 10.153 + 10.154 +This method must be implemented. 10.155 + 10.156 +\paragraph*{Purpose} 10.157 + 10.158 +The method is called each time a new task must be chosen for scheduling on the 10.159 +current CPU. The current time as passed as the single argument (the current 10.160 +task can be found using the {\tt current} macro). 10.161 + 10.162 +This method should select the next task to run on this CPU and set it's minimum 10.163 +time to run as well as returning the data described below. 10.164 + 10.165 +This method should also take the appropriate action if the previous 10.166 +task has blocked, e.g. removing it from the runqueue. 10.167 + 10.168 +\paragraph*{Call environment} 10.169 + 10.170 +The other fields in the {\tt task\_struct} are updated by the generic layer, 10.171 +which also performs all Xen-specific tasks and performs the actual task switch 10.172 +(unless the previous task has been chosen again). 10.173 + 10.174 +This method is called with the {\tt schedule\_lock} held for the current CPU 10.175 +and local interrupts disabled. 10.176 + 10.177 +\paragraph*{Return values} 10.178 + 10.179 +Must return a {\tt struct task\_slice} describing what task to run and how long 10.180 +for (at maximum). 10.181 + 10.182 +\subsubsection{control} 10.183 + 10.184 +\paragraph*{Purpose} 10.185 + 10.186 +This method is called for global scheduler control operations. It takes a 10.187 +pointer to a {\tt struct sched\_ctl\_cmd}, which it should either 10.188 +source data from or populate with data, depending on the value of the 10.189 +{\tt direction} field. 10.190 + 10.191 +\paragraph*{Call environment} 10.192 + 10.193 +The generic layer guarantees that when this method is called, the 10.194 +caller selected the correct scheduler ID, hence the scheduler's 10.195 +implementation does not need to sanity-check these parts of the call. 10.196 + 10.197 +\paragraph*{Return values} 10.198 + 10.199 +This function should return the value to be passed back to user space, hence it 10.200 +should either be 0 or an appropriate errno value. 10.201 + 10.202 +\subsubsection{sched\_adjdom} 10.203 + 10.204 +\paragraph*{Purpose} 10.205 + 10.206 +This method is called to adjust the scheduling parameters of a particular 10.207 +domain, or to query their current values. The function should check 10.208 +the {\tt direction} field of the {\tt sched\_adjdom\_cmd} it receives in 10.209 +order to determine which of these operations is being performed. 10.210 + 10.211 +\paragraph*{Call environment} 10.212 + 10.213 +The generic layer guarantees that the caller has specified the correct 10.214 +control interface version and scheduler ID and that the supplied {\tt 10.215 +task\_struct} will not be deallocated during the call (hence it is not 10.216 +necessary to {\tt get\_task\_struct}). 10.217 + 10.218 +\paragraph*{Return values} 10.219 + 10.220 +This function should return the value to be passed back to user space, hence it 10.221 +should either be 0 or an appropriate errno value. 10.222 + 10.223 +\subsubsection{reschedule} 10.224 + 10.225 +\paragraph*{Purpose} 10.226 + 10.227 +This method is called to determine if a reschedule is required as a result of a 10.228 +particular task. 10.229 + 10.230 +\paragraph*{Call environment} 10.231 +The generic layer will cause a reschedule if the current domain is the idle 10.232 +task or it has exceeded its minimum time slice before a reschedule. The 10.233 +generic layer guarantees that the task passed is not currently running but is 10.234 +on the runqueue. 10.235 + 10.236 +\paragraph*{Return values} 10.237 + 10.238 +Should return a mask of CPUs to cause a reschedule on. 10.239 + 10.240 +\subsubsection{dump\_settings} 10.241 + 10.242 +\paragraph*{Purpose} 10.243 + 10.244 +If implemented, this should dump any private global settings for this 10.245 +scheduler to the console. 10.246 + 10.247 +\paragraph*{Call environment} 10.248 + 10.249 +This function is called with interrupts enabled. 10.250 + 10.251 +\subsubsection{dump\_cpu\_state} 10.252 + 10.253 +\paragraph*{Purpose} 10.254 + 10.255 +This method should dump any private settings for the specified CPU. 10.256 + 10.257 +\paragraph*{Call environment} 10.258 + 10.259 +This function is called with interrupts disabled and the {\tt schedule\_lock} 10.260 +for the specified CPU held. 10.261 + 10.262 +\subsubsection{dump\_runq\_el} 10.263 + 10.264 +\paragraph*{Purpose} 10.265 + 10.266 +This method should dump any private settings for the specified task. 10.267 + 10.268 +\paragraph*{Call environment} 10.269 + 10.270 +This function is called with interrupts disabled and the {\tt schedule\_lock} 10.271 +for the task's CPU held.
11.1 --- a/docs/src/user.tex Tue Sep 20 09:43:29 2005 +0000 11.2 +++ b/docs/src/user.tex Tue Sep 20 09:43:46 2005 +0000 11.3 @@ -59,1803 +59,36 @@ Contributions of material, suggestions a 11.4 \renewcommand{\floatpagefraction}{.8} 11.5 \setstretch{1.1} 11.6 11.7 + 11.8 \part{Introduction and Tutorial} 11.9 -\chapter{Introduction} 11.10 - 11.11 -Xen is a {\em paravirtualising} virtual machine monitor (VMM), or 11.12 -`hypervisor', for the x86 processor architecture. Xen can securely 11.13 -execute multiple virtual machines on a single physical system with 11.14 -close-to-native performance. The virtual machine technology 11.15 -facilitates enterprise-grade functionality, including: 11.16 - 11.17 -\begin{itemize} 11.18 -\item Virtual machines with performance close to native 11.19 - hardware. 11.20 -\item Live migration of running virtual machines between physical hosts. 11.21 -\item Excellent hardware support (supports most Linux device drivers). 11.22 -\item Sandboxed, restartable device drivers. 11.23 -\end{itemize} 11.24 - 11.25 -Paravirtualisation permits very high performance virtualisation, 11.26 -even on architectures like x86 that are traditionally 11.27 -very hard to virtualise. 11.28 -The drawback of this approach is that it requires operating systems to 11.29 -be {\em ported} to run on Xen. Porting an OS to run on Xen is similar 11.30 -to supporting a new hardware platform, however the process 11.31 -is simplified because the paravirtual machine architecture is very 11.32 -similar to the underlying native hardware. Even though operating system 11.33 -kernels must explicitly support Xen, a key feature is that user space 11.34 -applications and libraries {\em do not} require modification. 11.35 - 11.36 -Xen support is available for increasingly many operating systems: 11.37 -right now, Linux 2.4, Linux 2.6 and NetBSD are available for Xen 2.0. 11.38 -A FreeBSD port is undergoing testing and will be incorporated into the 11.39 -release soon. Other OS ports, including Plan 9, are in progress. We 11.40 -hope that that arch-xen patches will be incorporated into the 11.41 -mainstream releases of these operating systems in due course (as has 11.42 -already happened for NetBSD). 11.43 - 11.44 -Possible usage scenarios for Xen include: 11.45 -\begin{description} 11.46 -\item [Kernel development.] Test and debug kernel modifications in a 11.47 - sandboxed virtual machine --- no need for a separate test 11.48 - machine. 11.49 -\item [Multiple OS configurations.] Run multiple operating systems 11.50 - simultaneously, for instance for compatibility or QA purposes. 11.51 -\item [Server consolidation.] Move multiple servers onto a single 11.52 - physical host with performance and fault isolation provided at 11.53 - virtual machine boundaries. 11.54 -\item [Cluster computing.] Management at VM granularity provides more 11.55 - flexibility than separately managing each physical host, but 11.56 - better control and isolation than single-system image solutions, 11.57 - particularly by using live migration for load balancing. 11.58 -\item [Hardware support for custom OSes.] Allow development of new OSes 11.59 - while benefiting from the wide-ranging hardware support of 11.60 - existing OSes such as Linux. 11.61 -\end{description} 11.62 - 11.63 -\section{Structure of a Xen-Based System} 11.64 - 11.65 -A Xen system has multiple layers, the lowest and most privileged of 11.66 -which is Xen itself. 11.67 -Xen in turn may host multiple {\em guest} operating systems, each of 11.68 -which is executed within a secure virtual machine (in Xen terminology, 11.69 -a {\em domain}). Domains are scheduled by Xen to make effective use of 11.70 -the available physical CPUs. Each guest OS manages its own 11.71 -applications, which includes responsibility for scheduling each 11.72 -application within the time allotted to the VM by Xen. 11.73 - 11.74 -The first domain, {\em domain 0}, is created automatically when the 11.75 -system boots and has special management privileges. Domain 0 builds 11.76 -other domains and manages their virtual devices. It also performs 11.77 -administrative tasks such as suspending, resuming and migrating other 11.78 -virtual machines. 11.79 - 11.80 -Within domain 0, a process called \emph{xend} runs to manage the system. 11.81 -\Xend is responsible for managing virtual machines and providing access 11.82 -to their consoles. Commands are issued to \xend over an HTTP 11.83 -interface, either from a command-line tool or from a web browser. 11.84 - 11.85 -\section{Hardware Support} 11.86 - 11.87 -Xen currently runs only on the x86 architecture, requiring a `P6' or 11.88 -newer processor (e.g. Pentium Pro, Celeron, Pentium II, Pentium III, 11.89 -Pentium IV, Xeon, AMD Athlon, AMD Duron). Multiprocessor machines are 11.90 -supported, and we also have basic support for HyperThreading (SMT), 11.91 -although this remains a topic for ongoing research. A port 11.92 -specifically for x86/64 is in progress, although Xen already runs on 11.93 -such systems in 32-bit legacy mode. In addition a port to the IA64 11.94 -architecture is approaching completion. We hope to add other 11.95 -architectures such as PPC and ARM in due course. 11.96 - 11.97 - 11.98 -Xen can currently use up to 4GB of memory. It is possible for x86 11.99 -machines to address up to 64GB of physical memory but there are no 11.100 -current plans to support these systems: The x86/64 port is the 11.101 -planned route to supporting larger memory sizes. 11.102 - 11.103 -Xen offloads most of the hardware support issues to the guest OS 11.104 -running in Domain~0. Xen itself contains only the code required to 11.105 -detect and start secondary processors, set up interrupt routing, and 11.106 -perform PCI bus enumeration. Device drivers run within a privileged 11.107 -guest OS rather than within Xen itself. This approach provides 11.108 -compatibility with the majority of device hardware supported by Linux. 11.109 -The default XenLinux build contains support for relatively modern 11.110 -server-class network and disk hardware, but you can add support for 11.111 -other hardware by configuring your XenLinux kernel in the normal way. 11.112 - 11.113 -\section{History} 11.114 - 11.115 -Xen was originally developed by the Systems Research Group at the 11.116 -University of Cambridge Computer Laboratory as part of the XenoServers 11.117 -project, funded by the UK-EPSRC. 11.118 -XenoServers aim to provide a `public infrastructure for 11.119 -global distributed computing', and Xen plays a key part in that, 11.120 -allowing us to efficiently partition a single machine to enable 11.121 -multiple independent clients to run their operating systems and 11.122 -applications in an environment providing protection, resource 11.123 -isolation and accounting. The project web page contains further 11.124 -information along with pointers to papers and technical reports: 11.125 -\path{http://www.cl.cam.ac.uk/xeno} 11.126 - 11.127 -Xen has since grown into a fully-fledged project in its own right, 11.128 -enabling us to investigate interesting research issues regarding the 11.129 -best techniques for virtualising resources such as the CPU, memory, 11.130 -disk and network. The project has been bolstered by support from 11.131 -Intel Research Cambridge, and HP Labs, who are now working closely 11.132 -with us. 11.133 - 11.134 -Xen was first described in a paper presented at SOSP in 11.135 -2003\footnote{\tt 11.136 -http://www.cl.cam.ac.uk/netos/papers/2003-xensosp.pdf}, and the first 11.137 -public release (1.0) was made that October. Since then, Xen has 11.138 -significantly matured and is now used in production scenarios on 11.139 -many sites. 11.140 - 11.141 -Xen 2.0 features greatly enhanced hardware support, configuration 11.142 -flexibility, usability and a larger complement of supported operating 11.143 -systems. This latest release takes Xen a step closer to becoming the 11.144 -definitive open source solution for virtualisation. 11.145 - 11.146 -\chapter{Installation} 11.147 - 11.148 -The Xen distribution includes three main components: Xen itself, ports 11.149 -of Linux 2.4 and 2.6 and NetBSD to run on Xen, and the user-space 11.150 -tools required to manage a Xen-based system. This chapter describes 11.151 -how to install the Xen 2.0 distribution from source. Alternatively, 11.152 -there may be pre-built packages available as part of your operating 11.153 -system distribution. 11.154 - 11.155 -\section{Prerequisites} 11.156 -\label{sec:prerequisites} 11.157 - 11.158 -The following is a full list of prerequisites. Items marked `$\dag$' 11.159 -are required by the \xend control tools, and hence required if you 11.160 -want to run more than one virtual machine; items marked `$*$' are only 11.161 -required if you wish to build from source. 11.162 -\begin{itemize} 11.163 -\item A working Linux distribution using the GRUB bootloader and 11.164 -running on a P6-class (or newer) CPU. 11.165 -\item [$\dag$] The \path{iproute2} package. 11.166 -\item [$\dag$] The Linux bridge-utils\footnote{Available from 11.167 -{\tt http://bridge.sourceforge.net}} (e.g., \path{/sbin/brctl}) 11.168 -\item [$\dag$] An installation of Twisted v1.3 or 11.169 -above\footnote{Available from {\tt 11.170 -http://www.twistedmatrix.com}}. There may be a binary package 11.171 -available for your distribution; alternatively it can be installed by 11.172 -running `{\sl make install-twisted}' in the root of the Xen source 11.173 -tree. 11.174 -\item [$*$] Build tools (gcc v3.2.x or v3.3.x, binutils, GNU make). 11.175 -\item [$*$] Development installation of libcurl (e.g., libcurl-devel) 11.176 -\item [$*$] Development installation of zlib (e.g., zlib-dev). 11.177 -\item [$*$] Development installation of Python v2.2 or later (e.g., python-dev). 11.178 -\item [$*$] \LaTeX and transfig are required to build the documentation. 11.179 -\end{itemize} 11.180 - 11.181 -Once you have satisfied the relevant prerequisites, you can 11.182 -now install either a binary or source distribution of Xen. 11.183 - 11.184 -\section{Installing from Binary Tarball} 11.185 - 11.186 -Pre-built tarballs are available for download from the Xen 11.187 -download page 11.188 -\begin{quote} 11.189 -{\tt http://xen.sf.net} 11.190 -\end{quote} 11.191 - 11.192 -Once you've downloaded the tarball, simply unpack and install: 11.193 -\begin{verbatim} 11.194 -# tar zxvf xen-2.0-install.tgz 11.195 -# cd xen-2.0-install 11.196 -# sh ./install.sh 11.197 -\end{verbatim} 11.198 - 11.199 -Once you've installed the binaries you need to configure 11.200 -your system as described in Section~\ref{s:configure}. 11.201 - 11.202 -\section{Installing from Source} 11.203 - 11.204 -This section describes how to obtain, build, and install 11.205 -Xen from source. 11.206 - 11.207 -\subsection{Obtaining the Source} 11.208 - 11.209 -The Xen source tree is available as either a compressed source tar 11.210 -ball or as a clone of our master BitKeeper repository. 11.211 - 11.212 -\begin{description} 11.213 -\item[Obtaining the Source Tarball]\mbox{} \\ 11.214 -Stable versions (and daily snapshots) of the Xen source tree are 11.215 -available as compressed tarballs from the Xen download page 11.216 -\begin{quote} 11.217 -{\tt http://xen.sf.net} 11.218 -\end{quote} 11.219 - 11.220 -\item[Using BitKeeper]\mbox{} \\ 11.221 -If you wish to install Xen from a clone of our latest BitKeeper 11.222 -repository then you will need to install the BitKeeper tools. 11.223 -Download instructions for BitKeeper can be obtained by filling out the 11.224 -form at: 11.225 - 11.226 -\begin{quote} 11.227 -{\tt http://www.bitmover.com/cgi-bin/download.cgi} 11.228 -\end{quote} 11.229 -The public master BK repository for the 2.0 release lives at: 11.230 -\begin{quote} 11.231 -{\tt bk://xen.bkbits.net/xen-2.0.bk} 11.232 -\end{quote} 11.233 -You can use BitKeeper to 11.234 -download it and keep it updated with the latest features and fixes. 11.235 - 11.236 -Change to the directory in which you want to put the source code, then 11.237 -run: 11.238 -\begin{verbatim} 11.239 -# bk clone bk://xen.bkbits.net/xen-2.0.bk 11.240 -\end{verbatim} 11.241 - 11.242 -Under your current directory, a new directory named \path{xen-2.0.bk} 11.243 -has been created, which contains all the source code for Xen, the OS 11.244 -ports, and the control tools. You can update your repository with the 11.245 -latest changes at any time by running: 11.246 -\begin{verbatim} 11.247 -# cd xen-2.0.bk # to change into the local repository 11.248 -# bk pull # to update the repository 11.249 -\end{verbatim} 11.250 -\end{description} 11.251 - 11.252 -%\section{The distribution} 11.253 -% 11.254 -%The Xen source code repository is structured as follows: 11.255 -% 11.256 -%\begin{description} 11.257 -%\item[\path{tools/}] Xen node controller daemon (Xend), command line tools, 11.258 -% control libraries 11.259 -%\item[\path{xen/}] The Xen VMM. 11.260 -%\item[\path{linux-*-xen-sparse/}] Xen support for Linux. 11.261 -%\item[\path{linux-*-patches/}] Experimental patches for Linux. 11.262 -%\item[\path{netbsd-*-xen-sparse/}] Xen support for NetBSD. 11.263 -%\item[\path{docs/}] Various documentation files for users and developers. 11.264 -%\item[\path{extras/}] Bonus extras. 11.265 -%\end{description} 11.266 - 11.267 -\subsection{Building from Source} 11.268 - 11.269 -The top-level Xen Makefile includes a target `world' that will do the 11.270 -following: 11.271 - 11.272 -\begin{itemize} 11.273 -\item Build Xen 11.274 -\item Build the control tools, including \xend 11.275 -\item Download (if necessary) and unpack the Linux 2.6 source code, 11.276 - and patch it for use with Xen 11.277 -\item Build a Linux kernel to use in domain 0 and a smaller 11.278 - unprivileged kernel, which can optionally be used for 11.279 - unprivileged virtual machines. 11.280 -\end{itemize} 11.281 - 11.282 - 11.283 -After the build has completed you should have a top-level 11.284 -directory called \path{dist/} in which all resulting targets 11.285 -will be placed; of particular interest are the two kernels 11.286 -XenLinux kernel images, one with a `-xen0' extension 11.287 -which contains hardware device drivers and drivers for Xen's virtual 11.288 -devices, and one with a `-xenU' extension that just contains the 11.289 -virtual ones. These are found in \path{dist/install/boot/} along 11.290 -with the image for Xen itself and the configuration files used 11.291 -during the build. 11.292 11.293 -The NetBSD port can be built using: 11.294 -\begin{quote} 11.295 -\begin{verbatim} 11.296 -# make netbsd20 11.297 -\end{verbatim} 11.298 -\end{quote} 11.299 -NetBSD port is built using a snapshot of the netbsd-2-0 cvs branch. 11.300 -The snapshot is downloaded as part of the build process, if it is not 11.301 -yet present in the \path{NETBSD\_SRC\_PATH} search path. The build 11.302 -process also downloads a toolchain which includes all the tools 11.303 -necessary to build the NetBSD kernel under Linux. 11.304 - 11.305 -To customize further the set of kernels built you need to edit 11.306 -the top-level Makefile. Look for the line: 11.307 - 11.308 -\begin{quote} 11.309 -\begin{verbatim} 11.310 -KERNELS ?= mk.linux-2.6-xen0 mk.linux-2.6-xenU 11.311 -\end{verbatim} 11.312 -\end{quote} 11.313 - 11.314 -You can edit this line to include any set of operating system kernels 11.315 -which have configurations in the top-level \path{buildconfigs/} 11.316 -directory, for example \path{mk.linux-2.4-xenU} to build a Linux 2.4 11.317 -kernel containing only virtual device drivers. 11.318 - 11.319 -%% Inspect the Makefile if you want to see what goes on during a build. 11.320 -%% Building Xen and the tools is straightforward, but XenLinux is more 11.321 -%% complicated. The makefile needs a `pristine' Linux kernel tree to which 11.322 -%% it will then add the Xen architecture files. You can tell the 11.323 -%% makefile the location of the appropriate Linux compressed tar file by 11.324 -%% setting the LINUX\_SRC environment variable, e.g. \\ 11.325 -%% \verb!# LINUX_SRC=/tmp/linux-2.6.11.tar.bz2 make world! \\ or by 11.326 -%% placing the tar file somewhere in the search path of {\tt 11.327 -%% LINUX\_SRC\_PATH} which defaults to `{\tt .:..}'. If the makefile 11.328 -%% can't find a suitable kernel tar file it attempts to download it from 11.329 -%% kernel.org (this won't work if you're behind a firewall). 11.330 - 11.331 -%% After untaring the pristine kernel tree, the makefile uses the {\tt 11.332 -%% mkbuildtree} script to add the Xen patches to the kernel. 11.333 - 11.334 - 11.335 -%% The procedure is similar to build the Linux 2.4 port: \\ 11.336 -%% \verb!# LINUX_SRC=/path/to/linux2.4/source make linux24! 11.337 - 11.338 - 11.339 -%% \framebox{\parbox{5in}{ 11.340 -%% {\bf Distro specific:} \\ 11.341 -%% {\it Gentoo} --- if not using udev (most installations, currently), you'll need 11.342 -%% to enable devfs and devfs mount at boot time in the xen0 config. 11.343 -%% }} 11.344 - 11.345 -\subsection{Custom XenLinux Builds} 11.346 - 11.347 -% If you have an SMP machine you may wish to give the {\tt '-j4'} 11.348 -% argument to make to get a parallel build. 11.349 - 11.350 -If you wish to build a customized XenLinux kernel (e.g. to support 11.351 -additional devices or enable distribution-required features), you can 11.352 -use the standard Linux configuration mechanisms, specifying that the 11.353 -architecture being built for is \path{xen}, e.g: 11.354 -\begin{quote} 11.355 -\begin{verbatim} 11.356 -# cd linux-2.6.11-xen0 11.357 -# make ARCH=xen xconfig 11.358 -# cd .. 11.359 -# make 11.360 -\end{verbatim} 11.361 -\end{quote} 11.362 - 11.363 -You can also copy an existing Linux configuration (\path{.config}) 11.364 -into \path{linux-2.6.11-xen0} and execute: 11.365 -\begin{quote} 11.366 -\begin{verbatim} 11.367 -# make ARCH=xen oldconfig 11.368 -\end{verbatim} 11.369 -\end{quote} 11.370 - 11.371 -You may be prompted with some Xen-specific options; we 11.372 -advise accepting the defaults for these options. 11.373 - 11.374 -Note that the only difference between the two types of Linux kernel 11.375 -that are built is the configuration file used for each. The "U" 11.376 -suffixed (unprivileged) versions don't contain any of the physical 11.377 -hardware device drivers, leading to a 30\% reduction in size; hence 11.378 -you may prefer these for your non-privileged domains. The `0' 11.379 -suffixed privileged versions can be used to boot the system, as well 11.380 -as in driver domains and unprivileged domains. 11.381 - 11.382 - 11.383 -\subsection{Installing the Binaries} 11.384 - 11.385 - 11.386 -The files produced by the build process are stored under the 11.387 -\path{dist/install/} directory. To install them in their default 11.388 -locations, do: 11.389 -\begin{quote} 11.390 -\begin{verbatim} 11.391 -# make install 11.392 -\end{verbatim} 11.393 -\end{quote} 11.394 - 11.395 - 11.396 -Alternatively, users with special installation requirements may wish 11.397 -to install them manually by copying the files to their appropriate 11.398 -destinations. 11.399 - 11.400 -%% Files in \path{install/boot/} include: 11.401 -%% \begin{itemize} 11.402 -%% \item \path{install/boot/xen-2.0.gz} Link to the Xen 'kernel' 11.403 -%% \item \path{install/boot/vmlinuz-2.6-xen0} Link to domain 0 XenLinux kernel 11.404 -%% \item \path{install/boot/vmlinuz-2.6-xenU} Link to unprivileged XenLinux kernel 11.405 -%% \end{itemize} 11.406 - 11.407 -The \path{dist/install/boot} directory will also contain the config files 11.408 -used for building the XenLinux kernels, and also versions of Xen and 11.409 -XenLinux kernels that contain debug symbols (\path{xen-syms-2.0.6} and 11.410 -\path{vmlinux-syms-2.6.11.11-xen0}) which are essential for interpreting crash 11.411 -dumps. Retain these files as the developers may wish to see them if 11.412 -you post on the mailing list. 11.413 - 11.414 - 11.415 - 11.416 - 11.417 - 11.418 -\section{Configuration} 11.419 -\label{s:configure} 11.420 -Once you have built and installed the Xen distribution, it is 11.421 -simple to prepare the machine for booting and running Xen. 11.422 - 11.423 -\subsection{GRUB Configuration} 11.424 - 11.425 -An entry should be added to \path{grub.conf} (often found under 11.426 -\path{/boot/} or \path{/boot/grub/}) to allow Xen / XenLinux to boot. 11.427 -This file is sometimes called \path{menu.lst}, depending on your 11.428 -distribution. The entry should look something like the following: 11.429 - 11.430 -{\small 11.431 -\begin{verbatim} 11.432 -title Xen 2.0 / XenLinux 2.6 11.433 - kernel /boot/xen-2.0.gz dom0_mem=131072 11.434 - module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro console=tty0 11.435 -\end{verbatim} 11.436 -} 11.437 +%% Chapter Introduction moved to introduction.tex 11.438 +\include{src/user/introduction} 11.439 11.440 -The kernel line tells GRUB where to find Xen itself and what boot 11.441 -parameters should be passed to it (in this case, setting domain 0's 11.442 -memory allocation in kilobytes and the settings for the serial port). For more 11.443 -details on the various Xen boot parameters see Section~\ref{s:xboot}. 11.444 - 11.445 -The module line of the configuration describes the location of the 11.446 -XenLinux kernel that Xen should start and the parameters that should 11.447 -be passed to it (these are standard Linux parameters, identifying the 11.448 -root device and specifying it be initially mounted read only and 11.449 -instructing that console output be sent to the screen). Some 11.450 -distributions such as SuSE do not require the \path{ro} parameter. 11.451 - 11.452 -%% \framebox{\parbox{5in}{ 11.453 -%% {\bf Distro specific:} \\ 11.454 -%% {\it SuSE} --- Omit the {\tt ro} option from the XenLinux kernel 11.455 -%% command line, since the partition won't be remounted rw during boot. 11.456 -%% }} 11.457 - 11.458 - 11.459 -If you want to use an initrd, just add another \path{module} line to 11.460 -the configuration, as usual: 11.461 -{\small 11.462 -\begin{verbatim} 11.463 - module /boot/my_initrd.gz 11.464 -\end{verbatim} 11.465 -} 11.466 - 11.467 -As always when installing a new kernel, it is recommended that you do 11.468 -not delete existing menu options from \path{menu.lst} --- you may want 11.469 -to boot your old Linux kernel in future, particularly if you 11.470 -have problems. 11.471 - 11.472 - 11.473 -\subsection{Serial Console (optional)} 11.474 - 11.475 -%% kernel /boot/xen-2.0.gz dom0_mem=131072 com1=115200,8n1 11.476 -%% module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro 11.477 - 11.478 - 11.479 -In order to configure Xen serial console output, it is necessary to add 11.480 -an boot option to your GRUB config; e.g. replace the above kernel line 11.481 -with: 11.482 -\begin{quote} 11.483 -{\small 11.484 -\begin{verbatim} 11.485 - kernel /boot/xen.gz dom0_mem=131072 com1=115200,8n1 11.486 -\end{verbatim}} 11.487 -\end{quote} 11.488 - 11.489 -This configures Xen to output on COM1 at 115,200 baud, 8 data bits, 11.490 -1 stop bit and no parity. Modify these parameters for your set up. 11.491 - 11.492 -One can also configure XenLinux to share the serial console; to 11.493 -achieve this append ``\path{console=ttyS0}'' to your 11.494 -module line. 11.495 - 11.496 - 11.497 -If you wish to be able to log in over the XenLinux serial console it 11.498 -is necessary to add a line into \path{/etc/inittab}, just as per 11.499 -regular Linux. Simply add the line: 11.500 -\begin{quote} 11.501 -{\small 11.502 -{\tt c:2345:respawn:/sbin/mingetty ttyS0} 11.503 -} 11.504 -\end{quote} 11.505 - 11.506 -and you should be able to log in. Note that to successfully log in 11.507 -as root over the serial line will require adding \path{ttyS0} to 11.508 -\path{/etc/securetty} in most modern distributions. 11.509 - 11.510 -\subsection{TLS Libraries} 11.511 - 11.512 -Users of the XenLinux 2.6 kernel should disable Thread Local Storage 11.513 -(e.g.\ by doing a \path{mv /lib/tls /lib/tls.disabled}) before 11.514 -attempting to run with a XenLinux kernel\footnote{If you boot without first 11.515 -disabling TLS, you will get a warning message during the boot 11.516 -process. In this case, simply perform the rename after the machine is 11.517 -up and then run \texttt{/sbin/ldconfig} to make it take effect.}. You can 11.518 -always reenable it by restoring the directory to its original location 11.519 -(i.e.\ \path{mv /lib/tls.disabled /lib/tls}). 11.520 - 11.521 -The reason for this is that the current TLS implementation uses 11.522 -segmentation in a way that is not permissible under Xen. If TLS is 11.523 -not disabled, an emulation mode is used within Xen which reduces 11.524 -performance substantially. 11.525 - 11.526 -We hope that this issue can be resolved by working with Linux 11.527 -distribution vendors to implement a minor backward-compatible change 11.528 -to the TLS library. 11.529 - 11.530 -\section{Booting Xen} 11.531 - 11.532 -It should now be possible to restart the system and use Xen. Reboot 11.533 -as usual but choose the new Xen option when the Grub screen appears. 11.534 - 11.535 -What follows should look much like a conventional Linux boot. The 11.536 -first portion of the output comes from Xen itself, supplying low level 11.537 -information about itself and the machine it is running on. The 11.538 -following portion of the output comes from XenLinux. 11.539 - 11.540 -You may see some errors during the XenLinux boot. These are not 11.541 -necessarily anything to worry about --- they may result from kernel 11.542 -configuration differences between your XenLinux kernel and the one you 11.543 -usually use. 11.544 - 11.545 -When the boot completes, you should be able to log into your system as 11.546 -usual. If you are unable to log in to your system running Xen, you 11.547 -should still be able to reboot with your normal Linux kernel. 11.548 - 11.549 - 11.550 -\chapter{Starting Additional Domains} 11.551 - 11.552 -The first step in creating a new domain is to prepare a root 11.553 -filesystem for it to boot off. Typically, this might be stored in a 11.554 -normal partition, an LVM or other volume manager partition, a disk 11.555 -file or on an NFS server. A simple way to do this is simply to boot 11.556 -from your standard OS install CD and install the distribution into 11.557 -another partition on your hard drive. 11.558 - 11.559 -To start the \xend control daemon, type 11.560 -\begin{quote} 11.561 -\verb!# xend start! 11.562 -\end{quote} 11.563 -If you 11.564 -wish the daemon to start automatically, see the instructions in 11.565 -Section~\ref{s:xend}. Once the daemon is running, you can use the 11.566 -\path{xm} tool to monitor and maintain the domains running on your 11.567 -system. This chapter provides only a brief tutorial: we provide full 11.568 -details of the \path{xm} tool in the next chapter. 11.569 - 11.570 -%\section{From the web interface} 11.571 -% 11.572 -%Boot the Xen machine and start Xensv (see Chapter~\ref{cha:xensv} for 11.573 -%more details) using the command: \\ 11.574 -%\verb_# xensv start_ \\ 11.575 -%This will also start Xend (see Chapter~\ref{cha:xend} for more information). 11.576 -% 11.577 -%The domain management interface will then be available at {\tt 11.578 -%http://your\_machine:8080/}. This provides a user friendly wizard for 11.579 -%starting domains and functions for managing running domains. 11.580 -% 11.581 -%\section{From the command line} 11.582 - 11.583 - 11.584 -\section{Creating a Domain Configuration File} 11.585 +%% Chapter Installation moved to installation.tex 11.586 +\include{src/user/installation} 11.587 11.588 -Before you can start an additional domain, you must create a 11.589 -configuration file. We provide two example files which you 11.590 -can use as a starting point: 11.591 -\begin{itemize} 11.592 - \item \path{/etc/xen/xmexample1} is a simple template configuration file 11.593 - for describing a single VM. 11.594 - 11.595 - \item \path{/etc/xen/xmexample2} file is a template description that 11.596 - is intended to be reused for multiple virtual machines. Setting 11.597 - the value of the \path{vmid} variable on the \path{xm} command line 11.598 - fills in parts of this template. 11.599 -\end{itemize} 11.600 - 11.601 -Copy one of these files and edit it as appropriate. 11.602 -Typical values you may wish to edit include: 11.603 - 11.604 -\begin{quote} 11.605 -\begin{description} 11.606 -\item[kernel] Set this to the path of the kernel you compiled for use 11.607 - with Xen (e.g.\ \path{kernel = '/boot/vmlinuz-2.6-xenU'}) 11.608 -\item[memory] Set this to the size of the domain's memory in 11.609 -megabytes (e.g.\ \path{memory = 64}) 11.610 -\item[disk] Set the first entry in this list to calculate the offset 11.611 -of the domain's root partition, based on the domain ID. Set the 11.612 -second to the location of \path{/usr} if you are sharing it between 11.613 -domains (e.g.\ \path{disk = ['phy:your\_hard\_drive\%d,sda1,w' \% 11.614 -(base\_partition\_number + vmid), 'phy:your\_usr\_partition,sda6,r' ]} 11.615 -\item[dhcp] Uncomment the dhcp variable, so that the domain will 11.616 -receive its IP address from a DHCP server (e.g.\ \path{dhcp='dhcp'}) 11.617 -\end{description} 11.618 -\end{quote} 11.619 - 11.620 -You may also want to edit the {\bf vif} variable in order to choose 11.621 -the MAC address of the virtual ethernet interface yourself. For 11.622 -example: 11.623 -\begin{quote} 11.624 -\verb_vif = ['mac=00:06:AA:F6:BB:B3']_ 11.625 -\end{quote} 11.626 -If you do not set this variable, \xend will automatically generate a 11.627 -random MAC address from an unused range. 11.628 - 11.629 - 11.630 -\section{Booting the Domain} 11.631 - 11.632 -The \path{xm} tool provides a variety of commands for managing domains. 11.633 -Use the \path{create} command to start new domains. Assuming you've 11.634 -created a configuration file \path{myvmconf} based around 11.635 -\path{/etc/xen/xmexample2}, to start a domain with virtual 11.636 -machine ID~1 you should type: 11.637 - 11.638 -\begin{quote} 11.639 -\begin{verbatim} 11.640 -# xm create -c myvmconf vmid=1 11.641 -\end{verbatim} 11.642 -\end{quote} 11.643 - 11.644 - 11.645 -The \path{-c} switch causes \path{xm} to turn into the domain's 11.646 -console after creation. The \path{vmid=1} sets the \path{vmid} 11.647 -variable used in the \path{myvmconf} file. 11.648 - 11.649 - 11.650 -You should see the console boot messages from the new domain 11.651 -appearing in the terminal in which you typed the command, 11.652 -culminating in a login prompt. 11.653 - 11.654 - 11.655 -\section{Example: ttylinux} 11.656 - 11.657 -Ttylinux is a very small Linux distribution, designed to require very 11.658 -few resources. We will use it as a concrete example of how to start a 11.659 -Xen domain. Most users will probably want to install a full-featured 11.660 -distribution once they have mastered the basics\footnote{ttylinux is 11.661 -maintained by Pascal Schmidt. You can download source packages from 11.662 -the distribution's home page: {\tt http://www.minimalinux.org/ttylinux/}}. 11.663 - 11.664 -\begin{enumerate} 11.665 -\item Download and extract the ttylinux disk image from the Files 11.666 -section of the project's SourceForge site (see 11.667 -\path{http://sf.net/projects/xen/}). 11.668 -\item Create a configuration file like the following: 11.669 -\begin{verbatim} 11.670 -kernel = "/boot/vmlinuz-2.6-xenU" 11.671 -memory = 64 11.672 -name = "ttylinux" 11.673 -nics = 1 11.674 -ip = "1.2.3.4" 11.675 -disk = ['file:/path/to/ttylinux/rootfs,sda1,w'] 11.676 -root = "/dev/sda1 ro" 11.677 -\end{verbatim} 11.678 -\item Now start the domain and connect to its console: 11.679 -\begin{verbatim} 11.680 -xm create configfile -c 11.681 -\end{verbatim} 11.682 -\item Login as root, password root. 11.683 -\end{enumerate} 11.684 - 11.685 - 11.686 -\section{Starting / Stopping Domains Automatically} 11.687 - 11.688 -It is possible to have certain domains start automatically at boot 11.689 -time and to have dom0 wait for all running domains to shutdown before 11.690 -it shuts down the system. 11.691 - 11.692 -To specify a domain is to start at boot-time, place its 11.693 -configuration file (or a link to it) under \path{/etc/xen/auto/}. 11.694 - 11.695 -A Sys-V style init script for RedHat and LSB-compliant systems is 11.696 -provided and will be automatically copied to \path{/etc/init.d/} 11.697 -during install. You can then enable it in the appropriate way for 11.698 -your distribution. 11.699 - 11.700 -For instance, on RedHat: 11.701 - 11.702 -\begin{quote} 11.703 -\verb_# chkconfig --add xendomains_ 11.704 -\end{quote} 11.705 - 11.706 -By default, this will start the boot-time domains in runlevels 3, 4 11.707 -and 5. 11.708 - 11.709 -You can also use the \path{service} command to run this script 11.710 -manually, e.g: 11.711 - 11.712 -\begin{quote} 11.713 -\verb_# service xendomains start_ 11.714 - 11.715 -Starts all the domains with config files under /etc/xen/auto/. 11.716 -\end{quote} 11.717 - 11.718 - 11.719 -\begin{quote} 11.720 -\verb_# service xendomains stop_ 11.721 - 11.722 -Shuts down ALL running Xen domains. 11.723 -\end{quote} 11.724 - 11.725 -\chapter{Domain Management Tools} 11.726 - 11.727 -The previous chapter described a simple example of how to configure 11.728 -and start a domain. This chapter summarises the tools available to 11.729 -manage running domains. 11.730 - 11.731 -\section{Command-line Management} 11.732 - 11.733 -Command line management tasks are also performed using the \path{xm} 11.734 -tool. For online help for the commands available, type: 11.735 -\begin{quote} 11.736 -\verb_# xm help_ 11.737 -\end{quote} 11.738 - 11.739 -You can also type \path{xm help $<$command$>$} for more information 11.740 -on a given command. 11.741 - 11.742 -\subsection{Basic Management Commands} 11.743 - 11.744 -The most important \path{xm} commands are: 11.745 -\begin{quote} 11.746 -\verb_# xm list_: Lists all domains running.\\ 11.747 -\verb_# xm consoles_ : Gives information about the domain consoles.\\ 11.748 -\verb_# xm console_: Opens a console to a domain (e.g.\ 11.749 - \verb_# xm console myVM_ 11.750 -\end{quote} 11.751 - 11.752 -\subsection{\tt xm list} 11.753 - 11.754 -The output of \path{xm list} is in rows of the following format: 11.755 -\begin{center} 11.756 -{\tt name domid memory cpu state cputime console} 11.757 -\end{center} 11.758 - 11.759 -\begin{quote} 11.760 -\begin{description} 11.761 -\item[name] The descriptive name of the virtual machine. 11.762 -\item[domid] The number of the domain ID this virtual machine is running in. 11.763 -\item[memory] Memory size in megabytes. 11.764 -\item[cpu] The CPU this domain is running on. 11.765 -\item[state] Domain state consists of 5 fields: 11.766 - \begin{description} 11.767 - \item[r] running 11.768 - \item[b] blocked 11.769 - \item[p] paused 11.770 - \item[s] shutdown 11.771 - \item[c] crashed 11.772 - \end{description} 11.773 -\item[cputime] How much CPU time (in seconds) the domain has used so far. 11.774 -\item[console] TCP port accepting connections to the domain's console. 11.775 -\end{description} 11.776 -\end{quote} 11.777 - 11.778 -The \path{xm list} command also supports a long output format when the 11.779 -\path{-l} switch is used. This outputs the fulls details of the 11.780 -running domains in \xend's SXP configuration format. 11.781 - 11.782 -For example, suppose the system is running the ttylinux domain as 11.783 -described earlier. The list command should produce output somewhat 11.784 -like the following: 11.785 -\begin{verbatim} 11.786 -# xm list 11.787 -Name Id Mem(MB) CPU State Time(s) Console 11.788 -Domain-0 0 251 0 r---- 172.2 11.789 -ttylinux 5 63 0 -b--- 3.0 9605 11.790 -\end{verbatim} 11.791 - 11.792 -Here we can see the details for the ttylinux domain, as well as for 11.793 -domain 0 (which, of course, is always running). Note that the console 11.794 -port for the ttylinux domain is 9605. This can be connected to by TCP 11.795 -using a terminal program (e.g. \path{telnet} or, better, 11.796 -\path{xencons}). The simplest way to connect is to use the \path{xm console} 11.797 -command, specifying the domain name or ID. To connect to the console 11.798 -of the ttylinux domain, we could use any of the following: 11.799 -\begin{verbatim} 11.800 -# xm console ttylinux 11.801 -# xm console 5 11.802 -# xencons localhost 9605 11.803 -\end{verbatim} 11.804 - 11.805 -\section{Domain Save and Restore} 11.806 - 11.807 -The administrator of a Xen system may suspend a virtual machine's 11.808 -current state into a disk file in domain 0, allowing it to be resumed 11.809 -at a later time. 11.810 - 11.811 -The ttylinux domain described earlier can be suspended to disk using 11.812 -the command: 11.813 -\begin{verbatim} 11.814 -# xm save ttylinux ttylinux.xen 11.815 -\end{verbatim} 11.816 - 11.817 -This will stop the domain named `ttylinux' and save its current state 11.818 -into a file called \path{ttylinux.xen}. 11.819 - 11.820 -To resume execution of this domain, use the \path{xm restore} command: 11.821 -\begin{verbatim} 11.822 -# xm restore ttylinux.xen 11.823 -\end{verbatim} 11.824 - 11.825 -This will restore the state of the domain and restart it. The domain 11.826 -will carry on as before and the console may be reconnected using the 11.827 -\path{xm console} command, as above. 11.828 - 11.829 -\section{Live Migration} 11.830 - 11.831 -Live migration is used to transfer a domain between physical hosts 11.832 -whilst that domain continues to perform its usual activities --- from 11.833 -the user's perspective, the migration should be imperceptible. 11.834 - 11.835 -To perform a live migration, both hosts must be running Xen / \xend and 11.836 -the destination host must have sufficient resources (e.g. memory 11.837 -capacity) to accommodate the domain after the move. Furthermore we 11.838 -currently require both source and destination machines to be on the 11.839 -same L2 subnet. 11.840 - 11.841 -Currently, there is no support for providing automatic remote access 11.842 -to filesystems stored on local disk when a domain is migrated. 11.843 -Administrators should choose an appropriate storage solution 11.844 -(i.e. SAN, NAS, etc.) to ensure that domain filesystems are also 11.845 -available on their destination node. GNBD is a good method for 11.846 -exporting a volume from one machine to another. iSCSI can do a similar 11.847 -job, but is more complex to set up. 11.848 - 11.849 -When a domain migrates, it's MAC and IP address move with it, thus it 11.850 -is only possible to migrate VMs within the same layer-2 network and IP 11.851 -subnet. If the destination node is on a different subnet, the 11.852 -administrator would need to manually configure a suitable etherip or 11.853 -IP tunnel in the domain 0 of the remote node. 11.854 - 11.855 -A domain may be migrated using the \path{xm migrate} command. To 11.856 -live migrate a domain to another machine, we would use 11.857 -the command: 11.858 - 11.859 -\begin{verbatim} 11.860 -# xm migrate --live mydomain destination.ournetwork.com 11.861 -\end{verbatim} 11.862 - 11.863 -Without the \path{--live} flag, \xend simply stops the domain and 11.864 -copies the memory image over to the new node and restarts it. Since 11.865 -domains can have large allocations this can be quite time consuming, 11.866 -even on a Gigabit network. With the \path{--live} flag \xend attempts 11.867 -to keep the domain running while the migration is in progress, 11.868 -resulting in typical `downtimes' of just 60--300ms. 11.869 - 11.870 -For now it will be necessary to reconnect to the domain's console on 11.871 -the new machine using the \path{xm console} command. If a migrated 11.872 -domain has any open network connections then they will be preserved, 11.873 -so SSH connections do not have this limitation. 11.874 - 11.875 -\section{Managing Domain Memory} 11.876 - 11.877 -XenLinux domains have the ability to relinquish / reclaim machine 11.878 -memory at the request of the administrator or the user of the domain. 11.879 +%% Chapter Starting Additional Domains moved to start_addl_dom.tex 11.880 +\include{src/user/start_addl_dom} 11.881 11.882 -\subsection{Setting memory footprints from dom0} 11.883 - 11.884 -The machine administrator can request that a domain alter its memory 11.885 -footprint using the \path{xm set-mem} command. For instance, we can 11.886 -request that our example ttylinux domain reduce its memory footprint 11.887 -to 32 megabytes. 11.888 - 11.889 -\begin{verbatim} 11.890 -# xm set-mem ttylinux 32 11.891 -\end{verbatim} 11.892 - 11.893 -We can now see the result of this in the output of \path{xm list}: 11.894 - 11.895 -\begin{verbatim} 11.896 -# xm list 11.897 -Name Id Mem(MB) CPU State Time(s) Console 11.898 -Domain-0 0 251 0 r---- 172.2 11.899 -ttylinux 5 31 0 -b--- 4.3 9605 11.900 -\end{verbatim} 11.901 - 11.902 -The domain has responded to the request by returning memory to Xen. We 11.903 -can restore the domain to its original size using the command line: 11.904 - 11.905 -\begin{verbatim} 11.906 -# xm set-mem ttylinux 64 11.907 -\end{verbatim} 11.908 - 11.909 -\subsection{Setting memory footprints from within a domain} 11.910 - 11.911 -The virtual file \path{/proc/xen/balloon} allows the owner of a 11.912 -domain to adjust their own memory footprint. Reading the file 11.913 -(e.g. \path{cat /proc/xen/balloon}) prints out the current 11.914 -memory footprint of the domain. Writing the file 11.915 -(e.g. \path{echo new\_target > /proc/xen/balloon}) requests 11.916 -that the kernel adjust the domain's memory footprint to a new value. 11.917 - 11.918 -\subsection{Setting memory limits} 11.919 - 11.920 -Xen associates a memory size limit with each domain. By default, this 11.921 -is the amount of memory the domain is originally started with, 11.922 -preventing the domain from ever growing beyond this size. To permit a 11.923 -domain to grow beyond its original allocation or to prevent a domain 11.924 -you've shrunk from reclaiming the memory it relinquished, use the 11.925 -\path{xm maxmem} command. 11.926 - 11.927 -\chapter{Domain Filesystem Storage} 11.928 - 11.929 -It is possible to directly export any Linux block device in dom0 to 11.930 -another domain, or to export filesystems / devices to virtual machines 11.931 -using standard network protocols (e.g. NBD, iSCSI, NFS, etc). This 11.932 -chapter covers some of the possibilities. 11.933 - 11.934 - 11.935 -\section{Exporting Physical Devices as VBDs} 11.936 -\label{s:exporting-physical-devices-as-vbds} 11.937 - 11.938 -One of the simplest configurations is to directly export 11.939 -individual partitions from domain 0 to other domains. To 11.940 -achieve this use the \path{phy:} specifier in your domain 11.941 -configuration file. For example a line like 11.942 -\begin{quote} 11.943 -\verb_disk = ['phy:hda3,sda1,w']_ 11.944 -\end{quote} 11.945 -specifies that the partition \path{/dev/hda3} in domain 0 11.946 -should be exported read-write to the new domain as \path{/dev/sda1}; 11.947 -one could equally well export it as \path{/dev/hda} or 11.948 -\path{/dev/sdb5} should one wish. 11.949 - 11.950 -In addition to local disks and partitions, it is possible to export 11.951 -any device that Linux considers to be ``a disk'' in the same manner. 11.952 -For example, if you have iSCSI disks or GNBD volumes imported into 11.953 -domain 0 you can export these to other domains using the \path{phy:} 11.954 -disk syntax. E.g.: 11.955 -\begin{quote} 11.956 -\verb_disk = ['phy:vg/lvm1,sda2,w']_ 11.957 -\end{quote} 11.958 - 11.959 - 11.960 - 11.961 -\begin{center} 11.962 -\framebox{\bf Warning: Block device sharing} 11.963 -\end{center} 11.964 -\begin{quote} 11.965 -Block devices should typically only be shared between domains in a 11.966 -read-only fashion otherwise the Linux kernel's file systems will get 11.967 -very confused as the file system structure may change underneath them 11.968 -(having the same ext3 partition mounted rw twice is a sure fire way to 11.969 -cause irreparable damage)! \Xend will attempt to prevent you from 11.970 -doing this by checking that the device is not mounted read-write in 11.971 -domain 0, and hasn't already been exported read-write to another 11.972 -domain. 11.973 -If you want read-write sharing, export the directory to other domains 11.974 -via NFS from domain0 (or use a cluster file system such as GFS or 11.975 -ocfs2). 11.976 - 11.977 -\end{quote} 11.978 - 11.979 - 11.980 -\section{Using File-backed VBDs} 11.981 - 11.982 -It is also possible to use a file in Domain 0 as the primary storage 11.983 -for a virtual machine. As well as being convenient, this also has the 11.984 -advantage that the virtual block device will be {\em sparse} --- space 11.985 -will only really be allocated as parts of the file are used. So if a 11.986 -virtual machine uses only half of its disk space then the file really 11.987 -takes up half of the size allocated. 11.988 - 11.989 -For example, to create a 2GB sparse file-backed virtual block device 11.990 -(actually only consumes 1KB of disk): 11.991 -\begin{quote} 11.992 -\verb_# dd if=/dev/zero of=vm1disk bs=1k seek=2048k count=1_ 11.993 -\end{quote} 11.994 - 11.995 -Make a file system in the disk file: 11.996 -\begin{quote} 11.997 -\verb_# mkfs -t ext3 vm1disk_ 11.998 -\end{quote} 11.999 - 11.1000 -(when the tool asks for confirmation, answer `y') 11.1001 - 11.1002 -Populate the file system e.g. by copying from the current root: 11.1003 -\begin{quote} 11.1004 -\begin{verbatim} 11.1005 -# mount -o loop vm1disk /mnt 11.1006 -# cp -ax /{root,dev,var,etc,usr,bin,sbin,lib} /mnt 11.1007 -# mkdir /mnt/{proc,sys,home,tmp} 11.1008 -\end{verbatim} 11.1009 -\end{quote} 11.1010 - 11.1011 -Tailor the file system by editing \path{/etc/fstab}, 11.1012 -\path{/etc/hostname}, etc (don't forget to edit the files in the 11.1013 -mounted file system, instead of your domain 0 filesystem, e.g. you 11.1014 -would edit \path{/mnt/etc/fstab} instead of \path{/etc/fstab} ). For 11.1015 -this example put \path{/dev/sda1} to root in fstab. 11.1016 - 11.1017 -Now unmount (this is important!): 11.1018 -\begin{quote} 11.1019 -\verb_# umount /mnt_ 11.1020 -\end{quote} 11.1021 - 11.1022 -In the configuration file set: 11.1023 -\begin{quote} 11.1024 -\verb_disk = ['file:/full/path/to/vm1disk,sda1,w']_ 11.1025 -\end{quote} 11.1026 +%% Chapter Domain Management Tools moved to domain_mgmt.tex 11.1027 +\include{src/user/domain_mgmt} 11.1028 11.1029 -As the virtual machine writes to its `disk', the sparse file will be 11.1030 -filled in and consume more space up to the original 2GB. 11.1031 - 11.1032 -{\bf Note that file-backed VBDs may not be appropriate for backing 11.1033 -I/O-intensive domains.} File-backed VBDs are known to experience 11.1034 -substantial slowdowns under heavy I/O workloads, due to the I/O handling 11.1035 -by the loopback block device used to support file-backed VBDs in dom0. 11.1036 -Better I/O performance can be achieved by using either LVM-backed VBDs 11.1037 -(Section~\ref{s:using-lvm-backed-vbds}) or physical devices as VBDs 11.1038 -(Section~\ref{s:exporting-physical-devices-as-vbds}). 11.1039 - 11.1040 -Linux supports a maximum of eight file-backed VBDs across all domains by 11.1041 -default. This limit can be statically increased by using the {\em 11.1042 -max\_loop} module parameter if CONFIG\_BLK\_DEV\_LOOP is compiled as a 11.1043 -module in the dom0 kernel, or by using the {\em max\_loop=n} boot option 11.1044 -if CONFIG\_BLK\_DEV\_LOOP is compiled directly into the dom0 kernel. 11.1045 - 11.1046 - 11.1047 -\section{Using LVM-backed VBDs} 11.1048 -\label{s:using-lvm-backed-vbds} 11.1049 - 11.1050 -A particularly appealing solution is to use LVM volumes 11.1051 -as backing for domain file-systems since this allows dynamic 11.1052 -growing/shrinking of volumes as well as snapshot and other 11.1053 -features. 11.1054 - 11.1055 -To initialise a partition to support LVM volumes: 11.1056 -\begin{quote} 11.1057 -\begin{verbatim} 11.1058 -# pvcreate /dev/sda10 11.1059 -\end{verbatim} 11.1060 -\end{quote} 11.1061 - 11.1062 -Create a volume group named `vg' on the physical partition: 11.1063 -\begin{quote} 11.1064 -\begin{verbatim} 11.1065 -# vgcreate vg /dev/sda10 11.1066 -\end{verbatim} 11.1067 -\end{quote} 11.1068 - 11.1069 -Create a logical volume of size 4GB named `myvmdisk1': 11.1070 -\begin{quote} 11.1071 -\begin{verbatim} 11.1072 -# lvcreate -L4096M -n myvmdisk1 vg 11.1073 -\end{verbatim} 11.1074 -\end{quote} 11.1075 - 11.1076 -You should now see that you have a \path{/dev/vg/myvmdisk1} 11.1077 -Make a filesystem, mount it and populate it, e.g.: 11.1078 -\begin{quote} 11.1079 -\begin{verbatim} 11.1080 -# mkfs -t ext3 /dev/vg/myvmdisk1 11.1081 -# mount /dev/vg/myvmdisk1 /mnt 11.1082 -# cp -ax / /mnt 11.1083 -# umount /mnt 11.1084 -\end{verbatim} 11.1085 -\end{quote} 11.1086 - 11.1087 -Now configure your VM with the following disk configuration: 11.1088 -\begin{quote} 11.1089 -\begin{verbatim} 11.1090 - disk = [ 'phy:vg/myvmdisk1,sda1,w' ] 11.1091 -\end{verbatim} 11.1092 -\end{quote} 11.1093 - 11.1094 -LVM enables you to grow the size of logical volumes, but you'll need 11.1095 -to resize the corresponding file system to make use of the new 11.1096 -space. Some file systems (e.g. ext3) now support on-line resize. See 11.1097 -the LVM manuals for more details. 11.1098 +%% Chapter Domain Filesystem Storage moved to domain_filesystem.tex 11.1099 +\include{src/user/domain_filesystem} 11.1100 11.1101 -You can also use LVM for creating copy-on-write clones of LVM 11.1102 -volumes (known as writable persistent snapshots in LVM 11.1103 -terminology). This facility is new in Linux 2.6.8, so isn't as 11.1104 -stable as one might hope. In particular, using lots of CoW LVM 11.1105 -disks consumes a lot of dom0 memory, and error conditions such as 11.1106 -running out of disk space are not handled well. Hopefully this 11.1107 -will improve in future. 11.1108 - 11.1109 -To create two copy-on-write clone of the above file system you 11.1110 -would use the following commands: 11.1111 - 11.1112 -\begin{quote} 11.1113 -\begin{verbatim} 11.1114 -# lvcreate -s -L1024M -n myclonedisk1 /dev/vg/myvmdisk1 11.1115 -# lvcreate -s -L1024M -n myclonedisk2 /dev/vg/myvmdisk1 11.1116 -\end{verbatim} 11.1117 -\end{quote} 11.1118 - 11.1119 -Each of these can grow to have 1GB of differences from the master 11.1120 -volume. You can grow the amount of space for storing the 11.1121 -differences using the lvextend command, e.g.: 11.1122 -\begin{quote} 11.1123 -\begin{verbatim} 11.1124 -# lvextend +100M /dev/vg/myclonedisk1 11.1125 -\end{verbatim} 11.1126 -\end{quote} 11.1127 - 11.1128 -Don't let the `differences volume' ever fill up otherwise LVM gets 11.1129 -rather confused. It may be possible to automate the growing 11.1130 -process by using \path{dmsetup wait} to spot the volume getting full 11.1131 -and then issue an \path{lvextend}. 11.1132 - 11.1133 -In principle, it is possible to continue writing to the volume 11.1134 -that has been cloned (the changes will not be visible to the 11.1135 -clones), but we wouldn't recommend this: have the cloned volume 11.1136 -as a `pristine' file system install that isn't mounted directly 11.1137 -by any of the virtual machines. 11.1138 - 11.1139 - 11.1140 -\section{Using NFS Root} 11.1141 - 11.1142 -First, populate a root filesystem in a directory on the server 11.1143 -machine. This can be on a distinct physical machine, or simply 11.1144 -run within a virtual machine on the same node. 11.1145 - 11.1146 -Now configure the NFS server to export this filesystem over the 11.1147 -network by adding a line to \path{/etc/exports}, for instance: 11.1148 - 11.1149 -\begin{quote} 11.1150 -\begin{small} 11.1151 -\begin{verbatim} 11.1152 -/export/vm1root 1.2.3.4/24 (rw,sync,no_root_squash) 11.1153 -\end{verbatim} 11.1154 -\end{small} 11.1155 -\end{quote} 11.1156 - 11.1157 -Finally, configure the domain to use NFS root. In addition to the 11.1158 -normal variables, you should make sure to set the following values in 11.1159 -the domain's configuration file: 11.1160 - 11.1161 -\begin{quote} 11.1162 -\begin{small} 11.1163 -\begin{verbatim} 11.1164 -root = '/dev/nfs' 11.1165 -nfs_server = '2.3.4.5' # substitute IP address of server 11.1166 -nfs_root = '/path/to/root' # path to root FS on the server 11.1167 -\end{verbatim} 11.1168 -\end{small} 11.1169 -\end{quote} 11.1170 - 11.1171 -The domain will need network access at boot time, so either statically 11.1172 -configure an IP address (Using the config variables \path{ip}, 11.1173 -\path{netmask}, \path{gateway}, \path{hostname}) or enable DHCP ( 11.1174 -\path{dhcp='dhcp'}). 11.1175 - 11.1176 -Note that the Linux NFS root implementation is known to have stability 11.1177 -problems under high load (this is not a Xen-specific problem), so this 11.1178 -configuration may not be appropriate for critical servers. 11.1179 11.1180 11.1181 \part{User Reference Documentation} 11.1182 11.1183 -\chapter{Control Software} 11.1184 - 11.1185 -The Xen control software includes the \xend node control daemon (which 11.1186 -must be running), the xm command line tools, and the prototype 11.1187 -xensv web interface. 11.1188 - 11.1189 -\section{\Xend (node control daemon)} 11.1190 -\label{s:xend} 11.1191 - 11.1192 -The Xen Daemon (\Xend) performs system management functions related to 11.1193 -virtual machines. It forms a central point of control for a machine 11.1194 -and can be controlled using an HTTP-based protocol. \Xend must be 11.1195 -running in order to start and manage virtual machines. 11.1196 - 11.1197 -\Xend must be run as root because it needs access to privileged system 11.1198 -management functions. A small set of commands may be issued on the 11.1199 -\xend command line: 11.1200 - 11.1201 -\begin{tabular}{ll} 11.1202 -\verb!# xend start! & start \xend, if not already running \\ 11.1203 -\verb!# xend stop! & stop \xend if already running \\ 11.1204 -\verb!# xend restart! & restart \xend if running, otherwise start it \\ 11.1205 -% \verb!# xend trace_start! & start \xend, with very detailed debug logging \\ 11.1206 -\verb!# xend status! & indicates \xend status by its return code 11.1207 -\end{tabular} 11.1208 - 11.1209 -A SysV init script called {\tt xend} is provided to start \xend at boot 11.1210 -time. {\tt make install} installs this script in {\path{/etc/init.d}. 11.1211 -To enable it, you have to make symbolic links in the appropriate 11.1212 -runlevel directories or use the {\tt chkconfig} tool, where available. 11.1213 - 11.1214 -Once \xend is running, more sophisticated administration can be done 11.1215 -using the xm tool (see Section~\ref{s:xm}) and the experimental 11.1216 -Xensv web interface (see Section~\ref{s:xensv}). 11.1217 - 11.1218 -As \xend runs, events will be logged to \path{/var/log/xend.log} and, 11.1219 -if the migration assistant daemon (\path{xfrd}) has been started, 11.1220 -\path{/var/log/xfrd.log}. These may be of use for troubleshooting 11.1221 -problems. 11.1222 - 11.1223 -\section{Xm (command line interface)} 11.1224 -\label{s:xm} 11.1225 - 11.1226 -The xm tool is the primary tool for managing Xen from the console. 11.1227 -The general format of an xm command line is: 11.1228 - 11.1229 -\begin{verbatim} 11.1230 -# xm command [switches] [arguments] [variables] 11.1231 -\end{verbatim} 11.1232 - 11.1233 -The available {\em switches} and {\em arguments} are dependent on the 11.1234 -{\em command} chosen. The {\em variables} may be set using 11.1235 -declarations of the form {\tt variable=value} and command line 11.1236 -declarations override any of the values in the configuration file 11.1237 -being used, including the standard variables described above and any 11.1238 -custom variables (for instance, the \path{xmdefconfig} file uses a 11.1239 -{\tt vmid} variable). 11.1240 - 11.1241 -The available commands are as follows: 11.1242 - 11.1243 -\begin{description} 11.1244 -\item[set-mem] Request a domain to adjust its memory footprint. 11.1245 -\item[create] Create a new domain. 11.1246 -\item[destroy] Kill a domain immediately. 11.1247 -\item[list] List running domains. 11.1248 -\item[shutdown] Ask a domain to shutdown. 11.1249 -\item[dmesg] Fetch the Xen (not Linux!) boot output. 11.1250 -\item[consoles] Lists the available consoles. 11.1251 -\item[console] Connect to the console for a domain. 11.1252 -\item[help] Get help on xm commands. 11.1253 -\item[save] Suspend a domain to disk. 11.1254 -\item[restore] Restore a domain from disk. 11.1255 -\item[pause] Pause a domain's execution. 11.1256 -\item[unpause] Unpause a domain. 11.1257 -\item[pincpu] Pin a domain to a CPU. 11.1258 -\item[bvt] Set BVT scheduler parameters for a domain. 11.1259 -\item[bvt\_ctxallow] Set the BVT context switching allowance for the system. 11.1260 -\item[atropos] Set the atropos parameters for a domain. 11.1261 -\item[rrobin] Set the round robin time slice for the system. 11.1262 -\item[info] Get information about the Xen host. 11.1263 -\item[call] Call a \xend HTTP API function directly. 11.1264 -\end{description} 11.1265 - 11.1266 -For a detailed overview of switches, arguments and variables to each command 11.1267 -try 11.1268 -\begin{quote} 11.1269 -\begin{verbatim} 11.1270 -# xm help command 11.1271 -\end{verbatim} 11.1272 -\end{quote} 11.1273 - 11.1274 -\section{Xensv (web control interface)} 11.1275 -\label{s:xensv} 11.1276 - 11.1277 -Xensv is the experimental web control interface for managing a Xen 11.1278 -machine. It can be used to perform some (but not yet all) of the 11.1279 -management tasks that can be done using the xm tool. 11.1280 - 11.1281 -It can be started using: 11.1282 -\begin{quote} 11.1283 -\verb_# xensv start_ 11.1284 -\end{quote} 11.1285 -and stopped using: 11.1286 -\begin{quote} 11.1287 -\verb_# xensv stop_ 11.1288 -\end{quote} 11.1289 - 11.1290 -By default, Xensv will serve out the web interface on port 8080. This 11.1291 -can be changed by editing 11.1292 -\path{/usr/lib/python2.3/site-packages/xen/sv/params.py}. 11.1293 - 11.1294 -Once Xensv is running, the web interface can be used to create and 11.1295 -manage running domains. 11.1296 - 11.1297 - 11.1298 - 11.1299 - 11.1300 -\chapter{Domain Configuration} 11.1301 -\label{cha:config} 11.1302 - 11.1303 -The following contains the syntax of the domain configuration 11.1304 -files and description of how to further specify networking, 11.1305 -driver domain and general scheduling behaviour. 11.1306 - 11.1307 -\section{Configuration Files} 11.1308 -\label{s:cfiles} 11.1309 - 11.1310 -Xen configuration files contain the following standard variables. 11.1311 -Unless otherwise stated, configuration items should be enclosed in 11.1312 -quotes: see \path{/etc/xen/xmexample1} and \path{/etc/xen/xmexample2} 11.1313 -for concrete examples of the syntax. 11.1314 - 11.1315 -\begin{description} 11.1316 -\item[kernel] Path to the kernel image 11.1317 -\item[ramdisk] Path to a ramdisk image (optional). 11.1318 -% \item[builder] The name of the domain build function (e.g. {\tt'linux'} or {\tt'netbsd'}. 11.1319 -\item[memory] Memory size in megabytes. 11.1320 -\item[cpu] CPU to run this domain on, or {\tt -1} for 11.1321 - auto-allocation. 11.1322 -\item[console] Port to export the domain console on (default 9600 + domain ID). 11.1323 -\item[nics] Number of virtual network interfaces. 11.1324 -\item[vif] List of MAC addresses (random addresses are assigned if not 11.1325 - given) and bridges to use for the domain's network interfaces, e.g. 11.1326 -\begin{verbatim} 11.1327 -vif = [ 'mac=aa:00:00:00:00:11, bridge=xen-br0', 11.1328 - 'bridge=xen-br1' ] 11.1329 -\end{verbatim} 11.1330 - to assign a MAC address and bridge to the first interface and assign 11.1331 - a different bridge to the second interface, leaving \xend to choose 11.1332 - the MAC address. 11.1333 -\item[disk] List of block devices to export to the domain, e.g. \\ 11.1334 - \verb_disk = [ 'phy:hda1,sda1,r' ]_ \\ 11.1335 - exports physical device \path{/dev/hda1} to the domain 11.1336 - as \path{/dev/sda1} with read-only access. Exporting a disk read-write 11.1337 - which is currently mounted is dangerous -- if you are \emph{certain} 11.1338 - you wish to do this, you can specify \path{w!} as the mode. 11.1339 -\item[dhcp] Set to {\tt 'dhcp'} if you want to use DHCP to configure 11.1340 - networking. 11.1341 -\item[netmask] Manually configured IP netmask. 11.1342 -\item[gateway] Manually configured IP gateway. 11.1343 -\item[hostname] Set the hostname for the virtual machine. 11.1344 -\item[root] Specify the root device parameter on the kernel command 11.1345 - line. 11.1346 -\item[nfs\_server] IP address for the NFS server (if any). 11.1347 -\item[nfs\_root] Path of the root filesystem on the NFS server (if any). 11.1348 -\item[extra] Extra string to append to the kernel command line (if 11.1349 - any) 11.1350 -\item[restart] Three possible options: 11.1351 - \begin{description} 11.1352 - \item[always] Always restart the domain, no matter what 11.1353 - its exit code is. 11.1354 - \item[never] Never restart the domain. 11.1355 - \item[onreboot] Restart the domain iff it requests reboot. 11.1356 - \end{description} 11.1357 -\end{description} 11.1358 - 11.1359 -For additional flexibility, it is also possible to include Python 11.1360 -scripting commands in configuration files. An example of this is the 11.1361 -\path{xmexample2} file, which uses Python code to handle the 11.1362 -\path{vmid} variable. 11.1363 - 11.1364 - 11.1365 -%\part{Advanced Topics} 11.1366 - 11.1367 -\section{Network Configuration} 11.1368 - 11.1369 -For many users, the default installation should work `out of the box'. 11.1370 -More complicated network setups, for instance with multiple ethernet 11.1371 -interfaces and/or existing bridging setups will require some 11.1372 -special configuration. 11.1373 - 11.1374 -The purpose of this section is to describe the mechanisms provided by 11.1375 -\xend to allow a flexible configuration for Xen's virtual networking. 11.1376 - 11.1377 -\subsection{Xen virtual network topology} 11.1378 - 11.1379 -Each domain network interface is connected to a virtual network 11.1380 -interface in dom0 by a point to point link (effectively a `virtual 11.1381 -crossover cable'). These devices are named {\tt 11.1382 -vif$<$domid$>$.$<$vifid$>$} (e.g. {\tt vif1.0} for the first interface 11.1383 -in domain 1, {\tt vif3.1} for the second interface in domain 3). 11.1384 - 11.1385 -Traffic on these virtual interfaces is handled in domain 0 using 11.1386 -standard Linux mechanisms for bridging, routing, rate limiting, etc. 11.1387 -Xend calls on two shell scripts to perform initial configuration of 11.1388 -the network and configuration of new virtual interfaces. By default, 11.1389 -these scripts configure a single bridge for all the virtual 11.1390 -interfaces. Arbitrary routing / bridging configurations can be 11.1391 -configured by customising the scripts, as described in the following 11.1392 -section. 11.1393 - 11.1394 -\subsection{Xen networking scripts} 11.1395 - 11.1396 -Xen's virtual networking is configured by two shell scripts (by 11.1397 -default \path{network} and \path{vif-bridge}). These are 11.1398 -called automatically by \xend when certain events occur, with 11.1399 -arguments to the scripts providing further contextual information. 11.1400 -These scripts are found by default in \path{/etc/xen/scripts}. The 11.1401 -names and locations of the scripts can be configured in 11.1402 -\path{/etc/xen/xend-config.sxp}. 11.1403 - 11.1404 -\begin{description} 11.1405 - 11.1406 -\item[network:] This script is called whenever \xend is started or 11.1407 -stopped to respectively initialise or tear down the Xen virtual 11.1408 -network. In the default configuration initialisation creates the 11.1409 -bridge `xen-br0' and moves eth0 onto that bridge, modifying the 11.1410 -routing accordingly. When \xend exits, it deletes the Xen bridge and 11.1411 -removes eth0, restoring the normal IP and routing configuration. 11.1412 - 11.1413 -%% In configurations where the bridge already exists, this script could 11.1414 -%% be replaced with a link to \path{/bin/true} (for instance). 11.1415 - 11.1416 -\item[vif-bridge:] This script is called for every domain virtual 11.1417 -interface and can configure firewalling rules and add the vif 11.1418 -to the appropriate bridge. By default, this adds and removes 11.1419 -VIFs on the default Xen bridge. 11.1420 - 11.1421 -\end{description} 11.1422 - 11.1423 -For more complex network setups (e.g. where routing is required or 11.1424 -integrate with existing bridges) these scripts may be replaced with 11.1425 -customised variants for your site's preferred configuration. 11.1426 - 11.1427 -%% There are two possible types of privileges: IO privileges and 11.1428 -%% administration privileges. 11.1429 - 11.1430 -\section{Driver Domain Configuration} 11.1431 - 11.1432 -I/O privileges can be assigned to allow a domain to directly access 11.1433 -PCI devices itself. This is used to support driver domains. 11.1434 - 11.1435 -Setting backend privileges is currently only supported in SXP format 11.1436 -config files. To allow a domain to function as a backend for others, 11.1437 -somewhere within the {\tt vm} element of its configuration file must 11.1438 -be a {\tt backend} element of the form {\tt (backend ({\em type}))} 11.1439 -where {\tt \em type} may be either {\tt netif} or {\tt blkif}, 11.1440 -according to the type of virtual device this domain will service. 11.1441 -%% After this domain has been built, \xend will connect all new and 11.1442 -%% existing {\em virtual} devices (of the appropriate type) to that 11.1443 -%% backend. 11.1444 - 11.1445 -Note that a block backend cannot currently import virtual block 11.1446 -devices from other domains, and a network backend cannot import 11.1447 -virtual network devices from other domains. Thus (particularly in the 11.1448 -case of block backends, which cannot import a virtual block device as 11.1449 -their root filesystem), you may need to boot a backend domain from a 11.1450 -ramdisk or a network device. 11.1451 - 11.1452 -Access to PCI devices may be configured on a per-device basis. Xen 11.1453 -will assign the minimal set of hardware privileges to a domain that 11.1454 -are required to control its devices. This can be configured in either 11.1455 -format of configuration file: 11.1456 - 11.1457 -\begin{itemize} 11.1458 -\item SXP Format: Include device elements of the form: \\ 11.1459 -\centerline{ {\tt (device (pci (bus {\em x}) (dev {\em y}) (func {\em z})))}} \\ 11.1460 - inside the top-level {\tt vm} element. Each one specifies the address 11.1461 - of a device this domain is allowed to access --- 11.1462 - the numbers {\em x},{\em y} and {\em z} may be in either decimal or 11.1463 - hexadecimal format. 11.1464 -\item Flat Format: Include a list of PCI device addresses of the 11.1465 - format: \\ 11.1466 -\centerline{{\tt pci = ['x,y,z', ...]}} \\ 11.1467 -where each element in the 11.1468 - list is a string specifying the components of the PCI device 11.1469 - address, separated by commas. The components ({\tt \em x}, {\tt \em 11.1470 - y} and {\tt \em z}) of the list may be formatted as either decimal 11.1471 - or hexadecimal. 11.1472 -\end{itemize} 11.1473 - 11.1474 -%% \section{Administration Domains} 11.1475 - 11.1476 -%% Administration privileges allow a domain to use the `dom0 11.1477 -%% operations' (so called because they are usually available only to 11.1478 -%% domain 0). A privileged domain can build other domains, set scheduling 11.1479 -%% parameters, etc. 11.1480 - 11.1481 -% Support for other administrative domains is not yet available... perhaps 11.1482 -% we should plumb it in some time 11.1483 - 11.1484 - 11.1485 - 11.1486 - 11.1487 - 11.1488 -\section{Scheduler Configuration} 11.1489 -\label{s:sched} 11.1490 - 11.1491 - 11.1492 -Xen offers a boot time choice between multiple schedulers. To select 11.1493 -a scheduler, pass the boot parameter {\em sched=sched\_name} to Xen, 11.1494 -substituting the appropriate scheduler name. Details of the schedulers 11.1495 -and their parameters are included below; future versions of the tools 11.1496 -will provide a higher-level interface to these tools. 11.1497 +%% Chapter Control Software moved to control_software.tex 11.1498 +\include{src/user/control_software} 11.1499 11.1500 -It is expected that system administrators configure their system to 11.1501 -use the scheduler most appropriate to their needs. Currently, the BVT 11.1502 -scheduler is the recommended choice. 11.1503 - 11.1504 -\subsection{Borrowed Virtual Time} 11.1505 - 11.1506 -{\tt sched=bvt} (the default) \\ 11.1507 - 11.1508 -BVT provides proportional fair shares of the CPU time. It has been 11.1509 -observed to penalise domains that block frequently (e.g. I/O intensive 11.1510 -domains), but this can be compensated for by using warping. 11.1511 - 11.1512 -\subsubsection{Global Parameters} 11.1513 - 11.1514 -\begin{description} 11.1515 -\item[ctx\_allow] 11.1516 - the context switch allowance is similar to the `quantum' 11.1517 - in traditional schedulers. It is the minimum time that 11.1518 - a scheduled domain will be allowed to run before being 11.1519 - pre-empted. 11.1520 -\end{description} 11.1521 - 11.1522 -\subsubsection{Per-domain parameters} 11.1523 - 11.1524 -\begin{description} 11.1525 -\item[mcuadv] 11.1526 - the MCU (Minimum Charging Unit) advance determines the 11.1527 - proportional share of the CPU that a domain receives. It 11.1528 - is set inversely proportionally to a domain's sharing weight. 11.1529 -\item[warp] 11.1530 - the amount of `virtual time' the domain is allowed to warp 11.1531 - backwards 11.1532 -\item[warpl] 11.1533 - the warp limit is the maximum time a domain can run warped for 11.1534 -\item[warpu] 11.1535 - the unwarp requirement is the minimum time a domain must 11.1536 - run unwarped for before it can warp again 11.1537 -\end{description} 11.1538 - 11.1539 -\subsection{Atropos} 11.1540 - 11.1541 -{\tt sched=atropos} \\ 11.1542 - 11.1543 -Atropos is a soft real time scheduler. It provides guarantees about 11.1544 -absolute shares of the CPU, with a facility for sharing 11.1545 -slack CPU time on a best-effort basis. It can provide timeliness 11.1546 -guarantees for latency-sensitive domains. 11.1547 - 11.1548 -Every domain has an associated period and slice. The domain should 11.1549 -receive `slice' nanoseconds every `period' nanoseconds. This allows 11.1550 -the administrator to configure both the absolute share of the CPU a 11.1551 -domain receives and the frequency with which it is scheduled. 11.1552 - 11.1553 -%% When 11.1554 -%% domains unblock, their period is reduced to the value of the latency 11.1555 -%% hint (the slice is scaled accordingly so that they still get the same 11.1556 -%% proportion of the CPU). For each subsequent period, the slice and 11.1557 -%% period times are doubled until they reach their original values. 11.1558 - 11.1559 -Note: don't overcommit the CPU when using Atropos (i.e. don't reserve 11.1560 -more CPU than is available --- the utilisation should be kept to 11.1561 -slightly less than 100\% in order to ensure predictable behaviour). 11.1562 - 11.1563 -\subsubsection{Per-domain parameters} 11.1564 - 11.1565 -\begin{description} 11.1566 -\item[period] The regular time interval during which a domain is 11.1567 - guaranteed to receive its allocation of CPU time. 11.1568 -\item[slice] 11.1569 - The length of time per period that a domain is guaranteed to run 11.1570 - for (in the absence of voluntary yielding of the CPU). 11.1571 -\item[latency] 11.1572 - The latency hint is used to control how soon after 11.1573 - waking up a domain it should be scheduled. 11.1574 -\item[xtratime] This is a boolean flag that specifies whether a domain 11.1575 - should be allowed a share of the system slack time. 11.1576 -\end{description} 11.1577 - 11.1578 -\subsection{Round Robin} 11.1579 - 11.1580 -{\tt sched=rrobin} \\ 11.1581 - 11.1582 -The round robin scheduler is included as a simple demonstration of 11.1583 -Xen's internal scheduler API. It is not intended for production use. 11.1584 - 11.1585 -\subsubsection{Global Parameters} 11.1586 - 11.1587 -\begin{description} 11.1588 -\item[rr\_slice] 11.1589 - The maximum time each domain runs before the next 11.1590 - scheduling decision is made. 11.1591 -\end{description} 11.1592 - 11.1593 - 11.1594 - 11.1595 - 11.1596 - 11.1597 - 11.1598 - 11.1599 - 11.1600 - 11.1601 - 11.1602 - 11.1603 - 11.1604 -\chapter{Build, Boot and Debug options} 11.1605 - 11.1606 -This chapter describes the build- and boot-time options 11.1607 -which may be used to tailor your Xen system. 11.1608 - 11.1609 -\section{Xen Build Options} 11.1610 - 11.1611 -Xen provides a number of build-time options which should be 11.1612 -set as environment variables or passed on make's command-line. 11.1613 - 11.1614 -\begin{description} 11.1615 -\item[verbose=y] Enable debugging messages when Xen detects an unexpected condition. 11.1616 -Also enables console output from all domains. 11.1617 -\item[debug=y] 11.1618 -Enable debug assertions. Implies {\bf verbose=y}. 11.1619 -(Primarily useful for tracing bugs in Xen). 11.1620 -\item[debugger=y] 11.1621 -Enable the in-Xen debugger. This can be used to debug 11.1622 -Xen, guest OSes, and applications. 11.1623 -\item[perfc=y] 11.1624 -Enable performance counters for significant events 11.1625 -within Xen. The counts can be reset or displayed 11.1626 -on Xen's console via console control keys. 11.1627 -\item[trace=y] 11.1628 -Enable per-cpu trace buffers which log a range of 11.1629 -events within Xen for collection by control 11.1630 -software. 11.1631 -\end{description} 11.1632 - 11.1633 -\section{Xen Boot Options} 11.1634 -\label{s:xboot} 11.1635 - 11.1636 -These options are used to configure Xen's behaviour at runtime. They 11.1637 -should be appended to Xen's command line, either manually or by 11.1638 -editing \path{grub.conf}. 11.1639 - 11.1640 -\begin{description} 11.1641 -\item [noreboot ] 11.1642 - Don't reboot the machine automatically on errors. This is 11.1643 - useful to catch debug output if you aren't catching console messages 11.1644 - via the serial line. 11.1645 - 11.1646 -\item [nosmp ] 11.1647 - Disable SMP support. 11.1648 - This option is implied by `ignorebiostables'. 11.1649 - 11.1650 -\item [watchdog ] 11.1651 - Enable NMI watchdog which can report certain failures. 11.1652 - 11.1653 -\item [noirqbalance ] 11.1654 - Disable software IRQ balancing and affinity. This can be used on 11.1655 - systems such as Dell 1850/2850 that have workarounds in hardware for 11.1656 - IRQ-routing issues. 11.1657 +%% Chapter Domain Configuration moved to domain_configuration.tex 11.1658 +\include{src/user/domain_configuration} 11.1659 11.1660 -\item [badpage=$<$page number$>$,$<$page number$>$, \ldots ] 11.1661 - Specify a list of pages not to be allocated for use 11.1662 - because they contain bad bytes. For example, if your 11.1663 - memory tester says that byte 0x12345678 is bad, you would 11.1664 - place `badpage=0x12345' on Xen's command line. 11.1665 - 11.1666 -\item [com1=$<$baud$>$,DPS,$<$io\_base$>$,$<$irq$>$ 11.1667 - com2=$<$baud$>$,DPS,$<$io\_base$>$,$<$irq$>$ ] \mbox{}\\ 11.1668 - Xen supports up to two 16550-compatible serial ports. 11.1669 - For example: `com1=9600, 8n1, 0x408, 5' maps COM1 to a 11.1670 - 9600-baud port, 8 data bits, no parity, 1 stop bit, 11.1671 - I/O port base 0x408, IRQ 5. 11.1672 - If some configuration options are standard (e.g., I/O base and IRQ), 11.1673 - then only a prefix of the full configuration string need be 11.1674 - specified. If the baud rate is pre-configured (e.g., by the 11.1675 - bootloader) then you can specify `auto' in place of a numeric baud 11.1676 - rate. 11.1677 - 11.1678 -\item [console=$<$specifier list$>$ ] 11.1679 - Specify the destination for Xen console I/O. 11.1680 - This is a comma-separated list of, for example: 11.1681 -\begin{description} 11.1682 - \item[vga] use VGA console and allow keyboard input 11.1683 - \item[com1] use serial port com1 11.1684 - \item[com2H] use serial port com2. Transmitted chars will 11.1685 - have the MSB set. Received chars must have 11.1686 - MSB set. 11.1687 - \item[com2L] use serial port com2. Transmitted chars will 11.1688 - have the MSB cleared. Received chars must 11.1689 - have MSB cleared. 11.1690 -\end{description} 11.1691 - The latter two examples allow a single port to be 11.1692 - shared by two subsystems (e.g. console and 11.1693 - debugger). Sharing is controlled by MSB of each 11.1694 - transmitted/received character. 11.1695 - [NB. Default for this option is `com1,vga'] 11.1696 - 11.1697 -\item [sync\_console ] 11.1698 - Force synchronous console output. This is useful if you system fails 11.1699 - unexpectedly before it has sent all available output to the 11.1700 - console. In most cases Xen will automatically enter synchronous mode 11.1701 - when an exceptional event occurs, but this option provides a manual 11.1702 - fallback. 11.1703 - 11.1704 -\item [conswitch=$<$switch-char$><$auto-switch-char$>$ ] 11.1705 - Specify how to switch serial-console input between 11.1706 - Xen and DOM0. The required sequence is CTRL-$<$switch-char$>$ 11.1707 - pressed three times. Specifying the backtick character 11.1708 - disables switching. 11.1709 - The $<$auto-switch-char$>$ specifies whether Xen should 11.1710 - auto-switch input to DOM0 when it boots --- if it is `x' 11.1711 - then auto-switching is disabled. Any other value, or 11.1712 - omitting the character, enables auto-switching. 11.1713 - [NB. default switch-char is `a'] 11.1714 - 11.1715 -\item [nmi=xxx ] 11.1716 - Specify what to do with an NMI parity or I/O error. \\ 11.1717 - `nmi=fatal': Xen prints a diagnostic and then hangs. \\ 11.1718 - `nmi=dom0': Inform DOM0 of the NMI. \\ 11.1719 - `nmi=ignore': Ignore the NMI. 11.1720 - 11.1721 -\item [mem=xxx ] 11.1722 - Set the physical RAM address limit. Any RAM appearing beyond this 11.1723 - physical address in the memory map will be ignored. This parameter 11.1724 - may be specified with a B, K, M or G suffix, representing bytes, 11.1725 - kilobytes, megabytes and gigabytes respectively. The 11.1726 - default unit, if no suffix is specified, is kilobytes. 11.1727 - 11.1728 -\item [dom0\_mem=xxx ] 11.1729 - Set the amount of memory to be allocated to domain0. In Xen 3.x the parameter 11.1730 - may be specified with a B, K, M or G suffix, representing bytes, 11.1731 - kilobytes, megabytes and gigabytes respectively; if no suffix is specified, 11.1732 - the parameter defaults to kilobytes. In previous versions of Xen, suffixes 11.1733 - were not supported and the value is always interpreted as kilobytes. 11.1734 - 11.1735 -\item [tbuf\_size=xxx ] 11.1736 - Set the size of the per-cpu trace buffers, in pages 11.1737 - (default 1). Note that the trace buffers are only 11.1738 - enabled in debug builds. Most users can ignore 11.1739 - this feature completely. 11.1740 - 11.1741 -\item [sched=xxx ] 11.1742 - Select the CPU scheduler Xen should use. The current 11.1743 - possibilities are `bvt' (default), `atropos' and `rrobin'. 11.1744 - For more information see Section~\ref{s:sched}. 11.1745 - 11.1746 -\item [apic\_verbosity=debug,verbose ] 11.1747 - Print more detailed information about local APIC and IOAPIC configuration. 11.1748 - 11.1749 -\item [lapic ] 11.1750 - Force use of local APIC even when left disabled by uniprocessor BIOS. 11.1751 - 11.1752 -\item [nolapic ] 11.1753 - Ignore local APIC in a uniprocessor system, even if enabled by the BIOS. 11.1754 - 11.1755 -\item [apic=bigsmp,default,es7000,summit ] 11.1756 - Specify NUMA platform. This can usually be probed automatically. 11.1757 - 11.1758 -\end{description} 11.1759 - 11.1760 -In addition, the following options may be specified on the Xen command 11.1761 -line. Since domain 0 shares responsibility for booting the platform, 11.1762 -Xen will automatically propagate these options to its command 11.1763 -line. These options are taken from Linux's command-line syntax with 11.1764 -unchanged semantics. 11.1765 - 11.1766 -\begin{description} 11.1767 -\item [acpi=off,force,strict,ht,noirq,\ldots ] 11.1768 - Modify how Xen (and domain 0) parses the BIOS ACPI tables. 11.1769 - 11.1770 -\item [acpi\_skip\_timer\_override ] 11.1771 - Instruct Xen (and domain 0) to ignore timer-interrupt override 11.1772 - instructions specified by the BIOS ACPI tables. 11.1773 - 11.1774 -\item [noapic ] 11.1775 - Instruct Xen (and domain 0) to ignore any IOAPICs that are present in 11.1776 - the system, and instead continue to use the legacy PIC. 11.1777 - 11.1778 -\end{description} 11.1779 - 11.1780 -\section{XenLinux Boot Options} 11.1781 - 11.1782 -In addition to the standard Linux kernel boot options, we support: 11.1783 -\begin{description} 11.1784 -\item[xencons=xxx ] Specify the device node to which the Xen virtual 11.1785 -console driver is attached. The following options are supported: 11.1786 -\begin{center} 11.1787 -\begin{tabular}{l} 11.1788 -`xencons=off': disable virtual console \\ 11.1789 -`xencons=tty': attach console to /dev/tty1 (tty0 at boot-time) \\ 11.1790 -`xencons=ttyS': attach console to /dev/ttyS0 11.1791 -\end{tabular} 11.1792 -\end{center} 11.1793 -The default is ttyS for dom0 and tty for all other domains. 11.1794 -\end{description} 11.1795 - 11.1796 - 11.1797 - 11.1798 -\section{Debugging} 11.1799 -\label{s:keys} 11.1800 - 11.1801 -Xen has a set of debugging features that can be useful to try and 11.1802 -figure out what's going on. Hit 'h' on the serial line (if you 11.1803 -specified a baud rate on the Xen command line) or ScrollLock-h on the 11.1804 -keyboard to get a list of supported commands. 11.1805 - 11.1806 -If you have a crash you'll likely get a crash dump containing an EIP 11.1807 -(PC) which, along with an \path{objdump -d image}, can be useful in 11.1808 -figuring out what's happened. Debug a Xenlinux image just as you 11.1809 -would any other Linux kernel. 11.1810 - 11.1811 -%% We supply a handy debug terminal program which you can find in 11.1812 -%% \path{/usr/local/src/xen-2.0.bk/tools/misc/miniterm/} 11.1813 -%% This should be built and executed on another machine that is connected 11.1814 -%% via a null modem cable. Documentation is included. 11.1815 -%% Alternatively, if the Xen machine is connected to a serial-port server 11.1816 -%% then we supply a dumb TCP terminal client, {\tt xencons}. 11.1817 - 11.1818 - 11.1819 +%% Chapter Build, Boot and Debug Options moved to build.tex 11.1820 +\include{src/user/build} 11.1821 11.1822 11.1823 \chapter{Further Support} 11.1824 @@ -1875,6 +108,7 @@ directory of the Xen source distribution 11.1825 %Various HOWTOs are available in \path{docs/HOWTOS} but this content is 11.1826 %being integrated into this manual. 11.1827 11.1828 + 11.1829 \section{Online References} 11.1830 11.1831 The official Xen web site is found at: 11.1832 @@ -1885,6 +119,7 @@ The official Xen web site is found at: 11.1833 This contains links to the latest versions of all on-line 11.1834 documentation (including the lateset version of the FAQ). 11.1835 11.1836 + 11.1837 \section{Mailing Lists} 11.1838 11.1839 There are currently four official Xen mailing lists: 11.1840 @@ -1905,326 +140,18 @@ from the unstable and 2.0 trees - develo 11.1841 \end{description} 11.1842 11.1843 11.1844 + 11.1845 \appendix 11.1846 11.1847 +%% Chapter Installing Xen / XenLinux on Debian moved to debian.tex 11.1848 +\include{src/user/debian} 11.1849 + 11.1850 +%% Chapter Installing Xen on Red Hat moved to redhat.tex 11.1851 +\include{src/user/redhat} 11.1852 + 11.1853 11.1854 -\chapter{Installing Xen / XenLinux on Debian} 11.1855 - 11.1856 -The Debian project provides a tool called \path{debootstrap} which 11.1857 -allows a base Debian system to be installed into a filesystem without 11.1858 -requiring the host system to have any Debian-specific software (such 11.1859 -as \path{apt}. 11.1860 - 11.1861 -Here's some info how to install Debian 3.1 (Sarge) for an unprivileged 11.1862 -Xen domain: 11.1863 - 11.1864 -\begin{enumerate} 11.1865 -\item Set up Xen 2.0 and test that it's working, as described earlier in 11.1866 - this manual. 11.1867 - 11.1868 -\item Create disk images for root-fs and swap (alternatively, you 11.1869 - might create dedicated partitions, LVM logical volumes, etc. if 11.1870 - that suits your setup). 11.1871 -\begin{small}\begin{verbatim} 11.1872 -dd if=/dev/zero of=/path/diskimage bs=1024k count=size_in_mbytes 11.1873 -dd if=/dev/zero of=/path/swapimage bs=1024k count=size_in_mbytes 11.1874 -\end{verbatim}\end{small} 11.1875 - If you're going to use this filesystem / disk image only as a 11.1876 - `template' for other vm disk images, something like 300 MB should 11.1877 - be enough.. (of course it depends what kind of packages you are 11.1878 - planning to install to the template) 11.1879 - 11.1880 -\item Create the filesystem and initialise the swap image 11.1881 -\begin{small}\begin{verbatim} 11.1882 -mkfs.ext3 /path/diskimage 11.1883 -mkswap /path/swapimage 11.1884 -\end{verbatim}\end{small} 11.1885 - 11.1886 -\item Mount the disk image for installation 11.1887 -\begin{small}\begin{verbatim} 11.1888 -mount -o loop /path/diskimage /mnt/disk 11.1889 -\end{verbatim}\end{small} 11.1890 - 11.1891 -\item Install \path{debootstrap} 11.1892 - 11.1893 -Make sure you have debootstrap installed on the host. If you are 11.1894 -running Debian sarge (3.1 / testing) or unstable you can install it by 11.1895 -running \path{apt-get install debootstrap}. Otherwise, it can be 11.1896 -downloaded from the Debian project website. 11.1897 - 11.1898 -\item Install Debian base to the disk image: 11.1899 -\begin{small}\begin{verbatim} 11.1900 -debootstrap --arch i386 sarge /mnt/disk \ 11.1901 - http://ftp.<countrycode>.debian.org/debian 11.1902 -\end{verbatim}\end{small} 11.1903 - 11.1904 -You can use any other Debian http/ftp mirror you want. 11.1905 - 11.1906 -\item When debootstrap completes successfully, modify settings: 11.1907 -\begin{small}\begin{verbatim} 11.1908 -chroot /mnt/disk /bin/bash 11.1909 -\end{verbatim}\end{small} 11.1910 - 11.1911 -Edit the following files using vi or nano and make needed changes: 11.1912 -\begin{small}\begin{verbatim} 11.1913 -/etc/hostname 11.1914 -/etc/hosts 11.1915 -/etc/resolv.conf 11.1916 -/etc/network/interfaces 11.1917 -/etc/networks 11.1918 -\end{verbatim}\end{small} 11.1919 - 11.1920 -Set up access to the services, edit: 11.1921 -\begin{small}\begin{verbatim} 11.1922 -/etc/hosts.deny 11.1923 -/etc/hosts.allow 11.1924 -/etc/inetd.conf 11.1925 -\end{verbatim}\end{small} 11.1926 - 11.1927 -Add Debian mirror to: 11.1928 -\begin{small}\begin{verbatim} 11.1929 -/etc/apt/sources.list 11.1930 -\end{verbatim}\end{small} 11.1931 - 11.1932 -Create fstab like this: 11.1933 -\begin{small}\begin{verbatim} 11.1934 -/dev/sda1 / ext3 errors=remount-ro 0 1 11.1935 -/dev/sda2 none swap sw 0 0 11.1936 -proc /proc proc defaults 0 0 11.1937 -\end{verbatim}\end{small} 11.1938 - 11.1939 -Logout 11.1940 - 11.1941 -\item Unmount the disk image 11.1942 -\begin{small}\begin{verbatim} 11.1943 -umount /mnt/disk 11.1944 -\end{verbatim}\end{small} 11.1945 - 11.1946 -\item Create Xen 2.0 configuration file for the new domain. You can 11.1947 - use the example-configurations coming with Xen as a template. 11.1948 - 11.1949 - Make sure you have the following set up: 11.1950 -\begin{small}\begin{verbatim} 11.1951 -disk = [ 'file:/path/diskimage,sda1,w', 'file:/path/swapimage,sda2,w' ] 11.1952 -root = "/dev/sda1 ro" 11.1953 -\end{verbatim}\end{small} 11.1954 - 11.1955 -\item Start the new domain 11.1956 -\begin{small}\begin{verbatim} 11.1957 -xm create -f domain_config_file 11.1958 -\end{verbatim}\end{small} 11.1959 - 11.1960 -Check that the new domain is running: 11.1961 -\begin{small}\begin{verbatim} 11.1962 -xm list 11.1963 -\end{verbatim}\end{small} 11.1964 - 11.1965 -\item Attach to the console of the new domain. 11.1966 - You should see something like this when starting the new domain: 11.1967 - 11.1968 -\begin{small}\begin{verbatim} 11.1969 -Started domain testdomain2, console on port 9626 11.1970 -\end{verbatim}\end{small} 11.1971 - 11.1972 - There you can see the ID of the console: 26. You can also list 11.1973 - the consoles with \path{xm consoles} (ID is the last two 11.1974 - digits of the port number.) 11.1975 - 11.1976 - Attach to the console: 11.1977 - 11.1978 -\begin{small}\begin{verbatim} 11.1979 -xm console 26 11.1980 -\end{verbatim}\end{small} 11.1981 - 11.1982 - or by telnetting to the port 9626 of localhost (the xm console 11.1983 - program works better). 11.1984 - 11.1985 -\item Log in and run base-config 11.1986 - 11.1987 - As a default there's no password for the root. 11.1988 - 11.1989 - Check that everything looks OK, and the system started without 11.1990 - errors. Check that the swap is active, and the network settings are 11.1991 - correct. 11.1992 - 11.1993 - Run \path{/usr/sbin/base-config} to set up the Debian settings. 11.1994 - 11.1995 - Set up the password for root using passwd. 11.1996 - 11.1997 -\item Done. You can exit the console by pressing \path{Ctrl + ]} 11.1998 - 11.1999 -\end{enumerate} 11.2000 - 11.2001 -If you need to create new domains, you can just copy the contents of 11.2002 -the `template'-image to the new disk images, either by mounting the 11.2003 -template and the new image, and using \path{cp -a} or \path{tar} or by 11.2004 -simply copying the image file. Once this is done, modify the 11.2005 -image-specific settings (hostname, network settings, etc). 11.2006 - 11.2007 -\chapter{Installing Xen / XenLinux on Redhat or Fedora Core} 11.2008 - 11.2009 -When using Xen / XenLinux on a standard Linux distribution there are 11.2010 -a couple of things to watch out for: 11.2011 - 11.2012 -Note that, because domains>0 don't have any privileged access at all, 11.2013 -certain commands in the default boot sequence will fail e.g. attempts 11.2014 -to update the hwclock, change the console font, update the keytable 11.2015 -map, start apmd (power management), or gpm (mouse cursor). Either 11.2016 -ignore the errors (they should be harmless), or remove them from the 11.2017 -startup scripts. Deleting the following links are a good start: 11.2018 -{\path{S24pcmcia}}, {\path{S09isdn}}, 11.2019 -{\path{S17keytable}}, {\path{S26apmd}}, 11.2020 -{\path{S85gpm}}. 11.2021 - 11.2022 -If you want to use a single root file system that works cleanly for 11.2023 -both domain 0 and unprivileged domains, a useful trick is to use 11.2024 -different 'init' run levels. For example, use 11.2025 -run level 3 for domain 0, and run level 4 for other domains. This 11.2026 -enables different startup scripts to be run in depending on the run 11.2027 -level number passed on the kernel command line. 11.2028 - 11.2029 -If using NFS root files systems mounted either from an 11.2030 -external server or from domain0 there are a couple of other gotchas. 11.2031 -The default {\path{/etc/sysconfig/iptables}} rules block NFS, so part 11.2032 -way through the boot sequence things will suddenly go dead. 11.2033 - 11.2034 -If you're planning on having a separate NFS {\path{/usr}} partition, the 11.2035 -RH9 boot scripts don't make life easy - they attempt to mount NFS file 11.2036 -systems way to late in the boot process. The easiest way I found to do 11.2037 -this was to have a {\path{/linuxrc}} script run ahead of 11.2038 -{\path{/sbin/init}} that mounts {\path{/usr}}: 11.2039 - 11.2040 -\begin{quote} 11.2041 -\begin{small}\begin{verbatim} 11.2042 - #!/bin/bash 11.2043 - /sbin/ipconfig lo 127.0.0.1 11.2044 - /sbin/portmap 11.2045 - /bin/mount /usr 11.2046 - exec /sbin/init "$@" <>/dev/console 2>&1 11.2047 -\end{verbatim}\end{small} 11.2048 -\end{quote} 11.2049 - 11.2050 -%$ XXX SMH: font lock fix :-) 11.2051 - 11.2052 -The one slight complication with the above is that 11.2053 -{\path{/sbin/portmap}} is dynamically linked against 11.2054 -{\path{/usr/lib/libwrap.so.0}} Since this is in 11.2055 -{\path{/usr}}, it won't work. This can be solved by copying the 11.2056 -file (and link) below the /usr mount point, and just let the file be 11.2057 -'covered' when the mount happens. 11.2058 - 11.2059 -In some installations, where a shared read-only {\path{/usr}} is 11.2060 -being used, it may be desirable to move other large directories over 11.2061 -into the read-only {\path{/usr}}. For example, you might replace 11.2062 -{\path{/bin}}, {\path{/lib}} and {\path{/sbin}} with 11.2063 -links into {\path{/usr/root/bin}}, {\path{/usr/root/lib}} 11.2064 -and {\path{/usr/root/sbin}} respectively. This creates other 11.2065 -problems for running the {\path{/linuxrc}} script, requiring 11.2066 -bash, portmap, mount, ifconfig, and a handful of other shared 11.2067 -libraries to be copied below the mount point --- a simple 11.2068 -statically-linked C program would solve this problem. 11.2069 - 11.2070 - 11.2071 - 11.2072 - 11.2073 -\chapter{Glossary of Terms} 11.2074 - 11.2075 -\begin{description} 11.2076 -\item[Atropos] One of the CPU schedulers provided by Xen. 11.2077 - Atropos provides domains with absolute shares 11.2078 - of the CPU, with timeliness guarantees and a 11.2079 - mechanism for sharing out `slack time'. 11.2080 - 11.2081 -\item[BVT] The BVT scheduler is used to give proportional 11.2082 - fair shares of the CPU to domains. 11.2083 - 11.2084 -\item[Exokernel] A minimal piece of privileged code, similar to 11.2085 - a {\bf microkernel} but providing a more 11.2086 - `hardware-like' interface to the tasks it 11.2087 - manages. This is similar to a paravirtualising 11.2088 - VMM like {\bf Xen} but was designed as a new 11.2089 - operating system structure, rather than 11.2090 - specifically to run multiple conventional OSs. 11.2091 - 11.2092 -\item[Domain] A domain is the execution context that 11.2093 - contains a running {\bf virtual machine}. 11.2094 - The relationship between virtual machines 11.2095 - and domains on Xen is similar to that between 11.2096 - programs and processes in an operating 11.2097 - system: a virtual machine is a persistent 11.2098 - entity that resides on disk (somewhat like 11.2099 - a program). When it is loaded for execution, 11.2100 - it runs in a domain. Each domain has a 11.2101 - {\bf domain ID}. 11.2102 - 11.2103 -\item[Domain 0] The first domain to be started on a Xen 11.2104 - machine. Domain 0 is responsible for managing 11.2105 - the system. 11.2106 - 11.2107 -\item[Domain ID] A unique identifier for a {\bf domain}, 11.2108 - analogous to a process ID in an operating 11.2109 - system. 11.2110 - 11.2111 -\item[Full virtualisation] An approach to virtualisation which 11.2112 - requires no modifications to the hosted 11.2113 - operating system, providing the illusion of 11.2114 - a complete system of real hardware devices. 11.2115 - 11.2116 -\item[Hypervisor] An alternative term for {\bf VMM}, used 11.2117 - because it means `beyond supervisor', 11.2118 - since it is responsible for managing multiple 11.2119 - `supervisor' kernels. 11.2120 - 11.2121 -\item[Live migration] A technique for moving a running virtual 11.2122 - machine to another physical host, without 11.2123 - stopping it or the services running on it. 11.2124 - 11.2125 -\item[Microkernel] A small base of code running at the highest 11.2126 - hardware privilege level. A microkernel is 11.2127 - responsible for sharing CPU and memory (and 11.2128 - sometimes other devices) between less 11.2129 - privileged tasks running on the system. 11.2130 - This is similar to a VMM, particularly a 11.2131 - {\bf paravirtualising} VMM but typically 11.2132 - addressing a different problem space and 11.2133 - providing different kind of interface. 11.2134 - 11.2135 -\item[NetBSD/Xen] A port of NetBSD to the Xen architecture. 11.2136 - 11.2137 -\item[Paravirtualisation] An approach to virtualisation which requires 11.2138 - modifications to the operating system in 11.2139 - order to run in a virtual machine. Xen 11.2140 - uses paravirtualisation but preserves 11.2141 - binary compatibility for user space 11.2142 - applications. 11.2143 - 11.2144 -\item[Shadow pagetables] A technique for hiding the layout of machine 11.2145 - memory from a virtual machine's operating 11.2146 - system. Used in some {\bf VMMs} to provide 11.2147 - the illusion of contiguous physical memory, 11.2148 - in Xen this is used during 11.2149 - {\bf live migration}. 11.2150 - 11.2151 -\item[Virtual Machine] The environment in which a hosted operating 11.2152 - system runs, providing the abstraction of a 11.2153 - dedicated machine. A virtual machine may 11.2154 - be identical to the underlying hardware (as 11.2155 - in {\bf full virtualisation}, or it may 11.2156 - differ, as in {\bf paravirtualisation}. 11.2157 - 11.2158 -\item[VMM] Virtual Machine Monitor - the software that 11.2159 - allows multiple virtual machines to be 11.2160 - multiplexed on a single physical machine. 11.2161 - 11.2162 -\item[Xen] Xen is a paravirtualising virtual machine 11.2163 - monitor, developed primarily by the 11.2164 - Systems Research Group at the University 11.2165 - of Cambridge Computer Laboratory. 11.2166 - 11.2167 -\item[XenLinux] Official name for the port of the Linux kernel 11.2168 - that runs on Xen. 11.2169 - 11.2170 -\end{description} 11.2171 +%% Chapter Glossary of Terms moved to glossary.tex 11.2172 +\include{src/user/glossary} 11.2173 11.2174 11.2175 \end{document}
12.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 12.2 +++ b/docs/src/user/build.tex Tue Sep 20 09:43:46 2005 +0000 12.3 @@ -0,0 +1,170 @@ 12.4 +\chapter{Build, Boot and Debug Options} 12.5 + 12.6 +This chapter describes the build- and boot-time options which may be 12.7 +used to tailor your Xen system. 12.8 + 12.9 + 12.10 +\section{Xen Build Options} 12.11 + 12.12 +Xen provides a number of build-time options which should be set as 12.13 +environment variables or passed on make's command-line. 12.14 + 12.15 +\begin{description} 12.16 +\item[verbose=y] Enable debugging messages when Xen detects an 12.17 + unexpected condition. Also enables console output from all domains. 12.18 +\item[debug=y] Enable debug assertions. Implies {\bf verbose=y}. 12.19 + (Primarily useful for tracing bugs in Xen). 12.20 +\item[debugger=y] Enable the in-Xen debugger. This can be used to 12.21 + debug Xen, guest OSes, and applications. 12.22 +\item[perfc=y] Enable performance counters for significant events 12.23 + within Xen. The counts can be reset or displayed on Xen's console 12.24 + via console control keys. 12.25 +\item[trace=y] Enable per-cpu trace buffers which log a range of 12.26 + events within Xen for collection by control software. 12.27 +\end{description} 12.28 + 12.29 + 12.30 +\section{Xen Boot Options} 12.31 +\label{s:xboot} 12.32 + 12.33 +These options are used to configure Xen's behaviour at runtime. They 12.34 +should be appended to Xen's command line, either manually or by 12.35 +editing \path{grub.conf}. 12.36 + 12.37 +\begin{description} 12.38 +\item [ noreboot ] Don't reboot the machine automatically on errors. 12.39 + This is useful to catch debug output if you aren't catching console 12.40 + messages via the serial line. 12.41 +\item [ nosmp ] Disable SMP support. This option is implied by 12.42 + `ignorebiostables'. 12.43 +\item [ watchdog ] Enable NMI watchdog which can report certain 12.44 + failures. 12.45 +\item [ noirqbalance ] Disable software IRQ balancing and affinity. 12.46 + This can be used on systems such as Dell 1850/2850 that have 12.47 + workarounds in hardware for IRQ-routing issues. 12.48 +\item [ badpage=$<$page number$>$,$<$page number$>$, \ldots ] Specify 12.49 + a list of pages not to be allocated for use because they contain bad 12.50 + bytes. For example, if your memory tester says that byte 0x12345678 12.51 + is bad, you would place `badpage=0x12345' on Xen's command line. 12.52 +\item [ com1=$<$baud$>$,DPS,$<$io\_base$>$,$<$irq$>$ 12.53 + com2=$<$baud$>$,DPS,$<$io\_base$>$,$<$irq$>$ ] \mbox{}\\ 12.54 + Xen supports up to two 16550-compatible serial ports. For example: 12.55 + `com1=9600, 8n1, 0x408, 5' maps COM1 to a 9600-baud port, 8 data 12.56 + bits, no parity, 1 stop bit, I/O port base 0x408, IRQ 5. If some 12.57 + configuration options are standard (e.g., I/O base and IRQ), then 12.58 + only a prefix of the full configuration string need be specified. If 12.59 + the baud rate is pre-configured (e.g., by the bootloader) then you 12.60 + can specify `auto' in place of a numeric baud rate. 12.61 +\item [ console=$<$specifier list$>$ ] Specify the destination for Xen 12.62 + console I/O. This is a comma-separated list of, for example: 12.63 + \begin{description} 12.64 + \item[ vga ] Use VGA console and allow keyboard input. 12.65 + \item[ com1 ] Use serial port com1. 12.66 + \item[ com2H ] Use serial port com2. Transmitted chars will have the 12.67 + MSB set. Received chars must have MSB set. 12.68 + \item[ com2L] Use serial port com2. Transmitted chars will have the 12.69 + MSB cleared. Received chars must have MSB cleared. 12.70 + \end{description} 12.71 + The latter two examples allow a single port to be shared by two 12.72 + subsystems (e.g.\ console and debugger). Sharing is controlled by 12.73 + MSB of each transmitted/received character. [NB. Default for this 12.74 + option is `com1,vga'] 12.75 +\item [ sync\_console ] Force synchronous console output. This is 12.76 + useful if you system fails unexpectedly before it has sent all 12.77 + available output to the console. In most cases Xen will 12.78 + automatically enter synchronous mode when an exceptional event 12.79 + occurs, but this option provides a manual fallback. 12.80 +\item [ conswitch=$<$switch-char$><$auto-switch-char$>$ ] Specify how 12.81 + to switch serial-console input between Xen and DOM0. The required 12.82 + sequence is CTRL-$<$switch-char$>$ pressed three times. Specifying 12.83 + the backtick character disables switching. The 12.84 + $<$auto-switch-char$>$ specifies whether Xen should auto-switch 12.85 + input to DOM0 when it boots --- if it is `x' then auto-switching is 12.86 + disabled. Any other value, or omitting the character, enables 12.87 + auto-switching. [NB. Default switch-char is `a'.] 12.88 +\item [ nmi=xxx ] 12.89 + Specify what to do with an NMI parity or I/O error. \\ 12.90 + `nmi=fatal': Xen prints a diagnostic and then hangs. \\ 12.91 + `nmi=dom0': Inform DOM0 of the NMI. \\ 12.92 + `nmi=ignore': Ignore the NMI. 12.93 +\item [ mem=xxx ] Set the physical RAM address limit. Any RAM 12.94 + appearing beyond this physical address in the memory map will be 12.95 + ignored. This parameter may be specified with a B, K, M or G suffix, 12.96 + representing bytes, kilobytes, megabytes and gigabytes respectively. 12.97 + The default unit, if no suffix is specified, is kilobytes. 12.98 +\item [ dom0\_mem=xxx ] Set the amount of memory to be allocated to 12.99 + domain0. In Xen 3.x the parameter may be specified with a B, K, M or 12.100 + G suffix, representing bytes, kilobytes, megabytes and gigabytes 12.101 + respectively; if no suffix is specified, the parameter defaults to 12.102 + kilobytes. In previous versions of Xen, suffixes were not supported 12.103 + and the value is always interpreted as kilobytes. 12.104 +\item [ tbuf\_size=xxx ] Set the size of the per-cpu trace buffers, in 12.105 + pages (default 1). Note that the trace buffers are only enabled in 12.106 + debug builds. Most users can ignore this feature completely. 12.107 +\item [ sched=xxx ] Select the CPU scheduler Xen should use. The 12.108 + current possibilities are `bvt' (default), `atropos' and `rrobin'. 12.109 + For more information see Section~\ref{s:sched}. 12.110 +\item [ apic\_verbosity=debug,verbose ] Print more detailed 12.111 + information about local APIC and IOAPIC configuration. 12.112 +\item [ lapic ] Force use of local APIC even when left disabled by 12.113 + uniprocessor BIOS. 12.114 +\item [ nolapic ] Ignore local APIC in a uniprocessor system, even if 12.115 + enabled by the BIOS. 12.116 +\item [ apic=bigsmp,default,es7000,summit ] Specify NUMA platform. 12.117 + This can usually be probed automatically. 12.118 +\end{description} 12.119 + 12.120 +In addition, the following options may be specified on the Xen command 12.121 +line. Since domain 0 shares responsibility for booting the platform, 12.122 +Xen will automatically propagate these options to its command line. 12.123 +These options are taken from Linux's command-line syntax with 12.124 +unchanged semantics. 12.125 + 12.126 +\begin{description} 12.127 +\item [ acpi=off,force,strict,ht,noirq,\ldots ] Modify how Xen (and 12.128 + domain 0) parses the BIOS ACPI tables. 12.129 +\item [ acpi\_skip\_timer\_override ] Instruct Xen (and domain~0) to 12.130 + ignore timer-interrupt override instructions specified by the BIOS 12.131 + ACPI tables. 12.132 +\item [ noapic ] Instruct Xen (and domain~0) to ignore any IOAPICs 12.133 + that are present in the system, and instead continue to use the 12.134 + legacy PIC. 12.135 +\end{description} 12.136 + 12.137 + 12.138 +\section{XenLinux Boot Options} 12.139 + 12.140 +In addition to the standard Linux kernel boot options, we support: 12.141 +\begin{description} 12.142 +\item[ xencons=xxx ] Specify the device node to which the Xen virtual 12.143 + console driver is attached. The following options are supported: 12.144 + \begin{center} 12.145 + \begin{tabular}{l} 12.146 + `xencons=off': disable virtual console \\ 12.147 + `xencons=tty': attach console to /dev/tty1 (tty0 at boot-time) \\ 12.148 + `xencons=ttyS': attach console to /dev/ttyS0 12.149 + \end{tabular} 12.150 +\end{center} 12.151 +The default is ttyS for dom0 and tty for all other domains. 12.152 +\end{description} 12.153 + 12.154 + 12.155 +\section{Debugging} 12.156 +\label{s:keys} 12.157 + 12.158 +Xen has a set of debugging features that can be useful to try and 12.159 +figure out what's going on. Hit `h' on the serial line (if you 12.160 +specified a baud rate on the Xen command line) or ScrollLock-h on the 12.161 +keyboard to get a list of supported commands. 12.162 + 12.163 +If you have a crash you'll likely get a crash dump containing an EIP 12.164 +(PC) which, along with an \path{objdump -d image}, can be useful in 12.165 +figuring out what's happened. Debug a Xenlinux image just as you 12.166 +would any other Linux kernel. 12.167 + 12.168 +%% We supply a handy debug terminal program which you can find in 12.169 +%% \path{/usr/local/src/xen-2.0.bk/tools/misc/miniterm/} This should 12.170 +%% be built and executed on another machine that is connected via a 12.171 +%% null modem cable. Documentation is included. Alternatively, if the 12.172 +%% Xen machine is connected to a serial-port server then we supply a 12.173 +%% dumb TCP terminal client, {\tt xencons}.
13.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 13.2 +++ b/docs/src/user/control_software.tex Tue Sep 20 09:43:46 2005 +0000 13.3 @@ -0,0 +1,115 @@ 13.4 +\chapter{Control Software} 13.5 + 13.6 +The Xen control software includes the \xend\ node control daemon 13.7 +(which must be running), the xm command line tools, and the prototype 13.8 +xensv web interface. 13.9 + 13.10 +\section{\Xend\ (node control daemon)} 13.11 +\label{s:xend} 13.12 + 13.13 +The Xen Daemon (\Xend) performs system management functions related to 13.14 +virtual machines. It forms a central point of control for a machine 13.15 +and can be controlled using an HTTP-based protocol. \Xend\ must be 13.16 +running in order to start and manage virtual machines. 13.17 + 13.18 +\Xend\ must be run as root because it needs access to privileged 13.19 +system management functions. A small set of commands may be issued on 13.20 +the \xend\ command line: 13.21 + 13.22 +\begin{tabular}{ll} 13.23 + \verb!# xend start! & start \xend, if not already running \\ 13.24 + \verb!# xend stop! & stop \xend\ if already running \\ 13.25 + \verb!# xend restart! & restart \xend\ if running, otherwise start it \\ 13.26 + % \verb!# xend trace_start! & start \xend, with very detailed debug logging \\ 13.27 + \verb!# xend status! & indicates \xend\ status by its return code 13.28 +\end{tabular} 13.29 + 13.30 +A SysV init script called {\tt xend} is provided to start \xend\ at 13.31 +boot time. {\tt make install} installs this script in 13.32 +\path{/etc/init.d}. To enable it, you have to make symbolic links in 13.33 +the appropriate runlevel directories or use the {\tt chkconfig} tool, 13.34 +where available. 13.35 + 13.36 +Once \xend\ is running, more sophisticated administration can be done 13.37 +using the xm tool (see Section~\ref{s:xm}) and the experimental Xensv 13.38 +web interface (see Section~\ref{s:xensv}). 13.39 + 13.40 +As \xend\ runs, events will be logged to \path{/var/log/xend.log} and, 13.41 +if the migration assistant daemon (\path{xfrd}) has been started, 13.42 +\path{/var/log/xfrd.log}. These may be of use for troubleshooting 13.43 +problems. 13.44 + 13.45 +\section{Xm (command line interface)} 13.46 +\label{s:xm} 13.47 + 13.48 +The xm tool is the primary tool for managing Xen from the console. 13.49 +The general format of an xm command line is: 13.50 + 13.51 +\begin{verbatim} 13.52 +# xm command [switches] [arguments] [variables] 13.53 +\end{verbatim} 13.54 + 13.55 +The available \emph{switches} and \emph{arguments} are dependent on 13.56 +the \emph{command} chosen. The \emph{variables} may be set using 13.57 +declarations of the form {\tt variable=value} and command line 13.58 +declarations override any of the values in the configuration file 13.59 +being used, including the standard variables described above and any 13.60 +custom variables (for instance, the \path{xmdefconfig} file uses a 13.61 +{\tt vmid} variable). 13.62 + 13.63 +The available commands are as follows: 13.64 + 13.65 +\begin{description} 13.66 +\item[set-mem] Request a domain to adjust its memory footprint. 13.67 +\item[create] Create a new domain. 13.68 +\item[destroy] Kill a domain immediately. 13.69 +\item[list] List running domains. 13.70 +\item[shutdown] Ask a domain to shutdown. 13.71 +\item[dmesg] Fetch the Xen (not Linux!) boot output. 13.72 +\item[consoles] Lists the available consoles. 13.73 +\item[console] Connect to the console for a domain. 13.74 +\item[help] Get help on xm commands. 13.75 +\item[save] Suspend a domain to disk. 13.76 +\item[restore] Restore a domain from disk. 13.77 +\item[pause] Pause a domain's execution. 13.78 +\item[unpause] Un-pause a domain. 13.79 +\item[pincpu] Pin a domain to a CPU. 13.80 +\item[bvt] Set BVT scheduler parameters for a domain. 13.81 +\item[bvt\_ctxallow] Set the BVT context switching allowance for the 13.82 + system. 13.83 +\item[atropos] Set the atropos parameters for a domain. 13.84 +\item[rrobin] Set the round robin time slice for the system. 13.85 +\item[info] Get information about the Xen host. 13.86 +\item[call] Call a \xend\ HTTP API function directly. 13.87 +\end{description} 13.88 + 13.89 +For a detailed overview of switches, arguments and variables to each 13.90 +command try 13.91 +\begin{quote} 13.92 +\begin{verbatim} 13.93 +# xm help command 13.94 +\end{verbatim} 13.95 +\end{quote} 13.96 + 13.97 +\section{Xensv (web control interface)} 13.98 +\label{s:xensv} 13.99 + 13.100 +Xensv is the experimental web control interface for managing a Xen 13.101 +machine. It can be used to perform some (but not yet all) of the 13.102 +management tasks that can be done using the xm tool. 13.103 + 13.104 +It can be started using: 13.105 +\begin{quote} 13.106 + \verb_# xensv start_ 13.107 +\end{quote} 13.108 +and stopped using: 13.109 +\begin{quote} 13.110 + \verb_# xensv stop_ 13.111 +\end{quote} 13.112 + 13.113 +By default, Xensv will serve out the web interface on port 8080. This 13.114 +can be changed by editing 13.115 +\path{/usr/lib/python2.3/site-packages/xen/sv/params.py}. 13.116 + 13.117 +Once Xensv is running, the web interface can be used to create and 13.118 +manage running domains.
14.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 14.2 +++ b/docs/src/user/debian.tex Tue Sep 20 09:43:46 2005 +0000 14.3 @@ -0,0 +1,154 @@ 14.4 +\chapter{Installing Xen / XenLinux on Debian} 14.5 + 14.6 +The Debian project provides a tool called \path{debootstrap} which 14.7 +allows a base Debian system to be installed into a filesystem without 14.8 +requiring the host system to have any Debian-specific software (such 14.9 +as \path{apt}). 14.10 + 14.11 +Here's some info how to install Debian 3.1 (Sarge) for an unprivileged 14.12 +Xen domain: 14.13 + 14.14 +\begin{enumerate} 14.15 + 14.16 +\item Set up Xen and test that it's working, as described earlier in 14.17 + this manual. 14.18 + 14.19 +\item Create disk images for rootfs and swap. Alternatively, you might 14.20 + create dedicated partitions, LVM logical volumes, etc.\ if that 14.21 + suits your setup. 14.22 +\begin{verbatim} 14.23 +dd if=/dev/zero of=/path/diskimage bs=1024k count=size_in_mbytes 14.24 +dd if=/dev/zero of=/path/swapimage bs=1024k count=size_in_mbytes 14.25 +\end{verbatim} 14.26 + 14.27 + If you're going to use this filesystem / disk image only as a 14.28 + `template' for other vm disk images, something like 300 MB should be 14.29 + enough. (of course it depends what kind of packages you are planning 14.30 + to install to the template) 14.31 + 14.32 +\item Create the filesystem and initialise the swap image 14.33 +\begin{verbatim} 14.34 +mkfs.ext3 /path/diskimage 14.35 +mkswap /path/swapimage 14.36 +\end{verbatim} 14.37 + 14.38 +\item Mount the disk image for installation 14.39 +\begin{verbatim} 14.40 +mount -o loop /path/diskimage /mnt/disk 14.41 +\end{verbatim} 14.42 + 14.43 +\item Install \path{debootstrap}. Make sure you have debootstrap 14.44 + installed on the host. If you are running Debian Sarge (3.1 / 14.45 + testing) or unstable you can install it by running \path{apt-get 14.46 + install debootstrap}. Otherwise, it can be downloaded from the 14.47 + Debian project website. 14.48 + 14.49 +\item Install Debian base to the disk image: 14.50 +\begin{verbatim} 14.51 +debootstrap --arch i386 sarge /mnt/disk \ 14.52 + http://ftp.<countrycode>.debian.org/debian 14.53 +\end{verbatim} 14.54 + 14.55 + You can use any other Debian http/ftp mirror you want. 14.56 + 14.57 +\item When debootstrap completes successfully, modify settings: 14.58 +\begin{verbatim} 14.59 +chroot /mnt/disk /bin/bash 14.60 +\end{verbatim} 14.61 + 14.62 +Edit the following files using vi or nano and make needed changes: 14.63 +\begin{verbatim} 14.64 +/etc/hostname 14.65 +/etc/hosts 14.66 +/etc/resolv.conf 14.67 +/etc/network/interfaces 14.68 +/etc/networks 14.69 +\end{verbatim} 14.70 + 14.71 +Set up access to the services, edit: 14.72 +\begin{verbatim} 14.73 +/etc/hosts.deny 14.74 +/etc/hosts.allow 14.75 +/etc/inetd.conf 14.76 +\end{verbatim} 14.77 + 14.78 +Add Debian mirror to: 14.79 +\begin{verbatim} 14.80 +/etc/apt/sources.list 14.81 +\end{verbatim} 14.82 + 14.83 +Create fstab like this: 14.84 +\begin{verbatim} 14.85 +/dev/sda1 / ext3 errors=remount-ro 0 1 14.86 +/dev/sda2 none swap sw 0 0 14.87 +proc /proc proc defaults 0 0 14.88 +\end{verbatim} 14.89 + 14.90 +Logout 14.91 + 14.92 +\item Unmount the disk image 14.93 +\begin{verbatim} 14.94 +umount /mnt/disk 14.95 +\end{verbatim} 14.96 + 14.97 +\item Create Xen 2.0 configuration file for the new domain. You can 14.98 + use the example-configurations coming with Xen as a template. 14.99 + 14.100 + Make sure you have the following set up: 14.101 +\begin{verbatim} 14.102 +disk = [ 'file:/path/diskimage,sda1,w', 'file:/path/swapimage,sda2,w' ] 14.103 +root = "/dev/sda1 ro" 14.104 +\end{verbatim} 14.105 + 14.106 +\item Start the new domain 14.107 +\begin{verbatim} 14.108 +xm create -f domain_config_file 14.109 +\end{verbatim} 14.110 + 14.111 +Check that the new domain is running: 14.112 +\begin{verbatim} 14.113 +xm list 14.114 +\end{verbatim} 14.115 + 14.116 +\item Attach to the console of the new domain. You should see 14.117 + something like this when starting the new domain: 14.118 + 14.119 +\begin{verbatim} 14.120 +Started domain testdomain2, console on port 9626 14.121 +\end{verbatim} 14.122 + 14.123 + There you can see the ID of the console: 26. You can also list the 14.124 + consoles with \path{xm consoles} (ID is the last two digits of the 14.125 + port number.) 14.126 + 14.127 + Attach to the console: 14.128 + 14.129 +\begin{verbatim} 14.130 +xm console 26 14.131 +\end{verbatim} 14.132 + 14.133 + or by telnetting to the port 9626 of localhost (the xm console 14.134 + program works better). 14.135 + 14.136 +\item Log in and run base-config 14.137 + 14.138 + As a default there's no password for the root. 14.139 + 14.140 + Check that everything looks OK, and the system started without 14.141 + errors. Check that the swap is active, and the network settings are 14.142 + correct. 14.143 + 14.144 + Run \path{/usr/sbin/base-config} to set up the Debian settings. 14.145 + 14.146 + Set up the password for root using passwd. 14.147 + 14.148 +\item Done. You can exit the console by pressing {\path{Ctrl + ]}} 14.149 + 14.150 +\end{enumerate} 14.151 + 14.152 + 14.153 +If you need to create new domains, you can just copy the contents of 14.154 +the `template'-image to the new disk images, either by mounting the 14.155 +template and the new image, and using \path{cp -a} or \path{tar} or by 14.156 +simply copying the image file. Once this is done, modify the 14.157 +image-specific settings (hostname, network settings, etc).
15.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 15.2 +++ b/docs/src/user/domain_configuration.tex Tue Sep 20 09:43:46 2005 +0000 15.3 @@ -0,0 +1,281 @@ 15.4 +\chapter{Domain Configuration} 15.5 +\label{cha:config} 15.6 + 15.7 +The following contains the syntax of the domain configuration files 15.8 +and description of how to further specify networking, driver domain 15.9 +and general scheduling behavior. 15.10 + 15.11 + 15.12 +\section{Configuration Files} 15.13 +\label{s:cfiles} 15.14 + 15.15 +Xen configuration files contain the following standard variables. 15.16 +Unless otherwise stated, configuration items should be enclosed in 15.17 +quotes: see \path{/etc/xen/xmexample1} and \path{/etc/xen/xmexample2} 15.18 +for concrete examples of the syntax. 15.19 + 15.20 +\begin{description} 15.21 +\item[kernel] Path to the kernel image. 15.22 +\item[ramdisk] Path to a ramdisk image (optional). 15.23 + % \item[builder] The name of the domain build function (e.g. 15.24 + % {\tt'linux'} or {\tt'netbsd'}. 15.25 +\item[memory] Memory size in megabytes. 15.26 +\item[cpu] CPU to run this domain on, or {\tt -1} for auto-allocation. 15.27 +\item[console] Port to export the domain console on (default 9600 + 15.28 + domain ID). 15.29 +\item[nics] Number of virtual network interfaces. 15.30 +\item[vif] List of MAC addresses (random addresses are assigned if not 15.31 + given) and bridges to use for the domain's network interfaces, e.g.\ 15.32 +\begin{verbatim} 15.33 +vif = [ 'mac=aa:00:00:00:00:11, bridge=xen-br0', 15.34 + 'bridge=xen-br1' ] 15.35 +\end{verbatim} 15.36 + to assign a MAC address and bridge to the first interface and assign 15.37 + a different bridge to the second interface, leaving \xend\ to choose 15.38 + the MAC address. 15.39 +\item[disk] List of block devices to export to the domain, e.g.\ \\ 15.40 + \verb_disk = [ 'phy:hda1,sda1,r' ]_ \\ 15.41 + exports physical device \path{/dev/hda1} to the domain as 15.42 + \path{/dev/sda1} with read-only access. Exporting a disk read-write 15.43 + which is currently mounted is dangerous -- if you are \emph{certain} 15.44 + you wish to do this, you can specify \path{w!} as the mode. 15.45 +\item[dhcp] Set to {\tt `dhcp'} if you want to use DHCP to configure 15.46 + networking. 15.47 +\item[netmask] Manually configured IP netmask. 15.48 +\item[gateway] Manually configured IP gateway. 15.49 +\item[hostname] Set the hostname for the virtual machine. 15.50 +\item[root] Specify the root device parameter on the kernel command 15.51 + line. 15.52 +\item[nfs\_server] IP address for the NFS server (if any). 15.53 +\item[nfs\_root] Path of the root filesystem on the NFS server (if 15.54 + any). 15.55 +\item[extra] Extra string to append to the kernel command line (if 15.56 + any) 15.57 +\item[restart] Three possible options: 15.58 + \begin{description} 15.59 + \item[always] Always restart the domain, no matter what its exit 15.60 + code is. 15.61 + \item[never] Never restart the domain. 15.62 + \item[onreboot] Restart the domain iff it requests reboot. 15.63 + \end{description} 15.64 +\end{description} 15.65 + 15.66 +For additional flexibility, it is also possible to include Python 15.67 +scripting commands in configuration files. An example of this is the 15.68 +\path{xmexample2} file, which uses Python code to handle the 15.69 +\path{vmid} variable. 15.70 + 15.71 + 15.72 +%\part{Advanced Topics} 15.73 + 15.74 + 15.75 +\section{Network Configuration} 15.76 + 15.77 +For many users, the default installation should work ``out of the 15.78 +box''. More complicated network setups, for instance with multiple 15.79 +Ethernet interfaces and/or existing bridging setups will require some 15.80 +special configuration. 15.81 + 15.82 +The purpose of this section is to describe the mechanisms provided by 15.83 +\xend\ to allow a flexible configuration for Xen's virtual networking. 15.84 + 15.85 +\subsection{Xen virtual network topology} 15.86 + 15.87 +Each domain network interface is connected to a virtual network 15.88 +interface in dom0 by a point to point link (effectively a ``virtual 15.89 +crossover cable''). These devices are named {\tt 15.90 + vif$<$domid$>$.$<$vifid$>$} (e.g.\ {\tt vif1.0} for the first 15.91 +interface in domain~1, {\tt vif3.1} for the second interface in 15.92 +domain~3). 15.93 + 15.94 +Traffic on these virtual interfaces is handled in domain~0 using 15.95 +standard Linux mechanisms for bridging, routing, rate limiting, etc. 15.96 +Xend calls on two shell scripts to perform initial configuration of 15.97 +the network and configuration of new virtual interfaces. By default, 15.98 +these scripts configure a single bridge for all the virtual 15.99 +interfaces. Arbitrary routing / bridging configurations can be 15.100 +configured by customizing the scripts, as described in the following 15.101 +section. 15.102 + 15.103 +\subsection{Xen networking scripts} 15.104 + 15.105 +Xen's virtual networking is configured by two shell scripts (by 15.106 +default \path{network} and \path{vif-bridge}). These are called 15.107 +automatically by \xend\ when certain events occur, with arguments to 15.108 +the scripts providing further contextual information. These scripts 15.109 +are found by default in \path{/etc/xen/scripts}. The names and 15.110 +locations of the scripts can be configured in 15.111 +\path{/etc/xen/xend-config.sxp}. 15.112 + 15.113 +\begin{description} 15.114 +\item[network:] This script is called whenever \xend\ is started or 15.115 + stopped to respectively initialize or tear down the Xen virtual 15.116 + network. In the default configuration initialization creates the 15.117 + bridge `xen-br0' and moves eth0 onto that bridge, modifying the 15.118 + routing accordingly. When \xend\ exits, it deletes the Xen bridge 15.119 + and removes eth0, restoring the normal IP and routing configuration. 15.120 + 15.121 + %% In configurations where the bridge already exists, this script 15.122 + %% could be replaced with a link to \path{/bin/true} (for instance). 15.123 + 15.124 +\item[vif-bridge:] This script is called for every domain virtual 15.125 + interface and can configure firewalling rules and add the vif to the 15.126 + appropriate bridge. By default, this adds and removes VIFs on the 15.127 + default Xen bridge. 15.128 +\end{description} 15.129 + 15.130 +For more complex network setups (e.g.\ where routing is required or 15.131 +integrate with existing bridges) these scripts may be replaced with 15.132 +customized variants for your site's preferred configuration. 15.133 + 15.134 +%% There are two possible types of privileges: IO privileges and 15.135 +%% administration privileges. 15.136 + 15.137 + 15.138 +\section{Driver Domain Configuration} 15.139 + 15.140 +I/O privileges can be assigned to allow a domain to directly access 15.141 +PCI devices itself. This is used to support driver domains. 15.142 + 15.143 +Setting back-end privileges is currently only supported in SXP format 15.144 +config files. To allow a domain to function as a back-end for others, 15.145 +somewhere within the {\tt vm} element of its configuration file must 15.146 +be a {\tt back-end} element of the form {\tt (back-end ({\em type}))} 15.147 +where {\tt \em type} may be either {\tt netif} or {\tt blkif}, 15.148 +according to the type of virtual device this domain will service. 15.149 +%% After this domain has been built, \xend will connect all new and 15.150 +%% existing {\em virtual} devices (of the appropriate type) to that 15.151 +%% back-end. 15.152 + 15.153 +Note that a block back-end cannot currently import virtual block 15.154 +devices from other domains, and a network back-end cannot import 15.155 +virtual network devices from other domains. Thus (particularly in the 15.156 +case of block back-ends, which cannot import a virtual block device as 15.157 +their root filesystem), you may need to boot a back-end domain from a 15.158 +ramdisk or a network device. 15.159 + 15.160 +Access to PCI devices may be configured on a per-device basis. Xen 15.161 +will assign the minimal set of hardware privileges to a domain that 15.162 +are required to control its devices. This can be configured in either 15.163 +format of configuration file: 15.164 + 15.165 +\begin{itemize} 15.166 +\item SXP Format: Include device elements of the form: \\ 15.167 + \centerline{ {\tt (device (pci (bus {\em x}) (dev {\em y}) (func {\em z})))}} \\ 15.168 + inside the top-level {\tt vm} element. Each one specifies the 15.169 + address of a device this domain is allowed to access --- the numbers 15.170 + \emph{x},\emph{y} and \emph{z} may be in either decimal or 15.171 + hexadecimal format. 15.172 +\item Flat Format: Include a list of PCI device addresses of the 15.173 + format: \\ 15.174 + \centerline{{\tt pci = ['x,y,z', \ldots]}} \\ 15.175 + where each element in the list is a string specifying the components 15.176 + of the PCI device address, separated by commas. The components 15.177 + ({\tt \em x}, {\tt \em y} and {\tt \em z}) of the list may be 15.178 + formatted as either decimal or hexadecimal. 15.179 +\end{itemize} 15.180 + 15.181 +%% \section{Administration Domains} 15.182 + 15.183 +%% Administration privileges allow a domain to use the `dom0 15.184 +%% operations' (so called because they are usually available only to 15.185 +%% domain 0). A privileged domain can build other domains, set 15.186 +%% scheduling parameters, etc. 15.187 + 15.188 +% Support for other administrative domains is not yet available... 15.189 +% perhaps we should plumb it in some time 15.190 + 15.191 + 15.192 +\section{Scheduler Configuration} 15.193 +\label{s:sched} 15.194 + 15.195 +Xen offers a boot time choice between multiple schedulers. To select 15.196 +a scheduler, pass the boot parameter \emph{sched=sched\_name} to Xen, 15.197 +substituting the appropriate scheduler name. Details of the 15.198 +schedulers and their parameters are included below; future versions of 15.199 +the tools will provide a higher-level interface to these tools. 15.200 + 15.201 +It is expected that system administrators configure their system to 15.202 +use the scheduler most appropriate to their needs. Currently, the BVT 15.203 +scheduler is the recommended choice. 15.204 + 15.205 +\subsection{Borrowed Virtual Time} 15.206 + 15.207 +{\tt sched=bvt} (the default) \\ 15.208 + 15.209 +BVT provides proportional fair shares of the CPU time. It has been 15.210 +observed to penalize domains that block frequently (e.g.\ I/O 15.211 +intensive domains), but this can be compensated for by using warping. 15.212 + 15.213 +\subsubsection{Global Parameters} 15.214 + 15.215 +\begin{description} 15.216 +\item[ctx\_allow] The context switch allowance is similar to the 15.217 + ``quantum'' in traditional schedulers. It is the minimum time that 15.218 + a scheduled domain will be allowed to run before being preempted. 15.219 +\end{description} 15.220 + 15.221 +\subsubsection{Per-domain parameters} 15.222 + 15.223 +\begin{description} 15.224 +\item[mcuadv] The MCU (Minimum Charging Unit) advance determines the 15.225 + proportional share of the CPU that a domain receives. It is set 15.226 + inversely proportionally to a domain's sharing weight. 15.227 +\item[warp] The amount of ``virtual time'' the domain is allowed to 15.228 + warp backwards. 15.229 +\item[warpl] The warp limit is the maximum time a domain can run 15.230 + warped for. 15.231 +\item[warpu] The unwarp requirement is the minimum time a domain must 15.232 + run unwarped for before it can warp again. 15.233 +\end{description} 15.234 + 15.235 +\subsection{Atropos} 15.236 + 15.237 +{\tt sched=atropos} \\ 15.238 + 15.239 +Atropos is a soft real time scheduler. It provides guarantees about 15.240 +absolute shares of the CPU, with a facility for sharing slack CPU time 15.241 +on a best-effort basis. It can provide timeliness guarantees for 15.242 +latency-sensitive domains. 15.243 + 15.244 +Every domain has an associated period and slice. The domain should 15.245 +receive `slice' nanoseconds every `period' nanoseconds. This allows 15.246 +the administrator to configure both the absolute share of the CPU a 15.247 +domain receives and the frequency with which it is scheduled. 15.248 + 15.249 +%% When domains unblock, their period is reduced to the value of the 15.250 +%% latency hint (the slice is scaled accordingly so that they still 15.251 +%% get the same proportion of the CPU). For each subsequent period, 15.252 +%% the slice and period times are doubled until they reach their 15.253 +%% original values. 15.254 + 15.255 +Note: don't over-commit the CPU when using Atropos (i.e.\ don't reserve 15.256 +more CPU than is available --- the utilization should be kept to 15.257 +slightly less than 100\% in order to ensure predictable behavior). 15.258 + 15.259 +\subsubsection{Per-domain parameters} 15.260 + 15.261 +\begin{description} 15.262 +\item[period] The regular time interval during which a domain is 15.263 + guaranteed to receive its allocation of CPU time. 15.264 +\item[slice] The length of time per period that a domain is guaranteed 15.265 + to run for (in the absence of voluntary yielding of the CPU). 15.266 +\item[latency] The latency hint is used to control how soon after 15.267 + waking up a domain it should be scheduled. 15.268 +\item[xtratime] This is a boolean flag that specifies whether a domain 15.269 + should be allowed a share of the system slack time. 15.270 +\end{description} 15.271 + 15.272 +\subsection{Round Robin} 15.273 + 15.274 +{\tt sched=rrobin} \\ 15.275 + 15.276 +The round robin scheduler is included as a simple demonstration of 15.277 +Xen's internal scheduler API. It is not intended for production use. 15.278 + 15.279 +\subsubsection{Global Parameters} 15.280 + 15.281 +\begin{description} 15.282 +\item[rr\_slice] The maximum time each domain runs before the next 15.283 + scheduling decision is made. 15.284 +\end{description}
16.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 16.2 +++ b/docs/src/user/domain_filesystem.tex Tue Sep 20 09:43:46 2005 +0000 16.3 @@ -0,0 +1,243 @@ 16.4 +\chapter{Domain Filesystem Storage} 16.5 + 16.6 +It is possible to directly export any Linux block device in dom0 to 16.7 +another domain, or to export filesystems / devices to virtual machines 16.8 +using standard network protocols (e.g.\ NBD, iSCSI, NFS, etc.). This 16.9 +chapter covers some of the possibilities. 16.10 + 16.11 + 16.12 +\section{Exporting Physical Devices as VBDs} 16.13 +\label{s:exporting-physical-devices-as-vbds} 16.14 + 16.15 +One of the simplest configurations is to directly export individual 16.16 +partitions from domain~0 to other domains. To achieve this use the 16.17 +\path{phy:} specifier in your domain configuration file. For example a 16.18 +line like 16.19 +\begin{quote} 16.20 + \verb_disk = ['phy:hda3,sda1,w']_ 16.21 +\end{quote} 16.22 +specifies that the partition \path{/dev/hda3} in domain~0 should be 16.23 +exported read-write to the new domain as \path{/dev/sda1}; one could 16.24 +equally well export it as \path{/dev/hda} or \path{/dev/sdb5} should 16.25 +one wish. 16.26 + 16.27 +In addition to local disks and partitions, it is possible to export 16.28 +any device that Linux considers to be ``a disk'' in the same manner. 16.29 +For example, if you have iSCSI disks or GNBD volumes imported into 16.30 +domain~0 you can export these to other domains using the \path{phy:} 16.31 +disk syntax. E.g.: 16.32 +\begin{quote} 16.33 + \verb_disk = ['phy:vg/lvm1,sda2,w']_ 16.34 +\end{quote} 16.35 + 16.36 +\begin{center} 16.37 + \framebox{\bf Warning: Block device sharing} 16.38 +\end{center} 16.39 +\begin{quote} 16.40 + Block devices should typically only be shared between domains in a 16.41 + read-only fashion otherwise the Linux kernel's file systems will get 16.42 + very confused as the file system structure may change underneath 16.43 + them (having the same ext3 partition mounted \path{rw} twice is a 16.44 + sure fire way to cause irreparable damage)! \Xend\ will attempt to 16.45 + prevent you from doing this by checking that the device is not 16.46 + mounted read-write in domain~0, and hasn't already been exported 16.47 + read-write to another domain. If you want read-write sharing, 16.48 + export the directory to other domains via NFS from domain~0 (or use 16.49 + a cluster file system such as GFS or ocfs2). 16.50 +\end{quote} 16.51 + 16.52 + 16.53 +\section{Using File-backed VBDs} 16.54 + 16.55 +It is also possible to use a file in Domain~0 as the primary storage 16.56 +for a virtual machine. As well as being convenient, this also has the 16.57 +advantage that the virtual block device will be \emph{sparse} --- 16.58 +space will only really be allocated as parts of the file are used. So 16.59 +if a virtual machine uses only half of its disk space then the file 16.60 +really takes up half of the size allocated. 16.61 + 16.62 +For example, to create a 2GB sparse file-backed virtual block device 16.63 +(actually only consumes 1KB of disk): 16.64 +\begin{quote} 16.65 + \verb_# dd if=/dev/zero of=vm1disk bs=1k seek=2048k count=1_ 16.66 +\end{quote} 16.67 + 16.68 +Make a file system in the disk file: 16.69 +\begin{quote} 16.70 + \verb_# mkfs -t ext3 vm1disk_ 16.71 +\end{quote} 16.72 + 16.73 +(when the tool asks for confirmation, answer `y') 16.74 + 16.75 +Populate the file system e.g.\ by copying from the current root: 16.76 +\begin{quote} 16.77 +\begin{verbatim} 16.78 +# mount -o loop vm1disk /mnt 16.79 +# cp -ax /{root,dev,var,etc,usr,bin,sbin,lib} /mnt 16.80 +# mkdir /mnt/{proc,sys,home,tmp} 16.81 +\end{verbatim} 16.82 +\end{quote} 16.83 + 16.84 +Tailor the file system by editing \path{/etc/fstab}, 16.85 +\path{/etc/hostname}, etc.\ Don't forget to edit the files in the 16.86 +mounted file system, instead of your domain~0 filesystem, e.g.\ you 16.87 +would edit \path{/mnt/etc/fstab} instead of \path{/etc/fstab}. For 16.88 +this example put \path{/dev/sda1} to root in fstab. 16.89 + 16.90 +Now unmount (this is important!): 16.91 +\begin{quote} 16.92 + \verb_# umount /mnt_ 16.93 +\end{quote} 16.94 + 16.95 +In the configuration file set: 16.96 +\begin{quote} 16.97 + \verb_disk = ['file:/full/path/to/vm1disk,sda1,w']_ 16.98 +\end{quote} 16.99 + 16.100 +As the virtual machine writes to its `disk', the sparse file will be 16.101 +filled in and consume more space up to the original 2GB. 16.102 + 16.103 +{\bf Note that file-backed VBDs may not be appropriate for backing 16.104 + I/O-intensive domains.} File-backed VBDs are known to experience 16.105 +substantial slowdowns under heavy I/O workloads, due to the I/O 16.106 +handling by the loopback block device used to support file-backed VBDs 16.107 +in dom0. Better I/O performance can be achieved by using either 16.108 +LVM-backed VBDs (Section~\ref{s:using-lvm-backed-vbds}) or physical 16.109 +devices as VBDs (Section~\ref{s:exporting-physical-devices-as-vbds}). 16.110 + 16.111 +Linux supports a maximum of eight file-backed VBDs across all domains 16.112 +by default. This limit can be statically increased by using the 16.113 +\emph{max\_loop} module parameter if CONFIG\_BLK\_DEV\_LOOP is 16.114 +compiled as a module in the dom0 kernel, or by using the 16.115 +\emph{max\_loop=n} boot option if CONFIG\_BLK\_DEV\_LOOP is compiled 16.116 +directly into the dom0 kernel. 16.117 + 16.118 + 16.119 +\section{Using LVM-backed VBDs} 16.120 +\label{s:using-lvm-backed-vbds} 16.121 + 16.122 +A particularly appealing solution is to use LVM volumes as backing for 16.123 +domain file-systems since this allows dynamic growing/shrinking of 16.124 +volumes as well as snapshot and other features. 16.125 + 16.126 +To initialize a partition to support LVM volumes: 16.127 +\begin{quote} 16.128 +\begin{verbatim} 16.129 +# pvcreate /dev/sda10 16.130 +\end{verbatim} 16.131 +\end{quote} 16.132 + 16.133 +Create a volume group named `vg' on the physical partition: 16.134 +\begin{quote} 16.135 +\begin{verbatim} 16.136 +# vgcreate vg /dev/sda10 16.137 +\end{verbatim} 16.138 +\end{quote} 16.139 + 16.140 +Create a logical volume of size 4GB named `myvmdisk1': 16.141 +\begin{quote} 16.142 +\begin{verbatim} 16.143 +# lvcreate -L4096M -n myvmdisk1 vg 16.144 +\end{verbatim} 16.145 +\end{quote} 16.146 + 16.147 +You should now see that you have a \path{/dev/vg/myvmdisk1} Make a 16.148 +filesystem, mount it and populate it, e.g.: 16.149 +\begin{quote} 16.150 +\begin{verbatim} 16.151 +# mkfs -t ext3 /dev/vg/myvmdisk1 16.152 +# mount /dev/vg/myvmdisk1 /mnt 16.153 +# cp -ax / /mnt 16.154 +# umount /mnt 16.155 +\end{verbatim} 16.156 +\end{quote} 16.157 + 16.158 +Now configure your VM with the following disk configuration: 16.159 +\begin{quote} 16.160 +\begin{verbatim} 16.161 + disk = [ 'phy:vg/myvmdisk1,sda1,w' ] 16.162 +\end{verbatim} 16.163 +\end{quote} 16.164 + 16.165 +LVM enables you to grow the size of logical volumes, but you'll need 16.166 +to resize the corresponding file system to make use of the new space. 16.167 +Some file systems (e.g.\ ext3) now support online resize. See the LVM 16.168 +manuals for more details. 16.169 + 16.170 +You can also use LVM for creating copy-on-write (CoW) clones of LVM 16.171 +volumes (known as writable persistent snapshots in LVM terminology). 16.172 +This facility is new in Linux 2.6.8, so isn't as stable as one might 16.173 +hope. In particular, using lots of CoW LVM disks consumes a lot of 16.174 +dom0 memory, and error conditions such as running out of disk space 16.175 +are not handled well. Hopefully this will improve in future. 16.176 + 16.177 +To create two copy-on-write clone of the above file system you would 16.178 +use the following commands: 16.179 + 16.180 +\begin{quote} 16.181 +\begin{verbatim} 16.182 +# lvcreate -s -L1024M -n myclonedisk1 /dev/vg/myvmdisk1 16.183 +# lvcreate -s -L1024M -n myclonedisk2 /dev/vg/myvmdisk1 16.184 +\end{verbatim} 16.185 +\end{quote} 16.186 + 16.187 +Each of these can grow to have 1GB of differences from the master 16.188 +volume. You can grow the amount of space for storing the differences 16.189 +using the lvextend command, e.g.: 16.190 +\begin{quote} 16.191 +\begin{verbatim} 16.192 +# lvextend +100M /dev/vg/myclonedisk1 16.193 +\end{verbatim} 16.194 +\end{quote} 16.195 + 16.196 +Don't let the `differences volume' ever fill up otherwise LVM gets 16.197 +rather confused. It may be possible to automate the growing process by 16.198 +using \path{dmsetup wait} to spot the volume getting full and then 16.199 +issue an \path{lvextend}. 16.200 + 16.201 +In principle, it is possible to continue writing to the volume that 16.202 +has been cloned (the changes will not be visible to the clones), but 16.203 +we wouldn't recommend this: have the cloned volume as a `pristine' 16.204 +file system install that isn't mounted directly by any of the virtual 16.205 +machines. 16.206 + 16.207 + 16.208 +\section{Using NFS Root} 16.209 + 16.210 +First, populate a root filesystem in a directory on the server 16.211 +machine. This can be on a distinct physical machine, or simply run 16.212 +within a virtual machine on the same node. 16.213 + 16.214 +Now configure the NFS server to export this filesystem over the 16.215 +network by adding a line to \path{/etc/exports}, for instance: 16.216 + 16.217 +\begin{quote} 16.218 + \begin{small} 16.219 +\begin{verbatim} 16.220 +/export/vm1root 1.2.3.4/24 (rw,sync,no_root_squash) 16.221 +\end{verbatim} 16.222 + \end{small} 16.223 +\end{quote} 16.224 + 16.225 +Finally, configure the domain to use NFS root. In addition to the 16.226 +normal variables, you should make sure to set the following values in 16.227 +the domain's configuration file: 16.228 + 16.229 +\begin{quote} 16.230 + \begin{small} 16.231 +\begin{verbatim} 16.232 +root = '/dev/nfs' 16.233 +nfs_server = '2.3.4.5' # substitute IP address of server 16.234 +nfs_root = '/path/to/root' # path to root FS on the server 16.235 +\end{verbatim} 16.236 + \end{small} 16.237 +\end{quote} 16.238 + 16.239 +The domain will need network access at boot time, so either statically 16.240 +configure an IP address using the config variables \path{ip}, 16.241 +\path{netmask}, \path{gateway}, \path{hostname}; or enable DHCP 16.242 +(\path{dhcp='dhcp'}). 16.243 + 16.244 +Note that the Linux NFS root implementation is known to have stability 16.245 +problems under high load (this is not a Xen-specific problem), so this 16.246 +configuration may not be appropriate for critical servers.
17.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 17.2 +++ b/docs/src/user/domain_mgmt.tex Tue Sep 20 09:43:46 2005 +0000 17.3 @@ -0,0 +1,203 @@ 17.4 +\chapter{Domain Management Tools} 17.5 + 17.6 +The previous chapter described a simple example of how to configure 17.7 +and start a domain. This chapter summarises the tools available to 17.8 +manage running domains. 17.9 + 17.10 + 17.11 +\section{Command-line Management} 17.12 + 17.13 +Command line management tasks are also performed using the \path{xm} 17.14 +tool. For online help for the commands available, type: 17.15 +\begin{quote} 17.16 + \verb_# xm help_ 17.17 +\end{quote} 17.18 + 17.19 +You can also type \path{xm help $<$command$>$} for more information on 17.20 +a given command. 17.21 + 17.22 +\subsection{Basic Management Commands} 17.23 + 17.24 +The most important \path{xm} commands are: 17.25 +\begin{quote} 17.26 + \verb_# xm list_: Lists all domains running.\\ 17.27 + \verb_# xm consoles_: Gives information about the domain consoles.\\ 17.28 + \verb_# xm console_: Opens a console to a domain (e.g.\ 17.29 + \verb_# xm console myVM_) 17.30 +\end{quote} 17.31 + 17.32 +\subsection{\tt xm list} 17.33 + 17.34 +The output of \path{xm list} is in rows of the following format: 17.35 +\begin{center} {\tt name domid memory cpu state cputime console} 17.36 +\end{center} 17.37 + 17.38 +\begin{quote} 17.39 + \begin{description} 17.40 + \item[name] The descriptive name of the virtual machine. 17.41 + \item[domid] The number of the domain ID this virtual machine is 17.42 + running in. 17.43 + \item[memory] Memory size in megabytes. 17.44 + \item[cpu] The CPU this domain is running on. 17.45 + \item[state] Domain state consists of 5 fields: 17.46 + \begin{description} 17.47 + \item[r] running 17.48 + \item[b] blocked 17.49 + \item[p] paused 17.50 + \item[s] shutdown 17.51 + \item[c] crashed 17.52 + \end{description} 17.53 + \item[cputime] How much CPU time (in seconds) the domain has used so 17.54 + far. 17.55 + \item[console] TCP port accepting connections to the domain's 17.56 + console. 17.57 + \end{description} 17.58 +\end{quote} 17.59 + 17.60 +The \path{xm list} command also supports a long output format when the 17.61 +\path{-l} switch is used. This outputs the fulls details of the 17.62 +running domains in \xend's SXP configuration format. 17.63 + 17.64 +For example, suppose the system is running the ttylinux domain as 17.65 +described earlier. The list command should produce output somewhat 17.66 +like the following: 17.67 +\begin{verbatim} 17.68 +# xm list 17.69 +Name Id Mem(MB) CPU State Time(s) Console 17.70 +Domain-0 0 251 0 r---- 172.2 17.71 +ttylinux 5 63 0 -b--- 3.0 9605 17.72 +\end{verbatim} 17.73 + 17.74 +Here we can see the details for the ttylinux domain, as well as for 17.75 +domain~0 (which, of course, is always running). Note that the console 17.76 +port for the ttylinux domain is 9605. This can be connected to by TCP 17.77 +using a terminal program (e.g. \path{telnet} or, better, 17.78 +\path{xencons}). The simplest way to connect is to use the 17.79 +\path{xm~console} command, specifying the domain name or ID. To 17.80 +connect to the console of the ttylinux domain, we could use any of the 17.81 +following: 17.82 +\begin{verbatim} 17.83 +# xm console ttylinux 17.84 +# xm console 5 17.85 +# xencons localhost 9605 17.86 +\end{verbatim} 17.87 + 17.88 +\section{Domain Save and Restore} 17.89 + 17.90 +The administrator of a Xen system may suspend a virtual machine's 17.91 +current state into a disk file in domain~0, allowing it to be resumed 17.92 +at a later time. 17.93 + 17.94 +The ttylinux domain described earlier can be suspended to disk using 17.95 +the command: 17.96 +\begin{verbatim} 17.97 +# xm save ttylinux ttylinux.xen 17.98 +\end{verbatim} 17.99 + 17.100 +This will stop the domain named `ttylinux' and save its current state 17.101 +into a file called \path{ttylinux.xen}. 17.102 + 17.103 +To resume execution of this domain, use the \path{xm restore} command: 17.104 +\begin{verbatim} 17.105 +# xm restore ttylinux.xen 17.106 +\end{verbatim} 17.107 + 17.108 +This will restore the state of the domain and restart it. The domain 17.109 +will carry on as before and the console may be reconnected using the 17.110 +\path{xm console} command, as above. 17.111 + 17.112 +\section{Live Migration} 17.113 + 17.114 +Live migration is used to transfer a domain between physical hosts 17.115 +whilst that domain continues to perform its usual activities --- from 17.116 +the user's perspective, the migration should be imperceptible. 17.117 + 17.118 +To perform a live migration, both hosts must be running Xen / \xend\ 17.119 +and the destination host must have sufficient resources (e.g.\ memory 17.120 +capacity) to accommodate the domain after the move. Furthermore we 17.121 +currently require both source and destination machines to be on the 17.122 +same L2 subnet. 17.123 + 17.124 +Currently, there is no support for providing automatic remote access 17.125 +to filesystems stored on local disk when a domain is migrated. 17.126 +Administrators should choose an appropriate storage solution (i.e.\ 17.127 +SAN, NAS, etc.) to ensure that domain filesystems are also available 17.128 +on their destination node. GNBD is a good method for exporting a 17.129 +volume from one machine to another. iSCSI can do a similar job, but is 17.130 +more complex to set up. 17.131 + 17.132 +When a domain migrates, it's MAC and IP address move with it, thus it 17.133 +is only possible to migrate VMs within the same layer-2 network and IP 17.134 +subnet. If the destination node is on a different subnet, the 17.135 +administrator would need to manually configure a suitable etherip or 17.136 +IP tunnel in the domain~0 of the remote node. 17.137 + 17.138 +A domain may be migrated using the \path{xm migrate} command. To live 17.139 +migrate a domain to another machine, we would use the command: 17.140 + 17.141 +\begin{verbatim} 17.142 +# xm migrate --live mydomain destination.ournetwork.com 17.143 +\end{verbatim} 17.144 + 17.145 +Without the \path{--live} flag, \xend\ simply stops the domain and 17.146 +copies the memory image over to the new node and restarts it. Since 17.147 +domains can have large allocations this can be quite time consuming, 17.148 +even on a Gigabit network. With the \path{--live} flag \xend\ attempts 17.149 +to keep the domain running while the migration is in progress, 17.150 +resulting in typical `downtimes' of just 60--300ms. 17.151 + 17.152 +For now it will be necessary to reconnect to the domain's console on 17.153 +the new machine using the \path{xm console} command. If a migrated 17.154 +domain has any open network connections then they will be preserved, 17.155 +so SSH connections do not have this limitation. 17.156 + 17.157 + 17.158 +\section{Managing Domain Memory} 17.159 + 17.160 +XenLinux domains have the ability to relinquish / reclaim machine 17.161 +memory at the request of the administrator or the user of the domain. 17.162 + 17.163 +\subsection{Setting memory footprints from dom0} 17.164 + 17.165 +The machine administrator can request that a domain alter its memory 17.166 +footprint using the \path{xm set-mem} command. For instance, we can 17.167 +request that our example ttylinux domain reduce its memory footprint 17.168 +to 32 megabytes. 17.169 + 17.170 +\begin{verbatim} 17.171 +# xm set-mem ttylinux 32 17.172 +\end{verbatim} 17.173 + 17.174 +We can now see the result of this in the output of \path{xm list}: 17.175 + 17.176 +\begin{verbatim} 17.177 +# xm list 17.178 +Name Id Mem(MB) CPU State Time(s) Console 17.179 +Domain-0 0 251 0 r---- 172.2 17.180 +ttylinux 5 31 0 -b--- 4.3 9605 17.181 +\end{verbatim} 17.182 + 17.183 +The domain has responded to the request by returning memory to Xen. We 17.184 +can restore the domain to its original size using the command line: 17.185 + 17.186 +\begin{verbatim} 17.187 +# xm set-mem ttylinux 64 17.188 +\end{verbatim} 17.189 + 17.190 +\subsection{Setting memory footprints from within a domain} 17.191 + 17.192 +The virtual file \path{/proc/xen/balloon} allows the owner of a domain 17.193 +to adjust their own memory footprint. Reading the file (e.g.\ 17.194 +\path{cat /proc/xen/balloon}) prints out the current memory footprint 17.195 +of the domain. Writing the file (e.g.\ \path{echo new\_target > 17.196 + /proc/xen/balloon}) requests that the kernel adjust the domain's 17.197 +memory footprint to a new value. 17.198 + 17.199 +\subsection{Setting memory limits} 17.200 + 17.201 +Xen associates a memory size limit with each domain. By default, this 17.202 +is the amount of memory the domain is originally started with, 17.203 +preventing the domain from ever growing beyond this size. To permit a 17.204 +domain to grow beyond its original allocation or to prevent a domain 17.205 +you've shrunk from reclaiming the memory it relinquished, use the 17.206 +\path{xm maxmem} command.
18.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 18.2 +++ b/docs/src/user/glossary.tex Tue Sep 20 09:43:46 2005 +0000 18.3 @@ -0,0 +1,79 @@ 18.4 +\chapter{Glossary of Terms} 18.5 + 18.6 +\begin{description} 18.7 + 18.8 +\item[Atropos] One of the CPU schedulers provided by Xen. Atropos 18.9 + provides domains with absolute shares of the CPU, with timeliness 18.10 + guarantees and a mechanism for sharing out `slack time'. 18.11 + 18.12 +\item[BVT] The BVT scheduler is used to give proportional fair shares 18.13 + of the CPU to domains. 18.14 + 18.15 +\item[Exokernel] A minimal piece of privileged code, similar to a {\bf 18.16 + microkernel} but providing a more `hardware-like' interface to the 18.17 + tasks it manages. This is similar to a paravirtualising VMM like 18.18 + {\bf Xen} but was designed as a new operating system structure, 18.19 + rather than specifically to run multiple conventional OSs. 18.20 + 18.21 +\item[Domain] A domain is the execution context that contains a 18.22 + running {\bf virtual machine}. The relationship between virtual 18.23 + machines and domains on Xen is similar to that between programs and 18.24 + processes in an operating system: a virtual machine is a persistent 18.25 + entity that resides on disk (somewhat like a program). When it is 18.26 + loaded for execution, it runs in a domain. Each domain has a {\bf 18.27 + domain ID}. 18.28 + 18.29 +\item[Domain 0] The first domain to be started on a Xen machine. 18.30 + Domain 0 is responsible for managing the system. 18.31 + 18.32 +\item[Domain ID] A unique identifier for a {\bf domain}, analogous to 18.33 + a process ID in an operating system. 18.34 + 18.35 +\item[Full virtualisation] An approach to virtualisation which 18.36 + requires no modifications to the hosted operating system, providing 18.37 + the illusion of a complete system of real hardware devices. 18.38 + 18.39 +\item[Hypervisor] An alternative term for {\bf VMM}, used because it 18.40 + means `beyond supervisor', since it is responsible for managing 18.41 + multiple `supervisor' kernels. 18.42 + 18.43 +\item[Live migration] A technique for moving a running virtual machine 18.44 + to another physical host, without stopping it or the services 18.45 + running on it. 18.46 + 18.47 +\item[Microkernel] A small base of code running at the highest 18.48 + hardware privilege level. A microkernel is responsible for sharing 18.49 + CPU and memory (and sometimes other devices) between less privileged 18.50 + tasks running on the system. This is similar to a VMM, particularly 18.51 + a {\bf paravirtualising} VMM but typically addressing a different 18.52 + problem space and providing different kind of interface. 18.53 + 18.54 +\item[NetBSD/Xen] A port of NetBSD to the Xen architecture. 18.55 + 18.56 +\item[Paravirtualisation] An approach to virtualisation which requires 18.57 + modifications to the operating system in order to run in a virtual 18.58 + machine. Xen uses paravirtualisation but preserves binary 18.59 + compatibility for user space applications. 18.60 + 18.61 +\item[Shadow pagetables] A technique for hiding the layout of machine 18.62 + memory from a virtual machine's operating system. Used in some {\bf 18.63 + VMMs} to provide the illusion of contiguous physical memory, in 18.64 + Xen this is used during {\bf live migration}. 18.65 + 18.66 +\item[Virtual Machine] The environment in which a hosted operating 18.67 + system runs, providing the abstraction of a dedicated machine. A 18.68 + virtual machine may be identical to the underlying hardware (as in 18.69 + {\bf full virtualisation}, or it may differ, as in {\bf 18.70 + paravirtualisation}). 18.71 + 18.72 +\item[VMM] Virtual Machine Monitor - the software that allows multiple 18.73 + virtual machines to be multiplexed on a single physical machine. 18.74 + 18.75 +\item[Xen] Xen is a paravirtualising virtual machine monitor, 18.76 + developed primarily by the Systems Research Group at the University 18.77 + of Cambridge Computer Laboratory. 18.78 + 18.79 +\item[XenLinux] Official name for the port of the Linux kernel that 18.80 + runs on Xen. 18.81 + 18.82 +\end{description}
19.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 19.2 +++ b/docs/src/user/installation.tex Tue Sep 20 09:43:46 2005 +0000 19.3 @@ -0,0 +1,394 @@ 19.4 +\chapter{Installation} 19.5 + 19.6 +The Xen distribution includes three main components: Xen itself, ports 19.7 +of Linux 2.4 and 2.6 and NetBSD to run on Xen, and the userspace 19.8 +tools required to manage a Xen-based system. This chapter describes 19.9 +how to install the Xen~2.0 distribution from source. Alternatively, 19.10 +there may be pre-built packages available as part of your operating 19.11 +system distribution. 19.12 + 19.13 + 19.14 +\section{Prerequisites} 19.15 +\label{sec:prerequisites} 19.16 + 19.17 +The following is a full list of prerequisites. Items marked `$\dag$' 19.18 +are required by the \xend\ control tools, and hence required if you 19.19 +want to run more than one virtual machine; items marked `$*$' are only 19.20 +required if you wish to build from source. 19.21 +\begin{itemize} 19.22 +\item A working Linux distribution using the GRUB bootloader and 19.23 + running on a P6-class (or newer) CPU. 19.24 +\item [$\dag$] The \path{iproute2} package. 19.25 +\item [$\dag$] The Linux bridge-utils\footnote{Available from {\tt 19.26 + http://bridge.sourceforge.net}} (e.g., \path{/sbin/brctl}) 19.27 +\item [$\dag$] An installation of Twisted~v1.3 or 19.28 + above\footnote{Available from {\tt http://www.twistedmatrix.com}}. 19.29 + There may be a binary package available for your distribution; 19.30 + alternatively it can be installed by running `{\sl make 19.31 + install-twisted}' in the root of the Xen source tree. 19.32 +\item [$*$] Build tools (gcc v3.2.x or v3.3.x, binutils, GNU make). 19.33 +\item [$*$] Development installation of libcurl (e.g., libcurl-devel) 19.34 +\item [$*$] Development installation of zlib (e.g., zlib-dev). 19.35 +\item [$*$] Development installation of Python v2.2 or later (e.g., 19.36 + python-dev). 19.37 +\item [$*$] \LaTeX\ and transfig are required to build the 19.38 + documentation. 19.39 +\end{itemize} 19.40 + 19.41 +Once you have satisfied the relevant prerequisites, you can now 19.42 +install either a binary or source distribution of Xen. 19.43 + 19.44 + 19.45 +\section{Installing from Binary Tarball} 19.46 + 19.47 +Pre-built tarballs are available for download from the Xen download 19.48 +page 19.49 +\begin{quote} {\tt http://xen.sf.net} 19.50 +\end{quote} 19.51 + 19.52 +Once you've downloaded the tarball, simply unpack and install: 19.53 +\begin{verbatim} 19.54 +# tar zxvf xen-2.0-install.tgz 19.55 +# cd xen-2.0-install 19.56 +# sh ./install.sh 19.57 +\end{verbatim} 19.58 + 19.59 +Once you've installed the binaries you need to configure your system 19.60 +as described in Section~\ref{s:configure}. 19.61 + 19.62 + 19.63 +\section{Installing from Source} 19.64 + 19.65 +This section describes how to obtain, build, and install Xen from 19.66 +source. 19.67 + 19.68 +\subsection{Obtaining the Source} 19.69 + 19.70 +The Xen source tree is available as either a compressed source tar 19.71 +ball or as a clone of our master BitKeeper repository. 19.72 + 19.73 +\begin{description} 19.74 +\item[Obtaining the Source Tarball]\mbox{} \\ 19.75 + Stable versions (and daily snapshots) of the Xen source tree are 19.76 + available as compressed tarballs from the Xen download page 19.77 + \begin{quote} {\tt http://xen.sf.net} 19.78 + \end{quote} 19.79 + 19.80 +\item[Using BitKeeper]\mbox{} \\ 19.81 + If you wish to install Xen from a clone of our latest BitKeeper 19.82 + repository then you will need to install the BitKeeper tools. 19.83 + Download instructions for BitKeeper can be obtained by filling out 19.84 + the form at: 19.85 + \begin{quote} {\tt http://www.bitmover.com/cgi-bin/download.cgi} 19.86 +\end{quote} 19.87 +The public master BK repository for the 2.0 release lives at: 19.88 +\begin{quote} {\tt bk://xen.bkbits.net/xen-2.0.bk} 19.89 +\end{quote} 19.90 +You can use BitKeeper to download it and keep it updated with the 19.91 +latest features and fixes. 19.92 + 19.93 +Change to the directory in which you want to put the source code, then 19.94 +run: 19.95 +\begin{verbatim} 19.96 +# bk clone bk://xen.bkbits.net/xen-2.0.bk 19.97 +\end{verbatim} 19.98 + 19.99 +Under your current directory, a new directory named \path{xen-2.0.bk} 19.100 +has been created, which contains all the source code for Xen, the OS 19.101 +ports, and the control tools. You can update your repository with the 19.102 +latest changes at any time by running: 19.103 +\begin{verbatim} 19.104 +# cd xen-2.0.bk # to change into the local repository 19.105 +# bk pull # to update the repository 19.106 +\end{verbatim} 19.107 +\end{description} 19.108 + 19.109 +% \section{The distribution} 19.110 +% 19.111 +% The Xen source code repository is structured as follows: 19.112 +% 19.113 +% \begin{description} 19.114 +% \item[\path{tools/}] Xen node controller daemon (Xend), command line 19.115 +% tools, control libraries 19.116 +% \item[\path{xen/}] The Xen VMM. 19.117 +% \item[\path{linux-*-xen-sparse/}] Xen support for Linux. 19.118 +% \item[\path{linux-*-patches/}] Experimental patches for Linux. 19.119 +% \item[\path{netbsd-*-xen-sparse/}] Xen support for NetBSD. 19.120 +% \item[\path{docs/}] Various documentation files for users and 19.121 +% developers. 19.122 +% \item[\path{extras/}] Bonus extras. 19.123 +% \end{description} 19.124 + 19.125 +\subsection{Building from Source} 19.126 + 19.127 +The top-level Xen Makefile includes a target `world' that will do the 19.128 +following: 19.129 + 19.130 +\begin{itemize} 19.131 +\item Build Xen. 19.132 +\item Build the control tools, including \xend. 19.133 +\item Download (if necessary) and unpack the Linux 2.6 source code, 19.134 + and patch it for use with Xen. 19.135 +\item Build a Linux kernel to use in domain 0 and a smaller 19.136 + unprivileged kernel, which can optionally be used for unprivileged 19.137 + virtual machines. 19.138 +\end{itemize} 19.139 + 19.140 +After the build has completed you should have a top-level directory 19.141 +called \path{dist/} in which all resulting targets will be placed; of 19.142 +particular interest are the two kernels XenLinux kernel images, one 19.143 +with a `-xen0' extension which contains hardware device drivers and 19.144 +drivers for Xen's virtual devices, and one with a `-xenU' extension 19.145 +that just contains the virtual ones. These are found in 19.146 +\path{dist/install/boot/} along with the image for Xen itself and the 19.147 +configuration files used during the build. 19.148 + 19.149 +The NetBSD port can be built using: 19.150 +\begin{quote} 19.151 +\begin{verbatim} 19.152 +# make netbsd20 19.153 +\end{verbatim} 19.154 +\end{quote} 19.155 +NetBSD port is built using a snapshot of the netbsd-2-0 cvs branch. 19.156 +The snapshot is downloaded as part of the build process, if it is not 19.157 +yet present in the \path{NETBSD\_SRC\_PATH} search path. The build 19.158 +process also downloads a toolchain which includes all the tools 19.159 +necessary to build the NetBSD kernel under Linux. 19.160 + 19.161 +To customize further the set of kernels built you need to edit the 19.162 +top-level Makefile. Look for the line: 19.163 + 19.164 +\begin{quote} 19.165 +\begin{verbatim} 19.166 +KERNELS ?= mk.linux-2.6-xen0 mk.linux-2.6-xenU 19.167 +\end{verbatim} 19.168 +\end{quote} 19.169 + 19.170 +You can edit this line to include any set of operating system kernels 19.171 +which have configurations in the top-level \path{buildconfigs/} 19.172 +directory, for example \path{mk.linux-2.4-xenU} to build a Linux 2.4 19.173 +kernel containing only virtual device drivers. 19.174 + 19.175 +%% Inspect the Makefile if you want to see what goes on during a 19.176 +%% build. Building Xen and the tools is straightforward, but XenLinux 19.177 +%% is more complicated. The makefile needs a `pristine' Linux kernel 19.178 +%% tree to which it will then add the Xen architecture files. You can 19.179 +%% tell the makefile the location of the appropriate Linux compressed 19.180 +%% tar file by 19.181 +%% setting the LINUX\_SRC environment variable, e.g. \\ 19.182 +%% \verb!# LINUX_SRC=/tmp/linux-2.6.11.tar.bz2 make world! \\ or by 19.183 +%% placing the tar file somewhere in the search path of {\tt 19.184 +%% LINUX\_SRC\_PATH} which defaults to `{\tt .:..}'. If the 19.185 +%% makefile can't find a suitable kernel tar file it attempts to 19.186 +%% download it from kernel.org (this won't work if you're behind a 19.187 +%% firewall). 19.188 + 19.189 +%% After untaring the pristine kernel tree, the makefile uses the {\tt 19.190 +%% mkbuildtree} script to add the Xen patches to the kernel. 19.191 + 19.192 + 19.193 +%% The procedure is similar to build the Linux 2.4 port: \\ 19.194 +%% \verb!# LINUX_SRC=/path/to/linux2.4/source make linux24! 19.195 + 19.196 + 19.197 +%% \framebox{\parbox{5in}{ 19.198 +%% {\bf Distro specific:} \\ 19.199 +%% {\it Gentoo} --- if not using udev (most installations, 19.200 +%% currently), you'll need to enable devfs and devfs mount at boot 19.201 +%% time in the xen0 config. }} 19.202 + 19.203 +\subsection{Custom XenLinux Builds} 19.204 + 19.205 +% If you have an SMP machine you may wish to give the {\tt '-j4'} 19.206 +% argument to make to get a parallel build. 19.207 + 19.208 +If you wish to build a customized XenLinux kernel (e.g. to support 19.209 +additional devices or enable distribution-required features), you can 19.210 +use the standard Linux configuration mechanisms, specifying that the 19.211 +architecture being built for is \path{xen}, e.g: 19.212 +\begin{quote} 19.213 +\begin{verbatim} 19.214 +# cd linux-2.6.11-xen0 19.215 +# make ARCH=xen xconfig 19.216 +# cd .. 19.217 +# make 19.218 +\end{verbatim} 19.219 +\end{quote} 19.220 + 19.221 +You can also copy an existing Linux configuration (\path{.config}) 19.222 +into \path{linux-2.6.11-xen0} and execute: 19.223 +\begin{quote} 19.224 +\begin{verbatim} 19.225 +# make ARCH=xen oldconfig 19.226 +\end{verbatim} 19.227 +\end{quote} 19.228 + 19.229 +You may be prompted with some Xen-specific options; we advise 19.230 +accepting the defaults for these options. 19.231 + 19.232 +Note that the only difference between the two types of Linux kernel 19.233 +that are built is the configuration file used for each. The `U' 19.234 +suffixed (unprivileged) versions don't contain any of the physical 19.235 +hardware device drivers, leading to a 30\% reduction in size; hence 19.236 +you may prefer these for your non-privileged domains. The `0' 19.237 +suffixed privileged versions can be used to boot the system, as well 19.238 +as in driver domains and unprivileged domains. 19.239 + 19.240 +\subsection{Installing the Binaries} 19.241 + 19.242 +The files produced by the build process are stored under the 19.243 +\path{dist/install/} directory. To install them in their default 19.244 +locations, do: 19.245 +\begin{quote} 19.246 +\begin{verbatim} 19.247 +# make install 19.248 +\end{verbatim} 19.249 +\end{quote} 19.250 + 19.251 +Alternatively, users with special installation requirements may wish 19.252 +to install them manually by copying the files to their appropriate 19.253 +destinations. 19.254 + 19.255 +%% Files in \path{install/boot/} include: 19.256 +%% \begin{itemize} 19.257 +%% \item \path{install/boot/xen-2.0.gz} Link to the Xen 'kernel' 19.258 +%% \item \path{install/boot/vmlinuz-2.6-xen0} Link to domain 0 19.259 +%% XenLinux kernel 19.260 +%% \item \path{install/boot/vmlinuz-2.6-xenU} Link to unprivileged 19.261 +%% XenLinux kernel 19.262 +%% \end{itemize} 19.263 + 19.264 +The \path{dist/install/boot} directory will also contain the config 19.265 +files used for building the XenLinux kernels, and also versions of Xen 19.266 +and XenLinux kernels that contain debug symbols (\path{xen-syms-2.0.6} 19.267 +and \path{vmlinux-syms-2.6.11.11-xen0}) which are essential for 19.268 +interpreting crash dumps. Retain these files as the developers may 19.269 +wish to see them if you post on the mailing list. 19.270 + 19.271 + 19.272 +\section{Configuration} 19.273 +\label{s:configure} 19.274 + 19.275 +Once you have built and installed the Xen distribution, it is simple 19.276 +to prepare the machine for booting and running Xen. 19.277 + 19.278 +\subsection{GRUB Configuration} 19.279 + 19.280 +An entry should be added to \path{grub.conf} (often found under 19.281 +\path{/boot/} or \path{/boot/grub/}) to allow Xen / XenLinux to boot. 19.282 +This file is sometimes called \path{menu.lst}, depending on your 19.283 +distribution. The entry should look something like the following: 19.284 + 19.285 +{\small 19.286 +\begin{verbatim} 19.287 +title Xen 2.0 / XenLinux 2.6 19.288 + kernel /boot/xen-2.0.gz dom0_mem=131072 19.289 + module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro console=tty0 19.290 +\end{verbatim} 19.291 +} 19.292 + 19.293 +The kernel line tells GRUB where to find Xen itself and what boot 19.294 +parameters should be passed to it (in this case, setting domain 0's 19.295 +memory allocation in kilobytes and the settings for the serial port). 19.296 +For more details on the various Xen boot parameters see 19.297 +Section~\ref{s:xboot}. 19.298 + 19.299 +The module line of the configuration describes the location of the 19.300 +XenLinux kernel that Xen should start and the parameters that should 19.301 +be passed to it (these are standard Linux parameters, identifying the 19.302 +root device and specifying it be initially mounted read only and 19.303 +instructing that console output be sent to the screen). Some 19.304 +distributions such as SuSE do not require the \path{ro} parameter. 19.305 + 19.306 +%% \framebox{\parbox{5in}{ 19.307 +%% {\bf Distro specific:} \\ 19.308 +%% {\it SuSE} --- Omit the {\tt ro} option from the XenLinux 19.309 +%% kernel command line, since the partition won't be remounted rw 19.310 +%% during boot. }} 19.311 + 19.312 + 19.313 +If you want to use an initrd, just add another \path{module} line to 19.314 +the configuration, as usual: 19.315 + 19.316 +{\small 19.317 +\begin{verbatim} 19.318 + module /boot/my_initrd.gz 19.319 +\end{verbatim} 19.320 +} 19.321 + 19.322 +As always when installing a new kernel, it is recommended that you do 19.323 +not delete existing menu options from \path{menu.lst} --- you may want 19.324 +to boot your old Linux kernel in future, particularly if you have 19.325 +problems. 19.326 + 19.327 +\subsection{Serial Console (optional)} 19.328 + 19.329 +%% kernel /boot/xen-2.0.gz dom0_mem=131072 com1=115200,8n1 19.330 +%% module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro 19.331 + 19.332 + 19.333 +In order to configure Xen serial console output, it is necessary to 19.334 +add an boot option to your GRUB config; e.g.\ replace the above kernel 19.335 +line with: 19.336 +\begin{quote} 19.337 +{\small 19.338 +\begin{verbatim} 19.339 + kernel /boot/xen.gz dom0_mem=131072 com1=115200,8n1 19.340 +\end{verbatim}} 19.341 +\end{quote} 19.342 + 19.343 +This configures Xen to output on COM1 at 115,200 baud, 8 data bits, 1 19.344 +stop bit and no parity. Modify these parameters for your set up. 19.345 + 19.346 +One can also configure XenLinux to share the serial console; to 19.347 +achieve this append ``\path{console=ttyS0}'' to your module line. 19.348 + 19.349 +If you wish to be able to log in over the XenLinux serial console it 19.350 +is necessary to add a line into \path{/etc/inittab}, just as per 19.351 +regular Linux. Simply add the line: 19.352 +\begin{quote} {\small {\tt c:2345:respawn:/sbin/mingetty ttyS0}} 19.353 +\end{quote} 19.354 + 19.355 +and you should be able to log in. Note that to successfully log in as 19.356 +root over the serial line will require adding \path{ttyS0} to 19.357 +\path{/etc/securetty} in most modern distributions. 19.358 + 19.359 +\subsection{TLS Libraries} 19.360 + 19.361 +Users of the XenLinux 2.6 kernel should disable Thread Local Storage 19.362 +(e.g.\ by doing a \path{mv /lib/tls /lib/tls.disabled}) before 19.363 +attempting to run with a XenLinux kernel\footnote{If you boot without 19.364 + first disabling TLS, you will get a warning message during the boot 19.365 + process. In this case, simply perform the rename after the machine 19.366 + is up and then run \texttt{/sbin/ldconfig} to make it take effect.}. 19.367 +You can always reenable it by restoring the directory to its original 19.368 +location (i.e.\ \path{mv /lib/tls.disabled /lib/tls}). 19.369 + 19.370 +The reason for this is that the current TLS implementation uses 19.371 +segmentation in a way that is not permissible under Xen. If TLS is 19.372 +not disabled, an emulation mode is used within Xen which reduces 19.373 +performance substantially. 19.374 + 19.375 +We hope that this issue can be resolved by working with Linux 19.376 +distribution vendors to implement a minor backward-compatible change 19.377 +to the TLS library. 19.378 + 19.379 + 19.380 +\section{Booting Xen} 19.381 + 19.382 +It should now be possible to restart the system and use Xen. Reboot 19.383 +as usual but choose the new Xen option when the Grub screen appears. 19.384 + 19.385 +What follows should look much like a conventional Linux boot. The 19.386 +first portion of the output comes from Xen itself, supplying low level 19.387 +information about itself and the machine it is running on. The 19.388 +following portion of the output comes from XenLinux. 19.389 + 19.390 +You may see some errors during the XenLinux boot. These are not 19.391 +necessarily anything to worry about --- they may result from kernel 19.392 +configuration differences between your XenLinux kernel and the one you 19.393 +usually use. 19.394 + 19.395 +When the boot completes, you should be able to log into your system as 19.396 +usual. If you are unable to log in to your system running Xen, you 19.397 +should still be able to reboot with your normal Linux kernel.
20.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 20.2 +++ b/docs/src/user/introduction.tex Tue Sep 20 09:43:46 2005 +0000 20.3 @@ -0,0 +1,143 @@ 20.4 +\chapter{Introduction} 20.5 + 20.6 + 20.7 +Xen is a \emph{paravirtualising} virtual machine monitor (VMM), or 20.8 +`hypervisor', for the x86 processor architecture. Xen can securely 20.9 +execute multiple virtual machines on a single physical system with 20.10 +close-to-native performance. The virtual machine technology 20.11 +facilitates enterprise-grade functionality, including: 20.12 + 20.13 +\begin{itemize} 20.14 +\item Virtual machines with performance close to native hardware. 20.15 +\item Live migration of running virtual machines between physical 20.16 + hosts. 20.17 +\item Excellent hardware support (supports most Linux device drivers). 20.18 +\item Sandboxed, re-startable device drivers. 20.19 +\end{itemize} 20.20 + 20.21 +Paravirtualisation permits very high performance virtualisation, even 20.22 +on architectures like x86 that are traditionally very hard to 20.23 +virtualise. 20.24 + 20.25 +The drawback of this approach is that it requires operating systems to 20.26 +be \emph{ported} to run on Xen. Porting an OS to run on Xen is 20.27 +similar to supporting a new hardware platform, however the process is 20.28 +simplified because the paravirtual machine architecture is very 20.29 +similar to the underlying native hardware. Even though operating 20.30 +system kernels must explicitly support Xen, a key feature is that user 20.31 +space applications and libraries \emph{do not} require modification. 20.32 + 20.33 +Xen support is available for increasingly many operating systems: 20.34 +right now, Linux 2.4, Linux 2.6 and NetBSD are available for Xen 2.0. 20.35 +A FreeBSD port is undergoing testing and will be incorporated into the 20.36 +release soon. Other OS ports, including Plan 9, are in progress. We 20.37 +hope that that arch-xen patches will be incorporated into the 20.38 +mainstream releases of these operating systems in due course (as has 20.39 +already happened for NetBSD). 20.40 + 20.41 +Possible usage scenarios for Xen include: 20.42 + 20.43 +\begin{description} 20.44 +\item [Kernel development.] Test and debug kernel modifications in a 20.45 + sandboxed virtual machine --- no need for a separate test machine. 20.46 +\item [Multiple OS configurations.] Run multiple operating systems 20.47 + simultaneously, for instance for compatibility or QA purposes. 20.48 +\item [Server consolidation.] Move multiple servers onto a single 20.49 + physical host with performance and fault isolation provided at 20.50 + virtual machine boundaries. 20.51 +\item [Cluster computing.] Management at VM granularity provides more 20.52 + flexibility than separately managing each physical host, but better 20.53 + control and isolation than single-system image solutions, 20.54 + particularly by using live migration for load balancing. 20.55 +\item [Hardware support for custom OSes.] Allow development of new 20.56 + OSes while benefiting from the wide-ranging hardware support of 20.57 + existing OSes such as Linux. 20.58 +\end{description} 20.59 + 20.60 + 20.61 +\section{Structure of a Xen-Based System} 20.62 + 20.63 +A Xen system has multiple layers, the lowest and most privileged of 20.64 +which is Xen itself. 20.65 + 20.66 +Xen in turn may host multiple \emph{guest} operating systems, each of 20.67 +which is executed within a secure virtual machine (in Xen terminology, 20.68 +a \emph{domain}). Domains are scheduled by Xen to make effective use 20.69 +of the available physical CPUs. Each guest OS manages its own 20.70 +applications, which includes responsibility for scheduling each 20.71 +application within the time allotted to the VM by Xen. 20.72 + 20.73 +The first domain, \emph{domain 0}, is created automatically when the 20.74 +system boots and has special management privileges. Domain 0 builds 20.75 +other domains and manages their virtual devices. It also performs 20.76 +administrative tasks such as suspending, resuming and migrating other 20.77 +virtual machines. 20.78 + 20.79 +Within domain 0, a process called \emph{xend} runs to manage the 20.80 +system. \Xend is responsible for managing virtual machines and 20.81 +providing access to their consoles. Commands are issued to \xend over 20.82 +an HTTP interface, either from a command-line tool or from a web 20.83 +browser. 20.84 + 20.85 + 20.86 +\section{Hardware Support} 20.87 + 20.88 +Xen currently runs only on the x86 architecture, requiring a `P6' or 20.89 +newer processor (e.g. Pentium Pro, Celeron, Pentium II, Pentium III, 20.90 +Pentium IV, Xeon, AMD Athlon, AMD Duron). Multiprocessor machines are 20.91 +supported, and we also have basic support for HyperThreading (SMT), 20.92 +although this remains a topic for ongoing research. A port 20.93 +specifically for x86/64 is in progress, although Xen already runs on 20.94 +such systems in 32-bit legacy mode. In addition a port to the IA64 20.95 +architecture is approaching completion. We hope to add other 20.96 +architectures such as PPC and ARM in due course. 20.97 + 20.98 +Xen can currently use up to 4GB of memory. It is possible for x86 20.99 +machines to address up to 64GB of physical memory but there are no 20.100 +current plans to support these systems: The x86/64 port is the planned 20.101 +route to supporting larger memory sizes. 20.102 + 20.103 +Xen offloads most of the hardware support issues to the guest OS 20.104 +running in Domain~0. Xen itself contains only the code required to 20.105 +detect and start secondary processors, set up interrupt routing, and 20.106 +perform PCI bus enumeration. Device drivers run within a privileged 20.107 +guest OS rather than within Xen itself. This approach provides 20.108 +compatibility with the majority of device hardware supported by Linux. 20.109 +The default XenLinux build contains support for relatively modern 20.110 +server-class network and disk hardware, but you can add support for 20.111 +other hardware by configuring your XenLinux kernel in the normal way. 20.112 + 20.113 + 20.114 +\section{History} 20.115 + 20.116 +Xen was originally developed by the Systems Research Group at the 20.117 +University of Cambridge Computer Laboratory as part of the XenoServers 20.118 +project, funded by the UK-EPSRC. 20.119 + 20.120 +XenoServers aim to provide a `public infrastructure for global 20.121 +distributed computing', and Xen plays a key part in that, allowing us 20.122 +to efficiently partition a single machine to enable multiple 20.123 +independent clients to run their operating systems and applications in 20.124 +an environment providing protection, resource isolation and 20.125 +accounting. The project web page contains further information along 20.126 +with pointers to papers and technical reports: 20.127 +\path{http://www.cl.cam.ac.uk/xeno} 20.128 + 20.129 +Xen has since grown into a fully-fledged project in its own right, 20.130 +enabling us to investigate interesting research issues regarding the 20.131 +best techniques for virtualising resources such as the CPU, memory, 20.132 +disk and network. The project has been bolstered by support from 20.133 +Intel Research Cambridge, and HP Labs, who are now working closely 20.134 +with us. 20.135 + 20.136 +Xen was first described in a paper presented at SOSP in 20.137 +2003\footnote{\tt 20.138 + http://www.cl.cam.ac.uk/netos/papers/2003-xensosp.pdf}, and the 20.139 +first public release (1.0) was made that October. Since then, Xen has 20.140 +significantly matured and is now used in production scenarios on many 20.141 +sites. 20.142 + 20.143 +Xen 2.0 features greatly enhanced hardware support, configuration 20.144 +flexibility, usability and a larger complement of supported operating 20.145 +systems. This latest release takes Xen a step closer to becoming the 20.146 +definitive open source solution for virtualisation.
21.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 21.2 +++ b/docs/src/user/redhat.tex Tue Sep 20 09:43:46 2005 +0000 21.3 @@ -0,0 +1,61 @@ 21.4 +\chapter{Installing Xen / XenLinux on Red~Hat or Fedora Core} 21.5 + 21.6 +When using Xen / XenLinux on a standard Linux distribution there are a 21.7 +couple of things to watch out for: 21.8 + 21.9 +Note that, because domains greater than 0 don't have any privileged 21.10 +access at all, certain commands in the default boot sequence will fail 21.11 +e.g.\ attempts to update the hwclock, change the console font, update 21.12 +the keytable map, start apmd (power management), or gpm (mouse 21.13 +cursor). Either ignore the errors (they should be harmless), or 21.14 +remove them from the startup scripts. Deleting the following links 21.15 +are a good start: {\path{S24pcmcia}}, {\path{S09isdn}}, 21.16 +{\path{S17keytable}}, {\path{S26apmd}}, {\path{S85gpm}}. 21.17 + 21.18 +If you want to use a single root file system that works cleanly for 21.19 +both domain~0 and unprivileged domains, a useful trick is to use 21.20 +different `init' run levels. For example, use run level 3 for 21.21 +domain~0, and run level 4 for other domains. This enables different 21.22 +startup scripts to be run in depending on the run level number passed 21.23 +on the kernel command line. 21.24 + 21.25 +If using NFS root files systems mounted either from an external server 21.26 +or from domain0 there are a couple of other gotchas. The default 21.27 +{\path{/etc/sysconfig/iptables}} rules block NFS, so part way through 21.28 +the boot sequence things will suddenly go dead. 21.29 + 21.30 +If you're planning on having a separate NFS {\path{/usr}} partition, 21.31 +the RH9 boot scripts don't make life easy - they attempt to mount NFS 21.32 +file systems way to late in the boot process. The easiest way I found 21.33 +to do this was to have a {\path{/linuxrc}} script run ahead of 21.34 +{\path{/sbin/init}} that mounts {\path{/usr}}: 21.35 + 21.36 +\begin{quote} 21.37 + \begin{small}\begin{verbatim} 21.38 + #!/bin/bash 21.39 + /sbin/ipconfig lo 127.0.0.1 21.40 + /sbin/portmap 21.41 + /bin/mount /usr 21.42 + exec /sbin/init "$@" <>/dev/console 2>&1 21.43 +\end{verbatim}\end{small} 21.44 +\end{quote} 21.45 + 21.46 +%% $ XXX SMH: font lock fix :-) 21.47 + 21.48 +The one slight complication with the above is that 21.49 +{\path{/sbin/portmap}} is dynamically linked against 21.50 +{\path{/usr/lib/libwrap.so.0}} Since this is in {\path{/usr}}, it 21.51 +won't work. This can be solved by copying the file (and link) below 21.52 +the {\path{/usr}} mount point, and just let the file be `covered' when 21.53 +the mount happens. 21.54 + 21.55 +In some installations, where a shared read-only {\path{/usr}} is being 21.56 +used, it may be desirable to move other large directories over into 21.57 +the read-only {\path{/usr}}. For example, you might replace 21.58 +{\path{/bin}}, {\path{/lib}} and {\path{/sbin}} with links into 21.59 +{\path{/usr/root/bin}}, {\path{/usr/root/lib}} and 21.60 +{\path{/usr/root/sbin}} respectively. This creates other problems for 21.61 +running the {\path{/linuxrc}} script, requiring bash, portmap, mount, 21.62 +ifconfig, and a handful of other shared libraries to be copied below 21.63 +the mount point --- a simple statically-linked C program would solve 21.64 +this problem.
22.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 22.2 +++ b/docs/src/user/start_addl_dom.tex Tue Sep 20 09:43:46 2005 +0000 22.3 @@ -0,0 +1,172 @@ 22.4 +\chapter{Starting Additional Domains} 22.5 + 22.6 +The first step in creating a new domain is to prepare a root 22.7 +filesystem for it to boot from. Typically, this might be stored in a 22.8 +normal partition, an LVM or other volume manager partition, a disk 22.9 +file or on an NFS server. A simple way to do this is simply to boot 22.10 +from your standard OS install CD and install the distribution into 22.11 +another partition on your hard drive. 22.12 + 22.13 +To start the \xend\ control daemon, type 22.14 +\begin{quote} 22.15 + \verb!# xend start! 22.16 +\end{quote} 22.17 + 22.18 +If you wish the daemon to start automatically, see the instructions in 22.19 +Section~\ref{s:xend}. Once the daemon is running, you can use the 22.20 +\path{xm} tool to monitor and maintain the domains running on your 22.21 +system. This chapter provides only a brief tutorial. We provide full 22.22 +details of the \path{xm} tool in the next chapter. 22.23 + 22.24 +% \section{From the web interface} 22.25 +% 22.26 +% Boot the Xen machine and start Xensv (see Chapter~\ref{cha:xensv} 22.27 +% for more details) using the command: \\ 22.28 +% \verb_# xensv start_ \\ 22.29 +% This will also start Xend (see Chapter~\ref{cha:xend} for more 22.30 +% information). 22.31 +% 22.32 +% The domain management interface will then be available at {\tt 22.33 +% http://your\_machine:8080/}. This provides a user friendly wizard 22.34 +% for starting domains and functions for managing running domains. 22.35 +% 22.36 +% \section{From the command line} 22.37 + 22.38 + 22.39 +\section{Creating a Domain Configuration File} 22.40 + 22.41 +Before you can start an additional domain, you must create a 22.42 +configuration file. We provide two example files which you can use as 22.43 +a starting point: 22.44 +\begin{itemize} 22.45 +\item \path{/etc/xen/xmexample1} is a simple template configuration 22.46 + file for describing a single VM. 22.47 + 22.48 +\item \path{/etc/xen/xmexample2} file is a template description that 22.49 + is intended to be reused for multiple virtual machines. Setting the 22.50 + value of the \path{vmid} variable on the \path{xm} command line 22.51 + fills in parts of this template. 22.52 +\end{itemize} 22.53 + 22.54 +Copy one of these files and edit it as appropriate. Typical values 22.55 +you may wish to edit include: 22.56 + 22.57 +\begin{quote} 22.58 +\begin{description} 22.59 +\item[kernel] Set this to the path of the kernel you compiled for use 22.60 + with Xen (e.g.\ \path{kernel = `/boot/vmlinuz-2.6-xenU'}) 22.61 +\item[memory] Set this to the size of the domain's memory in megabytes 22.62 + (e.g.\ \path{memory = 64}) 22.63 +\item[disk] Set the first entry in this list to calculate the offset 22.64 + of the domain's root partition, based on the domain ID. Set the 22.65 + second to the location of \path{/usr} if you are sharing it between 22.66 + domains (e.g.\ \path{disk = [`phy:your\_hard\_drive\%d,sda1,w' \% 22.67 + (base\_partition\_number + vmid), 22.68 + `phy:your\_usr\_partition,sda6,r' ]} 22.69 +\item[dhcp] Uncomment the dhcp variable, so that the domain will 22.70 + receive its IP address from a DHCP server (e.g.\ \path{dhcp=`dhcp'}) 22.71 +\end{description} 22.72 +\end{quote} 22.73 + 22.74 +You may also want to edit the {\bf vif} variable in order to choose 22.75 +the MAC address of the virtual ethernet interface yourself. For 22.76 +example: 22.77 +\begin{quote} 22.78 +\verb_vif = [`mac=00:06:AA:F6:BB:B3']_ 22.79 +\end{quote} 22.80 +If you do not set this variable, \xend\ will automatically generate a 22.81 +random MAC address from an unused range. 22.82 + 22.83 + 22.84 +\section{Booting the Domain} 22.85 + 22.86 +The \path{xm} tool provides a variety of commands for managing 22.87 +domains. Use the \path{create} command to start new domains. Assuming 22.88 +you've created a configuration file \path{myvmconf} based around 22.89 +\path{/etc/xen/xmexample2}, to start a domain with virtual machine 22.90 +ID~1 you should type: 22.91 + 22.92 +\begin{quote} 22.93 +\begin{verbatim} 22.94 +# xm create -c myvmconf vmid=1 22.95 +\end{verbatim} 22.96 +\end{quote} 22.97 + 22.98 +The \path{-c} switch causes \path{xm} to turn into the domain's 22.99 +console after creation. The \path{vmid=1} sets the \path{vmid} 22.100 +variable used in the \path{myvmconf} file. 22.101 + 22.102 +You should see the console boot messages from the new domain appearing 22.103 +in the terminal in which you typed the command, culminating in a login 22.104 +prompt. 22.105 + 22.106 + 22.107 +\section{Example: ttylinux} 22.108 + 22.109 +Ttylinux is a very small Linux distribution, designed to require very 22.110 +few resources. We will use it as a concrete example of how to start a 22.111 +Xen domain. Most users will probably want to install a full-featured 22.112 +distribution once they have mastered the basics\footnote{ttylinux is 22.113 + maintained by Pascal Schmidt. You can download source packages from 22.114 + the distribution's home page: {\tt 22.115 + http://www.minimalinux.org/ttylinux/}}. 22.116 + 22.117 +\begin{enumerate} 22.118 +\item Download and extract the ttylinux disk image from the Files 22.119 + section of the project's SourceForge site (see 22.120 + \path{http://sf.net/projects/xen/}). 22.121 +\item Create a configuration file like the following: 22.122 +\begin{verbatim} 22.123 +kernel = "/boot/vmlinuz-2.6-xenU" 22.124 +memory = 64 22.125 +name = "ttylinux" 22.126 +nics = 1 22.127 +ip = "1.2.3.4" 22.128 +disk = ['file:/path/to/ttylinux/rootfs,sda1,w'] 22.129 +root = "/dev/sda1 ro" 22.130 +\end{verbatim} 22.131 +\item Now start the domain and connect to its console: 22.132 +\begin{verbatim} 22.133 +xm create configfile -c 22.134 +\end{verbatim} 22.135 +\item Login as root, password root. 22.136 +\end{enumerate} 22.137 + 22.138 + 22.139 +\section{Starting / Stopping Domains Automatically} 22.140 + 22.141 +It is possible to have certain domains start automatically at boot 22.142 +time and to have dom0 wait for all running domains to shutdown before 22.143 +it shuts down the system. 22.144 + 22.145 +To specify a domain is to start at boot-time, place its configuration 22.146 +file (or a link to it) under \path{/etc/xen/auto/}. 22.147 + 22.148 +A Sys-V style init script for Red Hat and LSB-compliant systems is 22.149 +provided and will be automatically copied to \path{/etc/init.d/} 22.150 +during install. You can then enable it in the appropriate way for 22.151 +your distribution. 22.152 + 22.153 +For instance, on Red Hat: 22.154 + 22.155 +\begin{quote} 22.156 + \verb_# chkconfig --add xendomains_ 22.157 +\end{quote} 22.158 + 22.159 +By default, this will start the boot-time domains in runlevels 3, 4 22.160 +and 5. 22.161 + 22.162 +You can also use the \path{service} command to run this script 22.163 +manually, e.g: 22.164 + 22.165 +\begin{quote} 22.166 + \verb_# service xendomains start_ 22.167 + 22.168 + Starts all the domains with config files under /etc/xen/auto/. 22.169 +\end{quote} 22.170 + 22.171 +\begin{quote} 22.172 + \verb_# service xendomains stop_ 22.173 + 22.174 + Shuts down ALL running Xen domains. 22.175 +\end{quote}
23.1 --- a/tools/firmware/acpi/acpi_madt.c Tue Sep 20 09:43:29 2005 +0000 23.2 +++ b/tools/firmware/acpi/acpi_madt.c Tue Sep 20 09:43:46 2005 +0000 23.3 @@ -37,44 +37,7 @@ ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE Mad 23.4 ACPI_LOCAL_APIC_ADDRESS, 23.5 ACPI_MULTIPLE_APIC_FLAGS, 23.6 }, 23.7 - // 23.8 - // LOCAL APIC Entries for 4 processors. 23.9 - // 23.10 - { 23.11 - { 23.12 - ACPI_PROCESSOR_LOCAL_APIC, 23.13 - sizeof (ACPI_LOCAL_APIC_STRUCTURE), 23.14 - 0x00, 23.15 - 0x00, 23.16 - 0x00000001, 23.17 - }, 23.18 - 23.19 - { 23.20 - ACPI_PROCESSOR_LOCAL_APIC, 23.21 - sizeof (ACPI_LOCAL_APIC_STRUCTURE), 23.22 - 0x01, 23.23 - 0x00, 23.24 - 0x00000000 23.25 - }, 23.26 - 23.27 - { 23.28 - ACPI_PROCESSOR_LOCAL_APIC, 23.29 - sizeof (ACPI_LOCAL_APIC_STRUCTURE), 23.30 - 0x02, 23.31 - 0x00, 23.32 - 0x00000000 23.33 - }, 23.34 - 23.35 - { 23.36 - ACPI_PROCESSOR_LOCAL_APIC, 23.37 - sizeof (ACPI_LOCAL_APIC_STRUCTURE), 23.38 - 0x03, 23.39 - 0x00, 23.40 - 0x00000000 23.41 - } 23.42 - } 23.43 - , 23.44 - 23.45 + 23.46 // 23.47 // IO APIC 23.48 // 23.49 @@ -87,5 +50,19 @@ ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE Mad 23.50 ACPI_IO_APIC_ADDRESS_1, 23.51 0x0000 23.52 } 23.53 + }, 23.54 + 23.55 + // 23.56 + // LOCAL APIC Entries for up to 32 processors. 23.57 + // 23.58 + { 23.59 + { 23.60 + ACPI_PROCESSOR_LOCAL_APIC, 23.61 + sizeof (ACPI_LOCAL_APIC_STRUCTURE), 23.62 + 0x00, 23.63 + 0x00, 23.64 + 0x00000001, 23.65 + } 23.66 + 23.67 } 23.68 };
24.1 --- a/tools/firmware/acpi/acpi_madt.h Tue Sep 20 09:43:29 2005 +0000 24.2 +++ b/tools/firmware/acpi/acpi_madt.h Tue Sep 20 09:43:46 2005 +0000 24.3 @@ -35,9 +35,9 @@ 24.4 // 24.5 #pragma pack (1) 24.6 typedef struct { 24.7 - ACPI_2_0_MADT Header; 24.8 - ACPI_LOCAL_APIC_STRUCTURE LocalApic[4]; 24.9 - ACPI_IO_APIC_STRUCTURE IoApic[1]; 24.10 + ACPI_2_0_MADT Header; 24.11 + ACPI_IO_APIC_STRUCTURE IoApic[1]; 24.12 + ACPI_LOCAL_APIC_STRUCTURE LocalApic[32]; 24.13 } ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE; 24.14 #pragma pack () 24.15
25.1 --- a/tools/firmware/vmxassist/Makefile Tue Sep 20 09:43:29 2005 +0000 25.2 +++ b/tools/firmware/vmxassist/Makefile Tue Sep 20 09:43:46 2005 +0000 25.3 @@ -41,9 +41,9 @@ OBJECTS = head.o trap.o vm86.o setup.o u 25.4 25.5 all: vmxloader 25.6 25.7 -vmxloader: roms.h vmxloader.c acpi.h 25.8 - ${CC} ${CFLAGS} ${DEFINES} -c vmxloader.c 25.9 - $(CC) -o vmxloader.tmp -m32 -nostdlib -Wl,-N -Wl,-Ttext -Wl,0x100000 vmxloader.o 25.10 +vmxloader: roms.h vmxloader.c acpi.h acpi_madt.c 25.11 + ${CC} ${CFLAGS} ${DEFINES} -c vmxloader.c -c acpi_madt.c 25.12 + $(CC) -o vmxloader.tmp -m32 -nostdlib -Wl,-N -Wl,-Ttext -Wl,0x100000 vmxloader.o acpi_madt.o 25.13 objcopy --change-addresses=0xC0000000 vmxloader.tmp vmxloader 25.14 rm -f vmxloader.tmp 25.15
26.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 26.2 +++ b/tools/firmware/vmxassist/acpi_madt.c Tue Sep 20 09:43:46 2005 +0000 26.3 @@ -0,0 +1,145 @@ 26.4 +/* 26.5 + * acpi_madt.c: Update ACPI MADT table for multiple processor guest. 26.6 + * 26.7 + * Yu Ke, ke.yu@intel.com 26.8 + * Copyright (c) 2005, Intel Corporation. 26.9 + * 26.10 + * This program is free software; you can redistribute it and/or modify it 26.11 + * under the terms and conditions of the GNU General Public License, 26.12 + * version 2, as published by the Free Software Foundation. 26.13 + * 26.14 + * This program is distributed in the hope it will be useful, but WITHOUT 26.15 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 26.16 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 26.17 + * more details. 26.18 + * 26.19 + * You should have received a copy of the GNU General Public License along with 26.20 + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple 26.21 + * Place - Suite 330, Boston, MA 02111-1307 USA. 26.22 + */ 26.23 +#include "../acpi/acpi2_0.h" 26.24 +#include "../acpi/acpi_madt.h" 26.25 + 26.26 +#define NULL ((void*)0) 26.27 + 26.28 +extern int puts(const char *s); 26.29 + 26.30 +#define VCPU_MAGIC 0x76637075 /* "vcpu" */ 26.31 + 26.32 +/* xc_vmx_builder wrote vcpu block at 0x9F800. Return it. */ 26.33 +static int 26.34 +get_vcpus(void) 26.35 +{ 26.36 + unsigned long *vcpus; 26.37 + 26.38 + vcpus = (unsigned long *)0x9F800; 26.39 + if (vcpus[0] != VCPU_MAGIC) { 26.40 + puts("Bad vcpus magic, set vcpu number=1\n"); 26.41 + return 1; 26.42 + } 26.43 + 26.44 + return vcpus[1]; 26.45 +} 26.46 + 26.47 +static void * 26.48 +acpi_madt_get_madt(unsigned char *acpi_start) 26.49 +{ 26.50 + ACPI_2_0_RSDP *rsdp=NULL; 26.51 + ACPI_2_0_RSDT *rsdt=NULL; 26.52 + ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE *madt; 26.53 + 26.54 + rsdp = (ACPI_2_0_RSDP *)(acpi_start + sizeof(ACPI_2_0_FACS)); 26.55 + if (rsdp->Signature != ACPI_2_0_RSDP_SIGNATURE) { 26.56 + puts("Bad RSDP signature\n"); 26.57 + return NULL; 26.58 + } 26.59 + 26.60 + rsdt= (ACPI_2_0_RSDT *) 26.61 + (acpi_start + rsdp->RsdtAddress - ACPI_PHYSICAL_ADDRESS); 26.62 + if (rsdt->Header.Signature != ACPI_2_0_RSDT_SIGNATURE) { 26.63 + puts("Bad RSDT signature\n"); 26.64 + return NULL; 26.65 + } 26.66 + 26.67 + madt = (ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE *) 26.68 + ( acpi_start+ rsdt->Entry[1] - ACPI_PHYSICAL_ADDRESS); 26.69 + if (madt->Header.Header.Signature != 26.70 + ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE_SIGNATURE) { 26.71 + puts("Bad MADT signature \n"); 26.72 + return NULL; 26.73 + } 26.74 + 26.75 + return madt; 26.76 +} 26.77 + 26.78 +static void 26.79 +set_checksum(void *start, int checksum_offset, int len) 26.80 +{ 26.81 + unsigned char sum = 0; 26.82 + unsigned char *ptr; 26.83 + 26.84 + ptr = start; 26.85 + ptr[checksum_offset] = 0; 26.86 + while (len--) 26.87 + sum += *ptr++; 26.88 + 26.89 + ptr = start; 26.90 + ptr[checksum_offset] = -sum; 26.91 +} 26.92 + 26.93 +static int 26.94 +acpi_madt_set_local_apics( 26.95 + int nr_vcpu, 26.96 + ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE *madt) 26.97 +{ 26.98 + int i; 26.99 + 26.100 + if ((nr_vcpu > MAX_VIRT_CPUS) || (nr_vcpu < 0) || !madt) 26.101 + return -1; 26.102 + 26.103 + for (i = 0; i < nr_vcpu; i++) { 26.104 + madt->LocalApic[i].Type = ACPI_PROCESSOR_LOCAL_APIC; 26.105 + madt->LocalApic[i].Length = sizeof (ACPI_LOCAL_APIC_STRUCTURE); 26.106 + madt->LocalApic[i].AcpiProcessorId = i; 26.107 + madt->LocalApic[i].ApicId = i; 26.108 + madt->LocalApic[i].Flags = 1; 26.109 + } 26.110 + 26.111 + madt->Header.Header.Length = 26.112 + sizeof(ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE) - 26.113 + (MAX_VIRT_CPUS - nr_vcpu)* sizeof(ACPI_LOCAL_APIC_STRUCTURE); 26.114 + 26.115 + return 0; 26.116 +} 26.117 + 26.118 +#define FIELD_OFFSET(TYPE,Field) ((unsigned int)(&(((TYPE *) 0)->Field))) 26.119 + 26.120 +int acpi_madt_update(unsigned char *acpi_start) 26.121 +{ 26.122 + int rc; 26.123 + ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE *madt; 26.124 + 26.125 + madt = acpi_madt_get_madt(acpi_start); 26.126 + if (!madt) 26.127 + return -1; 26.128 + 26.129 + rc = acpi_madt_set_local_apics(get_vcpus(), madt); 26.130 + if (rc != 0) 26.131 + return rc; 26.132 + 26.133 + set_checksum( 26.134 + madt, FIELD_OFFSET(ACPI_TABLE_HEADER, Checksum), 26.135 + madt->Header.Header.Length); 26.136 + 26.137 + return 0; 26.138 +} 26.139 + 26.140 +/* 26.141 + * Local variables: 26.142 + * c-file-style: "linux" 26.143 + * indent-tabs-mode: t 26.144 + * c-indent-level: 8 26.145 + * c-basic-offset: 8 26.146 + * tab-width: 8 26.147 + * End: 26.148 + */
27.1 --- a/tools/firmware/vmxassist/vmxloader.c Tue Sep 20 09:43:29 2005 +0000 27.2 +++ b/tools/firmware/vmxassist/vmxloader.c Tue Sep 20 09:43:46 2005 +0000 27.3 @@ -27,6 +27,7 @@ 27.4 #ifdef _ACPI_ 27.5 #include "acpi.h" 27.6 #include "../acpi/acpi2_0.h" // for ACPI_PHYSICAL_ADDRESS 27.7 +int acpi_madt_update(unsigned char* acpi_start); 27.8 #endif 27.9 27.10 27.11 @@ -110,7 +111,10 @@ main() 27.12 } 27.13 #ifdef _ACPI_ 27.14 puts("Loading ACPI ...\n"); 27.15 - if (ACPI_PHYSICAL_ADDRESS+sizeof(acpi) <= 0xF0000 ){ 27.16 + 27.17 + acpi_madt_update(acpi); 27.18 + 27.19 + if (ACPI_PHYSICAL_ADDRESS+sizeof(acpi) <= 0xF0000) { 27.20 /* make sure acpi table does not overlap rombios 27.21 * currently acpi less than 8K will be OK. 27.22 */
28.1 --- a/tools/libxc/xc_vmx_build.c Tue Sep 20 09:43:29 2005 +0000 28.2 +++ b/tools/libxc/xc_vmx_build.c Tue Sep 20 09:43:46 2005 +0000 28.3 @@ -107,6 +107,33 @@ static void build_e820map(struct mem_map 28.4 mem_mapp->nr_map = nr_map; 28.5 } 28.6 28.7 +/* 28.8 + * Use E820 reserved memory 0x9F800 to pass number of vcpus to vmxloader 28.9 + * vmxloader will use it to config ACPI MADT table 28.10 + */ 28.11 +#define VCPU_MAGIC 0x76637075 /* "vcpu" */ 28.12 +static int 28.13 +set_nr_vcpus(int xc_handle, u32 dom, unsigned long *pfn_list, 28.14 + struct domain_setup_info *dsi, unsigned long vcpus) 28.15 +{ 28.16 + char *va_map; 28.17 + unsigned long *va_vcpus; 28.18 + 28.19 + va_map = xc_map_foreign_range( 28.20 + xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, 28.21 + pfn_list[(0x9F000 - dsi->v_start) >> PAGE_SHIFT]); 28.22 + if ( va_map == NULL ) 28.23 + return -1; 28.24 + 28.25 + va_vcpus = (unsigned long *)(va_map + 0x800); 28.26 + *va_vcpus++ = VCPU_MAGIC; 28.27 + *va_vcpus++ = vcpus; 28.28 + 28.29 + munmap(va_map, PAGE_SIZE); 28.30 + 28.31 + return 0; 28.32 +} 28.33 + 28.34 #ifdef __i386__ 28.35 static int zap_mmio_range(int xc_handle, u32 dom, 28.36 l2_pgentry_32_t *vl2tab, 28.37 @@ -496,7 +523,8 @@ static int setup_guest(int xc_handle, 28.38 MMU_MACHPHYS_UPDATE, count) ) 28.39 goto error_out; 28.40 } 28.41 - 28.42 + 28.43 + set_nr_vcpus(xc_handle, dom, page_array, &dsi, vcpus); 28.44 28.45 if ((boot_paramsp = xc_map_foreign_range( 28.46 xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
30.1 --- a/tools/vtpm/Makefile Tue Sep 20 09:43:29 2005 +0000 30.2 +++ b/tools/vtpm/Makefile Tue Sep 20 09:43:46 2005 +0000 30.3 @@ -4,7 +4,7 @@ XEN_ROOT = ../.. 30.4 include $(XEN_ROOT)/tools/vtpm/Rules.mk 30.5 30.6 # Dir name for emulator (as dom0 tpm driver) 30.7 -TPM_EMULATOR_DIR = tpm_emulator-0.2 30.8 +TPM_EMULATOR_DIR = tpm_emulator 30.9 # Dir name for vtpm instance 30.10 VTPM_DIR = vtpm 30.11 30.12 @@ -13,7 +13,7 @@ TPM_EMULATOR_TARFILE = tpm_emulator-0.2b 30.13 30.14 all: build 30.15 30.16 -build: $(TPM_EMULATOR_TARFILE) extract patch build_sub 30.17 +build: $(TPM_EMULATOR_DIR) $(VTPM_DIR) build_sub 30.18 30.19 install: build 30.20 $(MAKE) -C $(TPM_EMULATOR_DIR) $@ 30.21 @@ -26,36 +26,32 @@ clean: 30.22 if [ -d $(VTPM_DIR) ]; \ 30.23 then $(MAKE) -C $(VTPM_DIR) clean; \ 30.24 fi 30.25 + 30.26 +mrproper: 30.27 + rm -f $(TPM_EMULATOR_TARFILE) 30.28 rm -rf $(TPM_EMULATOR_DIR) 30.29 rm -rf $(VTPM_DIR) 30.30 30.31 -mrproper: clean 30.32 - rm -f $(TPM_EMULATOR_TARFILE) 30.33 - 30.34 # Download Swiss emulator 30.35 $(TPM_EMULATOR_TARFILE): 30.36 wget http://download.berlios.de/tpm-emulator/$(TPM_EMULATOR_TARFILE) 30.37 30.38 # Create vtpm and TPM emulator dirs 30.39 -extract: $(TPM_EMULATOR_DIR)/README $(VTPM_DIR)/README 30.40 - 30.41 -$(TPM_EMULATOR_DIR)/README: 30.42 - -rm -rf $(TPM_EMULATOR_DIR) 30.43 - tar -xzf $(TPM_EMULATOR_TARFILE) 30.44 - 30.45 -$(VTPM_DIR)/README: 30.46 - -rm -rf $(VTPM_DIR) 30.47 - cp -r --preserve $(TPM_EMULATOR_DIR) $(VTPM_DIR) 30.48 - 30.49 # apply patches for 1) used as dom0 tpm driver 2) used as vtpm device instance 30.50 -patch: $(TPM_EMULATOR_DIR)/Makefile $(VTPM_DIR)/Makefile 30.51 - 30.52 -$(TPM_EMULATOR_DIR)/Makefile: tpm_emulator.patch 30.53 +$(TPM_EMULATOR_DIR): $(TPM_EMULATOR_TARFILE) 30.54 + tar -xzf $(TPM_EMULATOR_TARFILE); 30.55 + mv tpm_emulator-0.2 $(TPM_EMULATOR_DIR); 30.56 + 30.57 -cd $(TPM_EMULATOR_DIR); \ 30.58 + patch -p1 < ../tpm_emulator-0.2b-x86_64.patch; \ 30.59 patch -p1 <../tpm_emulator.patch 30.60 30.61 -$(VTPM_DIR)/Makefile: vtpm.patch 30.62 +$(VTPM_DIR): $(TPM_EMULATOR_TARFILE) 30.63 + tar -xzf $(TPM_EMULATOR_TARFILE); 30.64 + mv tpm_emulator-0.2 $(VTPM_DIR); 30.65 + 30.66 -cd $(VTPM_DIR); \ 30.67 + patch -p1 < ../tpm_emulator-0.2b-x86_64.patch; \ 30.68 patch -p1 <../vtpm.patch 30.69 30.70 build_sub:
31.1 --- a/tools/vtpm/README Tue Sep 20 09:43:29 2005 +0000 31.2 +++ b/tools/vtpm/README Tue Sep 20 09:43:46 2005 +0000 31.3 @@ -23,6 +23,7 @@ Requirements 31.4 - xen-unstable 31.5 - IBM frontend/backend vtpm driver patch 31.6 - vtpm_managerd 31.7 +- GNU MP Big number library (GMP) 31.8 31.9 vtpmd Flow (for vtpm_manager. vtpmd never run by default) 31.10 ============================
32.1 --- a/tools/vtpm/tpm_emulator.patch Tue Sep 20 09:43:29 2005 +0000 32.2 +++ b/tools/vtpm/tpm_emulator.patch Tue Sep 20 09:43:46 2005 +0000 32.3 @@ -1,12 +1,12 @@ 32.4 -diff -uprN orig/tpm_emulator-0.2/AUTHORS tpm_emulator-0.2/AUTHORS 32.5 ---- orig/tpm_emulator-0.2/AUTHORS 2005-08-17 10:58:36.000000000 -0700 32.6 -+++ tpm_emulator-0.2/AUTHORS 2005-08-17 10:55:52.000000000 -0700 32.7 +diff -uprN orig/tpm_emulator-0.2-x86_64/AUTHORS tpm_emulator/AUTHORS 32.8 +--- orig/tpm_emulator-0.2-x86_64/AUTHORS 2005-08-15 00:58:57.000000000 -0700 32.9 ++++ tpm_emulator/AUTHORS 2005-09-14 20:27:22.000000000 -0700 32.10 @@ -1 +1,2 @@ 32.11 Mario Strasser <mast@gmx.net> 32.12 +INTEL Corp <> 32.13 -diff -uprN orig/tpm_emulator-0.2/ChangeLog tpm_emulator-0.2/ChangeLog 32.14 ---- orig/tpm_emulator-0.2/ChangeLog 2005-08-17 10:58:36.000000000 -0700 32.15 -+++ tpm_emulator-0.2/ChangeLog 2005-08-17 10:55:52.000000000 -0700 32.16 +diff -uprN orig/tpm_emulator-0.2-x86_64/ChangeLog tpm_emulator/ChangeLog 32.17 +--- orig/tpm_emulator-0.2-x86_64/ChangeLog 2005-08-15 00:58:57.000000000 -0700 32.18 ++++ tpm_emulator/ChangeLog 2005-09-14 20:27:22.000000000 -0700 32.19 @@ -1,3 +1,7 @@ 32.20 +2005-08-16: INTEL Corp 32.21 + * Set default permissions to PCRs 32.22 @@ -15,10 +15,29 @@ diff -uprN orig/tpm_emulator-0.2/ChangeL 32.23 2005-08-15 Mario Strasser <mast@gmx.net> 32.24 * all: some typos corrected 32.25 * tpm_integrity.c: bug in TPM_Extend fixed 32.26 -diff -uprN orig/tpm_emulator-0.2/Makefile tpm_emulator-0.2/Makefile 32.27 ---- orig/tpm_emulator-0.2/Makefile 2005-08-17 10:58:36.000000000 -0700 32.28 -+++ tpm_emulator-0.2/Makefile 2005-08-17 10:55:52.000000000 -0700 32.29 -@@ -1,15 +1,19 @@ 32.30 +diff -uprN orig/tpm_emulator-0.2-x86_64/linux_module.h tpm_emulator/linux_module.h 32.31 +--- orig/tpm_emulator-0.2-x86_64/linux_module.h 2005-09-15 19:21:14.844078720 -0700 32.32 ++++ tpm_emulator/linux_module.h 2005-09-14 20:27:22.000000000 -0700 32.33 +@@ -1,5 +1,6 @@ 32.34 + /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 32.35 + * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 32.36 ++ * Copyright (C) 2005 INTEL Corp. 32.37 + * 32.38 + * This module is free software; you can redistribute it and/or modify 32.39 + * it under the terms of the GNU General Public License as published 32.40 +@@ -35,7 +36,7 @@ 32.41 + #include "tpm_version.h" 32.42 + 32.43 + #define TPM_DEVICE_MINOR 224 32.44 +-#define TPM_DEVICE_NAME "tpm" 32.45 ++#define TPM_DEVICE_NAME "tpm0" 32.46 + #define TPM_MODULE_NAME "tpm_emulator" 32.47 + 32.48 + /* debug and log output functions */ 32.49 +diff -uprN orig/tpm_emulator-0.2-x86_64/Makefile tpm_emulator/Makefile 32.50 +--- orig/tpm_emulator-0.2-x86_64/Makefile 2005-09-15 19:21:14.845078568 -0700 32.51 ++++ tpm_emulator/Makefile 2005-09-14 20:27:22.000000000 -0700 32.52 +@@ -1,16 +1,20 @@ 32.53 # Software-Based Trusted Platform Module (TPM) Emulator for Linux 32.54 # Copyright (C) 2004 Mario Strasser <mast@gmx.net> 32.55 +# Copyright (C) 2005 INTEL Corp. 32.56 @@ -33,6 +52,7 @@ diff -uprN orig/tpm_emulator-0.2/Makefil 32.57 -KERNEL_BUILD := /lib/modules/$(KERNEL_RELEASE)/build 32.58 +KERNEL_BUILD := $(XEN_ROOT)/linux-2.6.12-xen0 32.59 MOD_SUBDIR := misc 32.60 + COMPILE_ARCH ?= $(shell uname -m | sed -e s/i.86/x86_32/) 32.61 32.62 # module settings 32.63 -MODULE_NAME := tpm_emulator 32.64 @@ -40,7 +60,7 @@ diff -uprN orig/tpm_emulator-0.2/Makefil 32.65 VERSION_MAJOR := 0 32.66 VERSION_MINOR := 2 32.67 VERSION_BUILD := $(shell date +"%s") 32.68 -@@ -27,11 +30,9 @@ DIRS := . crypto tpm 32.69 +@@ -34,11 +38,9 @@ DIRS := . crypto tpm 32.70 SRCS := $(foreach dir, $(DIRS), $(wildcard $(src)/$(dir)/*.c)) 32.71 OBJS := $(patsubst %.c, %.o, $(SRCS)) 32.72 SRCS += $(foreach dir, $(DIRS), $(wildcard $(src)/$(dir)/*.h)) 32.73 @@ -54,7 +74,7 @@ diff -uprN orig/tpm_emulator-0.2/Makefil 32.74 32.75 EXTRA_CFLAGS += -I$(src) -I$(src)/crypto -I$(src)/tpm 32.76 32.77 -@@ -42,23 +43,17 @@ all: $(src)/crypto/gmp.h $(src)/crypto/l 32.78 +@@ -49,23 +51,17 @@ all: $(src)/crypto/gmp.h $(src)/crypto/l 32.79 @$(MAKE) -C $(KERNEL_BUILD) M=$(CURDIR) modules 32.80 32.81 install: 32.82 @@ -84,9 +104,9 @@ diff -uprN orig/tpm_emulator-0.2/Makefil 32.83 32.84 $(src)/crypto/libgmp.a: 32.85 test -f $(src)/crypto/libgmp.a || ln -s $(GMP_LIB) $(src)/crypto/libgmp.a 32.86 -diff -uprN orig/tpm_emulator-0.2/README tpm_emulator-0.2/README 32.87 ---- orig/tpm_emulator-0.2/README 2005-08-17 10:58:36.000000000 -0700 32.88 -+++ tpm_emulator-0.2/README 2005-08-17 10:55:52.000000000 -0700 32.89 +diff -uprN orig/tpm_emulator-0.2-x86_64/README tpm_emulator/README 32.90 +--- orig/tpm_emulator-0.2-x86_64/README 2005-08-15 00:58:57.000000000 -0700 32.91 ++++ tpm_emulator/README 2005-09-14 20:27:22.000000000 -0700 32.92 @@ -13,7 +13,8 @@ $Id: README 8 2005-01-25 21:11:45Z jmoli 32.93 Copyright 32.94 -------------------------------------------------------------------------- 32.95 @@ -97,28 +117,9 @@ diff -uprN orig/tpm_emulator-0.2/README 32.96 32.97 This program is free software; you can redistribute it and/or modify 32.98 it under the terms of the GNU General Public License as published by 32.99 -diff -uprN orig/tpm_emulator-0.2/linux_module.h tpm_emulator-0.2/linux_module.h 32.100 ---- orig/tpm_emulator-0.2/linux_module.h 2005-08-17 10:58:36.000000000 -0700 32.101 -+++ tpm_emulator-0.2/linux_module.h 2005-08-17 10:55:52.000000000 -0700 32.102 -@@ -1,5 +1,6 @@ 32.103 - /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 32.104 - * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 32.105 -+ * Copyright (C) 2005 INTEL Corp. 32.106 - * 32.107 - * This module is free software; you can redistribute it and/or modify 32.108 - * it under the terms of the GNU General Public License as published 32.109 -@@ -33,7 +34,7 @@ 32.110 - #include "tpm_version.h" 32.111 - 32.112 - #define TPM_DEVICE_MINOR 224 32.113 --#define TPM_DEVICE_NAME "tpm" 32.114 -+#define TPM_DEVICE_NAME "tpm0" 32.115 - #define TPM_MODULE_NAME "tpm_emulator" 32.116 - 32.117 - /* debug and log output functions */ 32.118 -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_data.c tpm_emulator-0.2/tpm/tpm_data.c 32.119 ---- orig/tpm_emulator-0.2/tpm/tpm_data.c 2005-08-17 10:58:36.000000000 -0700 32.120 -+++ tpm_emulator-0.2/tpm/tpm_data.c 2005-08-17 10:55:52.000000000 -0700 32.121 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_data.c tpm_emulator/tpm/tpm_data.c 32.122 +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_data.c 2005-09-15 19:21:14.847078264 -0700 32.123 ++++ tpm_emulator/tpm/tpm_data.c 2005-09-14 20:27:22.000000000 -0700 32.124 @@ -1,6 +1,7 @@ 32.125 /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 32.126 * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 32.127 @@ -139,13 +140,3 @@ diff -uprN orig/tpm_emulator-0.2/tpm/tpm 32.128 tpmData.permanent.data.pcrAttrib[i].pcrReset = TRUE; 32.129 } 32.130 /* set tick type */ 32.131 -diff -uprN orig/tpm_emulator-0.2/tpm_version.h tpm_emulator-0.2/tpm_version.h 32.132 ---- orig/tpm_emulator-0.2/tpm_version.h 2005-08-17 10:58:36.000000000 -0700 32.133 -+++ tpm_emulator-0.2/tpm_version.h 2005-08-17 10:55:53.000000000 -0700 32.134 -@@ -2,5 +2,5 @@ 32.135 - #define _TPM_VERSION_H_ 32.136 - #define VERSION_MAJOR 0 32.137 - #define VERSION_MINOR 2 32.138 --#define VERSION_BUILD 1123950310 32.139 -+#define VERSION_BUILD 1124301353 32.140 - #endif /* _TPM_VERSION_H_ */
33.1 --- a/tools/vtpm/vtpm.patch Tue Sep 20 09:43:29 2005 +0000 33.2 +++ b/tools/vtpm/vtpm.patch Tue Sep 20 09:43:46 2005 +0000 33.3 @@ -1,12 +1,12 @@ 33.4 -diff -uprN orig/tpm_emulator-0.2/AUTHORS vtpm/AUTHORS 33.5 ---- orig/tpm_emulator-0.2/AUTHORS 2005-08-17 10:58:36.000000000 -0700 33.6 -+++ vtpm/AUTHORS 2005-08-17 10:55:52.000000000 -0700 33.7 +diff -uprN orig/tpm_emulator-0.2-x86_64/AUTHORS vtpm/AUTHORS 33.8 +--- orig/tpm_emulator-0.2-x86_64/AUTHORS 2005-08-15 00:58:57.000000000 -0700 33.9 ++++ vtpm/AUTHORS 2005-09-14 20:27:22.000000000 -0700 33.10 @@ -1 +1,2 @@ 33.11 Mario Strasser <mast@gmx.net> 33.12 +INTEL Corp <> 33.13 -diff -uprN orig/tpm_emulator-0.2/ChangeLog vtpm/ChangeLog 33.14 ---- orig/tpm_emulator-0.2/ChangeLog 2005-08-17 10:58:36.000000000 -0700 33.15 -+++ vtpm/ChangeLog 2005-08-17 10:55:52.000000000 -0700 33.16 +diff -uprN orig/tpm_emulator-0.2-x86_64/ChangeLog vtpm/ChangeLog 33.17 +--- orig/tpm_emulator-0.2-x86_64/ChangeLog 2005-08-15 00:58:57.000000000 -0700 33.18 ++++ vtpm/ChangeLog 2005-09-14 20:27:22.000000000 -0700 33.19 @@ -1,3 +1,7 @@ 33.20 +2005-08-16 Intel Corp 33.21 + Moved module out of kernel to run as a ring 3 app 33.22 @@ -15,115 +15,9 @@ diff -uprN orig/tpm_emulator-0.2/ChangeL 33.23 2005-08-15 Mario Strasser <mast@gmx.net> 33.24 * all: some typos corrected 33.25 * tpm_integrity.c: bug in TPM_Extend fixed 33.26 -diff -uprN orig/tpm_emulator-0.2/Makefile vtpm/Makefile 33.27 ---- orig/tpm_emulator-0.2/Makefile 2005-08-17 10:58:36.000000000 -0700 33.28 -+++ vtpm/Makefile 2005-08-17 10:55:52.000000000 -0700 33.29 -@@ -1,21 +1,29 @@ 33.30 - # Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.31 - # Copyright (C) 2004 Mario Strasser <mast@gmx.net> 33.32 -+# Copyright (C) 2005 INTEL Corp. 33.33 - # 33.34 - # $Id: Makefile 10 2005-04-26 20:59:50Z mast $ 33.35 - 33.36 --# kernel settings 33.37 --KERNEL_RELEASE := $(shell uname -r) 33.38 --KERNEL_BUILD := /lib/modules/$(KERNEL_RELEASE)/build 33.39 --MOD_SUBDIR := misc 33.40 -- 33.41 - # module settings 33.42 --MODULE_NAME := tpm_emulator 33.43 -+BIN := vtpmd 33.44 - VERSION_MAJOR := 0 33.45 - VERSION_MINOR := 2 33.46 - VERSION_BUILD := $(shell date +"%s") 33.47 - 33.48 --# enable/disable DEBUG messages 33.49 --EXTRA_CFLAGS += -DDEBUG -g 33.50 -+# Installation program and options 33.51 -+INSTALL = install 33.52 -+INSTALL_PROG = $(INSTALL) -m0755 33.53 -+INSTALL_DIR = $(INSTALL) -d -m0755 33.54 -+ 33.55 -+# Xen tools installation directory 33.56 -+TOOLS_INSTALL_DIR = $(DESTDIR)/usr/bin 33.57 -+ 33.58 -+CC := gcc 33.59 -+CFLAGS += -g -Wall $(INCLUDE) -DDEBUG 33.60 -+CFLAGS += -I. -Itpm 33.61 -+ 33.62 -+# Is the simulator running in it's own vm? 33.63 -+#CFLAGS += -DVTPM_MULTI_VM 33.64 - 33.65 - # GNU MP configuration 33.66 - GMP_LIB := /usr/lib/libgmp.a 33.67 -@@ -27,38 +35,31 @@ DIRS := . crypto tpm 33.68 - SRCS := $(foreach dir, $(DIRS), $(wildcard $(src)/$(dir)/*.c)) 33.69 - OBJS := $(patsubst %.c, %.o, $(SRCS)) 33.70 - SRCS += $(foreach dir, $(DIRS), $(wildcard $(src)/$(dir)/*.h)) 33.71 --DISTSRC := ./README ./AUTHORS ./ChangeLog ./Makefile $(SRCS) 33.72 --DISTDIR := tpm_emulator-$(VERSION_MAJOR).$(VERSION_MINOR) 33.73 - 33.74 --obj-m := $(MODULE_NAME).o 33.75 --$(MODULE_NAME)-objs := $(patsubst $(src)/%.o, %.o, $(OBJS)) crypto/libgmp.a 33.76 -+obj-m := $(BIN) 33.77 -+$(BIN)-objs := $(patsubst $(src)/%.o, %.o, $(OBJS)) crypto/libgmp.a 33.78 - 33.79 - EXTRA_CFLAGS += -I$(src) -I$(src)/crypto -I$(src)/tpm 33.80 - 33.81 - # do not print "Entering directory ..." 33.82 - MAKEFLAGS += --no-print-directory 33.83 - 33.84 --all: $(src)/crypto/gmp.h $(src)/crypto/libgmp.a version 33.85 -- @$(MAKE) -C $(KERNEL_BUILD) M=$(CURDIR) modules 33.86 -+all: $(BIN) 33.87 -+ 33.88 -+$(BIN): $(src)/crypto/gmp.h $(src)/crypto/libgmp.a version $(SRCS) $(OBJS) 33.89 -+ $(CC) $(CFLAGS) $(OBJS) $(src)/crypto/libgmp.a -o $(BIN) 33.90 -+ 33.91 -+%.o: %.c 33.92 -+ $(CC) $(CFLAGS) -c $< -o $@ 33.93 - 33.94 - install: 33.95 -- @$(MAKE) -C $(KERNEL_BUILD) M=$(CURDIR) modules_install 33.96 -- test -d /var/tpm || mkdir /var/tpm 33.97 -- test -c /dev/tpm || mknod /dev/tpm c 10 224 33.98 -- chmod 666 /dev/tpm 33.99 -- depmod -a 33.100 -+ $(INSTALL_PROG) $(BIN) $(TOOLS_INSTALL_DIR) 33.101 - 33.102 - clean: 33.103 -- @$(MAKE) -C $(KERNEL_BUILD) M=$(CURDIR) clean 33.104 -- rm -f $(src)/crypto/gmp.h $(src)/crypto/libgmp.a 33.105 -+ rm -f $(src)/crypto/gmp.h $(src)/crypto/libgmp.a $(OBJS) 33.106 - 33.107 --dist: $(DISTSRC) 33.108 -- rm -rf $(DISTDIR) 33.109 -- mkdir $(DISTDIR) 33.110 -- cp --parents $(DISTSRC) $(DISTDIR)/ 33.111 -- rm -f $(DISTDIR)/crypto/gmp.h 33.112 -- tar -chzf $(DISTDIR).tar.gz $(DISTDIR) 33.113 -- rm -rf $(DISTDIR) 33.114 -+mrproper: clean 33.115 -+ rm -f $(BIN) 33.116 - 33.117 - $(src)/crypto/libgmp.a: 33.118 - test -f $(src)/crypto/libgmp.a || ln -s $(GMP_LIB) $(src)/crypto/libgmp.a 33.119 -diff -uprN orig/tpm_emulator-0.2/README vtpm/README 33.120 ---- orig/tpm_emulator-0.2/README 2005-08-17 10:58:36.000000000 -0700 33.121 -+++ vtpm/README 2005-08-17 10:55:52.000000000 -0700 33.122 -@@ -13,7 +13,8 @@ $Id: README 8 2005-01-25 21:11:45Z jmoli 33.123 - Copyright 33.124 - -------------------------------------------------------------------------- 33.125 - Copyright (C) 2004 Mario Strasser <mast@gmx.net> and Swiss Federal 33.126 --Institute of Technology (ETH) Zurich. 33.127 -+ Institute of Technology (ETH) Zurich. 33.128 -+Copyright (C) 2005 INTEL Corp 33.129 - 33.130 - This program is free software; you can redistribute it and/or modify 33.131 - it under the terms of the GNU General Public License as published by 33.132 -diff -uprN orig/tpm_emulator-0.2/crypto/gmp_kernel_wrapper.c vtpm/crypto/gmp_kernel_wrapper.c 33.133 ---- orig/tpm_emulator-0.2/crypto/gmp_kernel_wrapper.c 2005-08-17 10:58:36.000000000 -0700 33.134 -+++ vtpm/crypto/gmp_kernel_wrapper.c 2005-08-17 10:55:52.000000000 -0700 33.135 +diff -uprN orig/tpm_emulator-0.2-x86_64/crypto/gmp_kernel_wrapper.c vtpm/crypto/gmp_kernel_wrapper.c 33.136 +--- orig/tpm_emulator-0.2-x86_64/crypto/gmp_kernel_wrapper.c 2005-09-15 19:21:42.508873032 -0700 33.137 ++++ vtpm/crypto/gmp_kernel_wrapper.c 2005-09-15 19:25:37.319176440 -0700 33.138 @@ -1,5 +1,6 @@ 33.139 /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.140 * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 33.141 @@ -154,9 +48,9 @@ diff -uprN orig/tpm_emulator-0.2/crypto/ 33.142 { 33.143 - void *ret = (void*)kmalloc(size, GFP_KERNEL); 33.144 - if (!ret) panic(KERN_CRIT TPM_MODULE_NAME 33.145 -- "GMP: cannot allocate memory (size=%u)\n", size); 33.146 +- "GMP: cannot allocate memory (size=%Zu)\n", size); 33.147 + void *ret = (void*)malloc(size); 33.148 -+ if (!ret) error("GMP: cannot allocate memory (size=%u)\n", size); 33.149 ++ if (!ret) error("GMP: cannot allocate memory (size=%Zu)\n", size); 33.150 return ret; 33.151 } 33.152 33.153 @@ -165,9 +59,10 @@ diff -uprN orig/tpm_emulator-0.2/crypto/ 33.154 { 33.155 - void *ret = (void*)kmalloc(new_size, GFP_KERNEL); 33.156 - if (!ret) panic(KERN_CRIT TPM_MODULE_NAME "GMP: Cannot reallocate memory " 33.157 +- "(old_size=%Zu new_size=%Zu)\n", old_size, new_size); 33.158 + void *ret = (void*)malloc(new_size); 33.159 + if (!ret) error("GMP: Cannot reallocate memory " 33.160 - "(old_size=%u new_size=%u)\n", old_size, new_size); 33.161 ++ "(old_size=%Zu new_size=%Zu)\n", old_size, new_size); 33.162 memcpy(ret, oldptr, old_size); 33.163 - kfree(oldptr); 33.164 + free(oldptr); 33.165 @@ -183,9 +78,9 @@ diff -uprN orig/tpm_emulator-0.2/crypto/ 33.166 } 33.167 } 33.168 33.169 -diff -uprN orig/tpm_emulator-0.2/crypto/rsa.c vtpm/crypto/rsa.c 33.170 ---- orig/tpm_emulator-0.2/crypto/rsa.c 2005-08-17 10:58:36.000000000 -0700 33.171 -+++ vtpm/crypto/rsa.c 2005-08-17 10:55:52.000000000 -0700 33.172 +diff -uprN orig/tpm_emulator-0.2-x86_64/crypto/rsa.c vtpm/crypto/rsa.c 33.173 +--- orig/tpm_emulator-0.2-x86_64/crypto/rsa.c 2005-08-15 00:58:57.000000000 -0700 33.174 ++++ vtpm/crypto/rsa.c 2005-09-14 20:27:22.000000000 -0700 33.175 @@ -1,5 +1,6 @@ 33.176 /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.177 * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 33.178 @@ -211,8 +106,8 @@ diff -uprN orig/tpm_emulator-0.2/crypto/ 33.179 sha1_final(&ctx, &msg[1]); 33.180 if (memcmp(&msg[1], &msg[1 + SHA1_DIGEST_LENGTH], 33.181 SHA1_DIGEST_LENGTH) != 0) return -1; 33.182 -diff -uprN orig/tpm_emulator-0.2/linux_module.c vtpm/linux_module.c 33.183 ---- orig/tpm_emulator-0.2/linux_module.c 2005-08-17 10:58:36.000000000 -0700 33.184 +diff -uprN orig/tpm_emulator-0.2-x86_64/linux_module.c vtpm/linux_module.c 33.185 +--- orig/tpm_emulator-0.2-x86_64/linux_module.c 2005-09-15 19:22:40.343080896 -0700 33.186 +++ vtpm/linux_module.c 1969-12-31 16:00:00.000000000 -0800 33.187 @@ -1,163 +0,0 @@ 33.188 -/* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.189 @@ -283,7 +178,7 @@ diff -uprN orig/tpm_emulator-0.2/linux_m 33.190 - 33.191 -static ssize_t tpm_read(struct file *file, char *buf, size_t count, loff_t *ppos) 33.192 -{ 33.193 -- debug("%s(%d)", __FUNCTION__, count); 33.194 +- debug("%s(%Zu)", __FUNCTION__, count); 33.195 - down(&tpm_mutex); 33.196 - if (tpm_response.data != NULL) { 33.197 - count = min(count, (size_t)tpm_response.size - (size_t)*ppos); 33.198 @@ -298,7 +193,7 @@ diff -uprN orig/tpm_emulator-0.2/linux_m 33.199 - 33.200 -static ssize_t tpm_write(struct file *file, const char *buf, size_t count, loff_t *ppos) 33.201 -{ 33.202 -- debug("%s(%d)", __FUNCTION__, count); 33.203 +- debug("%s(%Zu)", __FUNCTION__, count); 33.204 - down(&tpm_mutex); 33.205 - *ppos = 0; 33.206 - if (tpm_response.data != NULL) kfree(tpm_response.data); 33.207 @@ -378,9 +273,9 @@ diff -uprN orig/tpm_emulator-0.2/linux_m 33.208 - return (ticks > 0) ? ticks : 1; 33.209 -} 33.210 - 33.211 -diff -uprN orig/tpm_emulator-0.2/linux_module.h vtpm/linux_module.h 33.212 ---- orig/tpm_emulator-0.2/linux_module.h 2005-08-17 10:58:36.000000000 -0700 33.213 -+++ vtpm/linux_module.h 2005-08-17 10:55:52.000000000 -0700 33.214 +diff -uprN orig/tpm_emulator-0.2-x86_64/linux_module.h vtpm/linux_module.h 33.215 +--- orig/tpm_emulator-0.2-x86_64/linux_module.h 2005-09-15 19:21:14.844078720 -0700 33.216 ++++ vtpm/linux_module.h 2005-09-14 20:27:22.000000000 -0700 33.217 @@ -1,5 +1,6 @@ 33.218 /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.219 * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 33.220 @@ -416,17 +311,20 @@ diff -uprN orig/tpm_emulator-0.2/linux_m 33.221 33.222 +/* module settings */ 33.223 +#define min(A,B) ((A)<(B)?(A):(B)) 33.224 + #ifndef STR 33.225 #define STR(s) __STR__(s) 33.226 #define __STR__(s) #s 33.227 - #include "tpm_version.h" 33.228 -@@ -39,32 +45,35 @@ 33.229 +@@ -39,34 +45,38 @@ 33.230 + #define TPM_MODULE_NAME "tpm_emulator" 33.231 + 33.232 /* debug and log output functions */ 33.233 ++extern int dmi_id; 33.234 33.235 #ifdef DEBUG 33.236 -#define debug(fmt, ...) printk(KERN_DEBUG "%s %s:%d: Debug: " fmt "\n", \ 33.237 - TPM_MODULE_NAME, __FILE__, __LINE__, ## __VA_ARGS__) 33.238 -+#define debug(fmt, ...) printf("%s:%d: Debug: " fmt "\n", \ 33.239 -+ __FILE__, __LINE__, ## __VA_ARGS__) 33.240 ++#define debug(fmt, ...) printf("TPMD[%d]: %s:%d: Debug: " fmt "\n", \ 33.241 ++ dmi_id, __FILE__, __LINE__, ## __VA_ARGS__) 33.242 #else 33.243 #define debug(fmt, ...) 33.244 #endif 33.245 @@ -436,12 +334,12 @@ diff -uprN orig/tpm_emulator-0.2/linux_m 33.246 - TPM_MODULE_NAME, __FILE__, __LINE__, ## __VA_ARGS__) 33.247 -#define alert(fmt, ...) printk(KERN_ALERT "%s %s:%d: Alert: " fmt "\n", \ 33.248 - TPM_MODULE_NAME, __FILE__, __LINE__, ## __VA_ARGS__) 33.249 -+#define info(fmt, ...) printf("%s:%d: Info: " fmt "\n", \ 33.250 -+ __FILE__, __LINE__, ## __VA_ARGS__) 33.251 -+#define error(fmt, ...) printf("%s:%d: Error: " fmt "\n", \ 33.252 -+ __FILE__, __LINE__, ## __VA_ARGS__) 33.253 -+#define alert(fmt, ...) printf("%s:%d: Alert: " fmt "\n", \ 33.254 -+ __FILE__, __LINE__, ## __VA_ARGS__) 33.255 ++#define info(fmt, ...) printf("TPMD[%d]: %s:%d: Info: " fmt "\n", \ 33.256 ++ dmi_id, __FILE__, __LINE__, ## __VA_ARGS__) 33.257 ++#define error(fmt, ...) printf("TPMD[%d]: %s:%d: Error: " fmt "\n", \ 33.258 ++ dmi_id, __FILE__, __LINE__, ## __VA_ARGS__) 33.259 ++#define alert(fmt, ...) printf("TPMD[%d]: %s:%d: Alert: " fmt "\n", \ 33.260 ++ dmi_id, __FILE__, __LINE__, ## __VA_ARGS__) 33.261 33.262 /* memory allocation */ 33.263 33.264 @@ -465,7 +363,7 @@ diff -uprN orig/tpm_emulator-0.2/linux_m 33.265 static inline void tpm_get_random_bytes(void *buf, int nbytes) 33.266 { 33.267 get_random_bytes(buf, nbytes); 33.268 -@@ -84,9 +93,9 @@ uint64_t tpm_get_ticks(void); 33.269 +@@ -86,9 +96,9 @@ uint64_t tpm_get_ticks(void); 33.270 #define CPU_TO_LE16(x) __cpu_to_le16(x) 33.271 33.272 #define BE64_TO_CPU(x) __be64_to_cpu(x) 33.273 @@ -477,9 +375,116 @@ diff -uprN orig/tpm_emulator-0.2/linux_m 33.274 #define BE16_TO_CPU(x) __be16_to_cpu(x) 33.275 #define LE16_TO_CPU(x) __le16_to_cpu(x) 33.276 33.277 -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_audit.c vtpm/tpm/tpm_audit.c 33.278 ---- orig/tpm_emulator-0.2/tpm/tpm_audit.c 2005-08-17 10:58:36.000000000 -0700 33.279 -+++ vtpm/tpm/tpm_audit.c 2005-08-17 10:55:52.000000000 -0700 33.280 +diff -uprN orig/tpm_emulator-0.2-x86_64/Makefile vtpm/Makefile 33.281 +--- orig/tpm_emulator-0.2-x86_64/Makefile 2005-09-15 19:21:14.845078568 -0700 33.282 ++++ vtpm/Makefile 2005-09-14 20:27:22.000000000 -0700 33.283 +@@ -1,22 +1,31 @@ 33.284 + # Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.285 + # Copyright (C) 2004 Mario Strasser <mast@gmx.net> 33.286 ++# Copyright (C) 2005 INTEL Corp. 33.287 + # 33.288 + # $Id: Makefile 10 2005-04-26 20:59:50Z mast $ 33.289 + 33.290 +-# kernel settings 33.291 +-KERNEL_RELEASE := $(shell uname -r) 33.292 +-KERNEL_BUILD := /lib/modules/$(KERNEL_RELEASE)/build 33.293 +-MOD_SUBDIR := misc 33.294 + COMPILE_ARCH ?= $(shell uname -m | sed -e s/i.86/x86_32/) 33.295 + 33.296 + # module settings 33.297 +-MODULE_NAME := tpm_emulator 33.298 ++BIN := vtpmd 33.299 + VERSION_MAJOR := 0 33.300 + VERSION_MINOR := 2 33.301 + VERSION_BUILD := $(shell date +"%s") 33.302 + 33.303 +-# enable/disable DEBUG messages 33.304 +-EXTRA_CFLAGS += -DDEBUG -g 33.305 ++# Installation program and options 33.306 ++INSTALL = install 33.307 ++INSTALL_PROG = $(INSTALL) -m0755 33.308 ++INSTALL_DIR = $(INSTALL) -d -m0755 33.309 ++ 33.310 ++# Xen tools installation directory 33.311 ++TOOLS_INSTALL_DIR = $(DESTDIR)/usr/bin 33.312 ++ 33.313 ++CC := gcc 33.314 ++CFLAGS += -g -Wall $(INCLUDE) -DDEBUG 33.315 ++CFLAGS += -I. -Itpm 33.316 ++ 33.317 ++# Is the simulator running in it's own vm? 33.318 ++#CFLAGS += -DVTPM_MULTI_VM 33.319 + 33.320 + ifeq ($(COMPILE_ARCH),x86_64) 33.321 + LIBDIR = lib64 33.322 +@@ -34,38 +43,31 @@ DIRS := . crypto tpm 33.323 + SRCS := $(foreach dir, $(DIRS), $(wildcard $(src)/$(dir)/*.c)) 33.324 + OBJS := $(patsubst %.c, %.o, $(SRCS)) 33.325 + SRCS += $(foreach dir, $(DIRS), $(wildcard $(src)/$(dir)/*.h)) 33.326 +-DISTSRC := ./README ./AUTHORS ./ChangeLog ./Makefile $(SRCS) 33.327 +-DISTDIR := tpm_emulator-$(VERSION_MAJOR).$(VERSION_MINOR) 33.328 + 33.329 +-obj-m := $(MODULE_NAME).o 33.330 +-$(MODULE_NAME)-objs := $(patsubst $(src)/%.o, %.o, $(OBJS)) crypto/libgmp.a 33.331 ++obj-m := $(BIN) 33.332 ++$(BIN)-objs := $(patsubst $(src)/%.o, %.o, $(OBJS)) crypto/libgmp.a 33.333 + 33.334 + EXTRA_CFLAGS += -I$(src) -I$(src)/crypto -I$(src)/tpm 33.335 + 33.336 + # do not print "Entering directory ..." 33.337 + MAKEFLAGS += --no-print-directory 33.338 + 33.339 +-all: $(src)/crypto/gmp.h $(src)/crypto/libgmp.a version 33.340 +- @$(MAKE) -C $(KERNEL_BUILD) M=$(CURDIR) modules 33.341 ++all: $(BIN) 33.342 ++ 33.343 ++$(BIN): $(src)/crypto/gmp.h $(src)/crypto/libgmp.a version $(SRCS) $(OBJS) 33.344 ++ $(CC) $(CFLAGS) $(OBJS) $(src)/crypto/libgmp.a -o $(BIN) 33.345 ++ 33.346 ++%.o: %.c 33.347 ++ $(CC) $(CFLAGS) -c $< -o $@ 33.348 + 33.349 + install: 33.350 +- @$(MAKE) -C $(KERNEL_BUILD) M=$(CURDIR) modules_install 33.351 +- test -d /var/tpm || mkdir /var/tpm 33.352 +- test -c /dev/tpm || mknod /dev/tpm c 10 224 33.353 +- chmod 666 /dev/tpm 33.354 +- depmod -a 33.355 ++ $(INSTALL_PROG) $(BIN) $(TOOLS_INSTALL_DIR) 33.356 + 33.357 + clean: 33.358 +- @$(MAKE) -C $(KERNEL_BUILD) M=$(CURDIR) clean 33.359 +- rm -f $(src)/crypto/gmp.h $(src)/crypto/libgmp.a 33.360 ++ rm -f $(src)/crypto/gmp.h $(src)/crypto/libgmp.a $(OBJS) 33.361 + 33.362 +-dist: $(DISTSRC) 33.363 +- rm -rf $(DISTDIR) 33.364 +- mkdir $(DISTDIR) 33.365 +- cp --parents $(DISTSRC) $(DISTDIR)/ 33.366 +- rm -f $(DISTDIR)/crypto/gmp.h 33.367 +- tar -chzf $(DISTDIR).tar.gz $(DISTDIR) 33.368 +- rm -rf $(DISTDIR) 33.369 ++mrproper: clean 33.370 ++ rm -f $(BIN) tpm_version.h 33.371 + 33.372 + $(src)/crypto/libgmp.a: 33.373 + test -f $(src)/crypto/libgmp.a || ln -s $(GMP_LIB) $(src)/crypto/libgmp.a 33.374 +diff -uprN orig/tpm_emulator-0.2-x86_64/README vtpm/README 33.375 +--- orig/tpm_emulator-0.2-x86_64/README 2005-08-15 00:58:57.000000000 -0700 33.376 ++++ vtpm/README 2005-09-14 20:27:22.000000000 -0700 33.377 +@@ -13,7 +13,8 @@ $Id: README 8 2005-01-25 21:11:45Z jmoli 33.378 + Copyright 33.379 + -------------------------------------------------------------------------- 33.380 + Copyright (C) 2004 Mario Strasser <mast@gmx.net> and Swiss Federal 33.381 +-Institute of Technology (ETH) Zurich. 33.382 ++ Institute of Technology (ETH) Zurich. 33.383 ++Copyright (C) 2005 INTEL Corp 33.384 + 33.385 + This program is free software; you can redistribute it and/or modify 33.386 + it under the terms of the GNU General Public License as published by 33.387 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_audit.c vtpm/tpm/tpm_audit.c 33.388 +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_audit.c 2005-08-15 00:58:57.000000000 -0700 33.389 ++++ vtpm/tpm/tpm_audit.c 2005-09-14 20:27:22.000000000 -0700 33.390 @@ -1,6 +1,7 @@ 33.391 /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.392 * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 33.393 @@ -542,9 +547,9 @@ diff -uprN orig/tpm_emulator-0.2/tpm/tpm 33.394 return TPM_SUCCESS; 33.395 } 33.396 - 33.397 -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_authorization.c vtpm/tpm/tpm_authorization.c 33.398 ---- orig/tpm_emulator-0.2/tpm/tpm_authorization.c 2005-08-17 10:58:36.000000000 -0700 33.399 -+++ vtpm/tpm/tpm_authorization.c 2005-08-17 10:55:52.000000000 -0700 33.400 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_authorization.c vtpm/tpm/tpm_authorization.c 33.401 +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_authorization.c 2005-08-15 00:58:57.000000000 -0700 33.402 ++++ vtpm/tpm/tpm_authorization.c 2005-09-14 20:27:22.000000000 -0700 33.403 @@ -1,6 +1,7 @@ 33.404 /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.405 * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 33.406 @@ -568,9 +573,9 @@ diff -uprN orig/tpm_emulator-0.2/tpm/tpm 33.407 } 33.408 - 33.409 - 33.410 -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_capability.c vtpm/tpm/tpm_capability.c 33.411 ---- orig/tpm_emulator-0.2/tpm/tpm_capability.c 2005-08-17 10:58:36.000000000 -0700 33.412 -+++ vtpm/tpm/tpm_capability.c 2005-08-17 10:55:52.000000000 -0700 33.413 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_capability.c vtpm/tpm/tpm_capability.c 33.414 +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_capability.c 2005-08-15 00:58:57.000000000 -0700 33.415 ++++ vtpm/tpm/tpm_capability.c 2005-09-14 20:27:22.000000000 -0700 33.416 @@ -1,6 +1,7 @@ 33.417 /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.418 * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 33.419 @@ -593,9 +598,9 @@ diff -uprN orig/tpm_emulator-0.2/tpm/tpm 33.420 } 33.421 } 33.422 - 33.423 -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_cmd_handler.c vtpm/tpm/tpm_cmd_handler.c 33.424 ---- orig/tpm_emulator-0.2/tpm/tpm_cmd_handler.c 2005-08-17 10:58:36.000000000 -0700 33.425 -+++ vtpm/tpm/tpm_cmd_handler.c 2005-08-17 10:55:52.000000000 -0700 33.426 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_cmd_handler.c vtpm/tpm/tpm_cmd_handler.c 33.427 +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_cmd_handler.c 2005-08-15 00:58:57.000000000 -0700 33.428 ++++ vtpm/tpm/tpm_cmd_handler.c 2005-09-14 20:27:22.000000000 -0700 33.429 @@ -1,6 +1,7 @@ 33.430 /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.431 * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 33.432 @@ -658,9 +663,9 @@ diff -uprN orig/tpm_emulator-0.2/tpm/tpm 33.433 return 0; 33.434 } 33.435 - 33.436 -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_crypto.c vtpm/tpm/tpm_crypto.c 33.437 ---- orig/tpm_emulator-0.2/tpm/tpm_crypto.c 2005-08-17 10:58:36.000000000 -0700 33.438 -+++ vtpm/tpm/tpm_crypto.c 2005-08-17 10:55:52.000000000 -0700 33.439 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_crypto.c vtpm/tpm/tpm_crypto.c 33.440 +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_crypto.c 2005-09-15 19:21:14.846078416 -0700 33.441 ++++ vtpm/tpm/tpm_crypto.c 2005-09-14 20:27:22.000000000 -0700 33.442 @@ -1,6 +1,7 @@ 33.443 /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.444 * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 33.445 @@ -678,14 +683,14 @@ diff -uprN orig/tpm_emulator-0.2/tpm/tpm 33.446 memcpy(&buf[30], areaToSign, areaToSignSize); 33.447 if (rsa_sign(&key->key, RSA_SSA_PKCS1_SHA1, 33.448 buf, areaToSignSize + 30, *sig)) { 33.449 -@@ -379,4 +380,3 @@ TPM_RESULT TPM_CertifyKey2(TPM_KEY_HANDL 33.450 +@@ -383,4 +384,3 @@ TPM_RESULT TPM_CertifyKey2(TPM_KEY_HANDL 33.451 } 33.452 return TPM_SUCCESS; 33.453 } 33.454 - 33.455 -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_data.c vtpm/tpm/tpm_data.c 33.456 ---- orig/tpm_emulator-0.2/tpm/tpm_data.c 2005-08-17 10:58:36.000000000 -0700 33.457 -+++ vtpm/tpm/tpm_data.c 2005-08-17 10:55:52.000000000 -0700 33.458 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_data.c vtpm/tpm/tpm_data.c 33.459 +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_data.c 2005-09-15 19:21:14.847078264 -0700 33.460 ++++ vtpm/tpm/tpm_data.c 2005-09-14 20:27:22.000000000 -0700 33.461 @@ -1,6 +1,7 @@ 33.462 /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.463 * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 33.464 @@ -1005,7 +1010,7 @@ diff -uprN orig/tpm_emulator-0.2/tpm/tpm 33.465 } 33.466 33.467 #else 33.468 -@@ -231,7 +431,6 @@ int tpm_restore_permanent_data(void) 33.469 +@@ -232,7 +432,6 @@ int tpm_restore_permanent_data(void) 33.470 33.471 int tpm_erase_permanent_data(void) 33.472 { 33.473 @@ -1014,9 +1019,9 @@ diff -uprN orig/tpm_emulator-0.2/tpm/tpm 33.474 return res; 33.475 } 33.476 - 33.477 -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_deprecated.c vtpm/tpm/tpm_deprecated.c 33.478 ---- orig/tpm_emulator-0.2/tpm/tpm_deprecated.c 2005-08-17 10:58:36.000000000 -0700 33.479 -+++ vtpm/tpm/tpm_deprecated.c 2005-08-17 10:55:52.000000000 -0700 33.480 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_deprecated.c vtpm/tpm/tpm_deprecated.c 33.481 +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_deprecated.c 2005-08-15 00:58:57.000000000 -0700 33.482 ++++ vtpm/tpm/tpm_deprecated.c 2005-09-14 20:27:22.000000000 -0700 33.483 @@ -1,6 +1,7 @@ 33.484 /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.485 * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 33.486 @@ -1043,9 +1048,9 @@ diff -uprN orig/tpm_emulator-0.2/tpm/tpm 33.487 authContextSize, &contextBlob); 33.488 if (res != TPM_SUCCESS) return res; 33.489 len = *authContextSize; 33.490 -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_emulator.h vtpm/tpm/tpm_emulator.h 33.491 ---- orig/tpm_emulator-0.2/tpm/tpm_emulator.h 2005-08-17 10:58:36.000000000 -0700 33.492 -+++ vtpm/tpm/tpm_emulator.h 2005-08-17 10:55:52.000000000 -0700 33.493 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_emulator.h vtpm/tpm/tpm_emulator.h 33.494 +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_emulator.h 2005-08-15 00:58:57.000000000 -0700 33.495 ++++ vtpm/tpm/tpm_emulator.h 2005-09-14 20:27:22.000000000 -0700 33.496 @@ -1,5 +1,6 @@ 33.497 /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.498 * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 33.499 @@ -1063,9 +1068,9 @@ diff -uprN orig/tpm_emulator-0.2/tpm/tpm 33.500 33.501 /** 33.502 * tpm_emulator_init - initialises and starts the TPM emulator 33.503 -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_integrity.c vtpm/tpm/tpm_integrity.c 33.504 ---- orig/tpm_emulator-0.2/tpm/tpm_integrity.c 2005-08-17 10:58:36.000000000 -0700 33.505 -+++ vtpm/tpm/tpm_integrity.c 2005-08-17 10:55:52.000000000 -0700 33.506 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_integrity.c vtpm/tpm/tpm_integrity.c 33.507 +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_integrity.c 2005-08-15 00:58:57.000000000 -0700 33.508 ++++ vtpm/tpm/tpm_integrity.c 2005-09-14 20:27:22.000000000 -0700 33.509 @@ -1,6 +1,7 @@ 33.510 /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.511 * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 33.512 @@ -1079,9 +1084,9 @@ diff -uprN orig/tpm_emulator-0.2/tpm/tpm 33.513 return TPM_SUCCESS; 33.514 } 33.515 - 33.516 -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_structures.h vtpm/tpm/tpm_structures.h 33.517 ---- orig/tpm_emulator-0.2/tpm/tpm_structures.h 2005-08-17 10:58:36.000000000 -0700 33.518 -+++ vtpm/tpm/tpm_structures.h 2005-08-17 10:55:52.000000000 -0700 33.519 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_structures.h vtpm/tpm/tpm_structures.h 33.520 +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_structures.h 2005-08-15 00:58:57.000000000 -0700 33.521 ++++ vtpm/tpm/tpm_structures.h 2005-09-14 20:27:22.000000000 -0700 33.522 @@ -1,6 +1,7 @@ 33.523 /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.524 * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 33.525 @@ -1099,9 +1104,9 @@ diff -uprN orig/tpm_emulator-0.2/tpm/tpm 33.526 #include "crypto/rsa.h" 33.527 33.528 /* 33.529 -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_testing.c vtpm/tpm/tpm_testing.c 33.530 ---- orig/tpm_emulator-0.2/tpm/tpm_testing.c 2005-08-17 10:58:36.000000000 -0700 33.531 -+++ vtpm/tpm/tpm_testing.c 2005-08-17 10:55:52.000000000 -0700 33.532 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_testing.c vtpm/tpm/tpm_testing.c 33.533 +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_testing.c 2005-08-15 00:58:57.000000000 -0700 33.534 ++++ vtpm/tpm/tpm_testing.c 2005-09-14 20:27:22.000000000 -0700 33.535 @@ -1,6 +1,7 @@ 33.536 /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.537 * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 33.538 @@ -1217,9 +1222,9 @@ diff -uprN orig/tpm_emulator-0.2/tpm/tpm 33.539 rsa_private_key_t priv_key; 33.540 rsa_public_key_t pub_key; 33.541 33.542 -diff -uprN orig/tpm_emulator-0.2/tpm/tpm_ticks.c vtpm/tpm/tpm_ticks.c 33.543 ---- orig/tpm_emulator-0.2/tpm/tpm_ticks.c 2005-08-17 10:58:36.000000000 -0700 33.544 -+++ vtpm/tpm/tpm_ticks.c 2005-08-17 10:55:52.000000000 -0700 33.545 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/tpm_ticks.c vtpm/tpm/tpm_ticks.c 33.546 +--- orig/tpm_emulator-0.2-x86_64/tpm/tpm_ticks.c 2005-08-15 00:58:57.000000000 -0700 33.547 ++++ vtpm/tpm/tpm_ticks.c 2005-09-14 20:27:22.000000000 -0700 33.548 @@ -1,6 +1,7 @@ 33.549 /* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.550 * Copyright (C) 2004 Mario Strasser <mast@gmx.net>, 33.551 @@ -1302,9 +1307,9 @@ diff -uprN orig/tpm_emulator-0.2/tpm/tpm 33.552 } 33.553 33.554 33.555 -diff -uprN orig/tpm_emulator-0.2/tpm/vtpm_manager.h vtpm/tpm/vtpm_manager.h 33.556 ---- orig/tpm_emulator-0.2/tpm/vtpm_manager.h 1969-12-31 16:00:00.000000000 -0800 33.557 -+++ vtpm/tpm/vtpm_manager.h 2005-08-17 10:55:52.000000000 -0700 33.558 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpm/vtpm_manager.h vtpm/tpm/vtpm_manager.h 33.559 +--- orig/tpm_emulator-0.2-x86_64/tpm/vtpm_manager.h 1969-12-31 16:00:00.000000000 -0800 33.560 ++++ vtpm/tpm/vtpm_manager.h 2005-09-14 20:27:22.000000000 -0700 33.561 @@ -0,0 +1,126 @@ 33.562 +// =================================================================== 33.563 +// 33.564 @@ -1432,9 +1437,9 @@ diff -uprN orig/tpm_emulator-0.2/tpm/vtp 33.565 +*********************************************************************/ 33.566 + 33.567 +#endif //_VTPM_MANAGER_H_ 33.568 -diff -uprN orig/tpm_emulator-0.2/tpmd.c vtpm/tpmd.c 33.569 ---- orig/tpm_emulator-0.2/tpmd.c 1969-12-31 16:00:00.000000000 -0800 33.570 -+++ vtpm/tpmd.c 2005-08-17 10:55:52.000000000 -0700 33.571 +diff -uprN orig/tpm_emulator-0.2-x86_64/tpmd.c vtpm/tpmd.c 33.572 +--- orig/tpm_emulator-0.2-x86_64/tpmd.c 1969-12-31 16:00:00.000000000 -0800 33.573 ++++ vtpm/tpmd.c 2005-09-15 19:28:55.783005352 -0700 33.574 @@ -0,0 +1,207 @@ 33.575 +/* Software-Based Trusted Platform Module (TPM) Emulator for Linux 33.576 + * Copyright (C) 2005 INTEL Corp 33.577 @@ -1468,9 +1473,9 @@ diff -uprN orig/tpm_emulator-0.2/tpmd.c 33.578 +#else 33.579 + #define GUEST_RX_FIFO_D "/var/vtpm/fifos/guest-to-%d.fifo" 33.580 + #define GUEST_TX_FIFO "/var/vtpm/fifos/guest-from-all.fifo" 33.581 ++#endif 33.582 + 33.583 + int dmi_id; 33.584 -+#endif 33.585 + 33.586 +#define BUFFER_SIZE 2048 33.587 + 33.588 @@ -1506,7 +1511,7 @@ diff -uprN orig/tpm_emulator-0.2/tpmd.c 33.589 +{ 33.590 + uint8_t in[BUFFER_SIZE], *out, *addressed_out; 33.591 + uint32_t out_size; 33.592 -+ int in_size, written ; 33.593 ++ int in_size, written; 33.594 + int i, guest_id=-1; 33.595 + 33.596 + int vtpm_tx_fh=-1, vtpm_rx_fh=-1; 33.597 @@ -1602,7 +1607,7 @@ diff -uprN orig/tpm_emulator-0.2/tpmd.c 33.598 + written = write(vtpm_tx_fh, ctrl_msg, sizeof(ctrl_msg)); 33.599 + 33.600 + if (written != sizeof(ctrl_msg)) { 33.601 -+ printf("ERROR: Part of response not written %d/%d.\n", written, sizeof(ctrl_msg)); 33.602 ++ printf("ERROR: Part of response not written %d/%Zu.\n", written, sizeof(ctrl_msg)); 33.603 + } else { 33.604 + printf("Send Ctrl Message confermation\n"); 33.605 + } 33.606 @@ -1623,7 +1628,7 @@ diff -uprN orig/tpm_emulator-0.2/tpmd.c 33.607 + printf("%x ", addressed_out[i]); 33.608 + printf("\n"); 33.609 + } else { 33.610 -+ printf("Sent[%d]: ", out_size + sizeof(uint32_t)); 33.611 ++ printf("Sent[%Zu]: ", out_size + sizeof(uint32_t)); 33.612 + for (i=0; i< out_size+ sizeof(uint32_t); i++) 33.613 + printf("%x ", addressed_out[i]); 33.614 + printf("\n");
34.1 --- a/tools/vtpm_manager/README Tue Sep 20 09:43:29 2005 +0000 34.2 +++ b/tools/vtpm_manager/README Tue Sep 20 09:43:46 2005 +0000 34.3 @@ -51,14 +51,24 @@ VTPM_MULTI_VM -> Defined: 34.4 DUMMY_BACKEND -> vtpm_manager listens on /tmp/in.fifo and 34.5 /tmp/out.fifo rather than backend 34.6 34.7 -MANUAL_DM_LAUNCH -> User must manually launch & kill VTPMs 34.8 +MANUAL_DM_LAUNCH -> Must manually launch & kill VTPMs 34.9 34.10 -USE_FIXED_SRK_AUTH -> Do not randomly generate a random SRK & Owner auth 34.11 +WELL_KNOWN_SRK_AUTH -> Rather than randomly generating the password for the SRK, 34.12 + use a well known value. This is necessary for sharing use 34.13 + of the SRK across applications. Such as VTPM and Dom0 34.14 + measurement software. 34.15 + 34.16 +WELL_KNOWN_OWNER_AUTH -> Rather than randomly generating the password for the owner, 34.17 + use a well known value. This is useful for debugging and for 34.18 + poor bios which do not support clearing TPM if OwnerAuth is 34.19 + lost. However this has no protection from malicious app 34.20 + issuing a TPM_OwnerClear to wipe the TPM 34.21 34.22 Requirements 34.23 ============ 34.24 - xen-unstable 34.25 -- IBM frontend/backend vtpm driver patch 34.26 +- vtpm frontend/backend driver patch 34.27 +- OpenSSL Library 34.28 34.29 Single-VM Flow 34.30 ============================
35.1 --- a/tools/vtpm_manager/Rules.mk Tue Sep 20 09:43:29 2005 +0000 35.2 +++ b/tools/vtpm_manager/Rules.mk Tue Sep 20 09:43:46 2005 +0000 35.3 @@ -57,7 +57,8 @@ CFLAGS += -DLOGGING_MODULES="(BITMASK(VT 35.4 #CFLAGS += -DMANUAL_DM_LAUNCH 35.5 35.6 # Fixed SRK 35.7 -CFLAGS += -DUSE_FIXED_SRK_AUTH 35.8 +CFLAGS += -DWELL_KNOWN_SRK_AUTH 35.9 +#CFLAGS += -DWELL_KNOWN_OWNER_AUTH 35.10 35.11 # TPM Hardware Device or TPM Simulator 35.12 #CFLAGS += -DTPM_HWDEV
36.1 --- a/tools/vtpm_manager/crypto/Makefile Tue Sep 20 09:43:29 2005 +0000 36.2 +++ b/tools/vtpm_manager/crypto/Makefile Tue Sep 20 09:43:46 2005 +0000 36.3 @@ -13,6 +13,7 @@ clean: 36.4 rm -f *.a *.so *.o *.rpm $(DEP_FILES) 36.5 36.6 mrproper: clean 36.7 + rm -f *~ 36.8 36.9 $(BIN): $(OBJS) 36.10 $(AR) rcs $(BIN) $(OBJS)
37.1 --- a/tools/vtpm_manager/manager/Makefile Tue Sep 20 09:43:29 2005 +0000 37.2 +++ b/tools/vtpm_manager/manager/Makefile Tue Sep 20 09:43:46 2005 +0000 37.3 @@ -17,7 +17,7 @@ clean: 37.4 rm -f *.a *.so *.o *.rpm $(DEP_FILES) 37.5 37.6 mrproper: clean 37.7 - rm -f $(BIN) 37.8 + rm -f $(BIN) *~ 37.9 37.10 $(BIN): $(OBJS) 37.11 $(CC) $(LDFLAGS) $^ $(LIBS) -o $@
38.1 --- a/tools/vtpm_manager/manager/dmictl.c Tue Sep 20 09:43:29 2005 +0000 38.2 +++ b/tools/vtpm_manager/manager/dmictl.c Tue Sep 20 09:43:46 2005 +0000 38.3 @@ -1,339 +1,344 @@ 38.4 -// =================================================================== 38.5 -// 38.6 -// Copyright (c) 2005, Intel Corp. 38.7 -// All rights reserved. 38.8 -// 38.9 -// Redistribution and use in source and binary forms, with or without 38.10 -// modification, are permitted provided that the following conditions 38.11 -// are met: 38.12 -// 38.13 -// * Redistributions of source code must retain the above copyright 38.14 -// notice, this list of conditions and the following disclaimer. 38.15 -// * Redistributions in binary form must reproduce the above 38.16 -// copyright notice, this list of conditions and the following 38.17 -// disclaimer in the documentation and/or other materials provided 38.18 -// with the distribution. 38.19 -// * Neither the name of Intel Corporation nor the names of its 38.20 -// contributors may be used to endorse or promote products derived 38.21 -// from this software without specific prior written permission. 38.22 -// 38.23 -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 38.24 -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 38.25 -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 38.26 -// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 38.27 -// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 38.28 -// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 38.29 -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 38.30 -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38.31 -// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 38.32 -// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 38.33 -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 38.34 -// OF THE POSSIBILITY OF SUCH DAMAGE. 38.35 -// =================================================================== 38.36 -// 38.37 -// dmictl.c 38.38 -// 38.39 -// Functions for creating and destroying DMIs 38.40 -// 38.41 -// ================================================================== 38.42 - 38.43 -#include <stdio.h> 38.44 -#include <unistd.h> 38.45 -#include <string.h> 38.46 - 38.47 -#ifndef VTPM_MUTLI_VM 38.48 - #include <sys/types.h> 38.49 - #include <sys/stat.h> 38.50 - #include <fcntl.h> 38.51 - #include <signal.h> 38.52 - #include <wait.h> 38.53 -#endif 38.54 - 38.55 -#include "vtpmpriv.h" 38.56 -#include "bsg.h" 38.57 -#include "buffer.h" 38.58 -#include "log.h" 38.59 -#include "hashtable.h" 38.60 -#include "hashtable_itr.h" 38.61 - 38.62 -#define TPM_EMULATOR_PATH "/usr/bin/vtpmd" 38.63 - 38.64 -TPM_RESULT close_dmi( VTPM_DMI_RESOURCE *dmi_res) { 38.65 - TPM_RESULT status = TPM_FAIL; 38.66 - 38.67 - if (dmi_res == NULL) 38.68 - return TPM_SUCCESS; 38.69 - 38.70 - status = TCS_CloseContext(dmi_res->TCSContext); 38.71 - free ( dmi_res->NVMLocation ); 38.72 - dmi_res->connected = FALSE; 38.73 - 38.74 -#ifndef VTPM_MULTI_VM 38.75 - free(dmi_res->guest_tx_fname); 38.76 - free(dmi_res->vtpm_tx_fname); 38.77 - 38.78 - close(dmi_res->guest_tx_fh); dmi_res->guest_tx_fh = -1; 38.79 - close(dmi_res->vtpm_tx_fh); dmi_res->vtpm_tx_fh = -1; 38.80 - 38.81 - 38.82 - #ifndef MANUAL_DM_LAUNCH 38.83 - if (dmi_res->dmi_id != VTPM_CTL_DM) { 38.84 - if (dmi_res->dmi_pid != 0) { 38.85 - vtpmloginfo(VTPM_LOG_VTPM, "Killing dmi on pid %d.\n", dmi_res->dmi_pid); 38.86 - if ((kill(dmi_res->dmi_pid, SIGKILL) !=0) || 38.87 - (waitpid(dmi_res->dmi_pid, NULL, 0) != dmi_res->dmi_pid)){ 38.88 - vtpmlogerror(VTPM_LOG_VTPM, "Could not kill dmi on pid %d.\n", dmi_res->dmi_pid); 38.89 - status = TPM_FAIL; 38.90 - } 38.91 - } else 38.92 - vtpmlogerror(VTPM_LOG_VTPM, "Could not kill dmi because it's pid was 0.\n"); 38.93 - } 38.94 - #endif 38.95 -#endif 38.96 - 38.97 - return status; 38.98 -} 38.99 - 38.100 -TPM_RESULT VTPM_Handle_New_DMI( const buffer_t *param_buf) { 38.101 - 38.102 - VTPM_DMI_RESOURCE *new_dmi=NULL; 38.103 - TPM_RESULT status=TPM_FAIL; 38.104 - BYTE type; 38.105 - UINT32 dmi_id, domain_id, *dmi_id_key; 38.106 - int fh; 38.107 - 38.108 -#ifndef VTPM_MUTLI_VM 38.109 - char dmi_id_str[11]; // UINT32s are up to 10 digits + NULL 38.110 - struct stat file_info; 38.111 -#endif 38.112 - 38.113 - if (param_buf == NULL) { // Assume creation of Dom 0 control 38.114 - type = 0; 38.115 - domain_id = VTPM_CTL_DM; 38.116 - dmi_id = VTPM_CTL_DM; 38.117 - } else if (buffer_len(param_buf) != sizeof(BYTE) + sizeof(UINT32) *2) { 38.118 - vtpmloginfo(VTPM_LOG_VTPM, "New DMI command wrong length: %d.\n", buffer_len(param_buf)); 38.119 - status = TPM_BAD_PARAMETER; 38.120 - goto abort_egress; 38.121 - } else { 38.122 - BSG_UnpackList( param_buf->bytes, 3, 38.123 - BSG_TYPE_BYTE, &type, 38.124 - BSG_TYPE_UINT32, &domain_id, 38.125 - BSG_TYPE_UINT32, &dmi_id); 38.126 - } 38.127 - 38.128 - new_dmi = (VTPM_DMI_RESOURCE *) hashtable_search(vtpm_globals->dmi_map, &dmi_id); 38.129 - if (new_dmi == NULL) { 38.130 - vtpmloginfo(VTPM_LOG_VTPM, "Creating new DMI instance %d attached on domain %d.\n", dmi_id, domain_id); 38.131 - // Brand New DMI. Initialize the persistent pieces 38.132 - if ((new_dmi = (VTPM_DMI_RESOURCE *) malloc (sizeof(VTPM_DMI_RESOURCE))) == NULL) { 38.133 - status = TPM_RESOURCES; 38.134 - goto abort_egress; 38.135 - } 38.136 - memset(new_dmi, 0, sizeof(VTPM_DMI_RESOURCE)); 38.137 - new_dmi->dmi_id = dmi_id; 38.138 - new_dmi->connected = FALSE; 38.139 - 38.140 - if ((dmi_id_key = (UINT32 *) malloc (sizeof(UINT32))) == NULL) { 38.141 - status = TPM_RESOURCES; 38.142 - goto abort_egress; 38.143 - } 38.144 - *dmi_id_key = new_dmi->dmi_id; 38.145 - 38.146 - // install into map 38.147 - if (!hashtable_insert(vtpm_globals->dmi_map, dmi_id_key, new_dmi)){ 38.148 - free(new_dmi); 38.149 - free(dmi_id_key); 38.150 - status = TPM_FAIL; 38.151 - goto egress; 38.152 - } 38.153 - 38.154 - } else 38.155 - vtpmloginfo(VTPM_LOG_VTPM, "Re-attaching DMI instance %d on domain %d .\n", dmi_id, domain_id); 38.156 - 38.157 - if (new_dmi->connected) { 38.158 - vtpmlogerror(VTPM_LOG_VTPM, "Attempt to re-attach, currently attached instance %d. Ignoring\n", dmi_id); 38.159 - status = TPM_BAD_PARAMETER; 38.160 - goto egress; 38.161 - } 38.162 - 38.163 - // Initialize the Non-persistent pieces 38.164 - new_dmi->dmi_domain_id = domain_id; 38.165 - new_dmi->NVMLocation = NULL; 38.166 - 38.167 - new_dmi->TCSContext = 0; 38.168 - TPMTRYRETURN( TCS_OpenContext(&new_dmi->TCSContext) ); 38.169 - 38.170 - new_dmi->NVMLocation = (char *) malloc(11 + strlen(DMI_NVM_FILE)); 38.171 - sprintf(new_dmi->NVMLocation, DMI_NVM_FILE, (uint32_t) new_dmi->dmi_id); 38.172 - 38.173 - // Measure DMI 38.174 - // FIXME: This will measure DMI. Until then use a fixed DMI_Measurement value 38.175 - /* 38.176 - fh = open(TPM_EMULATOR_PATH, O_RDONLY); 38.177 - stat_ret = fstat(fh, &file_stat); 38.178 - if (stat_ret == 0) 38.179 - dmi_size = file_stat.st_size; 38.180 - else { 38.181 - vtpmlogerror(VTPM_LOG_VTPM, "Could not open tpm_emulator!!\n"); 38.182 - status = TPM_IOERROR; 38.183 - goto abort_egress; 38.184 - } 38.185 - dmi_buffer 38.186 - */ 38.187 - memset(&new_dmi->DMI_measurement, 0xcc, sizeof(TPM_DIGEST)); 38.188 - 38.189 -#ifndef VTPM_MULTI_VM 38.190 - if (dmi_id != VTPM_CTL_DM) { 38.191 - // Create a pair of fifo pipes 38.192 - if( (new_dmi->guest_tx_fname = (char *) malloc(11 + strlen(GUEST_TX_FIFO))) == NULL){ 38.193 - status = TPM_RESOURCES; 38.194 - goto abort_egress; 38.195 - } 38.196 - sprintf(new_dmi->guest_tx_fname, GUEST_TX_FIFO, (uint32_t) dmi_id); 38.197 - 38.198 - if ((new_dmi->vtpm_tx_fname = (char *) malloc(11 + strlen(VTPM_TX_FIFO))) == NULL) { 38.199 - status = TPM_RESOURCES; 38.200 - goto abort_egress; 38.201 - } 38.202 - sprintf(new_dmi->vtpm_tx_fname, VTPM_TX_FIFO, (uint32_t) dmi_id); 38.203 - 38.204 - new_dmi->guest_tx_fh = -1; 38.205 - new_dmi->vtpm_tx_fh= -1; 38.206 - 38.207 - if ( stat(new_dmi->guest_tx_fname, &file_info) == -1) { 38.208 - if ( mkfifo(new_dmi->guest_tx_fname, S_IWUSR | S_IRUSR ) ){ 38.209 - status = TPM_FAIL; 38.210 - goto abort_egress; 38.211 - } 38.212 - } 38.213 - 38.214 - if ( (fh = open(new_dmi->vtpm_tx_fname, O_RDWR)) == -1) { 38.215 - if ( mkfifo(new_dmi->vtpm_tx_fname, S_IWUSR | S_IRUSR ) ) { 38.216 - status = TPM_FAIL; 38.217 - goto abort_egress; 38.218 - } 38.219 - } 38.220 - 38.221 - // Launch DMI 38.222 - sprintf(dmi_id_str, "%d", (int) dmi_id); 38.223 -#ifdef MANUAL_DM_LAUNCH 38.224 - vtpmlogerror(VTPM_LOG_VTPM, "FAKING starting vtpm with dmi=%s\n", dmi_id_str); 38.225 - new_dmi->dmi_pid = 0; 38.226 -#else 38.227 - pid_t pid = fork(); 38.228 - 38.229 - if (pid == -1) { 38.230 - vtpmlogerror(VTPM_LOG_VTPM, "Could not fork to launch vtpm\n"); 38.231 - status = TPM_RESOURCES; 38.232 - goto abort_egress; 38.233 - } else if (pid == 0) { 38.234 - if ( stat(new_dmi->NVMLocation, &file_info) == -1) 38.235 - execl (TPM_EMULATOR_PATH, "vtmpd", "clear", dmi_id_str, NULL); 38.236 - else 38.237 - execl (TPM_EMULATOR_PATH, "vtpmd", "save", dmi_id_str, NULL); 38.238 - 38.239 - // Returning from these at all is an error. 38.240 - vtpmlogerror(VTPM_LOG_VTPM, "Could not exec to launch vtpm\n"); 38.241 - } else { 38.242 - new_dmi->dmi_pid = pid; 38.243 - vtpmloginfo(VTPM_LOG_VTPM, "Launching DMI on PID = %d\n", pid); 38.244 - } 38.245 -#endif // MANUAL_DM_LAUNCH 38.246 - } 38.247 -#else // VTPM_MUTLI_VM 38.248 - // FIXME: Measure DMI through call to Measurement agent in platform. 38.249 -#endif 38.250 - 38.251 - vtpm_globals->DMI_table_dirty = TRUE; 38.252 - new_dmi->connected = TRUE; 38.253 - status=TPM_SUCCESS; 38.254 - goto egress; 38.255 - 38.256 - abort_egress: 38.257 - close_dmi( new_dmi ); 38.258 - 38.259 - egress: 38.260 - return status; 38.261 -} 38.262 - 38.263 -TPM_RESULT VTPM_Handle_Close_DMI( const buffer_t *param_buf) { 38.264 - 38.265 - TPM_RESULT status=TPM_FAIL; 38.266 - VTPM_DMI_RESOURCE *dmi_res=NULL; 38.267 - UINT32 dmi_id; 38.268 - 38.269 - if ((param_buf == NULL) || (buffer_len(param_buf) != sizeof(UINT32)) ) { 38.270 - vtpmlogerror(VTPM_LOG_VTPM, "Closing DMI has bad size."); 38.271 - status = TPM_BAD_PARAMETER; 38.272 - goto abort_egress; 38.273 - } 38.274 - 38.275 - BSG_UnpackList( param_buf->bytes, 1, 38.276 - BSG_TYPE_UINT32, &dmi_id); 38.277 - 38.278 - vtpmloginfo(VTPM_LOG_VTPM, "Closing DMI %d.\n", dmi_id); 38.279 - 38.280 - dmi_res = (VTPM_DMI_RESOURCE *) hashtable_search(vtpm_globals->dmi_map, &dmi_id); 38.281 - if (dmi_res == NULL ) { 38.282 - vtpmlogerror(VTPM_LOG_VTPM, "Trying to close nonexistent DMI.\n"); 38.283 - status = TPM_BAD_PARAMETER; 38.284 - goto abort_egress; 38.285 - } 38.286 - 38.287 - if (!dmi_res->connected) { 38.288 - vtpmlogerror(VTPM_LOG_VTPM, "Closing non-connected DMI.\n"); 38.289 - status = TPM_BAD_PARAMETER; 38.290 - goto abort_egress; 38.291 - } 38.292 - 38.293 - // Close Dmi 38.294 - TPMTRYRETURN(close_dmi( dmi_res )); 38.295 - 38.296 - status=TPM_SUCCESS; 38.297 - goto egress; 38.298 - 38.299 - abort_egress: 38.300 - egress: 38.301 - 38.302 - return status; 38.303 -} 38.304 - 38.305 -TPM_RESULT VTPM_Handle_Delete_DMI( const buffer_t *param_buf) { 38.306 - 38.307 - TPM_RESULT status=TPM_FAIL; 38.308 - VTPM_DMI_RESOURCE *dmi_res=NULL; 38.309 - UINT32 dmi_id; 38.310 - 38.311 - if ((param_buf == NULL) || (buffer_len(param_buf) != sizeof(UINT32)) ) { 38.312 - vtpmlogerror(VTPM_LOG_VTPM, "Closing DMI has bad size.\n"); 38.313 - status = TPM_BAD_PARAMETER; 38.314 - goto abort_egress; 38.315 - } 38.316 - 38.317 - BSG_UnpackList( param_buf->bytes, 1, 38.318 - BSG_TYPE_UINT32, &dmi_id); 38.319 - 38.320 - vtpmloginfo(VTPM_LOG_VTPM, "Deleting DMI %d.\n", dmi_id); 38.321 - 38.322 - dmi_res = (VTPM_DMI_RESOURCE *) hashtable_remove(vtpm_globals->dmi_map, &dmi_id); 38.323 - if (dmi_res == NULL) { 38.324 - vtpmlogerror(VTPM_LOG_VTPM, "Closing non-existent DMI.\n"); 38.325 - status = TPM_BAD_PARAMETER; 38.326 - goto abort_egress; 38.327 - } 38.328 - 38.329 - //TODO: Automatically delete file dmi_res->NVMLocation 38.330 - 38.331 - // Close DMI first 38.332 - TPMTRYRETURN(close_dmi( dmi_res )); 38.333 - free ( dmi_res ); 38.334 - 38.335 - status=TPM_SUCCESS; 38.336 - goto egress; 38.337 - 38.338 - abort_egress: 38.339 - egress: 38.340 - 38.341 - return status; 38.342 -} 38.343 +// =================================================================== 38.344 +// 38.345 +// Copyright (c) 2005, Intel Corp. 38.346 +// All rights reserved. 38.347 +// 38.348 +// Redistribution and use in source and binary forms, with or without 38.349 +// modification, are permitted provided that the following conditions 38.350 +// are met: 38.351 +// 38.352 +// * Redistributions of source code must retain the above copyright 38.353 +// notice, this list of conditions and the following disclaimer. 38.354 +// * Redistributions in binary form must reproduce the above 38.355 +// copyright notice, this list of conditions and the following 38.356 +// disclaimer in the documentation and/or other materials provided 38.357 +// with the distribution. 38.358 +// * Neither the name of Intel Corporation nor the names of its 38.359 +// contributors may be used to endorse or promote products derived 38.360 +// from this software without specific prior written permission. 38.361 +// 38.362 +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 38.363 +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 38.364 +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 38.365 +// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 38.366 +// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 38.367 +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 38.368 +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 38.369 +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38.370 +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 38.371 +// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 38.372 +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 38.373 +// OF THE POSSIBILITY OF SUCH DAMAGE. 38.374 +// =================================================================== 38.375 +// 38.376 +// dmictl.c 38.377 +// 38.378 +// Functions for creating and destroying DMIs 38.379 +// 38.380 +// ================================================================== 38.381 + 38.382 +#include <stdio.h> 38.383 +#include <unistd.h> 38.384 +#include <string.h> 38.385 + 38.386 +#ifndef VTPM_MUTLI_VM 38.387 + #include <sys/types.h> 38.388 + #include <sys/stat.h> 38.389 + #include <fcntl.h> 38.390 + #include <signal.h> 38.391 + #include <wait.h> 38.392 +#endif 38.393 + 38.394 +#include "vtpmpriv.h" 38.395 +#include "bsg.h" 38.396 +#include "buffer.h" 38.397 +#include "log.h" 38.398 +#include "hashtable.h" 38.399 +#include "hashtable_itr.h" 38.400 + 38.401 +#define TPM_EMULATOR_PATH "/usr/bin/vtpmd" 38.402 + 38.403 +TPM_RESULT close_dmi( VTPM_DMI_RESOURCE *dmi_res) { 38.404 + TPM_RESULT status = TPM_FAIL; 38.405 + 38.406 + if (dmi_res == NULL) 38.407 + return TPM_SUCCESS; 38.408 + 38.409 + status = TCS_CloseContext(dmi_res->TCSContext); 38.410 + free ( dmi_res->NVMLocation ); 38.411 + dmi_res->connected = FALSE; 38.412 + 38.413 +#ifndef VTPM_MULTI_VM 38.414 + free(dmi_res->guest_tx_fname); 38.415 + free(dmi_res->vtpm_tx_fname); 38.416 + 38.417 + close(dmi_res->guest_tx_fh); dmi_res->guest_tx_fh = -1; 38.418 + close(dmi_res->vtpm_tx_fh); dmi_res->vtpm_tx_fh = -1; 38.419 + 38.420 + #ifndef MANUAL_DM_LAUNCH 38.421 + if (dmi_res->dmi_id != VTPM_CTL_DM) { 38.422 + if (dmi_res->dmi_pid != 0) { 38.423 + vtpmloginfo(VTPM_LOG_VTPM, "Killing dmi on pid %d.\n", dmi_res->dmi_pid); 38.424 + if (kill(dmi_res->dmi_pid, SIGKILL) !=0) { 38.425 + vtpmloginfo(VTPM_LOG_VTPM, "DMI on pid %d is already dead.\n", dmi_res->dmi_pid); 38.426 + } else if (waitpid(dmi_res->dmi_pid, NULL, 0) != dmi_res->dmi_pid) { 38.427 + vtpmlogerror(VTPM_LOG_VTPM, "DMI on pid %d failed to stop.\n", dmi_res->dmi_pid); 38.428 + status = TPM_FAIL; 38.429 + } 38.430 + } else { 38.431 + vtpmlogerror(VTPM_LOG_VTPM, "Could not kill dmi because it's pid was 0.\n"); 38.432 + status = TPM_FAIL; 38.433 + } 38.434 + } 38.435 + #endif 38.436 +#endif 38.437 + 38.438 + return status; 38.439 +} 38.440 + 38.441 +TPM_RESULT VTPM_Handle_New_DMI( const buffer_t *param_buf) { 38.442 + 38.443 + VTPM_DMI_RESOURCE *new_dmi=NULL; 38.444 + TPM_RESULT status=TPM_FAIL; 38.445 + BYTE type; 38.446 + UINT32 dmi_id, domain_id, *dmi_id_key; 38.447 + 38.448 +#ifndef VTPM_MULTI_VM 38.449 + int fh; 38.450 + char dmi_id_str[11]; // UINT32s are up to 10 digits + NULL 38.451 + struct stat file_info; 38.452 +#endif 38.453 + 38.454 + if (param_buf == NULL) { // Assume creation of Dom 0 control 38.455 + type = 0; 38.456 + domain_id = VTPM_CTL_DM; 38.457 + dmi_id = VTPM_CTL_DM; 38.458 + } else if (buffer_len(param_buf) != sizeof(BYTE) + sizeof(UINT32) *2) { 38.459 + vtpmloginfo(VTPM_LOG_VTPM, "New DMI command wrong length: %d.\n", buffer_len(param_buf)); 38.460 + status = TPM_BAD_PARAMETER; 38.461 + goto abort_egress; 38.462 + } else { 38.463 + BSG_UnpackList( param_buf->bytes, 3, 38.464 + BSG_TYPE_BYTE, &type, 38.465 + BSG_TYPE_UINT32, &domain_id, 38.466 + BSG_TYPE_UINT32, &dmi_id); 38.467 + } 38.468 + 38.469 + new_dmi = (VTPM_DMI_RESOURCE *) hashtable_search(vtpm_globals->dmi_map, &dmi_id); 38.470 + if (new_dmi == NULL) { 38.471 + vtpmloginfo(VTPM_LOG_VTPM, "Creating new DMI instance %d attached on domain %d.\n", dmi_id, domain_id); 38.472 + // Brand New DMI. Initialize the persistent pieces 38.473 + if ((new_dmi = (VTPM_DMI_RESOURCE *) malloc (sizeof(VTPM_DMI_RESOURCE))) == NULL) { 38.474 + status = TPM_RESOURCES; 38.475 + goto abort_egress; 38.476 + } 38.477 + memset(new_dmi, 0, sizeof(VTPM_DMI_RESOURCE)); 38.478 + new_dmi->dmi_id = dmi_id; 38.479 + new_dmi->connected = FALSE; 38.480 + 38.481 + if ((dmi_id_key = (UINT32 *) malloc (sizeof(UINT32))) == NULL) { 38.482 + status = TPM_RESOURCES; 38.483 + goto abort_egress; 38.484 + } 38.485 + *dmi_id_key = new_dmi->dmi_id; 38.486 + 38.487 + // install into map 38.488 + if (!hashtable_insert(vtpm_globals->dmi_map, dmi_id_key, new_dmi)){ 38.489 + free(new_dmi); 38.490 + free(dmi_id_key); 38.491 + status = TPM_FAIL; 38.492 + goto egress; 38.493 + } 38.494 + 38.495 + } else 38.496 + vtpmloginfo(VTPM_LOG_VTPM, "Re-attaching DMI instance %d on domain %d .\n", dmi_id, domain_id); 38.497 + 38.498 + if (new_dmi->connected) { 38.499 + vtpmlogerror(VTPM_LOG_VTPM, "Attempt to re-attach, currently attached instance %d. Ignoring\n", dmi_id); 38.500 + status = TPM_BAD_PARAMETER; 38.501 + goto egress; 38.502 + } 38.503 + 38.504 + // Initialize the Non-persistent pieces 38.505 + new_dmi->dmi_domain_id = domain_id; 38.506 + new_dmi->NVMLocation = NULL; 38.507 + 38.508 + new_dmi->TCSContext = 0; 38.509 + TPMTRYRETURN( TCS_OpenContext(&new_dmi->TCSContext) ); 38.510 + 38.511 + new_dmi->NVMLocation = (char *) malloc(11 + strlen(DMI_NVM_FILE)); 38.512 + sprintf(new_dmi->NVMLocation, DMI_NVM_FILE, (uint32_t) new_dmi->dmi_id); 38.513 + 38.514 + // Measure DMI 38.515 + // FIXME: This will measure DMI. Until then use a fixed DMI_Measurement value 38.516 + /* 38.517 + fh = open(TPM_EMULATOR_PATH, O_RDONLY); 38.518 + stat_ret = fstat(fh, &file_stat); 38.519 + if (stat_ret == 0) 38.520 + dmi_size = file_stat.st_size; 38.521 + else { 38.522 + vtpmlogerror(VTPM_LOG_VTPM, "Could not open tpm_emulator!!\n"); 38.523 + status = TPM_IOERROR; 38.524 + goto abort_egress; 38.525 + } 38.526 + dmi_buffer 38.527 + */ 38.528 + memset(&new_dmi->DMI_measurement, 0xcc, sizeof(TPM_DIGEST)); 38.529 + 38.530 +#ifndef VTPM_MULTI_VM 38.531 + if (dmi_id != VTPM_CTL_DM) { 38.532 + // Create a pair of fifo pipes 38.533 + if( (new_dmi->guest_tx_fname = (char *) malloc(11 + strlen(GUEST_TX_FIFO))) == NULL){ 38.534 + status = TPM_RESOURCES; 38.535 + goto abort_egress; 38.536 + } 38.537 + sprintf(new_dmi->guest_tx_fname, GUEST_TX_FIFO, (uint32_t) dmi_id); 38.538 + 38.539 + if ((new_dmi->vtpm_tx_fname = (char *) malloc(11 + strlen(VTPM_TX_FIFO))) == NULL) { 38.540 + status = TPM_RESOURCES; 38.541 + goto abort_egress; 38.542 + } 38.543 + sprintf(new_dmi->vtpm_tx_fname, VTPM_TX_FIFO, (uint32_t) dmi_id); 38.544 + 38.545 + new_dmi->guest_tx_fh = -1; 38.546 + new_dmi->vtpm_tx_fh= -1; 38.547 + 38.548 + if ( stat(new_dmi->guest_tx_fname, &file_info) == -1) { 38.549 + if ( mkfifo(new_dmi->guest_tx_fname, S_IWUSR | S_IRUSR ) ){ 38.550 + vtpmlogerror(VTPM_LOG_VTPM, "Failed to create dmi fifo.\n"); 38.551 + status = TPM_IOERROR; 38.552 + goto abort_egress; 38.553 + } 38.554 + } 38.555 + 38.556 + if ( (fh = open(new_dmi->vtpm_tx_fname, O_RDWR)) == -1) { 38.557 + if ( mkfifo(new_dmi->vtpm_tx_fname, S_IWUSR | S_IRUSR ) ) { 38.558 + vtpmlogerror(VTPM_LOG_VTPM, "Failed to create dmi fifo.\n"); 38.559 + status = TPM_IOERROR; 38.560 + goto abort_egress; 38.561 + } 38.562 + } 38.563 + 38.564 + // Launch DMI 38.565 + sprintf(dmi_id_str, "%d", (int) dmi_id); 38.566 +#ifdef MANUAL_DM_LAUNCH 38.567 + vtpmlogerror(VTPM_LOG_VTPM, "FAKING starting vtpm with dmi=%s\n", dmi_id_str); 38.568 + new_dmi->dmi_pid = 0; 38.569 +#else 38.570 + pid_t pid = fork(); 38.571 + 38.572 + if (pid == -1) { 38.573 + vtpmlogerror(VTPM_LOG_VTPM, "Could not fork to launch vtpm\n"); 38.574 + status = TPM_RESOURCES; 38.575 + goto abort_egress; 38.576 + } else if (pid == 0) { 38.577 + if ( stat(new_dmi->NVMLocation, &file_info) == -1) 38.578 + execl (TPM_EMULATOR_PATH, "vtmpd", "clear", dmi_id_str, NULL); 38.579 + else 38.580 + execl (TPM_EMULATOR_PATH, "vtpmd", "save", dmi_id_str, NULL); 38.581 + 38.582 + // Returning from these at all is an error. 38.583 + vtpmlogerror(VTPM_LOG_VTPM, "Could not exec to launch vtpm\n"); 38.584 + } else { 38.585 + new_dmi->dmi_pid = pid; 38.586 + vtpmloginfo(VTPM_LOG_VTPM, "Launching DMI on PID = %d\n", pid); 38.587 + } 38.588 +#endif // MANUAL_DM_LAUNCH 38.589 + } 38.590 +#else // VTPM_MUTLI_VM 38.591 + // FIXME: Measure DMI through call to Measurement agent in platform. 38.592 +#endif 38.593 + 38.594 + vtpm_globals->DMI_table_dirty = TRUE; 38.595 + new_dmi->connected = TRUE; 38.596 + status=TPM_SUCCESS; 38.597 + goto egress; 38.598 + 38.599 + abort_egress: 38.600 + vtpmlogerror(VTPM_LOG_VTPM, "Failed to create DMI id=%d due to status=%s. Cleaning.\n", dmi_id, tpm_get_error_name(status)); 38.601 + close_dmi( new_dmi ); 38.602 + 38.603 + egress: 38.604 + return status; 38.605 +} 38.606 + 38.607 +TPM_RESULT VTPM_Handle_Close_DMI( const buffer_t *param_buf) { 38.608 + 38.609 + TPM_RESULT status=TPM_FAIL; 38.610 + VTPM_DMI_RESOURCE *dmi_res=NULL; 38.611 + UINT32 dmi_id; 38.612 + 38.613 + if ((param_buf == NULL) || (buffer_len(param_buf) != sizeof(UINT32)) ) { 38.614 + vtpmlogerror(VTPM_LOG_VTPM, "Closing DMI has bad size."); 38.615 + status = TPM_BAD_PARAMETER; 38.616 + goto abort_egress; 38.617 + } 38.618 + 38.619 + BSG_UnpackList( param_buf->bytes, 1, 38.620 + BSG_TYPE_UINT32, &dmi_id); 38.621 + 38.622 + vtpmloginfo(VTPM_LOG_VTPM, "Closing DMI %d.\n", dmi_id); 38.623 + 38.624 + dmi_res = (VTPM_DMI_RESOURCE *) hashtable_search(vtpm_globals->dmi_map, &dmi_id); 38.625 + if (dmi_res == NULL ) { 38.626 + vtpmlogerror(VTPM_LOG_VTPM, "Trying to close nonexistent DMI.\n"); 38.627 + status = TPM_BAD_PARAMETER; 38.628 + goto abort_egress; 38.629 + } 38.630 + 38.631 + if (!dmi_res->connected) { 38.632 + vtpmlogerror(VTPM_LOG_VTPM, "Closing non-connected DMI.\n"); 38.633 + status = TPM_BAD_PARAMETER; 38.634 + goto abort_egress; 38.635 + } 38.636 + 38.637 + // Close Dmi 38.638 + TPMTRYRETURN(close_dmi( dmi_res )); 38.639 + 38.640 + status=TPM_SUCCESS; 38.641 + goto egress; 38.642 + 38.643 + abort_egress: 38.644 + egress: 38.645 + 38.646 + return status; 38.647 +} 38.648 + 38.649 +TPM_RESULT VTPM_Handle_Delete_DMI( const buffer_t *param_buf) { 38.650 + 38.651 + TPM_RESULT status=TPM_FAIL; 38.652 + VTPM_DMI_RESOURCE *dmi_res=NULL; 38.653 + UINT32 dmi_id; 38.654 + 38.655 + if ((param_buf == NULL) || (buffer_len(param_buf) != sizeof(UINT32)) ) { 38.656 + vtpmlogerror(VTPM_LOG_VTPM, "Closing DMI has bad size.\n"); 38.657 + status = TPM_BAD_PARAMETER; 38.658 + goto abort_egress; 38.659 + } 38.660 + 38.661 + BSG_UnpackList( param_buf->bytes, 1, 38.662 + BSG_TYPE_UINT32, &dmi_id); 38.663 + 38.664 + vtpmloginfo(VTPM_LOG_VTPM, "Deleting DMI %d.\n", dmi_id); 38.665 + 38.666 + dmi_res = (VTPM_DMI_RESOURCE *) hashtable_remove(vtpm_globals->dmi_map, &dmi_id); 38.667 + if (dmi_res == NULL) { 38.668 + vtpmlogerror(VTPM_LOG_VTPM, "Closing non-existent DMI.\n"); 38.669 + status = TPM_BAD_PARAMETER; 38.670 + goto abort_egress; 38.671 + } 38.672 + 38.673 + //TODO: Automatically delete file dmi_res->NVMLocation 38.674 + 38.675 + // Close DMI first 38.676 + TPMTRYRETURN(close_dmi( dmi_res )); 38.677 + free ( dmi_res ); 38.678 + 38.679 + status=TPM_SUCCESS; 38.680 + goto egress; 38.681 + 38.682 + abort_egress: 38.683 + egress: 38.684 + 38.685 + return status; 38.686 +}
39.1 --- a/tools/vtpm_manager/manager/securestorage.c Tue Sep 20 09:43:29 2005 +0000 39.2 +++ b/tools/vtpm_manager/manager/securestorage.c Tue Sep 20 09:43:46 2005 +0000 39.3 @@ -1,401 +1,401 @@ 39.4 -// =================================================================== 39.5 -// 39.6 -// Copyright (c) 2005, Intel Corp. 39.7 -// All rights reserved. 39.8 -// 39.9 -// Redistribution and use in source and binary forms, with or without 39.10 -// modification, are permitted provided that the following conditions 39.11 -// are met: 39.12 -// 39.13 -// * Redistributions of source code must retain the above copyright 39.14 -// notice, this list of conditions and the following disclaimer. 39.15 -// * Redistributions in binary form must reproduce the above 39.16 -// copyright notice, this list of conditions and the following 39.17 -// disclaimer in the documentation and/or other materials provided 39.18 -// with the distribution. 39.19 -// * Neither the name of Intel Corporation nor the names of its 39.20 -// contributors may be used to endorse or promote products derived 39.21 -// from this software without specific prior written permission. 39.22 -// 39.23 -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 39.24 -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 39.25 -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 39.26 -// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 39.27 -// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 39.28 -// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 39.29 -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 39.30 -// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39.31 -// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 39.32 -// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 39.33 -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 39.34 -// OF THE POSSIBILITY OF SUCH DAMAGE. 39.35 -// =================================================================== 39.36 -// 39.37 -// securestorage.c 39.38 -// 39.39 -// Functions regarding securely storing DMI secrets. 39.40 -// 39.41 -// ================================================================== 39.42 - 39.43 -#include <sys/types.h> 39.44 -#include <sys/stat.h> 39.45 -#include <fcntl.h> 39.46 -#include <unistd.h> 39.47 -#include <string.h> 39.48 - 39.49 -#include "tcg.h" 39.50 -#include "vtpm_manager.h" 39.51 -#include "vtpmpriv.h" 39.52 -#include "vtsp.h" 39.53 -#include "bsg.h" 39.54 -#include "crypto.h" 39.55 -#include "hashtable.h" 39.56 -#include "hashtable_itr.h" 39.57 -#include "buffer.h" 39.58 -#include "log.h" 39.59 - 39.60 -TPM_RESULT VTPM_Handle_Save_NVM(VTPM_DMI_RESOURCE *myDMI, 39.61 - const buffer_t *inbuf, 39.62 - buffer_t *outbuf) { 39.63 - 39.64 - TPM_RESULT status = TPM_SUCCESS; 39.65 - symkey_t symkey; 39.66 - buffer_t state_cipher = NULL_BUF, 39.67 - symkey_cipher = NULL_BUF; 39.68 - int fh; 39.69 - long bytes_written; 39.70 - BYTE *sealed_NVM=NULL; 39.71 - UINT32 sealed_NVM_size, i; 39.72 - struct pack_constbuf_t symkey_cipher32, state_cipher32; 39.73 - 39.74 - vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Save_NVMing[%d]: 0x", buffer_len(inbuf)); 39.75 - for (i=0; i< buffer_len(inbuf); i++) 39.76 - vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", inbuf->bytes[i]); 39.77 - vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); 39.78 - 39.79 - // Generate a sym key and encrypt state with it 39.80 - TPMTRY(TPM_ENCRYPT_ERROR, Crypto_symcrypto_genkey (&symkey) ); 39.81 - TPMTRY(TPM_ENCRYPT_ERROR, Crypto_symcrypto_encrypt (&symkey, inbuf, &state_cipher) ); 39.82 - 39.83 - // Encrypt symmetric key 39.84 - TPMTRYRETURN( VTSP_Bind( &vtpm_globals->storageKey, 39.85 - &symkey.key, 39.86 - &symkey_cipher) ); 39.87 - 39.88 - // Create output blob: symkey_size + symkey_cipher + state_cipher_size + state_cipher 39.89 - 39.90 - symkey_cipher32.size = buffer_len(&symkey_cipher); 39.91 - symkey_cipher32.data = symkey_cipher.bytes; 39.92 - 39.93 - state_cipher32.size = buffer_len(&state_cipher); 39.94 - state_cipher32.data = state_cipher.bytes; 39.95 - 39.96 - sealed_NVM = (BYTE *) malloc( 2 * sizeof(UINT32) + symkey_cipher32.size + state_cipher32.size); 39.97 - 39.98 - sealed_NVM_size = BSG_PackList(sealed_NVM, 2, 39.99 - BSG_TPM_SIZE32_DATA, &symkey_cipher32, 39.100 - BSG_TPM_SIZE32_DATA, &state_cipher32); 39.101 - 39.102 - // Mark DMI Table so new save state info will get pushed to disk on return. 39.103 - vtpm_globals->DMI_table_dirty = TRUE; 39.104 - 39.105 - // Write sealed blob off disk from NVMLocation 39.106 - // TODO: How to properly return from these. Do we care if we return failure 39.107 - // after writing the file? We can't get the old one back. 39.108 - // TODO: Backup old file and try and recover that way. 39.109 - fh = open(myDMI->NVMLocation, O_WRONLY | O_CREAT, S_IREAD | S_IWRITE); 39.110 - if ( (bytes_written = write(fh, sealed_NVM, sealed_NVM_size) ) != (long) sealed_NVM_size) { 39.111 - vtpmlogerror(VTPM_LOG_VTPM, "We just overwrote a DMI_NVM and failed to finish. %ld/%ld bytes.\n", bytes_written, (long)sealed_NVM_size); 39.112 - status = TPM_IOERROR; 39.113 - goto abort_egress; 39.114 - } 39.115 - close(fh); 39.116 - 39.117 - Crypto_SHA1Full (sealed_NVM, sealed_NVM_size, (BYTE *) &myDMI->NVM_measurement); 39.118 - 39.119 - vtpmloginfo(VTPM_LOG_VTPM, "Saved %d bytes of E(symkey) + %d bytes of E(NVM)\n", buffer_len(&symkey_cipher), buffer_len(&state_cipher)); 39.120 - goto egress; 39.121 - 39.122 - abort_egress: 39.123 - vtpmlogerror(VTPM_LOG_VTPM, "Failed to load NVM\n."); 39.124 - 39.125 - egress: 39.126 - 39.127 - buffer_free ( &state_cipher); 39.128 - buffer_free ( &symkey_cipher); 39.129 - free(sealed_NVM); 39.130 - Crypto_symcrypto_freekey (&symkey); 39.131 - 39.132 - return status; 39.133 -} 39.134 - 39.135 - 39.136 -/* inbuf = null outbuf = sealed blob size, sealed blob.*/ 39.137 -TPM_RESULT VTPM_Handle_Load_NVM(VTPM_DMI_RESOURCE *myDMI, 39.138 - const buffer_t *inbuf, 39.139 - buffer_t *outbuf) { 39.140 - 39.141 - TPM_RESULT status = TPM_SUCCESS; 39.142 - symkey_t symkey; 39.143 - buffer_t state_cipher = NULL_BUF, 39.144 - symkey_clear = NULL_BUF, 39.145 - symkey_cipher = NULL_BUF; 39.146 - struct pack_buf_t symkey_cipher32, state_cipher32; 39.147 - 39.148 - UINT32 sealed_NVM_size; 39.149 - BYTE *sealed_NVM = NULL; 39.150 - long fh_size; 39.151 - int fh, stat_ret, i; 39.152 - struct stat file_stat; 39.153 - TPM_DIGEST sealedNVMHash; 39.154 - 39.155 - memset(&symkey, 0, sizeof(symkey_t)); 39.156 - 39.157 - if (myDMI->NVMLocation == NULL) { 39.158 - vtpmlogerror(VTPM_LOG_VTPM, "Unable to load NVM because the file name NULL.\n"); 39.159 - status = TPM_AUTHFAIL; 39.160 - goto abort_egress; 39.161 - } 39.162 - 39.163 - //Read sealed blob off disk from NVMLocation 39.164 - fh = open(myDMI->NVMLocation, O_RDONLY); 39.165 - stat_ret = fstat(fh, &file_stat); 39.166 - if (stat_ret == 0) 39.167 - fh_size = file_stat.st_size; 39.168 - else { 39.169 - status = TPM_IOERROR; 39.170 - goto abort_egress; 39.171 - } 39.172 - 39.173 - sealed_NVM = (BYTE *) malloc(fh_size); 39.174 - if (read(fh, sealed_NVM, fh_size) != fh_size) { 39.175 - status = TPM_IOERROR; 39.176 - goto abort_egress; 39.177 - } 39.178 - close(fh); 39.179 - 39.180 - vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Load_NVMing[%ld]: 0x", fh_size); 39.181 - for (i=0; i< fh_size; i++) 39.182 - vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", sealed_NVM[i]); 39.183 - vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); 39.184 - 39.185 - sealed_NVM_size = BSG_UnpackList(sealed_NVM, 2, 39.186 - BSG_TPM_SIZE32_DATA, &symkey_cipher32, 39.187 - BSG_TPM_SIZE32_DATA, &state_cipher32); 39.188 - 39.189 - TPMTRYRETURN( buffer_init_convert (&symkey_cipher, 39.190 - symkey_cipher32.size, 39.191 - symkey_cipher32.data) ); 39.192 - 39.193 - TPMTRYRETURN( buffer_init_convert (&state_cipher, 39.194 - state_cipher32.size, 39.195 - state_cipher32.data) ); 39.196 - 39.197 - Crypto_SHA1Full(sealed_NVM, sealed_NVM_size, (BYTE *) &sealedNVMHash); 39.198 - 39.199 - // Verify measurement of sealed blob. 39.200 - if (memcmp(&sealedNVMHash, &myDMI->NVM_measurement, sizeof(TPM_DIGEST)) ) { 39.201 - vtpmlogerror(VTPM_LOG_VTPM, "VTPM LoadNVM NVM measurement check failed.\n"); 39.202 - vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Correct hash: "); 39.203 - for (i=0; i< sizeof(TPM_DIGEST); i++) 39.204 - vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", ((BYTE*)&myDMI->NVM_measurement)[i]); 39.205 - vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); 39.206 - 39.207 - vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Measured hash: "); 39.208 - for (i=0; i< sizeof(TPM_DIGEST); i++) 39.209 - vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", ((BYTE*)&sealedNVMHash)[i]); 39.210 - vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); 39.211 - 39.212 - status = TPM_AUTHFAIL; 39.213 - goto abort_egress; 39.214 - } 39.215 - 39.216 - // Decrypt Symmetric Key 39.217 - TPMTRYRETURN( VTSP_Unbind( myDMI->TCSContext, 39.218 - vtpm_globals->storageKeyHandle, 39.219 - &symkey_cipher, 39.220 - (const TPM_AUTHDATA*)&vtpm_globals->storage_key_usage_auth, 39.221 - &symkey_clear, 39.222 - &(vtpm_globals->keyAuth) ) ); 39.223 - 39.224 - // create symmetric key using saved