ia64/xen-unstable
changeset 7027:06d84bf87159
Merge latest xen-unstable into xen-ia64-unstable
line diff
1.1 --- a/.hgignore Thu Sep 22 11:34:14 2005 -0600 1.2 +++ b/.hgignore Thu Sep 22 11:42:01 2005 -0600 1.3 @@ -86,6 +86,9 @@ 1.4 ^tools/check/\..*$ 1.5 ^tools/console/xenconsoled$ 1.6 ^tools/console/xenconsole$ 1.7 +^tools/debugger/gdb/gdb-6\.2\.1\.tar\.bz2$ 1.8 +^tools/debugger/gdb/gdb-6\.2\.1/.*$ 1.9 +^tools/debugger/gdb/gdb-6\.2\.1-linux-i386-xen/.*$ 1.10 ^tools/debugger/pdb/pdb$ 1.11 ^tools/debugger/pdb/linux-[0-9.]*-module/.*\.ko$ 1.12 ^tools/debugger/pdb/linux-[0-9.]*-module/.*\.mod.c$ 1.13 @@ -136,9 +139,10 @@ 1.14 ^tools/vnet/vnet-module/\..*\.cmd$ 1.15 ^tools/vnet/vnet-module/\.tmp_versions/.*$ 1.16 ^tools/vnet/vnet-module/vnet_module\.mod\..*$ 1.17 -^tools/vtpm/vtpm* 1.18 -^tools/vtpm/tpm_emulator-* 1.19 -^tools/vtpm_manager/manager/vtpm_managerd 1.20 +^tools/vtpm/tpm_emulator/.*$ 1.21 +^tools/vtpm/tpm_emulator-.*\.tar\.gz$ 1.22 +^tools/vtpm/vtpm/.*$ 1.23 +^tools/vtpm_manager/manager/vtpm_managerd$ 1.24 ^tools/xcutils/xc_restore$ 1.25 ^tools/xcutils/xc_save$ 1.26 ^tools/xenstat/xentop/xentop$ 1.27 @@ -156,6 +160,7 @@ 1.28 ^tools/xenstore/xs_stress$ 1.29 ^tools/xenstore/xs_test$ 1.30 ^tools/xenstore/xs_watch_stress$ 1.31 +^tools/xentrace/xenctx$ 1.32 ^tools/xentrace/xentrace$ 1.33 ^xen/BLOG$ 1.34 ^xen/TAGS$
3.1 --- a/Makefile Thu Sep 22 11:34:14 2005 -0600 3.2 +++ b/Makefile Thu Sep 22 11:42:01 2005 -0600 3.3 @@ -98,12 +98,15 @@ clean:: 3.4 $(MAKE) -C tools clean 3.5 $(MAKE) -C docs clean 3.6 3.7 -# clean, but blow away kernel build tree plus tar balls 3.8 -mrproper: clean 3.9 +# clean, but blow away kernel build tree plus tarballs 3.10 +distclean: clean 3.11 rm -rf dist patches/tmp 3.12 for i in $(ALLKERNELS) ; do $(MAKE) $$i-delete ; done 3.13 for i in $(ALLSPARSETREES) ; do $(MAKE) $$i-mrproper ; done 3.14 3.15 +# Linux name for GNU distclean 3.16 +mrproper: distclean 3.17 + 3.18 install-logging: LOGGING=logging-0.4.9.2 3.19 install-logging: 3.20 [ -f $(LOGGING).tar.gz ] || wget http://www.red-dove.com/$(LOGGING).tar.gz 3.21 @@ -142,7 +145,7 @@ help: 3.22 @echo 'Cleaning targets:' 3.23 @echo ' clean - clean the Xen, tools and docs (but not' 3.24 @echo ' guest kernel) trees' 3.25 - @echo ' mrproper - clean plus delete kernel tarballs and kernel' 3.26 + @echo ' distclean - clean plus delete kernel tarballs and kernel' 3.27 @echo ' build trees' 3.28 @echo ' kdelete - delete guest kernel build trees' 3.29 @echo ' kclean - clean guest kernel build trees' 3.30 @@ -163,27 +166,25 @@ uninstall: D=$(DESTDIR) 3.31 uninstall: 3.32 [ -d $(D)/etc/xen ] && mv -f $(D)/etc/xen $(D)/etc/xen.old-`date +%s` 3.33 rm -rf $(D)/etc/init.d/xend* 3.34 - rm -rf $(D)/usr/$(LIBDIR)/libxc* $(D)/usr/$(LIBDIR)/libxutil* 3.35 - rm -rf $(D)/usr/$(LIBDIR)/python/xen $(D)/usr/include/xen 3.36 - rm -rf $(D)/usr/$(LIBDIR)/share/xen $(D)/usr/$(LIBDIR)/libxenstore* 3.37 + rm -rf $(D)/etc/hotplug/xen-backend.agent 3.38 rm -rf $(D)/var/run/xen* $(D)/var/lib/xen* 3.39 - rm -rf $(D)/usr/include/xcs_proto.h $(D)/usr/include/xc.h 3.40 - rm -rf $(D)/usr/include/xs_lib.h $(D)/usr/include/xs.h 3.41 - rm -rf $(D)/usr/sbin/xcs $(D)/usr/sbin/xcsdump $(D)/usr/sbin/xen* 3.42 - rm -rf $(D)/usr/sbin/netfix 3.43 - rm -rf $(D)/usr/sbin/xfrd $(D)/usr/sbin/xm 3.44 - rm -rf $(D)/usr/share/doc/xen $(D)/usr/man/man*/xentrace* 3.45 - rm -rf $(D)/usr/bin/xen* $(D)/usr/bin/miniterm 3.46 rm -rf $(D)/boot/*xen* 3.47 rm -rf $(D)/lib/modules/*xen* 3.48 + rm -rf $(D)/usr/bin/xen* $(D)/usr/bin/lomount 3.49 rm -rf $(D)/usr/bin/cpuperf-perfcntr $(D)/usr/bin/cpuperf-xen 3.50 rm -rf $(D)/usr/bin/xc_shadow 3.51 - rm -rf $(D)/usr/share/xen $(D)/usr/libexec/xen 3.52 + rm -rf $(D)/usr/include/xenctrl.h 3.53 + rm -rf $(D)/usr/include/xs_lib.h $(D)/usr/include/xs.h 3.54 + rm -rf $(D)/usr/include/xen 3.55 + rm -rf $(D)/usr/$(LIBDIR)/libxenctrl* $(D)/usr/$(LIBDIR)/libxenguest* 3.56 + rm -rf $(D)/usr/$(LIBDIR)/libxenstore* 3.57 + rm -rf $(D)/usr/$(LIBDIR)/python/xen $(D)/usr/$(LIBDIR)/xen 3.58 + rm -rf $(D)/usr/libexec/xen 3.59 + rm -rf $(D)/usr/sbin/xen* $(D)/usr/sbin/netfix $(D)/usr/sbin/xm 3.60 + rm -rf $(D)/usr/share/doc/xen 3.61 + rm -rf $(D)/usr/share/xen 3.62 rm -rf $(D)/usr/share/man/man1/xen* 3.63 rm -rf $(D)/usr/share/man/man8/xen* 3.64 - rm -rf $(D)/usr/lib/xen 3.65 - rm -rf $(D)/etc/hotplug.d/xen-backend 3.66 - rm -rf $(D)/etc/hotplug/xen-backend.agent 3.67 3.68 # Legacy targets for compatibility 3.69 linux24:
11.1 --- a/docs/Makefile Thu Sep 22 11:34:14 2005 -0600 11.2 +++ b/docs/Makefile Thu Sep 22 11:42:01 2005 -0600 11.3 @@ -12,7 +12,7 @@ DOXYGEN := doxygen 11.4 11.5 pkgdocdir := /usr/share/doc/xen 11.6 11.7 -DOC_TEX := $(wildcard src/*.tex) 11.8 +DOC_TEX := src/user.tex src/interface.tex 11.9 DOC_PS := $(patsubst src/%.tex,ps/%.ps,$(DOC_TEX)) 11.10 DOC_PDF := $(patsubst src/%.tex,pdf/%.pdf,$(DOC_TEX)) 11.11 DOC_HTML := $(patsubst src/%.tex,html/%/index.html,$(DOC_TEX)) 11.12 @@ -36,11 +36,12 @@ html: 11.13 $(MAKE) $(DOC_HTML); fi 11.14 11.15 python-dev-docs: 11.16 - mkdir -p api/tools/python 11.17 + @mkdir -v -p api/tools/python 11.18 @if which $(DOXYGEN) 1>/dev/null 2>/dev/null; then \ 11.19 echo "Running doxygen to generate Python tools APIs ... "; \ 11.20 $(DOXYGEN) Doxyfile; \ 11.21 - $(MAKE) -C api/tools/python/latex ; fi 11.22 + $(MAKE) -C api/tools/python/latex ; else \ 11.23 + echo "Doxygen not installed; skipping python-dev-docs."; fi 11.24 11.25 clean: 11.26 rm -rf .word_count *.aux *.dvi *.bbl *.blg *.glo *.idx *~
15.1 --- a/docs/src/interface.tex Thu Sep 22 11:34:14 2005 -0600 15.2 +++ b/docs/src/interface.tex Thu Sep 22 11:42:01 2005 -0600 15.3 @@ -87,1084 +87,23 @@ itself, allows the Xen framework to sepa 15.4 mechanism and policy within the system. 15.5 15.6 15.7 - 15.8 -\chapter{Virtual Architecture} 15.9 - 15.10 -On a Xen-based system, the hypervisor itself runs in {\it ring 0}. It 15.11 -has full access to the physical memory available in the system and is 15.12 -responsible for allocating portions of it to the domains. Guest 15.13 -operating systems run in and use {\it rings 1}, {\it 2} and {\it 3} as 15.14 -they see fit. Segmentation is used to prevent the guest OS from 15.15 -accessing the portion of the address space that is reserved for 15.16 -Xen. We expect most guest operating systems will use ring 1 for their 15.17 -own operation and place applications in ring 3. 15.18 - 15.19 -In this chapter we consider the basic virtual architecture provided 15.20 -by Xen: the basic CPU state, exception and interrupt handling, and 15.21 -time. Other aspects such as memory and device access are discussed 15.22 -in later chapters. 15.23 - 15.24 -\section{CPU state} 15.25 - 15.26 -All privileged state must be handled by Xen. The guest OS has no 15.27 -direct access to CR3 and is not permitted to update privileged bits in 15.28 -EFLAGS. Guest OSes use \emph{hypercalls} to invoke operations in Xen; 15.29 -these are analogous to system calls but occur from ring 1 to ring 0. 15.30 - 15.31 -A list of all hypercalls is given in Appendix~\ref{a:hypercalls}. 15.32 - 15.33 - 15.34 - 15.35 -\section{Exceptions} 15.36 - 15.37 -A virtual IDT is provided --- a domain can submit a table of trap 15.38 -handlers to Xen via the {\tt set\_trap\_table()} hypercall. Most trap 15.39 -handlers are identical to native x86 handlers, although the page-fault 15.40 -handler is somewhat different. 15.41 - 15.42 - 15.43 -\section{Interrupts and events} 15.44 - 15.45 -Interrupts are virtualized by mapping them to \emph{events}, which are 15.46 -delivered asynchronously to the target domain using a callback 15.47 -supplied via the {\tt set\_callbacks()} hypercall. A guest OS can map 15.48 -these events onto its standard interrupt dispatch mechanisms. Xen is 15.49 -responsible for determining the target domain that will handle each 15.50 -physical interrupt source. For more details on the binding of event 15.51 -sources to events, see Chapter~\ref{c:devices}. 15.52 - 15.53 - 15.54 - 15.55 -\section{Time} 15.56 - 15.57 -Guest operating systems need to be aware of the passage of both real 15.58 -(or wallclock) time and their own `virtual time' (the time for 15.59 -which they have been executing). Furthermore, Xen has a notion of 15.60 -time which is used for scheduling. The following notions of 15.61 -time are provided: 15.62 - 15.63 -\begin{description} 15.64 -\item[Cycle counter time.] 15.65 - 15.66 -This provides a fine-grained time reference. The cycle counter time is 15.67 -used to accurately extrapolate the other time references. On SMP machines 15.68 -it is currently assumed that the cycle counter time is synchronized between 15.69 -CPUs. The current x86-based implementation achieves this within inter-CPU 15.70 -communication latencies. 15.71 - 15.72 -\item[System time.] 15.73 - 15.74 -This is a 64-bit counter which holds the number of nanoseconds that 15.75 -have elapsed since system boot. 15.76 - 15.77 - 15.78 -\item[Wall clock time.] 15.79 - 15.80 -This is the time of day in a Unix-style {\tt struct timeval} (seconds 15.81 -and microseconds since 1 January 1970, adjusted by leap seconds). An 15.82 -NTP client hosted by {\it domain 0} can keep this value accurate. 15.83 - 15.84 - 15.85 -\item[Domain virtual time.] 15.86 - 15.87 -This progresses at the same pace as system time, but only while a 15.88 -domain is executing --- it stops while a domain is de-scheduled. 15.89 -Therefore the share of the CPU that a domain receives is indicated by 15.90 -the rate at which its virtual time increases. 15.91 - 15.92 -\end{description} 15.93 - 15.94 - 15.95 -Xen exports timestamps for system time and wall-clock time to guest 15.96 -operating systems through a shared page of memory. Xen also provides 15.97 -the cycle counter time at the instant the timestamps were calculated, 15.98 -and the CPU frequency in Hertz. This allows the guest to extrapolate 15.99 -system and wall-clock times accurately based on the current cycle 15.100 -counter time. 15.101 - 15.102 -Since all time stamps need to be updated and read \emph{atomically} 15.103 -two version numbers are also stored in the shared info page. The 15.104 -first is incremented prior to an update, while the second is only 15.105 -incremented afterwards. Thus a guest can be sure that it read a consistent 15.106 -state by checking the two version numbers are equal. 15.107 - 15.108 -Xen includes a periodic ticker which sends a timer event to the 15.109 -currently executing domain every 10ms. The Xen scheduler also sends a 15.110 -timer event whenever a domain is scheduled; this allows the guest OS 15.111 -to adjust for the time that has passed while it has been inactive. In 15.112 -addition, Xen allows each domain to request that they receive a timer 15.113 -event sent at a specified system time by using the {\tt 15.114 -set\_timer\_op()} hypercall. Guest OSes may use this timer to 15.115 -implement timeout values when they block. 15.116 - 15.117 - 15.118 - 15.119 -%% % akw: demoting this to a section -- not sure if there is any point 15.120 -%% % though, maybe just remove it. 15.121 - 15.122 -\section{Xen CPU Scheduling} 15.123 - 15.124 -Xen offers a uniform API for CPU schedulers. It is possible to choose 15.125 -from a number of schedulers at boot and it should be easy to add more. 15.126 -The BVT, Atropos and Round Robin schedulers are part of the normal 15.127 -Xen distribution. BVT provides proportional fair shares of the CPU to 15.128 -the running domains. Atropos can be used to reserve absolute shares 15.129 -of the CPU for each domain. Round-robin is provided as an example of 15.130 -Xen's internal scheduler API. 15.131 - 15.132 -\paragraph*{Note: SMP host support} 15.133 -Xen has always supported SMP host systems. Domains are statically assigned to 15.134 -CPUs, either at creation time or when manually pinning to a particular CPU. 15.135 -The current schedulers then run locally on each CPU to decide which of the 15.136 -assigned domains should be run there. The user-level control software 15.137 -can be used to perform coarse-grain load-balancing between CPUs. 15.138 +%% chapter Virtual Architecture moved to architecture.tex 15.139 +\include{src/interface/architecture} 15.140 15.141 - 15.142 -%% More information on the characteristics and use of these schedulers is 15.143 -%% available in {\tt Sched-HOWTO.txt}. 15.144 - 15.145 - 15.146 -\section{Privileged operations} 15.147 - 15.148 -Xen exports an extended interface to privileged domains (viz.\ {\it 15.149 - Domain 0}). This allows such domains to build and boot other domains 15.150 -on the server, and provides control interfaces for managing 15.151 -scheduling, memory, networking, and block devices. 15.152 - 15.153 - 15.154 -\chapter{Memory} 15.155 -\label{c:memory} 15.156 - 15.157 -Xen is responsible for managing the allocation of physical memory to 15.158 -domains, and for ensuring safe use of the paging and segmentation 15.159 -hardware. 15.160 - 15.161 - 15.162 -\section{Memory Allocation} 15.163 - 15.164 - 15.165 -Xen resides within a small fixed portion of physical memory; it also 15.166 -reserves the top 64MB of every virtual address space. The remaining 15.167 -physical memory is available for allocation to domains at a page 15.168 -granularity. Xen tracks the ownership and use of each page, which 15.169 -allows it to enforce secure partitioning between domains. 15.170 - 15.171 -Each domain has a maximum and current physical memory allocation. 15.172 -A guest OS may run a `balloon driver' to dynamically adjust its 15.173 -current memory allocation up to its limit. 15.174 - 15.175 - 15.176 -%% XXX SMH: I use machine and physical in the next section (which 15.177 -%% is kinda required for consistency with code); wonder if this 15.178 -%% section should use same terms? 15.179 -%% 15.180 -%% Probably. 15.181 -%% 15.182 -%% Merging this and below section at some point prob makes sense. 15.183 - 15.184 -\section{Pseudo-Physical Memory} 15.185 - 15.186 -Since physical memory is allocated and freed on a page granularity, 15.187 -there is no guarantee that a domain will receive a contiguous stretch 15.188 -of physical memory. However most operating systems do not have good 15.189 -support for operating in a fragmented physical address space. To aid 15.190 -porting such operating systems to run on top of Xen, we make a 15.191 -distinction between \emph{machine memory} and \emph{pseudo-physical 15.192 -memory}. 15.193 - 15.194 -Put simply, machine memory refers to the entire amount of memory 15.195 -installed in the machine, including that reserved by Xen, in use by 15.196 -various domains, or currently unallocated. We consider machine memory 15.197 -to comprise a set of 4K \emph{machine page frames} numbered 15.198 -consecutively starting from 0. Machine frame numbers mean the same 15.199 -within Xen or any domain. 15.200 - 15.201 -Pseudo-physical memory, on the other hand, is a per-domain 15.202 -abstraction. It allows a guest operating system to consider its memory 15.203 -allocation to consist of a contiguous range of physical page frames 15.204 -starting at physical frame 0, despite the fact that the underlying 15.205 -machine page frames may be sparsely allocated and in any order. 15.206 - 15.207 -To achieve this, Xen maintains a globally readable {\it 15.208 -machine-to-physical} table which records the mapping from machine page 15.209 -frames to pseudo-physical ones. In addition, each domain is supplied 15.210 -with a {\it physical-to-machine} table which performs the inverse 15.211 -mapping. Clearly the machine-to-physical table has size proportional 15.212 -to the amount of RAM installed in the machine, while each 15.213 -physical-to-machine table has size proportional to the memory 15.214 -allocation of the given domain. 15.215 - 15.216 -Architecture dependent code in guest operating systems can then use 15.217 -the two tables to provide the abstraction of pseudo-physical 15.218 -memory. In general, only certain specialized parts of the operating 15.219 -system (such as page table management) needs to understand the 15.220 -difference between machine and pseudo-physical addresses. 15.221 - 15.222 -\section{Page Table Updates} 15.223 - 15.224 -In the default mode of operation, Xen enforces read-only access to 15.225 -page tables and requires guest operating systems to explicitly request 15.226 -any modifications. Xen validates all such requests and only applies 15.227 -updates that it deems safe. This is necessary to prevent domains from 15.228 -adding arbitrary mappings to their page tables. 15.229 - 15.230 -To aid validation, Xen associates a type and reference count with each 15.231 -memory page. A page has one of the following 15.232 -mutually-exclusive types at any point in time: page directory ({\sf 15.233 -PD}), page table ({\sf PT}), local descriptor table ({\sf LDT}), 15.234 -global descriptor table ({\sf GDT}), or writable ({\sf RW}). Note that 15.235 -a guest OS may always create readable mappings of its own memory 15.236 -regardless of its current type. 15.237 -%%% XXX: possibly explain more about ref count 'lifecyle' here? 15.238 -This mechanism is used to 15.239 -maintain the invariants required for safety; for example, a domain 15.240 -cannot have a writable mapping to any part of a page table as this 15.241 -would require the page concerned to simultaneously be of types {\sf 15.242 - PT} and {\sf RW}. 15.243 - 15.244 - 15.245 -%\section{Writable Page Tables} 15.246 - 15.247 -Xen also provides an alternative mode of operation in which guests be 15.248 -have the illusion that their page tables are directly writable. Of 15.249 -course this is not really the case, since Xen must still validate 15.250 -modifications to ensure secure partitioning. To this end, Xen traps 15.251 -any write attempt to a memory page of type {\sf PT} (i.e., that is 15.252 -currently part of a page table). If such an access occurs, Xen 15.253 -temporarily allows write access to that page while at the same time 15.254 -{\em disconnecting} it from the page table that is currently in 15.255 -use. This allows the guest to safely make updates to the page because 15.256 -the newly-updated entries cannot be used by the MMU until Xen 15.257 -revalidates and reconnects the page. 15.258 -Reconnection occurs automatically in a number of situations: for 15.259 -example, when the guest modifies a different page-table page, when the 15.260 -domain is preempted, or whenever the guest uses Xen's explicit 15.261 -page-table update interfaces. 15.262 - 15.263 -Finally, Xen also supports a form of \emph{shadow page tables} in 15.264 -which the guest OS uses a independent copy of page tables which are 15.265 -unknown to the hardware (i.e.\ which are never pointed to by {\tt 15.266 -cr3}). Instead Xen propagates changes made to the guest's tables to the 15.267 -real ones, and vice versa. This is useful for logging page writes 15.268 -(e.g.\ for live migration or checkpoint). A full version of the shadow 15.269 -page tables also allows guest OS porting with less effort. 15.270 - 15.271 -\section{Segment Descriptor Tables} 15.272 +%% chapter Memory moved to memory.tex 15.273 +\include{src/interface/memory} 15.274 15.275 -On boot a guest is supplied with a default GDT, which does not reside 15.276 -within its own memory allocation. If the guest wishes to use other 15.277 -than the default `flat' ring-1 and ring-3 segments that this GDT 15.278 -provides, it must register a custom GDT and/or LDT with Xen, 15.279 -allocated from its own memory. Note that a number of GDT 15.280 -entries are reserved by Xen -- any custom GDT must also include 15.281 -sufficient space for these entries. 15.282 - 15.283 -For example, the following hypercall is used to specify a new GDT: 15.284 - 15.285 -\begin{quote} 15.286 -int {\bf set\_gdt}(unsigned long *{\em frame\_list}, int {\em entries}) 15.287 - 15.288 -{\em frame\_list}: An array of up to 16 machine page frames within 15.289 -which the GDT resides. Any frame registered as a GDT frame may only 15.290 -be mapped read-only within the guest's address space (e.g., no 15.291 -writable mappings, no use as a page-table page, and so on). 15.292 - 15.293 -{\em entries}: The number of descriptor-entry slots in the GDT. Note 15.294 -that the table must be large enough to contain Xen's reserved entries; 15.295 -thus we must have `{\em entries $>$ LAST\_RESERVED\_GDT\_ENTRY}\ '. 15.296 -Note also that, after registering the GDT, slots {\em FIRST\_} through 15.297 -{\em LAST\_RESERVED\_GDT\_ENTRY} are no longer usable by the guest and 15.298 -may be overwritten by Xen. 15.299 -\end{quote} 15.300 - 15.301 -The LDT is updated via the generic MMU update mechanism (i.e., via 15.302 -the {\tt mmu\_update()} hypercall. 15.303 - 15.304 -\section{Start of Day} 15.305 - 15.306 -The start-of-day environment for guest operating systems is rather 15.307 -different to that provided by the underlying hardware. In particular, 15.308 -the processor is already executing in protected mode with paging 15.309 -enabled. 15.310 - 15.311 -{\it Domain 0} is created and booted by Xen itself. For all subsequent 15.312 -domains, the analogue of the boot-loader is the {\it domain builder}, 15.313 -user-space software running in {\it domain 0}. The domain builder 15.314 -is responsible for building the initial page tables for a domain 15.315 -and loading its kernel image at the appropriate virtual address. 15.316 - 15.317 - 15.318 - 15.319 -\chapter{Devices} 15.320 -\label{c:devices} 15.321 - 15.322 -Devices such as network and disk are exported to guests using a 15.323 -split device driver. The device driver domain, which accesses the 15.324 -physical device directly also runs a {\em backend} driver, serving 15.325 -requests to that device from guests. Each guest will use a simple 15.326 -{\em frontend} driver, to access the backend. Communication between these 15.327 -domains is composed of two parts: First, data is placed onto a shared 15.328 -memory page between the domains. Second, an event channel between the 15.329 -two domains is used to pass notification that data is outstanding. 15.330 -This separation of notification from data transfer allows message 15.331 -batching, and results in very efficient device access. 15.332 - 15.333 -Event channels are used extensively in device virtualization; each 15.334 -domain has a number of end-points or \emph{ports} each of which 15.335 -may be bound to one of the following \emph{event sources}: 15.336 -\begin{itemize} 15.337 - \item a physical interrupt from a real device, 15.338 - \item a virtual interrupt (callback) from Xen, or 15.339 - \item a signal from another domain 15.340 -\end{itemize} 15.341 - 15.342 -Events are lightweight and do not carry much information beyond 15.343 -the source of the notification. Hence when performing bulk data 15.344 -transfer, events are typically used as synchronization primitives 15.345 -over a shared memory transport. Event channels are managed via 15.346 -the {\tt event\_channel\_op()} hypercall; for more details see 15.347 -Section~\ref{s:idc}. 15.348 - 15.349 -This chapter focuses on some individual device interfaces 15.350 -available to Xen guests. 15.351 - 15.352 -\section{Network I/O} 15.353 - 15.354 -Virtual network device services are provided by shared memory 15.355 -communication with a backend domain. From the point of view of 15.356 -other domains, the backend may be viewed as a virtual ethernet switch 15.357 -element with each domain having one or more virtual network interfaces 15.358 -connected to it. 15.359 - 15.360 -\subsection{Backend Packet Handling} 15.361 - 15.362 -The backend driver is responsible for a variety of actions relating to 15.363 -the transmission and reception of packets from the physical device. 15.364 -With regard to transmission, the backend performs these key actions: 15.365 - 15.366 -\begin{itemize} 15.367 -\item {\bf Validation:} To ensure that domains do not attempt to 15.368 - generate invalid (e.g. spoofed) traffic, the backend driver may 15.369 - validate headers ensuring that source MAC and IP addresses match the 15.370 - interface that they have been sent from. 15.371 - 15.372 - Validation functions can be configured using standard firewall rules 15.373 - ({\small{\tt iptables}} in the case of Linux). 15.374 - 15.375 -\item {\bf Scheduling:} Since a number of domains can share a single 15.376 - physical network interface, the backend must mediate access when 15.377 - several domains each have packets queued for transmission. This 15.378 - general scheduling function subsumes basic shaping or rate-limiting 15.379 - schemes. 15.380 - 15.381 -\item {\bf Logging and Accounting:} The backend domain can be 15.382 - configured with classifier rules that control how packets are 15.383 - accounted or logged. For example, log messages might be generated 15.384 - whenever a domain attempts to send a TCP packet containing a SYN. 15.385 -\end{itemize} 15.386 - 15.387 -On receipt of incoming packets, the backend acts as a simple 15.388 -demultiplexer: Packets are passed to the appropriate virtual 15.389 -interface after any necessary logging and accounting have been carried 15.390 -out. 15.391 - 15.392 -\subsection{Data Transfer} 15.393 - 15.394 -Each virtual interface uses two ``descriptor rings'', one for transmit, 15.395 -the other for receive. Each descriptor identifies a block of contiguous 15.396 -physical memory allocated to the domain. 15.397 - 15.398 -The transmit ring carries packets to transmit from the guest to the 15.399 -backend domain. The return path of the transmit ring carries messages 15.400 -indicating that the contents have been physically transmitted and the 15.401 -backend no longer requires the associated pages of memory. 15.402 +%% chapter Devices moved to devices.tex 15.403 +\include{src/interface/devices} 15.404 15.405 -To receive packets, the guest places descriptors of unused pages on 15.406 -the receive ring. The backend will return received packets by 15.407 -exchanging these pages in the domain's memory with new pages 15.408 -containing the received data, and passing back descriptors regarding 15.409 -the new packets on the ring. This zero-copy approach allows the 15.410 -backend to maintain a pool of free pages to receive packets into, and 15.411 -then deliver them to appropriate domains after examining their 15.412 -headers. 15.413 - 15.414 -% 15.415 -%Real physical addresses are used throughout, with the domain performing 15.416 -%translation from pseudo-physical addresses if that is necessary. 15.417 - 15.418 -If a domain does not keep its receive ring stocked with empty buffers then 15.419 -packets destined to it may be dropped. This provides some defence against 15.420 -receive livelock problems because an overload domain will cease to receive 15.421 -further data. Similarly, on the transmit path, it provides the application 15.422 -with feedback on the rate at which packets are able to leave the system. 15.423 - 15.424 - 15.425 -Flow control on rings is achieved by including a pair of producer 15.426 -indexes on the shared ring page. Each side will maintain a private 15.427 -consumer index indicating the next outstanding message. In this 15.428 -manner, the domains cooperate to divide the ring into two message 15.429 -lists, one in each direction. Notification is decoupled from the 15.430 -immediate placement of new messages on the ring; the event channel 15.431 -will be used to generate notification when {\em either} a certain 15.432 -number of outstanding messages are queued, {\em or} a specified number 15.433 -of nanoseconds have elapsed since the oldest message was placed on the 15.434 -ring. 15.435 - 15.436 -% Not sure if my version is any better -- here is what was here before: 15.437 -%% Synchronization between the backend domain and the guest is achieved using 15.438 -%% counters held in shared memory that is accessible to both. Each ring has 15.439 -%% associated producer and consumer indices indicating the area in the ring 15.440 -%% that holds descriptors that contain data. After receiving {\it n} packets 15.441 -%% or {\t nanoseconds} after receiving the first packet, the hypervisor sends 15.442 -%% an event to the domain. 15.443 - 15.444 -\section{Block I/O} 15.445 - 15.446 -All guest OS disk access goes through the virtual block device VBD 15.447 -interface. This interface allows domains access to portions of block 15.448 -storage devices visible to the the block backend device. The VBD 15.449 -interface is a split driver, similar to the network interface 15.450 -described above. A single shared memory ring is used between the 15.451 -frontend and backend drivers, across which read and write messages are 15.452 -sent. 15.453 - 15.454 -Any block device accessible to the backend domain, including 15.455 -network-based block (iSCSI, *NBD, etc), loopback and LVM/MD devices, 15.456 -can be exported as a VBD. Each VBD is mapped to a device node in the 15.457 -guest, specified in the guest's startup configuration. 15.458 - 15.459 -Old (Xen 1.2) virtual disks are not supported under Xen 2.0, since 15.460 -similar functionality can be achieved using the more complete LVM 15.461 -system, which is already in widespread use. 15.462 - 15.463 -\subsection{Data Transfer} 15.464 - 15.465 -The single ring between the guest and the block backend supports three 15.466 -messages: 15.467 - 15.468 -\begin{description} 15.469 -\item [{\small {\tt PROBE}}:] Return a list of the VBDs available to this guest 15.470 - from the backend. The request includes a descriptor of a free page 15.471 - into which the reply will be written by the backend. 15.472 - 15.473 -\item [{\small {\tt READ}}:] Read data from the specified block device. The 15.474 - front end identifies the device and location to read from and 15.475 - attaches pages for the data to be copied to (typically via DMA from 15.476 - the device). The backend acknowledges completed read requests as 15.477 - they finish. 15.478 - 15.479 -\item [{\small {\tt WRITE}}:] Write data to the specified block device. This 15.480 - functions essentially as {\small {\tt READ}}, except that the data moves to 15.481 - the device instead of from it. 15.482 -\end{description} 15.483 - 15.484 -% um... some old text 15.485 -%% In overview, the same style of descriptor-ring that is used for 15.486 -%% network packets is used here. Each domain has one ring that carries 15.487 -%% operation requests to the hypervisor and carries the results back 15.488 -%% again. 15.489 - 15.490 -%% Rather than copying data, the backend simply maps the domain's buffers 15.491 -%% in order to enable direct DMA to them. The act of mapping the buffers 15.492 -%% also increases the reference counts of the underlying pages, so that 15.493 -%% the unprivileged domain cannot try to return them to the hypervisor, 15.494 -%% install them as page tables, or any other unsafe behaviour. 15.495 -%% %block API here 15.496 - 15.497 - 15.498 -\chapter{Further Information} 15.499 - 15.500 - 15.501 -If you have questions that are not answered by this manual, the 15.502 -sources of information listed below may be of interest to you. Note 15.503 -that bug reports, suggestions and contributions related to the 15.504 -software (or the documentation) should be sent to the Xen developers' 15.505 -mailing list (address below). 15.506 - 15.507 -\section{Other documentation} 15.508 - 15.509 -If you are mainly interested in using (rather than developing for) 15.510 -Xen, the {\em Xen Users' Manual} is distributed in the {\tt docs/} 15.511 -directory of the Xen source distribution. 15.512 - 15.513 -% Various HOWTOs are also available in {\tt docs/HOWTOS}. 15.514 - 15.515 -\section{Online references} 15.516 - 15.517 -The official Xen web site is found at: 15.518 -\begin{quote} 15.519 -{\tt http://www.cl.cam.ac.uk/Research/SRG/netos/xen/} 15.520 -\end{quote} 15.521 - 15.522 -This contains links to the latest versions of all on-line 15.523 -documentation. 15.524 - 15.525 -\section{Mailing lists} 15.526 - 15.527 -There are currently four official Xen mailing lists: 15.528 - 15.529 -\begin{description} 15.530 -\item[xen-devel@lists.xensource.com] Used for development 15.531 -discussions and bug reports. Subscribe at: \\ 15.532 -{\small {\tt http://lists.xensource.com/xen-devel}} 15.533 -\item[xen-users@lists.xensource.com] Used for installation and usage 15.534 -discussions and requests for help. Subscribe at: \\ 15.535 -{\small {\tt http://lists.xensource.com/xen-users}} 15.536 -\item[xen-announce@lists.xensource.com] Used for announcements only. 15.537 -Subscribe at: \\ 15.538 -{\small {\tt http://lists.xensource.com/xen-announce}} 15.539 -\item[xen-changelog@lists.xensource.com] Changelog feed 15.540 -from the unstable and 2.0 trees - developer oriented. Subscribe at: \\ 15.541 -{\small {\tt http://lists.xensource.com/xen-changelog}} 15.542 -\end{description} 15.543 - 15.544 -Of these, xen-devel is the most active. 15.545 - 15.546 - 15.547 +%% chapter Further Information moved to further_info.tex 15.548 +\include{src/interface/further_info} 15.549 15.550 15.551 \appendix 15.552 15.553 -%\newcommand{\hypercall}[1]{\vspace{5mm}{\large\sf #1}} 15.554 - 15.555 - 15.556 - 15.557 - 15.558 - 15.559 -\newcommand{\hypercall}[1]{\vspace{2mm}{\sf #1}} 15.560 - 15.561 - 15.562 - 15.563 - 15.564 - 15.565 - 15.566 -\chapter{Xen Hypercalls} 15.567 -\label{a:hypercalls} 15.568 - 15.569 -Hypercalls represent the procedural interface to Xen; this appendix 15.570 -categorizes and describes the current set of hypercalls. 15.571 - 15.572 -\section{Invoking Hypercalls} 15.573 - 15.574 -Hypercalls are invoked in a manner analogous to system calls in a 15.575 -conventional operating system; a software interrupt is issued which 15.576 -vectors to an entry point within Xen. On x86\_32 machines the 15.577 -instruction required is {\tt int \$82}; the (real) IDT is setup so 15.578 -that this may only be issued from within ring 1. The particular 15.579 -hypercall to be invoked is contained in {\tt EAX} --- a list 15.580 -mapping these values to symbolic hypercall names can be found 15.581 -in {\tt xen/include/public/xen.h}. 15.582 - 15.583 -On some occasions a set of hypercalls will be required to carry 15.584 -out a higher-level function; a good example is when a guest 15.585 -operating wishes to context switch to a new process which 15.586 -requires updating various privileged CPU state. As an optimization 15.587 -for these cases, there is a generic mechanism to issue a set of 15.588 -hypercalls as a batch: 15.589 - 15.590 -\begin{quote} 15.591 -\hypercall{multicall(void *call\_list, int nr\_calls)} 15.592 - 15.593 -Execute a series of hypervisor calls; {\tt nr\_calls} is the length of 15.594 -the array of {\tt multicall\_entry\_t} structures pointed to be {\tt 15.595 -call\_list}. Each entry contains the hypercall operation code followed 15.596 -by up to 7 word-sized arguments. 15.597 -\end{quote} 15.598 - 15.599 -Note that multicalls are provided purely as an optimization; there is 15.600 -no requirement to use them when first porting a guest operating 15.601 -system. 15.602 - 15.603 - 15.604 -\section{Virtual CPU Setup} 15.605 - 15.606 -At start of day, a guest operating system needs to setup the virtual 15.607 -CPU it is executing on. This includes installing vectors for the 15.608 -virtual IDT so that the guest OS can handle interrupts, page faults, 15.609 -etc. However the very first thing a guest OS must setup is a pair 15.610 -of hypervisor callbacks: these are the entry points which Xen will 15.611 -use when it wishes to notify the guest OS of an occurrence. 15.612 - 15.613 -\begin{quote} 15.614 -\hypercall{set\_callbacks(unsigned long event\_selector, unsigned long 15.615 - event\_address, unsigned long failsafe\_selector, unsigned long 15.616 - failsafe\_address) } 15.617 - 15.618 -Register the normal (``event'') and failsafe callbacks for 15.619 -event processing. In each case the code segment selector and 15.620 -address within that segment are provided. The selectors must 15.621 -have RPL 1; in XenLinux we simply use the kernel's CS for both 15.622 -{\tt event\_selector} and {\tt failsafe\_selector}. 15.623 - 15.624 -The value {\tt event\_address} specifies the address of the guest OSes 15.625 -event handling and dispatch routine; the {\tt failsafe\_address} 15.626 -specifies a separate entry point which is used only if a fault occurs 15.627 -when Xen attempts to use the normal callback. 15.628 -\end{quote} 15.629 - 15.630 - 15.631 -After installing the hypervisor callbacks, the guest OS can 15.632 -install a `virtual IDT' by using the following hypercall: 15.633 - 15.634 -\begin{quote} 15.635 -\hypercall{set\_trap\_table(trap\_info\_t *table)} 15.636 - 15.637 -Install one or more entries into the per-domain 15.638 -trap handler table (essentially a software version of the IDT). 15.639 -Each entry in the array pointed to by {\tt table} includes the 15.640 -exception vector number with the corresponding segment selector 15.641 -and entry point. Most guest OSes can use the same handlers on 15.642 -Xen as when running on the real hardware; an exception is the 15.643 -page fault handler (exception vector 14) where a modified 15.644 -stack-frame layout is used. 15.645 - 15.646 - 15.647 -\end{quote} 15.648 - 15.649 - 15.650 - 15.651 -\section{Scheduling and Timer} 15.652 - 15.653 -Domains are preemptively scheduled by Xen according to the 15.654 -parameters installed by domain 0 (see Section~\ref{s:dom0ops}). 15.655 -In addition, however, a domain may choose to explicitly 15.656 -control certain behavior with the following hypercall: 15.657 - 15.658 -\begin{quote} 15.659 -\hypercall{sched\_op(unsigned long op)} 15.660 - 15.661 -Request scheduling operation from hypervisor. The options are: {\it 15.662 -yield}, {\it block}, and {\it shutdown}. {\it yield} keeps the 15.663 -calling domain runnable but may cause a reschedule if other domains 15.664 -are runnable. {\it block} removes the calling domain from the run 15.665 -queue and cause is to sleeps until an event is delivered to it. {\it 15.666 -shutdown} is used to end the domain's execution; the caller can 15.667 -additionally specify whether the domain should reboot, halt or 15.668 -suspend. 15.669 -\end{quote} 15.670 - 15.671 -To aid the implementation of a process scheduler within a guest OS, 15.672 -Xen provides a virtual programmable timer: 15.673 - 15.674 -\begin{quote} 15.675 -\hypercall{set\_timer\_op(uint64\_t timeout)} 15.676 - 15.677 -Request a timer event to be sent at the specified system time (time 15.678 -in nanoseconds since system boot). The hypercall actually passes the 15.679 -64-bit timeout value as a pair of 32-bit values. 15.680 - 15.681 -\end{quote} 15.682 - 15.683 -Note that calling {\tt set\_timer\_op()} prior to {\tt sched\_op} 15.684 -allows block-with-timeout semantics. 15.685 - 15.686 - 15.687 -\section{Page Table Management} 15.688 - 15.689 -Since guest operating systems have read-only access to their page 15.690 -tables, Xen must be involved when making any changes. The following 15.691 -multi-purpose hypercall can be used to modify page-table entries, 15.692 -update the machine-to-physical mapping table, flush the TLB, install 15.693 -a new page-table base pointer, and more. 15.694 - 15.695 -\begin{quote} 15.696 -\hypercall{mmu\_update(mmu\_update\_t *req, int count, int *success\_count)} 15.697 - 15.698 -Update the page table for the domain; a set of {\tt count} updates are 15.699 -submitted for processing in a batch, with {\tt success\_count} being 15.700 -updated to report the number of successful updates. 15.701 - 15.702 -Each element of {\tt req[]} contains a pointer (address) and value; 15.703 -the least significant 2-bits of the pointer are used to distinguish 15.704 -the type of update requested as follows: 15.705 -\begin{description} 15.706 - 15.707 -\item[\it MMU\_NORMAL\_PT\_UPDATE:] update a page directory entry or 15.708 -page table entry to the associated value; Xen will check that the 15.709 -update is safe, as described in Chapter~\ref{c:memory}. 15.710 - 15.711 -\item[\it MMU\_MACHPHYS\_UPDATE:] update an entry in the 15.712 - machine-to-physical table. The calling domain must own the machine 15.713 - page in question (or be privileged). 15.714 - 15.715 -\item[\it MMU\_EXTENDED\_COMMAND:] perform additional MMU operations. 15.716 -The set of additional MMU operations is considerable, and includes 15.717 -updating {\tt cr3} (or just re-installing it for a TLB flush), 15.718 -flushing the cache, installing a new LDT, or pinning \& unpinning 15.719 -page-table pages (to ensure their reference count doesn't drop to zero 15.720 -which would require a revalidation of all entries). 15.721 - 15.722 -Further extended commands are used to deal with granting and 15.723 -acquiring page ownership; see Section~\ref{s:idc}. 15.724 - 15.725 - 15.726 -\end{description} 15.727 - 15.728 -More details on the precise format of all commands can be 15.729 -found in {\tt xen/include/public/xen.h}. 15.730 - 15.731 - 15.732 -\end{quote} 15.733 - 15.734 -Explicitly updating batches of page table entries is extremely 15.735 -efficient, but can require a number of alterations to the guest 15.736 -OS. Using the writable page table mode (Chapter~\ref{c:memory}) is 15.737 -recommended for new OS ports. 15.738 - 15.739 -Regardless of which page table update mode is being used, however, 15.740 -there are some occasions (notably handling a demand page fault) where 15.741 -a guest OS will wish to modify exactly one PTE rather than a 15.742 -batch. This is catered for by the following: 15.743 - 15.744 -\begin{quote} 15.745 -\hypercall{update\_va\_mapping(unsigned long page\_nr, unsigned long 15.746 -val, \\ unsigned long flags)} 15.747 - 15.748 -Update the currently installed PTE for the page {\tt page\_nr} to 15.749 -{\tt val}. As with {\tt mmu\_update()}, Xen checks the modification 15.750 -is safe before applying it. The {\tt flags} determine which kind 15.751 -of TLB flush, if any, should follow the update. 15.752 - 15.753 -\end{quote} 15.754 - 15.755 -Finally, sufficiently privileged domains may occasionally wish to manipulate 15.756 -the pages of others: 15.757 -\begin{quote} 15.758 - 15.759 -\hypercall{update\_va\_mapping\_otherdomain(unsigned long page\_nr, 15.760 -unsigned long val, unsigned long flags, uint16\_t domid)} 15.761 - 15.762 -Identical to {\tt update\_va\_mapping()} save that the pages being 15.763 -mapped must belong to the domain {\tt domid}. 15.764 - 15.765 -\end{quote} 15.766 - 15.767 -This privileged operation is currently used by backend virtual device 15.768 -drivers to safely map pages containing I/O data. 15.769 - 15.770 - 15.771 - 15.772 -\section{Segmentation Support} 15.773 - 15.774 -Xen allows guest OSes to install a custom GDT if they require it; 15.775 -this is context switched transparently whenever a domain is 15.776 -[de]scheduled. The following hypercall is effectively a 15.777 -`safe' version of {\tt lgdt}: 15.778 - 15.779 -\begin{quote} 15.780 -\hypercall{set\_gdt(unsigned long *frame\_list, int entries)} 15.781 - 15.782 -Install a global descriptor table for a domain; {\tt frame\_list} is 15.783 -an array of up to 16 machine page frames within which the GDT resides, 15.784 -with {\tt entries} being the actual number of descriptor-entry 15.785 -slots. All page frames must be mapped read-only within the guest's 15.786 -address space, and the table must be large enough to contain Xen's 15.787 -reserved entries (see {\tt xen/include/public/arch-x86\_32.h}). 15.788 - 15.789 -\end{quote} 15.790 - 15.791 -Many guest OSes will also wish to install LDTs; this is achieved by 15.792 -using {\tt mmu\_update()} with an extended command, passing the 15.793 -linear address of the LDT base along with the number of entries. No 15.794 -special safety checks are required; Xen needs to perform this task 15.795 -simply since {\tt lldt} requires CPL 0. 15.796 - 15.797 - 15.798 -Xen also allows guest operating systems to update just an 15.799 -individual segment descriptor in the GDT or LDT: 15.800 - 15.801 -\begin{quote} 15.802 -\hypercall{update\_descriptor(unsigned long ma, unsigned long word1, 15.803 -unsigned long word2)} 15.804 - 15.805 -Update the GDT/LDT entry at machine address {\tt ma}; the new 15.806 -8-byte descriptor is stored in {\tt word1} and {\tt word2}. 15.807 -Xen performs a number of checks to ensure the descriptor is 15.808 -valid. 15.809 - 15.810 -\end{quote} 15.811 - 15.812 -Guest OSes can use the above in place of context switching entire 15.813 -LDTs (or the GDT) when the number of changing descriptors is small. 15.814 - 15.815 -\section{Context Switching} 15.816 - 15.817 -When a guest OS wishes to context switch between two processes, 15.818 -it can use the page table and segmentation hypercalls described 15.819 -above to perform the the bulk of the privileged work. In addition, 15.820 -however, it will need to invoke Xen to switch the kernel (ring 1) 15.821 -stack pointer: 15.822 - 15.823 -\begin{quote} 15.824 -\hypercall{stack\_switch(unsigned long ss, unsigned long esp)} 15.825 - 15.826 -Request kernel stack switch from hypervisor; {\tt ss} is the new 15.827 -stack segment, which {\tt esp} is the new stack pointer. 15.828 - 15.829 -\end{quote} 15.830 - 15.831 -A final useful hypercall for context switching allows ``lazy'' 15.832 -save and restore of floating point state: 15.833 - 15.834 -\begin{quote} 15.835 -\hypercall{fpu\_taskswitch(void)} 15.836 - 15.837 -This call instructs Xen to set the {\tt TS} bit in the {\tt cr0} 15.838 -control register; this means that the next attempt to use floating 15.839 -point will cause a trap which the guest OS can trap. Typically it will 15.840 -then save/restore the FP state, and clear the {\tt TS} bit. 15.841 -\end{quote} 15.842 - 15.843 -This is provided as an optimization only; guest OSes can also choose 15.844 -to save and restore FP state on all context switches for simplicity. 15.845 - 15.846 - 15.847 -\section{Physical Memory Management} 15.848 - 15.849 -As mentioned previously, each domain has a maximum and current 15.850 -memory allocation. The maximum allocation, set at domain creation 15.851 -time, cannot be modified. However a domain can choose to reduce 15.852 -and subsequently grow its current allocation by using the 15.853 -following call: 15.854 - 15.855 -\begin{quote} 15.856 -\hypercall{dom\_mem\_op(unsigned int op, unsigned long *extent\_list, 15.857 - unsigned long nr\_extents, unsigned int extent\_order)} 15.858 - 15.859 -Increase or decrease current memory allocation (as determined by 15.860 -the value of {\tt op}). Each invocation provides a list of 15.861 -extents each of which is $2^s$ pages in size, 15.862 -where $s$ is the value of {\tt extent\_order}. 15.863 - 15.864 -\end{quote} 15.865 - 15.866 -In addition to simply reducing or increasing the current memory 15.867 -allocation via a `balloon driver', this call is also useful for 15.868 -obtaining contiguous regions of machine memory when required (e.g. 15.869 -for certain PCI devices, or if using superpages). 15.870 - 15.871 - 15.872 -\section{Inter-Domain Communication} 15.873 -\label{s:idc} 15.874 - 15.875 -Xen provides a simple asynchronous notification mechanism via 15.876 -\emph{event channels}. Each domain has a set of end-points (or 15.877 -\emph{ports}) which may be bound to an event source (e.g. a physical 15.878 -IRQ, a virtual IRQ, or an port in another domain). When a pair of 15.879 -end-points in two different domains are bound together, then a `send' 15.880 -operation on one will cause an event to be received by the destination 15.881 -domain. 15.882 - 15.883 -The control and use of event channels involves the following hypercall: 15.884 - 15.885 -\begin{quote} 15.886 -\hypercall{event\_channel\_op(evtchn\_op\_t *op)} 15.887 - 15.888 -Inter-domain event-channel management; {\tt op} is a discriminated 15.889 -union which allows the following 7 operations: 15.890 - 15.891 -\begin{description} 15.892 - 15.893 -\item[\it alloc\_unbound:] allocate a free (unbound) local 15.894 - port and prepare for connection from a specified domain. 15.895 -\item[\it bind\_virq:] bind a local port to a virtual 15.896 -IRQ; any particular VIRQ can be bound to at most one port per domain. 15.897 -\item[\it bind\_pirq:] bind a local port to a physical IRQ; 15.898 -once more, a given pIRQ can be bound to at most one port per 15.899 -domain. Furthermore the calling domain must be sufficiently 15.900 -privileged. 15.901 -\item[\it bind\_interdomain:] construct an interdomain event 15.902 -channel; in general, the target domain must have previously allocated 15.903 -an unbound port for this channel, although this can be bypassed by 15.904 -privileged domains during domain setup. 15.905 -\item[\it close:] close an interdomain event channel. 15.906 -\item[\it send:] send an event to the remote end of a 15.907 -interdomain event channel. 15.908 -\item[\it status:] determine the current status of a local port. 15.909 -\end{description} 15.910 - 15.911 -For more details see 15.912 -{\tt xen/include/public/event\_channel.h}. 15.913 - 15.914 -\end{quote} 15.915 - 15.916 -Event channels are the fundamental communication primitive between 15.917 -Xen domains and seamlessly support SMP. However they provide little 15.918 -bandwidth for communication {\sl per se}, and hence are typically 15.919 -married with a piece of shared memory to produce effective and 15.920 -high-performance inter-domain communication. 15.921 - 15.922 -Safe sharing of memory pages between guest OSes is carried out by 15.923 -granting access on a per page basis to individual domains. This is 15.924 -achieved by using the {\tt grant\_table\_op()} hypercall. 15.925 - 15.926 -\begin{quote} 15.927 -\hypercall{grant\_table\_op(unsigned int cmd, void *uop, unsigned int count)} 15.928 - 15.929 -Grant or remove access to a particular page to a particular domain. 15.930 - 15.931 -\end{quote} 15.932 - 15.933 -This is not currently widely in use by guest operating systems, but 15.934 -we intend to integrate support more fully in the near future. 15.935 - 15.936 -\section{PCI Configuration} 15.937 - 15.938 -Domains with physical device access (i.e.\ driver domains) receive 15.939 -limited access to certain PCI devices (bus address space and 15.940 -interrupts). However many guest operating systems attempt to 15.941 -determine the PCI configuration by directly access the PCI BIOS, 15.942 -which cannot be allowed for safety. 15.943 - 15.944 -Instead, Xen provides the following hypercall: 15.945 - 15.946 -\begin{quote} 15.947 -\hypercall{physdev\_op(void *physdev\_op)} 15.948 - 15.949 -Perform a PCI configuration option; depending on the value 15.950 -of {\tt physdev\_op} this can be a PCI config read, a PCI config 15.951 -write, or a small number of other queries. 15.952 - 15.953 -\end{quote} 15.954 - 15.955 - 15.956 -For examples of using {\tt physdev\_op()}, see the 15.957 -Xen-specific PCI code in the linux sparse tree. 15.958 - 15.959 -\section{Administrative Operations} 15.960 -\label{s:dom0ops} 15.961 - 15.962 -A large number of control operations are available to a sufficiently 15.963 -privileged domain (typically domain 0). These allow the creation and 15.964 -management of new domains, for example. A complete list is given 15.965 -below: for more details on any or all of these, please see 15.966 -{\tt xen/include/public/dom0\_ops.h} 15.967 - 15.968 - 15.969 -\begin{quote} 15.970 -\hypercall{dom0\_op(dom0\_op\_t *op)} 15.971 - 15.972 -Administrative domain operations for domain management. The options are: 15.973 - 15.974 -\begin{description} 15.975 -\item [\it DOM0\_CREATEDOMAIN:] create a new domain 15.976 - 15.977 -\item [\it DOM0\_PAUSEDOMAIN:] remove a domain from the scheduler run 15.978 -queue. 15.979 - 15.980 -\item [\it DOM0\_UNPAUSEDOMAIN:] mark a paused domain as schedulable 15.981 - once again. 15.982 - 15.983 -\item [\it DOM0\_DESTROYDOMAIN:] deallocate all resources associated 15.984 -with a domain 15.985 - 15.986 -\item [\it DOM0\_GETMEMLIST:] get list of pages used by the domain 15.987 - 15.988 -\item [\it DOM0\_SCHEDCTL:] 15.989 - 15.990 -\item [\it DOM0\_ADJUSTDOM:] adjust scheduling priorities for domain 15.991 - 15.992 -\item [\it DOM0\_BUILDDOMAIN:] do final guest OS setup for domain 15.993 - 15.994 -\item [\it DOM0\_GETDOMAINFO:] get statistics about the domain 15.995 - 15.996 -\item [\it DOM0\_GETPAGEFRAMEINFO:] 15.997 - 15.998 -\item [\it DOM0\_GETPAGEFRAMEINFO2:] 15.999 - 15.1000 -\item [\it DOM0\_IOPL:] set I/O privilege level 15.1001 - 15.1002 -\item [\it DOM0\_MSR:] read or write model specific registers 15.1003 - 15.1004 -\item [\it DOM0\_DEBUG:] interactively invoke the debugger 15.1005 - 15.1006 -\item [\it DOM0\_SETTIME:] set system time 15.1007 - 15.1008 -\item [\it DOM0\_READCONSOLE:] read console content from hypervisor buffer ring 15.1009 - 15.1010 -\item [\it DOM0\_PINCPUDOMAIN:] pin domain to a particular CPU 15.1011 - 15.1012 -\item [\it DOM0\_GETTBUFS:] get information about the size and location of 15.1013 - the trace buffers (only on trace-buffer enabled builds) 15.1014 - 15.1015 -\item [\it DOM0\_PHYSINFO:] get information about the host machine 15.1016 - 15.1017 -\item [\it DOM0\_PCIDEV\_ACCESS:] modify PCI device access permissions 15.1018 - 15.1019 -\item [\it DOM0\_SCHED\_ID:] get the ID of the current Xen scheduler 15.1020 - 15.1021 -\item [\it DOM0\_SHADOW\_CONTROL:] switch between shadow page-table modes 15.1022 - 15.1023 -\item [\it DOM0\_SETDOMAININITIALMEM:] set initial memory allocation of a domain 15.1024 - 15.1025 -\item [\it DOM0\_SETDOMAINMAXMEM:] set maximum memory allocation of a domain 15.1026 - 15.1027 -\item [\it DOM0\_SETDOMAINVMASSIST:] set domain VM assist options 15.1028 -\end{description} 15.1029 -\end{quote} 15.1030 - 15.1031 -Most of the above are best understood by looking at the code 15.1032 -implementing them (in {\tt xen/common/dom0\_ops.c}) and in 15.1033 -the user-space tools that use them (mostly in {\tt tools/libxc}). 15.1034 - 15.1035 -\section{Debugging Hypercalls} 15.1036 - 15.1037 -A few additional hypercalls are mainly useful for debugging: 15.1038 - 15.1039 -\begin{quote} 15.1040 -\hypercall{console\_io(int cmd, int count, char *str)} 15.1041 - 15.1042 -Use Xen to interact with the console; operations are: 15.1043 - 15.1044 -{\it CONSOLEIO\_write}: Output count characters from buffer str. 15.1045 - 15.1046 -{\it CONSOLEIO\_read}: Input at most count characters into buffer str. 15.1047 -\end{quote} 15.1048 - 15.1049 -A pair of hypercalls allows access to the underlying debug registers: 15.1050 -\begin{quote} 15.1051 -\hypercall{set\_debugreg(int reg, unsigned long value)} 15.1052 - 15.1053 -Set debug register {\tt reg} to {\tt value} 15.1054 - 15.1055 -\hypercall{get\_debugreg(int reg)} 15.1056 - 15.1057 -Return the contents of the debug register {\tt reg} 15.1058 -\end{quote} 15.1059 - 15.1060 -And finally: 15.1061 -\begin{quote} 15.1062 -\hypercall{xen\_version(int cmd)} 15.1063 - 15.1064 -Request Xen version number. 15.1065 -\end{quote} 15.1066 - 15.1067 -This is useful to ensure that user-space tools are in sync 15.1068 -with the underlying hypervisor. 15.1069 - 15.1070 -\section{Deprecated Hypercalls} 15.1071 - 15.1072 -Xen is under constant development and refinement; as such there 15.1073 -are plans to improve the way in which various pieces of functionality 15.1074 -are exposed to guest OSes. 15.1075 - 15.1076 -\begin{quote} 15.1077 -\hypercall{vm\_assist(unsigned int cmd, unsigned int type)} 15.1078 - 15.1079 -Toggle various memory management modes (in particular wrritable page 15.1080 -tables and superpage support). 15.1081 - 15.1082 -\end{quote} 15.1083 - 15.1084 -This is likely to be replaced with mode values in the shared 15.1085 -information page since this is more resilient for resumption 15.1086 -after migration or checkpoint. 15.1087 - 15.1088 - 15.1089 - 15.1090 - 15.1091 - 15.1092 - 15.1093 +%% chapter hypercalls moved to hypercalls.tex 15.1094 +\include{src/interface/hypercalls} 15.1095 15.1096 15.1097 %% 15.1098 @@ -1173,279 +112,9 @@ after migration or checkpoint. 15.1099 %% new scheduler... not clear how many of them there are... 15.1100 %% 15.1101 15.1102 -\begin{comment} 15.1103 - 15.1104 -\chapter{Scheduling API} 15.1105 - 15.1106 -The scheduling API is used by both the schedulers described above and should 15.1107 -also be used by any new schedulers. It provides a generic interface and also 15.1108 -implements much of the ``boilerplate'' code. 15.1109 - 15.1110 -Schedulers conforming to this API are described by the following 15.1111 -structure: 15.1112 - 15.1113 -\begin{verbatim} 15.1114 -struct scheduler 15.1115 -{ 15.1116 - char *name; /* full name for this scheduler */ 15.1117 - char *opt_name; /* option name for this scheduler */ 15.1118 - unsigned int sched_id; /* ID for this scheduler */ 15.1119 - 15.1120 - int (*init_scheduler) (); 15.1121 - int (*alloc_task) (struct task_struct *); 15.1122 - void (*add_task) (struct task_struct *); 15.1123 - void (*free_task) (struct task_struct *); 15.1124 - void (*rem_task) (struct task_struct *); 15.1125 - void (*wake_up) (struct task_struct *); 15.1126 - void (*do_block) (struct task_struct *); 15.1127 - task_slice_t (*do_schedule) (s_time_t); 15.1128 - int (*control) (struct sched_ctl_cmd *); 15.1129 - int (*adjdom) (struct task_struct *, 15.1130 - struct sched_adjdom_cmd *); 15.1131 - s32 (*reschedule) (struct task_struct *); 15.1132 - void (*dump_settings) (void); 15.1133 - void (*dump_cpu_state) (int); 15.1134 - void (*dump_runq_el) (struct task_struct *); 15.1135 -}; 15.1136 -\end{verbatim} 15.1137 - 15.1138 -The only method that {\em must} be implemented is 15.1139 -{\tt do\_schedule()}. However, if there is not some implementation for the 15.1140 -{\tt wake\_up()} method then waking tasks will not get put on the runqueue! 15.1141 - 15.1142 -The fields of the above structure are described in more detail below. 15.1143 - 15.1144 -\subsubsection{name} 15.1145 - 15.1146 -The name field should point to a descriptive ASCII string. 15.1147 - 15.1148 -\subsubsection{opt\_name} 15.1149 - 15.1150 -This field is the value of the {\tt sched=} boot-time option that will select 15.1151 -this scheduler. 15.1152 - 15.1153 -\subsubsection{sched\_id} 15.1154 - 15.1155 -This is an integer that uniquely identifies this scheduler. There should be a 15.1156 -macro corrsponding to this scheduler ID in {\tt <xen/sched-if.h>}. 15.1157 - 15.1158 -\subsubsection{init\_scheduler} 15.1159 - 15.1160 -\paragraph*{Purpose} 15.1161 - 15.1162 -This is a function for performing any scheduler-specific initialisation. For 15.1163 -instance, it might allocate memory for per-CPU scheduler data and initialise it 15.1164 -appropriately. 15.1165 - 15.1166 -\paragraph*{Call environment} 15.1167 - 15.1168 -This function is called after the initialisation performed by the generic 15.1169 -layer. The function is called exactly once, for the scheduler that has been 15.1170 -selected. 15.1171 - 15.1172 -\paragraph*{Return values} 15.1173 - 15.1174 -This should return negative on failure --- this will cause an 15.1175 -immediate panic and the system will fail to boot. 15.1176 - 15.1177 -\subsubsection{alloc\_task} 15.1178 - 15.1179 -\paragraph*{Purpose} 15.1180 -Called when a {\tt task\_struct} is allocated by the generic scheduler 15.1181 -layer. A particular scheduler implementation may use this method to 15.1182 -allocate per-task data for this task. It may use the {\tt 15.1183 -sched\_priv} pointer in the {\tt task\_struct} to point to this data. 15.1184 - 15.1185 -\paragraph*{Call environment} 15.1186 -The generic layer guarantees that the {\tt sched\_priv} field will 15.1187 -remain intact from the time this method is called until the task is 15.1188 -deallocated (so long as the scheduler implementation does not change 15.1189 -it explicitly!). 15.1190 - 15.1191 -\paragraph*{Return values} 15.1192 -Negative on failure. 15.1193 - 15.1194 -\subsubsection{add\_task} 15.1195 - 15.1196 -\paragraph*{Purpose} 15.1197 - 15.1198 -Called when a task is initially added by the generic layer. 15.1199 - 15.1200 -\paragraph*{Call environment} 15.1201 - 15.1202 -The fields in the {\tt task\_struct} are now filled out and available for use. 15.1203 -Schedulers should implement appropriate initialisation of any per-task private 15.1204 -information in this method. 15.1205 - 15.1206 -\subsubsection{free\_task} 15.1207 - 15.1208 -\paragraph*{Purpose} 15.1209 - 15.1210 -Schedulers should free the space used by any associated private data 15.1211 -structures. 15.1212 - 15.1213 -\paragraph*{Call environment} 15.1214 - 15.1215 -This is called when a {\tt task\_struct} is about to be deallocated. 15.1216 -The generic layer will have done generic task removal operations and 15.1217 -(if implemented) called the scheduler's {\tt rem\_task} method before 15.1218 -this method is called. 15.1219 - 15.1220 -\subsubsection{rem\_task} 15.1221 - 15.1222 -\paragraph*{Purpose} 15.1223 - 15.1224 -This is called when a task is being removed from scheduling (but is 15.1225 -not yet being freed). 15.1226 - 15.1227 -\subsubsection{wake\_up} 15.1228 - 15.1229 -\paragraph*{Purpose} 15.1230 - 15.1231 -Called when a task is woken up, this method should put the task on the runqueue 15.1232 -(or do the scheduler-specific equivalent action). 15.1233 - 15.1234 -\paragraph*{Call environment} 15.1235 - 15.1236 -The task is already set to state RUNNING. 15.1237 - 15.1238 -\subsubsection{do\_block} 15.1239 - 15.1240 -\paragraph*{Purpose} 15.1241 - 15.1242 -This function is called when a task is blocked. This function should 15.1243 -not remove the task from the runqueue. 15.1244 - 15.1245 -\paragraph*{Call environment} 15.1246 - 15.1247 -The EVENTS\_MASTER\_ENABLE\_BIT is already set and the task state changed to 15.1248 -TASK\_INTERRUPTIBLE on entry to this method. A call to the {\tt 15.1249 - do\_schedule} method will be made after this method returns, in 15.1250 -order to select the next task to run. 15.1251 - 15.1252 -\subsubsection{do\_schedule} 15.1253 - 15.1254 -This method must be implemented. 15.1255 - 15.1256 -\paragraph*{Purpose} 15.1257 - 15.1258 -The method is called each time a new task must be chosen for scheduling on the 15.1259 -current CPU. The current time as passed as the single argument (the current 15.1260 -task can be found using the {\tt current} macro). 15.1261 - 15.1262 -This method should select the next task to run on this CPU and set it's minimum 15.1263 -time to run as well as returning the data described below. 15.1264 - 15.1265 -This method should also take the appropriate action if the previous 15.1266 -task has blocked, e.g. removing it from the runqueue. 15.1267 - 15.1268 -\paragraph*{Call environment} 15.1269 - 15.1270 -The other fields in the {\tt task\_struct} are updated by the generic layer, 15.1271 -which also performs all Xen-specific tasks and performs the actual task switch 15.1272 -(unless the previous task has been chosen again). 15.1273 - 15.1274 -This method is called with the {\tt schedule\_lock} held for the current CPU 15.1275 -and local interrupts disabled. 15.1276 - 15.1277 -\paragraph*{Return values} 15.1278 - 15.1279 -Must return a {\tt struct task\_slice} describing what task to run and how long 15.1280 -for (at maximum). 15.1281 - 15.1282 -\subsubsection{control} 15.1283 - 15.1284 -\paragraph*{Purpose} 15.1285 - 15.1286 -This method is called for global scheduler control operations. It takes a 15.1287 -pointer to a {\tt struct sched\_ctl\_cmd}, which it should either 15.1288 -source data from or populate with data, depending on the value of the 15.1289 -{\tt direction} field. 15.1290 - 15.1291 -\paragraph*{Call environment} 15.1292 - 15.1293 -The generic layer guarantees that when this method is called, the 15.1294 -caller selected the correct scheduler ID, hence the scheduler's 15.1295 -implementation does not need to sanity-check these parts of the call. 15.1296 - 15.1297 -\paragraph*{Return values} 15.1298 - 15.1299 -This function should return the value to be passed back to user space, hence it 15.1300 -should either be 0 or an appropriate errno value. 15.1301 - 15.1302 -\subsubsection{sched\_adjdom} 15.1303 - 15.1304 -\paragraph*{Purpose} 15.1305 - 15.1306 -This method is called to adjust the scheduling parameters of a particular 15.1307 -domain, or to query their current values. The function should check 15.1308 -the {\tt direction} field of the {\tt sched\_adjdom\_cmd} it receives in 15.1309 -order to determine which of these operations is being performed. 15.1310 - 15.1311 -\paragraph*{Call environment} 15.1312 - 15.1313 -The generic layer guarantees that the caller has specified the correct 15.1314 -control interface version and scheduler ID and that the supplied {\tt 15.1315 -task\_struct} will not be deallocated during the call (hence it is not 15.1316 -necessary to {\tt get\_task\_struct}). 15.1317 - 15.1318 -\paragraph*{Return values} 15.1319 - 15.1320 -This function should return the value to be passed back to user space, hence it 15.1321 -should either be 0 or an appropriate errno value. 15.1322 - 15.1323 -\subsubsection{reschedule} 15.1324 - 15.1325 -\paragraph*{Purpose} 15.1326 - 15.1327 -This method is called to determine if a reschedule is required as a result of a 15.1328 -particular task. 15.1329 - 15.1330 -\paragraph*{Call environment} 15.1331 -The generic layer will cause a reschedule if the current domain is the idle 15.1332 -task or it has exceeded its minimum time slice before a reschedule. The 15.1333 -generic layer guarantees that the task passed is not currently running but is 15.1334 -on the runqueue. 15.1335 - 15.1336 -\paragraph*{Return values} 15.1337 - 15.1338 -Should return a mask of CPUs to cause a reschedule on. 15.1339 - 15.1340 -\subsubsection{dump\_settings} 15.1341 - 15.1342 -\paragraph*{Purpose} 15.1343 - 15.1344 -If implemented, this should dump any private global settings for this 15.1345 -scheduler to the console. 15.1346 - 15.1347 -\paragraph*{Call environment} 15.1348 - 15.1349 -This function is called with interrupts enabled. 15.1350 - 15.1351 -\subsubsection{dump\_cpu\_state} 15.1352 - 15.1353 -\paragraph*{Purpose} 15.1354 - 15.1355 -This method should dump any private settings for the specified CPU. 15.1356 - 15.1357 -\paragraph*{Call environment} 15.1358 - 15.1359 -This function is called with interrupts disabled and the {\tt schedule\_lock} 15.1360 -for the specified CPU held. 15.1361 - 15.1362 -\subsubsection{dump\_runq\_el} 15.1363 - 15.1364 -\paragraph*{Purpose} 15.1365 - 15.1366 -This method should dump any private settings for the specified task. 15.1367 - 15.1368 -\paragraph*{Call environment} 15.1369 - 15.1370 -This function is called with interrupts disabled and the {\tt schedule\_lock} 15.1371 -for the task's CPU held. 15.1372 - 15.1373 -\end{comment} 15.1374 - 15.1375 +%% \include{src/interface/scheduling} 15.1376 +%% scheduling information moved to scheduling.tex 15.1377 +%% still commented out 15.1378 15.1379 15.1380 15.1381 @@ -1457,74 +126,9 @@ for the task's CPU held. 15.1382 %% (and/or kip's stuff?) and write about that instead? 15.1383 %% 15.1384 15.1385 -\begin{comment} 15.1386 - 15.1387 -\chapter{Debugging} 15.1388 - 15.1389 -Xen provides tools for debugging both Xen and guest OSes. Currently, the 15.1390 -Pervasive Debugger provides a GDB stub, which provides facilities for symbolic 15.1391 -debugging of Xen itself and of OS kernels running on top of Xen. The Trace 15.1392 -Buffer provides a lightweight means to log data about Xen's internal state and 15.1393 -behaviour at runtime, for later analysis. 15.1394 - 15.1395 -\section{Pervasive Debugger} 15.1396 - 15.1397 -Information on using the pervasive debugger is available in pdb.txt. 15.1398 - 15.1399 - 15.1400 -\section{Trace Buffer} 15.1401 - 15.1402 -The trace buffer provides a means to observe Xen's operation from domain 0. 15.1403 -Trace events, inserted at key points in Xen's code, record data that can be 15.1404 -read by the {\tt xentrace} tool. Recording these events has a low overhead 15.1405 -and hence the trace buffer may be useful for debugging timing-sensitive 15.1406 -behaviours. 15.1407 - 15.1408 -\subsection{Internal API} 15.1409 - 15.1410 -To use the trace buffer functionality from within Xen, you must {\tt \#include 15.1411 -<xen/trace.h>}, which contains definitions related to the trace buffer. Trace 15.1412 -events are inserted into the buffer using the {\tt TRACE\_xD} ({\tt x} = 0, 1, 15.1413 -2, 3, 4 or 5) macros. These all take an event number, plus {\tt x} additional 15.1414 -(32-bit) data as their arguments. For trace buffer-enabled builds of Xen these 15.1415 -will insert the event ID and data into the trace buffer, along with the current 15.1416 -value of the CPU cycle-counter. For builds without the trace buffer enabled, 15.1417 -the macros expand to no-ops and thus can be left in place without incurring 15.1418 -overheads. 15.1419 - 15.1420 -\subsection{Trace-enabled builds} 15.1421 - 15.1422 -By default, the trace buffer is enabled only in debug builds (i.e. {\tt NDEBUG} 15.1423 -is not defined). It can be enabled separately by defining {\tt TRACE\_BUFFER}, 15.1424 -either in {\tt <xen/config.h>} or on the gcc command line. 15.1425 - 15.1426 -The size (in pages) of the per-CPU trace buffers can be specified using the 15.1427 -{\tt tbuf\_size=n } boot parameter to Xen. If the size is set to 0, the trace 15.1428 -buffers will be disabled. 15.1429 - 15.1430 -\subsection{Dumping trace data} 15.1431 - 15.1432 -When running a trace buffer build of Xen, trace data are written continuously 15.1433 -into the buffer data areas, with newer data overwriting older data. This data 15.1434 -can be captured using the {\tt xentrace} program in domain 0. 15.1435 - 15.1436 -The {\tt xentrace} tool uses {\tt /dev/mem} in domain 0 to map the trace 15.1437 -buffers into its address space. It then periodically polls all the buffers for 15.1438 -new data, dumping out any new records from each buffer in turn. As a result, 15.1439 -for machines with multiple (logical) CPUs, the trace buffer output will not be 15.1440 -in overall chronological order. 15.1441 - 15.1442 -The output from {\tt xentrace} can be post-processed using {\tt 15.1443 -xentrace\_cpusplit} (used to split trace data out into per-cpu log files) and 15.1444 -{\tt xentrace\_format} (used to pretty-print trace data). For the predefined 15.1445 -trace points, there is an example format file in {\tt tools/xentrace/formats }. 15.1446 - 15.1447 -For more information, see the manual pages for {\tt xentrace}, {\tt 15.1448 -xentrace\_format} and {\tt xentrace\_cpusplit}. 15.1449 - 15.1450 -\end{comment} 15.1451 - 15.1452 - 15.1453 +%% \include{src/interface/debugging} 15.1454 +%% debugging information moved to debugging.tex 15.1455 +%% still commented out 15.1456 15.1457 15.1458 \end{document}
16.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 16.2 +++ b/docs/src/interface/architecture.tex Thu Sep 22 11:42:01 2005 -0600 16.3 @@ -0,0 +1,140 @@ 16.4 +\chapter{Virtual Architecture} 16.5 + 16.6 +On a Xen-based system, the hypervisor itself runs in {\it ring 0}. It 16.7 +has full access to the physical memory available in the system and is 16.8 +responsible for allocating portions of it to the domains. Guest 16.9 +operating systems run in and use {\it rings 1}, {\it 2} and {\it 3} as 16.10 +they see fit. Segmentation is used to prevent the guest OS from 16.11 +accessing the portion of the address space that is reserved for Xen. 16.12 +We expect most guest operating systems will use ring 1 for their own 16.13 +operation and place applications in ring 3. 16.14 + 16.15 +In this chapter we consider the basic virtual architecture provided by 16.16 +Xen: the basic CPU state, exception and interrupt handling, and time. 16.17 +Other aspects such as memory and device access are discussed in later 16.18 +chapters. 16.19 + 16.20 + 16.21 +\section{CPU state} 16.22 + 16.23 +All privileged state must be handled by Xen. The guest OS has no 16.24 +direct access to CR3 and is not permitted to update privileged bits in 16.25 +EFLAGS. Guest OSes use \emph{hypercalls} to invoke operations in Xen; 16.26 +these are analogous to system calls but occur from ring 1 to ring 0. 16.27 + 16.28 +A list of all hypercalls is given in Appendix~\ref{a:hypercalls}. 16.29 + 16.30 + 16.31 +\section{Exceptions} 16.32 + 16.33 +A virtual IDT is provided --- a domain can submit a table of trap 16.34 +handlers to Xen via the {\tt set\_trap\_table()} hypercall. Most trap 16.35 +handlers are identical to native x86 handlers, although the page-fault 16.36 +handler is somewhat different. 16.37 + 16.38 + 16.39 +\section{Interrupts and events} 16.40 + 16.41 +Interrupts are virtualized by mapping them to \emph{events}, which are 16.42 +delivered asynchronously to the target domain using a callback 16.43 +supplied via the {\tt set\_callbacks()} hypercall. A guest OS can map 16.44 +these events onto its standard interrupt dispatch mechanisms. Xen is 16.45 +responsible for determining the target domain that will handle each 16.46 +physical interrupt source. For more details on the binding of event 16.47 +sources to events, see Chapter~\ref{c:devices}. 16.48 + 16.49 + 16.50 +\section{Time} 16.51 + 16.52 +Guest operating systems need to be aware of the passage of both real 16.53 +(or wallclock) time and their own `virtual time' (the time for which 16.54 +they have been executing). Furthermore, Xen has a notion of time which 16.55 +is used for scheduling. The following notions of time are provided: 16.56 + 16.57 +\begin{description} 16.58 +\item[Cycle counter time.] 16.59 + 16.60 + This provides a fine-grained time reference. The cycle counter time 16.61 + is used to accurately extrapolate the other time references. On SMP 16.62 + machines it is currently assumed that the cycle counter time is 16.63 + synchronized between CPUs. The current x86-based implementation 16.64 + achieves this within inter-CPU communication latencies. 16.65 + 16.66 +\item[System time.] 16.67 + 16.68 + This is a 64-bit counter which holds the number of nanoseconds that 16.69 + have elapsed since system boot. 16.70 + 16.71 +\item[Wall clock time.] 16.72 + 16.73 + This is the time of day in a Unix-style {\tt struct timeval} 16.74 + (seconds and microseconds since 1 January 1970, adjusted by leap 16.75 + seconds). An NTP client hosted by {\it domain 0} can keep this 16.76 + value accurate. 16.77 + 16.78 +\item[Domain virtual time.] 16.79 + 16.80 + This progresses at the same pace as system time, but only while a 16.81 + domain is executing --- it stops while a domain is de-scheduled. 16.82 + Therefore the share of the CPU that a domain receives is indicated 16.83 + by the rate at which its virtual time increases. 16.84 + 16.85 +\end{description} 16.86 + 16.87 + 16.88 +Xen exports timestamps for system time and wall-clock time to guest 16.89 +operating systems through a shared page of memory. Xen also provides 16.90 +the cycle counter time at the instant the timestamps were calculated, 16.91 +and the CPU frequency in Hertz. This allows the guest to extrapolate 16.92 +system and wall-clock times accurately based on the current cycle 16.93 +counter time. 16.94 + 16.95 +Since all time stamps need to be updated and read \emph{atomically} 16.96 +two version numbers are also stored in the shared info page. The first 16.97 +is incremented prior to an update, while the second is only 16.98 +incremented afterwards. Thus a guest can be sure that it read a 16.99 +consistent state by checking the two version numbers are equal. 16.100 + 16.101 +Xen includes a periodic ticker which sends a timer event to the 16.102 +currently executing domain every 10ms. The Xen scheduler also sends a 16.103 +timer event whenever a domain is scheduled; this allows the guest OS 16.104 +to adjust for the time that has passed while it has been inactive. In 16.105 +addition, Xen allows each domain to request that they receive a timer 16.106 +event sent at a specified system time by using the {\tt 16.107 + set\_timer\_op()} hypercall. Guest OSes may use this timer to 16.108 +implement timeout values when they block. 16.109 + 16.110 + 16.111 + 16.112 +%% % akw: demoting this to a section -- not sure if there is any point 16.113 +%% % though, maybe just remove it. 16.114 + 16.115 +\section{Xen CPU Scheduling} 16.116 + 16.117 +Xen offers a uniform API for CPU schedulers. It is possible to choose 16.118 +from a number of schedulers at boot and it should be easy to add more. 16.119 +The BVT, Atropos and Round Robin schedulers are part of the normal Xen 16.120 +distribution. BVT provides proportional fair shares of the CPU to the 16.121 +running domains. Atropos can be used to reserve absolute shares of 16.122 +the CPU for each domain. Round-robin is provided as an example of 16.123 +Xen's internal scheduler API. 16.124 + 16.125 +\paragraph*{Note: SMP host support} 16.126 +Xen has always supported SMP host systems. Domains are statically 16.127 +assigned to CPUs, either at creation time or when manually pinning to 16.128 +a particular CPU. The current schedulers then run locally on each CPU 16.129 +to decide which of the assigned domains should be run there. The 16.130 +user-level control software can be used to perform coarse-grain 16.131 +load-balancing between CPUs. 16.132 + 16.133 + 16.134 +%% More information on the characteristics and use of these schedulers 16.135 +%% is available in {\tt Sched-HOWTO.txt}. 16.136 + 16.137 + 16.138 +\section{Privileged operations} 16.139 + 16.140 +Xen exports an extended interface to privileged domains (viz.\ {\it 16.141 + Domain 0}). This allows such domains to build and boot other domains 16.142 +on the server, and provides control interfaces for managing 16.143 +scheduling, memory, networking, and block devices.
17.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 17.2 +++ b/docs/src/interface/debugging.tex Thu Sep 22 11:42:01 2005 -0600 17.3 @@ -0,0 +1,62 @@ 17.4 +\chapter{Debugging} 17.5 + 17.6 +Xen provides tools for debugging both Xen and guest OSes. Currently, the 17.7 +Pervasive Debugger provides a GDB stub, which provides facilities for symbolic 17.8 +debugging of Xen itself and of OS kernels running on top of Xen. The Trace 17.9 +Buffer provides a lightweight means to log data about Xen's internal state and 17.10 +behaviour at runtime, for later analysis. 17.11 + 17.12 +\section{Pervasive Debugger} 17.13 + 17.14 +Information on using the pervasive debugger is available in pdb.txt. 17.15 + 17.16 + 17.17 +\section{Trace Buffer} 17.18 + 17.19 +The trace buffer provides a means to observe Xen's operation from domain 0. 17.20 +Trace events, inserted at key points in Xen's code, record data that can be 17.21 +read by the {\tt xentrace} tool. Recording these events has a low overhead 17.22 +and hence the trace buffer may be useful for debugging timing-sensitive 17.23 +behaviours. 17.24 + 17.25 +\subsection{Internal API} 17.26 + 17.27 +To use the trace buffer functionality from within Xen, you must {\tt \#include 17.28 +<xen/trace.h>}, which contains definitions related to the trace buffer. Trace 17.29 +events are inserted into the buffer using the {\tt TRACE\_xD} ({\tt x} = 0, 1, 17.30 +2, 3, 4 or 5) macros. These all take an event number, plus {\tt x} additional 17.31 +(32-bit) data as their arguments. For trace buffer-enabled builds of Xen these 17.32 +will insert the event ID and data into the trace buffer, along with the current 17.33 +value of the CPU cycle-counter. For builds without the trace buffer enabled, 17.34 +the macros expand to no-ops and thus can be left in place without incurring 17.35 +overheads. 17.36 + 17.37 +\subsection{Trace-enabled builds} 17.38 + 17.39 +By default, the trace buffer is enabled only in debug builds (i.e. {\tt NDEBUG} 17.40 +is not defined). It can be enabled separately by defining {\tt TRACE\_BUFFER}, 17.41 +either in {\tt <xen/config.h>} or on the gcc command line. 17.42 + 17.43 +The size (in pages) of the per-CPU trace buffers can be specified using the 17.44 +{\tt tbuf\_size=n } boot parameter to Xen. If the size is set to 0, the trace 17.45 +buffers will be disabled. 17.46 + 17.47 +\subsection{Dumping trace data} 17.48 + 17.49 +When running a trace buffer build of Xen, trace data are written continuously 17.50 +into the buffer data areas, with newer data overwriting older data. This data 17.51 +can be captured using the {\tt xentrace} program in domain 0. 17.52 + 17.53 +The {\tt xentrace} tool uses {\tt /dev/mem} in domain 0 to map the trace 17.54 +buffers into its address space. It then periodically polls all the buffers for 17.55 +new data, dumping out any new records from each buffer in turn. As a result, 17.56 +for machines with multiple (logical) CPUs, the trace buffer output will not be 17.57 +in overall chronological order. 17.58 + 17.59 +The output from {\tt xentrace} can be post-processed using {\tt 17.60 +xentrace\_cpusplit} (used to split trace data out into per-cpu log files) and 17.61 +{\tt xentrace\_format} (used to pretty-print trace data). For the predefined 17.62 +trace points, there is an example format file in {\tt tools/xentrace/formats }. 17.63 + 17.64 +For more information, see the manual pages for {\tt xentrace}, {\tt 17.65 +xentrace\_format} and {\tt xentrace\_cpusplit}.
18.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 18.2 +++ b/docs/src/interface/devices.tex Thu Sep 22 11:42:01 2005 -0600 18.3 @@ -0,0 +1,178 @@ 18.4 +\chapter{Devices} 18.5 +\label{c:devices} 18.6 + 18.7 +Devices such as network and disk are exported to guests using a split 18.8 +device driver. The device driver domain, which accesses the physical 18.9 +device directly also runs a \emph{backend} driver, serving requests to 18.10 +that device from guests. Each guest will use a simple \emph{frontend} 18.11 +driver, to access the backend. Communication between these domains is 18.12 +composed of two parts: First, data is placed onto a shared memory page 18.13 +between the domains. Second, an event channel between the two domains 18.14 +is used to pass notification that data is outstanding. This 18.15 +separation of notification from data transfer allows message batching, 18.16 +and results in very efficient device access. 18.17 + 18.18 +Event channels are used extensively in device virtualization; each 18.19 +domain has a number of end-points or \emph{ports} each of which may be 18.20 +bound to one of the following \emph{event sources}: 18.21 +\begin{itemize} 18.22 + \item a physical interrupt from a real device, 18.23 + \item a virtual interrupt (callback) from Xen, or 18.24 + \item a signal from another domain 18.25 +\end{itemize} 18.26 + 18.27 +Events are lightweight and do not carry much information beyond the 18.28 +source of the notification. Hence when performing bulk data transfer, 18.29 +events are typically used as synchronization primitives over a shared 18.30 +memory transport. Event channels are managed via the {\tt 18.31 + event\_channel\_op()} hypercall; for more details see 18.32 +Section~\ref{s:idc}. 18.33 + 18.34 +This chapter focuses on some individual device interfaces available to 18.35 +Xen guests. 18.36 + 18.37 + 18.38 +\section{Network I/O} 18.39 + 18.40 +Virtual network device services are provided by shared memory 18.41 +communication with a backend domain. From the point of view of other 18.42 +domains, the backend may be viewed as a virtual ethernet switch 18.43 +element with each domain having one or more virtual network interfaces 18.44 +connected to it. 18.45 + 18.46 +\subsection{Backend Packet Handling} 18.47 + 18.48 +The backend driver is responsible for a variety of actions relating to 18.49 +the transmission and reception of packets from the physical device. 18.50 +With regard to transmission, the backend performs these key actions: 18.51 + 18.52 +\begin{itemize} 18.53 +\item {\bf Validation:} To ensure that domains do not attempt to 18.54 + generate invalid (e.g. spoofed) traffic, the backend driver may 18.55 + validate headers ensuring that source MAC and IP addresses match the 18.56 + interface that they have been sent from. 18.57 + 18.58 + Validation functions can be configured using standard firewall rules 18.59 + ({\small{\tt iptables}} in the case of Linux). 18.60 + 18.61 +\item {\bf Scheduling:} Since a number of domains can share a single 18.62 + physical network interface, the backend must mediate access when 18.63 + several domains each have packets queued for transmission. This 18.64 + general scheduling function subsumes basic shaping or rate-limiting 18.65 + schemes. 18.66 + 18.67 +\item {\bf Logging and Accounting:} The backend domain can be 18.68 + configured with classifier rules that control how packets are 18.69 + accounted or logged. For example, log messages might be generated 18.70 + whenever a domain attempts to send a TCP packet containing a SYN. 18.71 +\end{itemize} 18.72 + 18.73 +On receipt of incoming packets, the backend acts as a simple 18.74 +demultiplexer: Packets are passed to the appropriate virtual interface 18.75 +after any necessary logging and accounting have been carried out. 18.76 + 18.77 +\subsection{Data Transfer} 18.78 + 18.79 +Each virtual interface uses two ``descriptor rings'', one for 18.80 +transmit, the other for receive. Each descriptor identifies a block 18.81 +of contiguous physical memory allocated to the domain. 18.82 + 18.83 +The transmit ring carries packets to transmit from the guest to the 18.84 +backend domain. The return path of the transmit ring carries messages 18.85 +indicating that the contents have been physically transmitted and the 18.86 +backend no longer requires the associated pages of memory. 18.87 + 18.88 +To receive packets, the guest places descriptors of unused pages on 18.89 +the receive ring. The backend will return received packets by 18.90 +exchanging these pages in the domain's memory with new pages 18.91 +containing the received data, and passing back descriptors regarding 18.92 +the new packets on the ring. This zero-copy approach allows the 18.93 +backend to maintain a pool of free pages to receive packets into, and 18.94 +then deliver them to appropriate domains after examining their 18.95 +headers. 18.96 + 18.97 +% Real physical addresses are used throughout, with the domain 18.98 +% performing translation from pseudo-physical addresses if that is 18.99 +% necessary. 18.100 + 18.101 +If a domain does not keep its receive ring stocked with empty buffers 18.102 +then packets destined to it may be dropped. This provides some 18.103 +defence against receive livelock problems because an overload domain 18.104 +will cease to receive further data. Similarly, on the transmit path, 18.105 +it provides the application with feedback on the rate at which packets 18.106 +are able to leave the system. 18.107 + 18.108 +Flow control on rings is achieved by including a pair of producer 18.109 +indexes on the shared ring page. Each side will maintain a private 18.110 +consumer index indicating the next outstanding message. In this 18.111 +manner, the domains cooperate to divide the ring into two message 18.112 +lists, one in each direction. Notification is decoupled from the 18.113 +immediate placement of new messages on the ring; the event channel 18.114 +will be used to generate notification when {\em either} a certain 18.115 +number of outstanding messages are queued, {\em or} a specified number 18.116 +of nanoseconds have elapsed since the oldest message was placed on the 18.117 +ring. 18.118 + 18.119 +%% Not sure if my version is any better -- here is what was here 18.120 +%% before: Synchronization between the backend domain and the guest is 18.121 +%% achieved using counters held in shared memory that is accessible to 18.122 +%% both. Each ring has associated producer and consumer indices 18.123 +%% indicating the area in the ring that holds descriptors that contain 18.124 +%% data. After receiving {\it n} packets or {\t nanoseconds} after 18.125 +%% receiving the first packet, the hypervisor sends an event to the 18.126 +%% domain. 18.127 + 18.128 + 18.129 +\section{Block I/O} 18.130 + 18.131 +All guest OS disk access goes through the virtual block device VBD 18.132 +interface. This interface allows domains access to portions of block 18.133 +storage devices visible to the the block backend device. The VBD 18.134 +interface is a split driver, similar to the network interface 18.135 +described above. A single shared memory ring is used between the 18.136 +frontend and backend drivers, across which read and write messages are 18.137 +sent. 18.138 + 18.139 +Any block device accessible to the backend domain, including 18.140 +network-based block (iSCSI, *NBD, etc), loopback and LVM/MD devices, 18.141 +can be exported as a VBD. Each VBD is mapped to a device node in the 18.142 +guest, specified in the guest's startup configuration. 18.143 + 18.144 +Old (Xen 1.2) virtual disks are not supported under Xen 2.0, since 18.145 +similar functionality can be achieved using the more complete LVM 18.146 +system, which is already in widespread use. 18.147 + 18.148 +\subsection{Data Transfer} 18.149 + 18.150 +The single ring between the guest and the block backend supports three 18.151 +messages: 18.152 + 18.153 +\begin{description} 18.154 +\item [{\small {\tt PROBE}}:] Return a list of the VBDs available to 18.155 + this guest from the backend. The request includes a descriptor of a 18.156 + free page into which the reply will be written by the backend. 18.157 + 18.158 +\item [{\small {\tt READ}}:] Read data from the specified block 18.159 + device. The front end identifies the device and location to read 18.160 + from and attaches pages for the data to be copied to (typically via 18.161 + DMA from the device). The backend acknowledges completed read 18.162 + requests as they finish. 18.163 + 18.164 +\item [{\small {\tt WRITE}}:] Write data to the specified block 18.165 + device. This functions essentially as {\small {\tt READ}}, except 18.166 + that the data moves to the device instead of from it. 18.167 +\end{description} 18.168 + 18.169 +%% um... some old text: In overview, the same style of descriptor-ring 18.170 +%% that is used for network packets is used here. Each domain has one 18.171 +%% ring that carries operation requests to the hypervisor and carries 18.172 +%% the results back again. 18.173 + 18.174 +%% Rather than copying data, the backend simply maps the domain's 18.175 +%% buffers in order to enable direct DMA to them. The act of mapping 18.176 +%% the buffers also increases the reference counts of the underlying 18.177 +%% pages, so that the unprivileged domain cannot try to return them to 18.178 +%% the hypervisor, install them as page tables, or any other unsafe 18.179 +%% behaviour. 18.180 +%% 18.181 +%% % block API here
19.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 19.2 +++ b/docs/src/interface/further_info.tex Thu Sep 22 11:42:01 2005 -0600 19.3 @@ -0,0 +1,49 @@ 19.4 +\chapter{Further Information} 19.5 + 19.6 +If you have questions that are not answered by this manual, the 19.7 +sources of information listed below may be of interest to you. Note 19.8 +that bug reports, suggestions and contributions related to the 19.9 +software (or the documentation) should be sent to the Xen developers' 19.10 +mailing list (address below). 19.11 + 19.12 + 19.13 +\section{Other documentation} 19.14 + 19.15 +If you are mainly interested in using (rather than developing for) 19.16 +Xen, the \emph{Xen Users' Manual} is distributed in the {\tt docs/} 19.17 +directory of the Xen source distribution. 19.18 + 19.19 +% Various HOWTOs are also available in {\tt docs/HOWTOS}. 19.20 + 19.21 + 19.22 +\section{Online references} 19.23 + 19.24 +The official Xen web site is found at: 19.25 +\begin{quote} 19.26 +{\tt http://www.cl.cam.ac.uk/Research/SRG/netos/xen/} 19.27 +\end{quote} 19.28 + 19.29 +This contains links to the latest versions of all on-line 19.30 +documentation. 19.31 + 19.32 + 19.33 +\section{Mailing lists} 19.34 + 19.35 +There are currently four official Xen mailing lists: 19.36 + 19.37 +\begin{description} 19.38 +\item[xen-devel@lists.xensource.com] Used for development 19.39 + discussions and bug reports. Subscribe at: \\ 19.40 + {\small {\tt http://lists.xensource.com/xen-devel}} 19.41 +\item[xen-users@lists.xensource.com] Used for installation and usage 19.42 + discussions and requests for help. Subscribe at: \\ 19.43 + {\small {\tt http://lists.xensource.com/xen-users}} 19.44 +\item[xen-announce@lists.xensource.com] Used for announcements only. 19.45 + Subscribe at: \\ 19.46 + {\small {\tt http://lists.xensource.com/xen-announce}} 19.47 +\item[xen-changelog@lists.xensource.com] Changelog feed 19.48 + from the unstable and 2.0 trees - developer oriented. Subscribe at: \\ 19.49 + {\small {\tt http://lists.xensource.com/xen-changelog}} 19.50 +\end{description} 19.51 + 19.52 +Of these, xen-devel is the most active.
20.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 20.2 +++ b/docs/src/interface/hypercalls.tex Thu Sep 22 11:42:01 2005 -0600 20.3 @@ -0,0 +1,524 @@ 20.4 + 20.5 +\newcommand{\hypercall}[1]{\vspace{2mm}{\sf #1}} 20.6 + 20.7 +\chapter{Xen Hypercalls} 20.8 +\label{a:hypercalls} 20.9 + 20.10 +Hypercalls represent the procedural interface to Xen; this appendix 20.11 +categorizes and describes the current set of hypercalls. 20.12 + 20.13 +\section{Invoking Hypercalls} 20.14 + 20.15 +Hypercalls are invoked in a manner analogous to system calls in a 20.16 +conventional operating system; a software interrupt is issued which 20.17 +vectors to an entry point within Xen. On x86\_32 machines the 20.18 +instruction required is {\tt int \$82}; the (real) IDT is setup so 20.19 +that this may only be issued from within ring 1. The particular 20.20 +hypercall to be invoked is contained in {\tt EAX} --- a list 20.21 +mapping these values to symbolic hypercall names can be found 20.22 +in {\tt xen/include/public/xen.h}. 20.23 + 20.24 +On some occasions a set of hypercalls will be required to carry 20.25 +out a higher-level function; a good example is when a guest 20.26 +operating wishes to context switch to a new process which 20.27 +requires updating various privileged CPU state. As an optimization 20.28 +for these cases, there is a generic mechanism to issue a set of 20.29 +hypercalls as a batch: 20.30 + 20.31 +\begin{quote} 20.32 +\hypercall{multicall(void *call\_list, int nr\_calls)} 20.33 + 20.34 +Execute a series of hypervisor calls; {\tt nr\_calls} is the length of 20.35 +the array of {\tt multicall\_entry\_t} structures pointed to be {\tt 20.36 +call\_list}. Each entry contains the hypercall operation code followed 20.37 +by up to 7 word-sized arguments. 20.38 +\end{quote} 20.39 + 20.40 +Note that multicalls are provided purely as an optimization; there is 20.41 +no requirement to use them when first porting a guest operating 20.42 +system. 20.43 + 20.44 + 20.45 +\section{Virtual CPU Setup} 20.46 + 20.47 +At start of day, a guest operating system needs to setup the virtual 20.48 +CPU it is executing on. This includes installing vectors for the 20.49 +virtual IDT so that the guest OS can handle interrupts, page faults, 20.50 +etc. However the very first thing a guest OS must setup is a pair 20.51 +of hypervisor callbacks: these are the entry points which Xen will 20.52 +use when it wishes to notify the guest OS of an occurrence. 20.53 + 20.54 +\begin{quote} 20.55 +\hypercall{set\_callbacks(unsigned long event\_selector, unsigned long 20.56 + event\_address, unsigned long failsafe\_selector, unsigned long 20.57 + failsafe\_address) } 20.58 + 20.59 +Register the normal (``event'') and failsafe callbacks for 20.60 +event processing. In each case the code segment selector and 20.61 +address within that segment are provided. The selectors must 20.62 +have RPL 1; in XenLinux we simply use the kernel's CS for both 20.63 +{\tt event\_selector} and {\tt failsafe\_selector}. 20.64 + 20.65 +The value {\tt event\_address} specifies the address of the guest OSes 20.66 +event handling and dispatch routine; the {\tt failsafe\_address} 20.67 +specifies a separate entry point which is used only if a fault occurs 20.68 +when Xen attempts to use the normal callback. 20.69 +\end{quote} 20.70 + 20.71 + 20.72 +After installing the hypervisor callbacks, the guest OS can 20.73 +install a `virtual IDT' by using the following hypercall: 20.74 + 20.75 +\begin{quote} 20.76 +\hypercall{set\_trap\_table(trap\_info\_t *table)} 20.77 + 20.78 +Install one or more entries into the per-domain 20.79 +trap handler table (essentially a software version of the IDT). 20.80 +Each entry in the array pointed to by {\tt table} includes the 20.81 +exception vector number with the corresponding segment selector 20.82 +and entry point. Most guest OSes can use the same handlers on 20.83 +Xen as when running on the real hardware; an exception is the 20.84 +page fault handler (exception vector 14) where a modified 20.85 +stack-frame layout is used. 20.86 + 20.87 + 20.88 +\end{quote} 20.89 + 20.90 + 20.91 + 20.92 +\section{Scheduling and Timer} 20.93 + 20.94 +Domains are preemptively scheduled by Xen according to the 20.95 +parameters installed by domain 0 (see Section~\ref{s:dom0ops}). 20.96 +In addition, however, a domain may choose to explicitly 20.97 +control certain behavior with the following hypercall: 20.98 + 20.99 +\begin{quote} 20.100 +\hypercall{sched\_op(unsigned long op)} 20.101 + 20.102 +Request scheduling operation from hypervisor. The options are: {\it 20.103 +yield}, {\it block}, and {\it shutdown}. {\it yield} keeps the 20.104 +calling domain runnable but may cause a reschedule if other domains 20.105 +are runnable. {\it block} removes the calling domain from the run 20.106 +queue and cause is to sleeps until an event is delivered to it. {\it 20.107 +shutdown} is used to end the domain's execution; the caller can 20.108 +additionally specify whether the domain should reboot, halt or 20.109 +suspend. 20.110 +\end{quote} 20.111 + 20.112 +To aid the implementation of a process scheduler within a guest OS, 20.113 +Xen provides a virtual programmable timer: 20.114 + 20.115 +\begin{quote} 20.116 +\hypercall{set\_timer\_op(uint64\_t timeout)} 20.117 + 20.118 +Request a timer event to be sent at the specified system time (time 20.119 +in nanoseconds since system boot). The hypercall actually passes the 20.120 +64-bit timeout value as a pair of 32-bit values. 20.121 + 20.122 +\end{quote} 20.123 + 20.124 +Note that calling {\tt set\_timer\_op()} prior to {\tt sched\_op} 20.125 +allows block-with-timeout semantics. 20.126 + 20.127 + 20.128 +\section{Page Table Management} 20.129 + 20.130 +Since guest operating systems have read-only access to their page 20.131 +tables, Xen must be involved when making any changes. The following 20.132 +multi-purpose hypercall can be used to modify page-table entries, 20.133 +update the machine-to-physical mapping table, flush the TLB, install 20.134 +a new page-table base pointer, and more. 20.135 + 20.136 +\begin{quote} 20.137 +\hypercall{mmu\_update(mmu\_update\_t *req, int count, int *success\_count)} 20.138 + 20.139 +Update the page table for the domain; a set of {\tt count} updates are 20.140 +submitted for processing in a batch, with {\tt success\_count} being 20.141 +updated to report the number of successful updates. 20.142 + 20.143 +Each element of {\tt req[]} contains a pointer (address) and value; 20.144 +the least significant 2-bits of the pointer are used to distinguish 20.145 +the type of update requested as follows: 20.146 +\begin{description} 20.147 + 20.148 +\item[\it MMU\_NORMAL\_PT\_UPDATE:] update a page directory entry or 20.149 +page table entry to the associated value; Xen will check that the 20.150 +update is safe, as described in Chapter~\ref{c:memory}. 20.151 + 20.152 +\item[\it MMU\_MACHPHYS\_UPDATE:] update an entry in the 20.153 + machine-to-physical table. The calling domain must own the machine 20.154 + page in question (or be privileged). 20.155 + 20.156 +\item[\it MMU\_EXTENDED\_COMMAND:] perform additional MMU operations. 20.157 +The set of additional MMU operations is considerable, and includes 20.158 +updating {\tt cr3} (or just re-installing it for a TLB flush), 20.159 +flushing the cache, installing a new LDT, or pinning \& unpinning 20.160 +page-table pages (to ensure their reference count doesn't drop to zero 20.161 +which would require a revalidation of all entries). 20.162 + 20.163 +Further extended commands are used to deal with granting and 20.164 +acquiring page ownership; see Section~\ref{s:idc}. 20.165 + 20.166 + 20.167 +\end{description} 20.168 + 20.169 +More details on the precise format of all commands can be 20.170 +found in {\tt xen/include/public/xen.h}. 20.171 + 20.172 + 20.173 +\end{quote} 20.174 + 20.175 +Explicitly updating batches of page table entries is extremely 20.176 +efficient, but can require a number of alterations to the guest 20.177 +OS. Using the writable page table mode (Chapter~\ref{c:memory}) is 20.178 +recommended for new OS ports. 20.179 + 20.180 +Regardless of which page table update mode is being used, however, 20.181 +there are some occasions (notably handling a demand page fault) where 20.182 +a guest OS will wish to modify exactly one PTE rather than a 20.183 +batch. This is catered for by the following: 20.184 + 20.185 +\begin{quote} 20.186 +\hypercall{update\_va\_mapping(unsigned long page\_nr, unsigned long 20.187 +val, \\ unsigned long flags)} 20.188 + 20.189 +Update the currently installed PTE for the page {\tt page\_nr} to 20.190 +{\tt val}. As with {\tt mmu\_update()}, Xen checks the modification 20.191 +is safe before applying it. The {\tt flags} determine which kind 20.192 +of TLB flush, if any, should follow the update. 20.193 + 20.194 +\end{quote} 20.195 + 20.196 +Finally, sufficiently privileged domains may occasionally wish to manipulate 20.197 +the pages of others: 20.198 +\begin{quote} 20.199 + 20.200 +\hypercall{update\_va\_mapping\_otherdomain(unsigned long page\_nr, 20.201 +unsigned long val, unsigned long flags, uint16\_t domid)} 20.202 + 20.203 +Identical to {\tt update\_va\_mapping()} save that the pages being 20.204 +mapped must belong to the domain {\tt domid}. 20.205 + 20.206 +\end{quote} 20.207 + 20.208 +This privileged operation is currently used by backend virtual device 20.209 +drivers to safely map pages containing I/O data. 20.210 + 20.211 + 20.212 + 20.213 +\section{Segmentation Support} 20.214 + 20.215 +Xen allows guest OSes to install a custom GDT if they require it; 20.216 +this is context switched transparently whenever a domain is 20.217 +[de]scheduled. The following hypercall is effectively a 20.218 +`safe' version of {\tt lgdt}: 20.219 + 20.220 +\begin{quote} 20.221 +\hypercall{set\_gdt(unsigned long *frame\_list, int entries)} 20.222 + 20.223 +Install a global descriptor table for a domain; {\tt frame\_list} is 20.224 +an array of up to 16 machine page frames within which the GDT resides, 20.225 +with {\tt entries} being the actual number of descriptor-entry 20.226 +slots. All page frames must be mapped read-only within the guest's 20.227 +address space, and the table must be large enough to contain Xen's 20.228 +reserved entries (see {\tt xen/include/public/arch-x86\_32.h}). 20.229 + 20.230 +\end{quote} 20.231 + 20.232 +Many guest OSes will also wish to install LDTs; this is achieved by 20.233 +using {\tt mmu\_update()} with an extended command, passing the 20.234 +linear address of the LDT base along with the number of entries. No 20.235 +special safety checks are required; Xen needs to perform this task 20.236 +simply since {\tt lldt} requires CPL 0. 20.237 + 20.238 + 20.239 +Xen also allows guest operating systems to update just an 20.240 +individual segment descriptor in the GDT or LDT: 20.241 + 20.242 +\begin{quote} 20.243 +\hypercall{update\_descriptor(unsigned long ma, unsigned long word1, 20.244 +unsigned long word2)} 20.245 + 20.246 +Update the GDT/LDT entry at machine address {\tt ma}; the new 20.247 +8-byte descriptor is stored in {\tt word1} and {\tt word2}. 20.248 +Xen performs a number of checks to ensure the descriptor is 20.249 +valid. 20.250 + 20.251 +\end{quote} 20.252 + 20.253 +Guest OSes can use the above in place of context switching entire 20.254 +LDTs (or the GDT) when the number of changing descriptors is small. 20.255 + 20.256 +\section{Context Switching} 20.257 + 20.258 +When a guest OS wishes to context switch between two processes, 20.259 +it can use the page table and segmentation hypercalls described 20.260 +above to perform the the bulk of the privileged work. In addition, 20.261 +however, it will need to invoke Xen to switch the kernel (ring 1) 20.262 +stack pointer: 20.263 + 20.264 +\begin{quote} 20.265 +\hypercall{stack\_switch(unsigned long ss, unsigned long esp)} 20.266 + 20.267 +Request kernel stack switch from hypervisor; {\tt ss} is the new 20.268 +stack segment, which {\tt esp} is the new stack pointer. 20.269 + 20.270 +\end{quote} 20.271 + 20.272 +A final useful hypercall for context switching allows ``lazy'' 20.273 +save and restore of floating point state: 20.274 + 20.275 +\begin{quote} 20.276 +\hypercall{fpu\_taskswitch(void)} 20.277 + 20.278 +This call instructs Xen to set the {\tt TS} bit in the {\tt cr0} 20.279 +control register; this means that the next attempt to use floating 20.280 +point will cause a trap which the guest OS can trap. Typically it will 20.281 +then save/restore the FP state, and clear the {\tt TS} bit. 20.282 +\end{quote} 20.283 + 20.284 +This is provided as an optimization only; guest OSes can also choose 20.285 +to save and restore FP state on all context switches for simplicity. 20.286 + 20.287 + 20.288 +\section{Physical Memory Management} 20.289 + 20.290 +As mentioned previously, each domain has a maximum and current 20.291 +memory allocation. The maximum allocation, set at domain creation 20.292 +time, cannot be modified. However a domain can choose to reduce 20.293 +and subsequently grow its current allocation by using the 20.294 +following call: 20.295 + 20.296 +\begin{quote} 20.297 +\hypercall{dom\_mem\_op(unsigned int op, unsigned long *extent\_list, 20.298 + unsigned long nr\_extents, unsigned int extent\_order)} 20.299 + 20.300 +Increase or decrease current memory allocation (as determined by 20.301 +the value of {\tt op}). Each invocation provides a list of 20.302 +extents each of which is $2^s$ pages in size, 20.303 +where $s$ is the value of {\tt extent\_order}. 20.304 + 20.305 +\end{quote} 20.306 + 20.307 +In addition to simply reducing or increasing the current memory 20.308 +allocation via a `balloon driver', this call is also useful for 20.309 +obtaining contiguous regions of machine memory when required (e.g. 20.310 +for certain PCI devices, or if using superpages). 20.311 + 20.312 + 20.313 +\section{Inter-Domain Communication} 20.314 +\label{s:idc} 20.315 + 20.316 +Xen provides a simple asynchronous notification mechanism via 20.317 +\emph{event channels}. Each domain has a set of end-points (or 20.318 +\emph{ports}) which may be bound to an event source (e.g. a physical 20.319 +IRQ, a virtual IRQ, or an port in another domain). When a pair of 20.320 +end-points in two different domains are bound together, then a `send' 20.321 +operation on one will cause an event to be received by the destination 20.322 +domain. 20.323 + 20.324 +The control and use of event channels involves the following hypercall: 20.325 + 20.326 +\begin{quote} 20.327 +\hypercall{event\_channel\_op(evtchn\_op\_t *op)} 20.328 + 20.329 +Inter-domain event-channel management; {\tt op} is a discriminated 20.330 +union which allows the following 7 operations: 20.331 + 20.332 +\begin{description} 20.333 + 20.334 +\item[\it alloc\_unbound:] allocate a free (unbound) local 20.335 + port and prepare for connection from a specified domain. 20.336 +\item[\it bind\_virq:] bind a local port to a virtual 20.337 +IRQ; any particular VIRQ can be bound to at most one port per domain. 20.338 +\item[\it bind\_pirq:] bind a local port to a physical IRQ; 20.339 +once more, a given pIRQ can be bound to at most one port per 20.340 +domain. Furthermore the calling domain must be sufficiently 20.341 +privileged. 20.342 +\item[\it bind\_interdomain:] construct an interdomain event 20.343 +channel; in general, the target domain must have previously allocated 20.344 +an unbound port for this channel, although this can be bypassed by 20.345 +privileged domains during domain setup. 20.346 +\item[\it close:] close an interdomain event channel. 20.347 +\item[\it send:] send an event to the remote end of a 20.348 +interdomain event channel. 20.349 +\item[\it status:] determine the current status of a local port. 20.350 +\end{description} 20.351 + 20.352 +For more details see 20.353 +{\tt xen/include/public/event\_channel.h}. 20.354 + 20.355 +\end{quote} 20.356 + 20.357 +Event channels are the fundamental communication primitive between 20.358 +Xen domains and seamlessly support SMP. However they provide little 20.359 +bandwidth for communication {\sl per se}, and hence are typically 20.360 +married with a piece of shared memory to produce effective and 20.361 +high-performance inter-domain communication. 20.362 + 20.363 +Safe sharing of memory pages between guest OSes is carried out by 20.364 +granting access on a per page basis to individual domains. This is 20.365 +achieved by using the {\tt grant\_table\_op()} hypercall. 20.366 + 20.367 +\begin{quote} 20.368 +\hypercall{grant\_table\_op(unsigned int cmd, void *uop, unsigned int count)} 20.369 + 20.370 +Grant or remove access to a particular page to a particular domain. 20.371 + 20.372 +\end{quote} 20.373 + 20.374 +This is not currently widely in use by guest operating systems, but 20.375 +we intend to integrate support more fully in the near future. 20.376 + 20.377 +\section{PCI Configuration} 20.378 + 20.379 +Domains with physical device access (i.e.\ driver domains) receive 20.380 +limited access to certain PCI devices (bus address space and 20.381 +interrupts). However many guest operating systems attempt to 20.382 +determine the PCI configuration by directly access the PCI BIOS, 20.383 +which cannot be allowed for safety. 20.384 + 20.385 +Instead, Xen provides the following hypercall: 20.386 + 20.387 +\begin{quote} 20.388 +\hypercall{physdev\_op(void *physdev\_op)} 20.389 + 20.390 +Perform a PCI configuration option; depending on the value 20.391 +of {\tt physdev\_op} this can be a PCI config read, a PCI config 20.392 +write, or a small number of other queries. 20.393 + 20.394 +\end{quote} 20.395 + 20.396 + 20.397 +For examples of using {\tt physdev\_op()}, see the 20.398 +Xen-specific PCI code in the linux sparse tree. 20.399 + 20.400 +\section{Administrative Operations} 20.401 +\label{s:dom0ops} 20.402 + 20.403 +A large number of control operations are available to a sufficiently 20.404 +privileged domain (typically domain 0). These allow the creation and 20.405 +management of new domains, for example. A complete list is given 20.406 +below: for more details on any or all of these, please see 20.407 +{\tt xen/include/public/dom0\_ops.h} 20.408 + 20.409 + 20.410 +\begin{quote} 20.411 +\hypercall{dom0\_op(dom0\_op\_t *op)} 20.412 + 20.413 +Administrative domain operations for domain management. The options are: 20.414 + 20.415 +\begin{description} 20.416 +\item [\it DOM0\_CREATEDOMAIN:] create a new domain 20.417 + 20.418 +\item [\it DOM0\_PAUSEDOMAIN:] remove a domain from the scheduler run 20.419 +queue. 20.420 + 20.421 +\item [\it DOM0\_UNPAUSEDOMAIN:] mark a paused domain as schedulable 20.422 + once again. 20.423 + 20.424 +\item [\it DOM0\_DESTROYDOMAIN:] deallocate all resources associated 20.425 +with a domain 20.426 + 20.427 +\item [\it DOM0\_GETMEMLIST:] get list of pages used by the domain 20.428 + 20.429 +\item [\it DOM0\_SCHEDCTL:] 20.430 + 20.431 +\item [\it DOM0\_ADJUSTDOM:] adjust scheduling priorities for domain 20.432 + 20.433 +\item [\it DOM0\_BUILDDOMAIN:] do final guest OS setup for domain 20.434 + 20.435 +\item [\it DOM0\_GETDOMAINFO:] get statistics about the domain 20.436 + 20.437 +\item [\it DOM0\_GETPAGEFRAMEINFO:] 20.438 + 20.439 +\item [\it DOM0\_GETPAGEFRAMEINFO2:] 20.440 + 20.441 +\item [\it DOM0\_IOPL:] set I/O privilege level 20.442 + 20.443 +\item [\it DOM0\_MSR:] read or write model specific registers 20.444 + 20.445 +\item [\it DOM0\_DEBUG:] interactively invoke the debugger 20.446 + 20.447 +\item [\it DOM0\_SETTIME:] set system time 20.448 + 20.449 +\item [\it DOM0\_READCONSOLE:] read console content from hypervisor buffer ring 20.450 + 20.451 +\item [\it DOM0\_PINCPUDOMAIN:] pin domain to a particular CPU 20.452 + 20.453 +\item [\it DOM0\_GETTBUFS:] get information about the size and location of 20.454 + the trace buffers (only on trace-buffer enabled builds) 20.455 + 20.456 +\item [\it DOM0\_PHYSINFO:] get information about the host machine 20.457 + 20.458 +\item [\it DOM0\_PCIDEV\_ACCESS:] modify PCI device access permissions 20.459 + 20.460 +\item [\it DOM0\_SCHED\_ID:] get the ID of the current Xen scheduler 20.461 + 20.462 +\item [\it DOM0\_SHADOW\_CONTROL:] switch between shadow page-table modes 20.463 + 20.464 +\item [\it DOM0\_SETDOMAININITIALMEM:] set initial memory allocation of a domain 20.465 + 20.466 +\item [\it DOM0\_SETDOMAINMAXMEM:] set maximum memory allocation of a domain 20.467 + 20.468 +\item [\it DOM0\_SETDOMAINVMASSIST:] set domain VM assist options 20.469 +\end{description} 20.470 +\end{quote} 20.471 + 20.472 +Most of the above are best understood by looking at the code 20.473 +implementing them (in {\tt xen/common/dom0\_ops.c}) and in 20.474 +the user-space tools that use them (mostly in {\tt tools/libxc}). 20.475 + 20.476 +\section{Debugging Hypercalls} 20.477 + 20.478 +A few additional hypercalls are mainly useful for debugging: 20.479 + 20.480 +\begin{quote} 20.481 +\hypercall{console\_io(int cmd, int count, char *str)} 20.482 + 20.483 +Use Xen to interact with the console; operations are: 20.484 + 20.485 +{\it CONSOLEIO\_write}: Output count characters from buffer str. 20.486 + 20.487 +{\it CONSOLEIO\_read}: Input at most count characters into buffer str. 20.488 +\end{quote} 20.489 + 20.490 +A pair of hypercalls allows access to the underlying debug registers: 20.491 +\begin{quote} 20.492 +\hypercall{set\_debugreg(int reg, unsigned long value)} 20.493 + 20.494 +Set debug register {\tt reg} to {\tt value} 20.495 + 20.496 +\hypercall{get\_debugreg(int reg)} 20.497 + 20.498 +Return the contents of the debug register {\tt reg} 20.499 +\end{quote} 20.500 + 20.501 +And finally: 20.502 +\begin{quote} 20.503 +\hypercall{xen\_version(int cmd)} 20.504 + 20.505 +Request Xen version number. 20.506 +\end{quote} 20.507 + 20.508 +This is useful to ensure that user-space tools are in sync 20.509 +with the underlying hypervisor. 20.510 + 20.511 +\section{Deprecated Hypercalls} 20.512 + 20.513 +Xen is under constant development and refinement; as such there 20.514 +are plans to improve the way in which various pieces of functionality 20.515 +are exposed to guest OSes. 20.516 + 20.517 +\begin{quote} 20.518 +\hypercall{vm\_assist(unsigned int cmd, unsigned int type)} 20.519 + 20.520 +Toggle various memory management modes (in particular wrritable page 20.521 +tables and superpage support). 20.522 + 20.523 +\end{quote} 20.524 + 20.525 +This is likely to be replaced with mode values in the shared 20.526 +information page since this is more resilient for resumption 20.527 +after migration or checkpoint.
21.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 21.2 +++ b/docs/src/interface/memory.tex Thu Sep 22 11:42:01 2005 -0600 21.3 @@ -0,0 +1,162 @@ 21.4 +\chapter{Memory} 21.5 +\label{c:memory} 21.6 + 21.7 +Xen is responsible for managing the allocation of physical memory to 21.8 +domains, and for ensuring safe use of the paging and segmentation 21.9 +hardware. 21.10 + 21.11 + 21.12 +\section{Memory Allocation} 21.13 + 21.14 +Xen resides within a small fixed portion of physical memory; it also 21.15 +reserves the top 64MB of every virtual address space. The remaining 21.16 +physical memory is available for allocation to domains at a page 21.17 +granularity. Xen tracks the ownership and use of each page, which 21.18 +allows it to enforce secure partitioning between domains. 21.19 + 21.20 +Each domain has a maximum and current physical memory allocation. A 21.21 +guest OS may run a `balloon driver' to dynamically adjust its current 21.22 +memory allocation up to its limit. 21.23 + 21.24 + 21.25 +%% XXX SMH: I use machine and physical in the next section (which is 21.26 +%% kinda required for consistency with code); wonder if this section 21.27 +%% should use same terms? 21.28 +%% 21.29 +%% Probably. 21.30 +%% 21.31 +%% Merging this and below section at some point prob makes sense. 21.32 + 21.33 +\section{Pseudo-Physical Memory} 21.34 + 21.35 +Since physical memory is allocated and freed on a page granularity, 21.36 +there is no guarantee that a domain will receive a contiguous stretch 21.37 +of physical memory. However most operating systems do not have good 21.38 +support for operating in a fragmented physical address space. To aid 21.39 +porting such operating systems to run on top of Xen, we make a 21.40 +distinction between \emph{machine memory} and \emph{pseudo-physical 21.41 + memory}. 21.42 + 21.43 +Put simply, machine memory refers to the entire amount of memory 21.44 +installed in the machine, including that reserved by Xen, in use by 21.45 +various domains, or currently unallocated. We consider machine memory 21.46 +to comprise a set of 4K \emph{machine page frames} numbered 21.47 +consecutively starting from 0. Machine frame numbers mean the same 21.48 +within Xen or any domain. 21.49 + 21.50 +Pseudo-physical memory, on the other hand, is a per-domain 21.51 +abstraction. It allows a guest operating system to consider its memory 21.52 +allocation to consist of a contiguous range of physical page frames 21.53 +starting at physical frame 0, despite the fact that the underlying 21.54 +machine page frames may be sparsely allocated and in any order. 21.55 + 21.56 +To achieve this, Xen maintains a globally readable {\it 21.57 + machine-to-physical} table which records the mapping from machine 21.58 +page frames to pseudo-physical ones. In addition, each domain is 21.59 +supplied with a {\it physical-to-machine} table which performs the 21.60 +inverse mapping. Clearly the machine-to-physical table has size 21.61 +proportional to the amount of RAM installed in the machine, while each 21.62 +physical-to-machine table has size proportional to the memory 21.63 +allocation of the given domain. 21.64 + 21.65 +Architecture dependent code in guest operating systems can then use 21.66 +the two tables to provide the abstraction of pseudo-physical memory. 21.67 +In general, only certain specialized parts of the operating system 21.68 +(such as page table management) needs to understand the difference 21.69 +between machine and pseudo-physical addresses. 21.70 + 21.71 + 21.72 +\section{Page Table Updates} 21.73 + 21.74 +In the default mode of operation, Xen enforces read-only access to 21.75 +page tables and requires guest operating systems to explicitly request 21.76 +any modifications. Xen validates all such requests and only applies 21.77 +updates that it deems safe. This is necessary to prevent domains from 21.78 +adding arbitrary mappings to their page tables. 21.79 + 21.80 +To aid validation, Xen associates a type and reference count with each 21.81 +memory page. A page has one of the following mutually-exclusive types 21.82 +at any point in time: page directory ({\sf PD}), page table ({\sf 21.83 + PT}), local descriptor table ({\sf LDT}), global descriptor table 21.84 +({\sf GDT}), or writable ({\sf RW}). Note that a guest OS may always 21.85 +create readable mappings of its own memory regardless of its current 21.86 +type. 21.87 + 21.88 +%%% XXX: possibly explain more about ref count 'lifecyle' here? 21.89 +This mechanism is used to maintain the invariants required for safety; 21.90 +for example, a domain cannot have a writable mapping to any part of a 21.91 +page table as this would require the page concerned to simultaneously 21.92 +be of types {\sf PT} and {\sf RW}. 21.93 + 21.94 + 21.95 +% \section{Writable Page Tables} 21.96 + 21.97 +Xen also provides an alternative mode of operation in which guests be 21.98 +have the illusion that their page tables are directly writable. Of 21.99 +course this is not really the case, since Xen must still validate 21.100 +modifications to ensure secure partitioning. To this end, Xen traps 21.101 +any write attempt to a memory page of type {\sf PT} (i.e., that is 21.102 +currently part of a page table). If such an access occurs, Xen 21.103 +temporarily allows write access to that page while at the same time 21.104 +\emph{disconnecting} it from the page table that is currently in use. 21.105 +This allows the guest to safely make updates to the page because the 21.106 +newly-updated entries cannot be used by the MMU until Xen revalidates 21.107 +and reconnects the page. Reconnection occurs automatically in a 21.108 +number of situations: for example, when the guest modifies a different 21.109 +page-table page, when the domain is preempted, or whenever the guest 21.110 +uses Xen's explicit page-table update interfaces. 21.111 + 21.112 +Finally, Xen also supports a form of \emph{shadow page tables} in 21.113 +which the guest OS uses a independent copy of page tables which are 21.114 +unknown to the hardware (i.e.\ which are never pointed to by {\tt 21.115 + cr3}). Instead Xen propagates changes made to the guest's tables to 21.116 +the real ones, and vice versa. This is useful for logging page writes 21.117 +(e.g.\ for live migration or checkpoint). A full version of the shadow 21.118 +page tables also allows guest OS porting with less effort. 21.119 + 21.120 + 21.121 +\section{Segment Descriptor Tables} 21.122 + 21.123 +On boot a guest is supplied with a default GDT, which does not reside 21.124 +within its own memory allocation. If the guest wishes to use other 21.125 +than the default `flat' ring-1 and ring-3 segments that this GDT 21.126 +provides, it must register a custom GDT and/or LDT with Xen, allocated 21.127 +from its own memory. Note that a number of GDT entries are reserved by 21.128 +Xen -- any custom GDT must also include sufficient space for these 21.129 +entries. 21.130 + 21.131 +For example, the following hypercall is used to specify a new GDT: 21.132 + 21.133 +\begin{quote} 21.134 + int {\bf set\_gdt}(unsigned long *{\em frame\_list}, int {\em 21.135 + entries}) 21.136 + 21.137 + \emph{frame\_list}: An array of up to 16 machine page frames within 21.138 + which the GDT resides. Any frame registered as a GDT frame may only 21.139 + be mapped read-only within the guest's address space (e.g., no 21.140 + writable mappings, no use as a page-table page, and so on). 21.141 + 21.142 + \emph{entries}: The number of descriptor-entry slots in the GDT. 21.143 + Note that the table must be large enough to contain Xen's reserved 21.144 + entries; thus we must have `{\em entries $>$ 21.145 + LAST\_RESERVED\_GDT\_ENTRY}\ '. Note also that, after registering 21.146 + the GDT, slots \emph{FIRST\_} through 21.147 + \emph{LAST\_RESERVED\_GDT\_ENTRY} are no longer usable by the guest 21.148 + and may be overwritten by Xen. 21.149 +\end{quote} 21.150 + 21.151 +The LDT is updated via the generic MMU update mechanism (i.e., via the 21.152 +{\tt mmu\_update()} hypercall. 21.153 + 21.154 +\section{Start of Day} 21.155 + 21.156 +The start-of-day environment for guest operating systems is rather 21.157 +different to that provided by the underlying hardware. In particular, 21.158 +the processor is already executing in protected mode with paging 21.159 +enabled. 21.160 + 21.161 +{\it Domain 0} is created and booted by Xen itself. For all subsequent 21.162 +domains, the analogue of the boot-loader is the {\it domain builder}, 21.163 +user-space software running in {\it domain 0}. The domain builder is 21.164 +responsible for building the initial page tables for a domain and 21.165 +loading its kernel image at the appropriate virtual address.
22.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 22.2 +++ b/docs/src/interface/scheduling.tex Thu Sep 22 11:42:01 2005 -0600 22.3 @@ -0,0 +1,268 @@ 22.4 +\chapter{Scheduling API} 22.5 + 22.6 +The scheduling API is used by both the schedulers described above and should 22.7 +also be used by any new schedulers. It provides a generic interface and also 22.8 +implements much of the ``boilerplate'' code. 22.9 + 22.10 +Schedulers conforming to this API are described by the following 22.11 +structure: 22.12 + 22.13 +\begin{verbatim} 22.14 +struct scheduler 22.15 +{ 22.16 + char *name; /* full name for this scheduler */ 22.17 + char *opt_name; /* option name for this scheduler */ 22.18 + unsigned int sched_id; /* ID for this scheduler */ 22.19 + 22.20 + int (*init_scheduler) (); 22.21 + int (*alloc_task) (struct task_struct *); 22.22 + void (*add_task) (struct task_struct *); 22.23 + void (*free_task) (struct task_struct *); 22.24 + void (*rem_task) (struct task_struct *); 22.25 + void (*wake_up) (struct task_struct *); 22.26 + void (*do_block) (struct task_struct *); 22.27 + task_slice_t (*do_schedule) (s_time_t); 22.28 + int (*control) (struct sched_ctl_cmd *); 22.29 + int (*adjdom) (struct task_struct *, 22.30 + struct sched_adjdom_cmd *); 22.31 + s32 (*reschedule) (struct task_struct *); 22.32 + void (*dump_settings) (void); 22.33 + void (*dump_cpu_state) (int); 22.34 + void (*dump_runq_el) (struct task_struct *); 22.35 +}; 22.36 +\end{verbatim} 22.37 + 22.38 +The only method that {\em must} be implemented is 22.39 +{\tt do\_schedule()}. However, if there is not some implementation for the 22.40 +{\tt wake\_up()} method then waking tasks will not get put on the runqueue! 22.41 + 22.42 +The fields of the above structure are described in more detail below. 22.43 + 22.44 +\subsubsection{name} 22.45 + 22.46 +The name field should point to a descriptive ASCII string. 22.47 + 22.48 +\subsubsection{opt\_name} 22.49 + 22.50 +This field is the value of the {\tt sched=} boot-time option that will select 22.51 +this scheduler. 22.52 + 22.53 +\subsubsection{sched\_id} 22.54 + 22.55 +This is an integer that uniquely identifies this scheduler. There should be a 22.56 +macro corrsponding to this scheduler ID in {\tt <xen/sched-if.h>}. 22.57 + 22.58 +\subsubsection{init\_scheduler} 22.59 + 22.60 +\paragraph*{Purpose} 22.61 + 22.62 +This is a function for performing any scheduler-specific initialisation. For 22.63 +instance, it might allocate memory for per-CPU scheduler data and initialise it 22.64 +appropriately. 22.65 + 22.66 +\paragraph*{Call environment} 22.67 + 22.68 +This function is called after the initialisation performed by the generic 22.69 +layer. The function is called exactly once, for the scheduler that has been 22.70 +selected. 22.71 + 22.72 +\paragraph*{Return values} 22.73 + 22.74 +This should return negative on failure --- this will cause an 22.75 +immediate panic and the system will fail to boot. 22.76 + 22.77 +\subsubsection{alloc\_task} 22.78 + 22.79 +\paragraph*{Purpose} 22.80 +Called when a {\tt task\_struct} is allocated by the generic scheduler 22.81 +layer. A particular scheduler implementation may use this method to 22.82 +allocate per-task data for this task. It may use the {\tt 22.83 +sched\_priv} pointer in the {\tt task\_struct} to point to this data. 22.84 + 22.85 +\paragraph*{Call environment} 22.86 +The generic layer guarantees that the {\tt sched\_priv} field will 22.87 +remain intact from the time this method is called until the task is 22.88 +deallocated (so long as the scheduler implementation does not change 22.89 +it explicitly!). 22.90 + 22.91 +\paragraph*{Return values} 22.92 +Negative on failure. 22.93 + 22.94 +\subsubsection{add\_task} 22.95 + 22.96 +\paragraph*{Purpose} 22.97 + 22.98 +Called when a task is initially added by the generic layer. 22.99 + 22.100 +\paragraph*{Call environment} 22.101 + 22.102 +The fields in the {\tt task\_struct} are now filled out and available for use. 22.103 +Schedulers should implement appropriate initialisation of any per-task private 22.104 +information in this method. 22.105 + 22.106 +\subsubsection{free\_task} 22.107 + 22.108 +\paragraph*{Purpose} 22.109 + 22.110 +Schedulers should free the space used by any associated private data 22.111 +structures. 22.112 + 22.113 +\paragraph*{Call environment} 22.114 + 22.115 +This is called when a {\tt task\_struct} is about to be deallocated. 22.116 +The generic layer will have done generic task removal operations and 22.117 +(if implemented) called the scheduler's {\tt rem\_task} method before 22.118 +this method is called. 22.119 + 22.120 +\subsubsection{rem\_task} 22.121 + 22.122 +\paragraph*{Purpose} 22.123 + 22.124 +This is called when a task is being removed from scheduling (but is 22.125 +not yet being freed). 22.126 + 22.127 +\subsubsection{wake\_up} 22.128 + 22.129 +\paragraph*{Purpose} 22.130 + 22.131 +Called when a task is woken up, this method should put the task on the runqueue 22.132 +(or do the scheduler-specific equivalent action). 22.133 + 22.134 +\paragraph*{Call environment} 22.135 + 22.136 +The task is already set to state RUNNING. 22.137 + 22.138 +\subsubsection{do\_block} 22.139 + 22.140 +\paragraph*{Purpose} 22.141 + 22.142 +This function is called when a task is blocked. This function should 22.143 +not remove the task from the runqueue. 22.144 + 22.145 +\paragraph*{Call environment} 22.146 + 22.147 +The EVENTS\_MASTER\_ENABLE\_BIT is already set and the task state changed to 22.148 +TASK\_INTERRUPTIBLE on entry to this method. A call to the {\tt 22.149 + do\_schedule} method will be made after this method returns, in 22.150 +order to select the next task to run. 22.151 + 22.152 +\subsubsection{do\_schedule} 22.153 + 22.154 +This method must be implemented. 22.155 + 22.156 +\paragraph*{Purpose} 22.157 + 22.158 +The method is called each time a new task must be chosen for scheduling on the 22.159 +current CPU. The current time as passed as the single argument (the current 22.160 +task can be found using the {\tt current} macro). 22.161 + 22.162 +This method should select the next task to run on this CPU and set it's minimum 22.163 +time to run as well as returning the data described below. 22.164 + 22.165 +This method should also take the appropriate action if the previous 22.166 +task has blocked, e.g. removing it from the runqueue. 22.167 + 22.168 +\paragraph*{Call environment} 22.169 + 22.170 +The other fields in the {\tt task\_struct} are updated by the generic layer, 22.171 +which also performs all Xen-specific tasks and performs the actual task switch 22.172 +(unless the previous task has been chosen again). 22.173 + 22.174 +This method is called with the {\tt schedule\_lock} held for the current CPU 22.175 +and local interrupts disabled. 22.176 + 22.177 +\paragraph*{Return values} 22.178 + 22.179 +Must return a {\tt struct task\_slice} describing what task to run and how long 22.180 +for (at maximum). 22.181 + 22.182 +\subsubsection{control} 22.183 + 22.184 +\paragraph*{Purpose} 22.185 + 22.186 +This method is called for global scheduler control operations. It takes a 22.187 +pointer to a {\tt struct sched\_ctl\_cmd}, which it should either 22.188 +source data from or populate with data, depending on the value of the 22.189 +{\tt direction} field. 22.190 + 22.191 +\paragraph*{Call environment} 22.192 + 22.193 +The generic layer guarantees that when this method is called, the 22.194 +caller selected the correct scheduler ID, hence the scheduler's 22.195 +implementation does not need to sanity-check these parts of the call. 22.196 + 22.197 +\paragraph*{Return values} 22.198 + 22.199 +This function should return the value to be passed back to user space, hence it 22.200 +should either be 0 or an appropriate errno value. 22.201 + 22.202 +\subsubsection{sched\_adjdom} 22.203 + 22.204 +\paragraph*{Purpose} 22.205 + 22.206 +This method is called to adjust the scheduling parameters of a particular 22.207 +domain, or to query their current values. The function should check 22.208 +the {\tt direction} field of the {\tt sched\_adjdom\_cmd} it receives in 22.209 +order to determine which of these operations is being performed. 22.210 + 22.211 +\paragraph*{Call environment} 22.212 + 22.213 +The generic layer guarantees that the caller has specified the correct 22.214 +control interface version and scheduler ID and that the supplied {\tt 22.215 +task\_struct} will not be deallocated during the call (hence it is not 22.216 +necessary to {\tt get\_task\_struct}). 22.217 + 22.218 +\paragraph*{Return values} 22.219 + 22.220 +This function should return the value to be passed back to user space, hence it 22.221 +should either be 0 or an appropriate errno value. 22.222 + 22.223 +\subsubsection{reschedule} 22.224 + 22.225 +\paragraph*{Purpose} 22.226 + 22.227 +This method is called to determine if a reschedule is required as a result of a 22.228 +particular task. 22.229 + 22.230 +\paragraph*{Call environment} 22.231 +The generic layer will cause a reschedule if the current domain is the idle 22.232 +task or it has exceeded its minimum time slice before a reschedule. The 22.233 +generic layer guarantees that the task passed is not currently running but is 22.234 +on the runqueue. 22.235 + 22.236 +\paragraph*{Return values} 22.237 + 22.238 +Should return a mask of CPUs to cause a reschedule on. 22.239 + 22.240 +\subsubsection{dump\_settings} 22.241 + 22.242 +\paragraph*{Purpose} 22.243 + 22.244 +If implemented, this should dump any private global settings for this 22.245 +scheduler to the console. 22.246 + 22.247 +\paragraph*{Call environment} 22.248 + 22.249 +This function is called with interrupts enabled. 22.250 + 22.251 +\subsubsection{dump\_cpu\_state} 22.252 + 22.253 +\paragraph*{Purpose} 22.254 + 22.255 +This method should dump any private settings for the specified CPU. 22.256 + 22.257 +\paragraph*{Call environment} 22.258 + 22.259 +This function is called with interrupts disabled and the {\tt schedule\_lock} 22.260 +for the specified CPU held. 22.261 + 22.262 +\subsubsection{dump\_runq\_el} 22.263 + 22.264 +\paragraph*{Purpose} 22.265 + 22.266 +This method should dump any private settings for the specified task. 22.267 + 22.268 +\paragraph*{Call environment} 22.269 + 22.270 +This function is called with interrupts disabled and the {\tt schedule\_lock} 22.271 +for the task's CPU held.
23.1 --- a/docs/src/user.tex Thu Sep 22 11:34:14 2005 -0600 23.2 +++ b/docs/src/user.tex Thu Sep 22 11:42:01 2005 -0600 23.3 @@ -59,1803 +59,36 @@ Contributions of material, suggestions a 23.4 \renewcommand{\floatpagefraction}{.8} 23.5 \setstretch{1.1} 23.6 23.7 + 23.8 \part{Introduction and Tutorial} 23.9 -\chapter{Introduction} 23.10 - 23.11 -Xen is a {\em paravirtualising} virtual machine monitor (VMM), or 23.12 -`hypervisor', for the x86 processor architecture. Xen can securely 23.13 -execute multiple virtual machines on a single physical system with 23.14 -close-to-native performance. The virtual machine technology 23.15 -facilitates enterprise-grade functionality, including: 23.16 - 23.17 -\begin{itemize} 23.18 -\item Virtual machines with performance close to native 23.19 - hardware. 23.20 -\item Live migration of running virtual machines between physical hosts. 23.21 -\item Excellent hardware support (supports most Linux device drivers). 23.22 -\item Sandboxed, restartable device drivers. 23.23 -\end{itemize} 23.24 - 23.25 -Paravirtualisation permits very high performance virtualisation, 23.26 -even on architectures like x86 that are traditionally 23.27 -very hard to virtualise. 23.28 -The drawback of this approach is that it requires operating systems to 23.29 -be {\em ported} to run on Xen. Porting an OS to run on Xen is similar 23.30 -to supporting a new hardware platform, however the process 23.31 -is simplified because the paravirtual machine architecture is very 23.32 -similar to the underlying native hardware. Even though operating system 23.33 -kernels must explicitly support Xen, a key feature is that user space 23.34 -applications and libraries {\em do not} require modification. 23.35 - 23.36 -Xen support is available for increasingly many operating systems: 23.37 -right now, Linux 2.4, Linux 2.6 and NetBSD are available for Xen 2.0. 23.38 -A FreeBSD port is undergoing testing and will be incorporated into the 23.39 -release soon. Other OS ports, including Plan 9, are in progress. We 23.40 -hope that that arch-xen patches will be incorporated into the 23.41 -mainstream releases of these operating systems in due course (as has 23.42 -already happened for NetBSD). 23.43 - 23.44 -Possible usage scenarios for Xen include: 23.45 -\begin{description} 23.46 -\item [Kernel development.] Test and debug kernel modifications in a 23.47 - sandboxed virtual machine --- no need for a separate test 23.48 - machine. 23.49 -\item [Multiple OS configurations.] Run multiple operating systems 23.50 - simultaneously, for instance for compatibility or QA purposes. 23.51 -\item [Server consolidation.] Move multiple servers onto a single 23.52 - physical host with performance and fault isolation provided at 23.53 - virtual machine boundaries. 23.54 -\item [Cluster computing.] Management at VM granularity provides more 23.55 - flexibility than separately managing each physical host, but 23.56 - better control and isolation than single-system image solutions, 23.57 - particularly by using live migration for load balancing. 23.58 -\item [Hardware support for custom OSes.] Allow development of new OSes 23.59 - while benefiting from the wide-ranging hardware support of 23.60 - existing OSes such as Linux. 23.61 -\end{description} 23.62 - 23.63 -\section{Structure of a Xen-Based System} 23.64 - 23.65 -A Xen system has multiple layers, the lowest and most privileged of 23.66 -which is Xen itself. 23.67 -Xen in turn may host multiple {\em guest} operating systems, each of 23.68 -which is executed within a secure virtual machine (in Xen terminology, 23.69 -a {\em domain}). Domains are scheduled by Xen to make effective use of 23.70 -the available physical CPUs. Each guest OS manages its own 23.71 -applications, which includes responsibility for scheduling each 23.72 -application within the time allotted to the VM by Xen. 23.73 - 23.74 -The first domain, {\em domain 0}, is created automatically when the 23.75 -system boots and has special management privileges. Domain 0 builds 23.76 -other domains and manages their virtual devices. It also performs 23.77 -administrative tasks such as suspending, resuming and migrating other 23.78 -virtual machines. 23.79 - 23.80 -Within domain 0, a process called \emph{xend} runs to manage the system. 23.81 -\Xend is responsible for managing virtual machines and providing access 23.82 -to their consoles. Commands are issued to \xend over an HTTP 23.83 -interface, either from a command-line tool or from a web browser. 23.84 - 23.85 -\section{Hardware Support} 23.86 - 23.87 -Xen currently runs only on the x86 architecture, requiring a `P6' or 23.88 -newer processor (e.g. Pentium Pro, Celeron, Pentium II, Pentium III, 23.89 -Pentium IV, Xeon, AMD Athlon, AMD Duron). Multiprocessor machines are 23.90 -supported, and we also have basic support for HyperThreading (SMT), 23.91 -although this remains a topic for ongoing research. A port 23.92 -specifically for x86/64 is in progress, although Xen already runs on 23.93 -such systems in 32-bit legacy mode. In addition a port to the IA64 23.94 -architecture is approaching completion. We hope to add other 23.95 -architectures such as PPC and ARM in due course. 23.96 - 23.97 - 23.98 -Xen can currently use up to 4GB of memory. It is possible for x86 23.99 -machines to address up to 64GB of physical memory but there are no 23.100 -current plans to support these systems: The x86/64 port is the 23.101 -planned route to supporting larger memory sizes. 23.102 - 23.103 -Xen offloads most of the hardware support issues to the guest OS 23.104 -running in Domain~0. Xen itself contains only the code required to 23.105 -detect and start secondary processors, set up interrupt routing, and 23.106 -perform PCI bus enumeration. Device drivers run within a privileged 23.107 -guest OS rather than within Xen itself. This approach provides 23.108 -compatibility with the majority of device hardware supported by Linux. 23.109 -The default XenLinux build contains support for relatively modern 23.110 -server-class network and disk hardware, but you can add support for 23.111 -other hardware by configuring your XenLinux kernel in the normal way. 23.112 - 23.113 -\section{History} 23.114 - 23.115 -Xen was originally developed by the Systems Research Group at the 23.116 -University of Cambridge Computer Laboratory as part of the XenoServers 23.117 -project, funded by the UK-EPSRC. 23.118 -XenoServers aim to provide a `public infrastructure for 23.119 -global distributed computing', and Xen plays a key part in that, 23.120 -allowing us to efficiently partition a single machine to enable 23.121 -multiple independent clients to run their operating systems and 23.122 -applications in an environment providing protection, resource 23.123 -isolation and accounting. The project web page contains further 23.124 -information along with pointers to papers and technical reports: 23.125 -\path{http://www.cl.cam.ac.uk/xeno} 23.126 - 23.127 -Xen has since grown into a fully-fledged project in its own right, 23.128 -enabling us to investigate interesting research issues regarding the 23.129 -best techniques for virtualising resources such as the CPU, memory, 23.130 -disk and network. The project has been bolstered by support from 23.131 -Intel Research Cambridge, and HP Labs, who are now working closely 23.132 -with us. 23.133 - 23.134 -Xen was first described in a paper presented at SOSP in 23.135 -2003\footnote{\tt 23.136 -http://www.cl.cam.ac.uk/netos/papers/2003-xensosp.pdf}, and the first 23.137 -public release (1.0) was made that October. Since then, Xen has 23.138 -significantly matured and is now used in production scenarios on 23.139 -many sites. 23.140 - 23.141 -Xen 2.0 features greatly enhanced hardware support, configuration 23.142 -flexibility, usability and a larger complement of supported operating 23.143 -systems. This latest release takes Xen a step closer to becoming the 23.144 -definitive open source solution for virtualisation. 23.145 - 23.146 -\chapter{Installation} 23.147 - 23.148 -The Xen distribution includes three main components: Xen itself, ports 23.149 -of Linux 2.4 and 2.6 and NetBSD to run on Xen, and the user-space 23.150 -tools required to manage a Xen-based system. This chapter describes 23.151 -how to install the Xen 2.0 distribution from source. Alternatively, 23.152 -there may be pre-built packages available as part of your operating 23.153 -system distribution. 23.154 - 23.155 -\section{Prerequisites} 23.156 -\label{sec:prerequisites} 23.157 - 23.158 -The following is a full list of prerequisites. Items marked `$\dag$' 23.159 -are required by the \xend control tools, and hence required if you 23.160 -want to run more than one virtual machine; items marked `$*$' are only 23.161 -required if you wish to build from source. 23.162 -\begin{itemize} 23.163 -\item A working Linux distribution using the GRUB bootloader and 23.164 -running on a P6-class (or newer) CPU. 23.165 -\item [$\dag$] The \path{iproute2} package. 23.166 -\item [$\dag$] The Linux bridge-utils\footnote{Available from 23.167 -{\tt http://bridge.sourceforge.net}} (e.g., \path{/sbin/brctl}) 23.168 -\item [$\dag$] An installation of Twisted v1.3 or 23.169 -above\footnote{Available from {\tt 23.170 -http://www.twistedmatrix.com}}. There may be a binary package 23.171 -available for your distribution; alternatively it can be installed by 23.172 -running `{\sl make install-twisted}' in the root of the Xen source 23.173 -tree. 23.174 -\item [$*$] Build tools (gcc v3.2.x or v3.3.x, binutils, GNU make). 23.175 -\item [$*$] Development installation of libcurl (e.g., libcurl-devel) 23.176 -\item [$*$] Development installation of zlib (e.g., zlib-dev). 23.177 -\item [$*$] Development installation of Python v2.2 or later (e.g., python-dev). 23.178 -\item [$*$] \LaTeX and transfig are required to build the documentation. 23.179 -\end{itemize} 23.180 - 23.181 -Once you have satisfied the relevant prerequisites, you can 23.182 -now install either a binary or source distribution of Xen. 23.183 - 23.184 -\section{Installing from Binary Tarball} 23.185 - 23.186 -Pre-built tarballs are available for download from the Xen 23.187 -download page 23.188 -\begin{quote} 23.189 -{\tt http://xen.sf.net} 23.190 -\end{quote} 23.191 - 23.192 -Once you've downloaded the tarball, simply unpack and install: 23.193 -\begin{verbatim} 23.194 -# tar zxvf xen-2.0-install.tgz 23.195 -# cd xen-2.0-install 23.196 -# sh ./install.sh 23.197 -\end{verbatim} 23.198 - 23.199 -Once you've installed the binaries you need to configure 23.200 -your system as described in Section~\ref{s:configure}. 23.201 - 23.202 -\section{Installing from Source} 23.203 - 23.204 -This section describes how to obtain, build, and install 23.205 -Xen from source. 23.206 - 23.207 -\subsection{Obtaining the Source} 23.208 - 23.209 -The Xen source tree is available as either a compressed source tar 23.210 -ball or as a clone of our master BitKeeper repository. 23.211 - 23.212 -\begin{description} 23.213 -\item[Obtaining the Source Tarball]\mbox{} \\ 23.214 -Stable versions (and daily snapshots) of the Xen source tree are 23.215 -available as compressed tarballs from the Xen download page 23.216 -\begin{quote} 23.217 -{\tt http://xen.sf.net} 23.218 -\end{quote} 23.219 - 23.220 -\item[Using BitKeeper]\mbox{} \\ 23.221 -If you wish to install Xen from a clone of our latest BitKeeper 23.222 -repository then you will need to install the BitKeeper tools. 23.223 -Download instructions for BitKeeper can be obtained by filling out the 23.224 -form at: 23.225 - 23.226 -\begin{quote} 23.227 -{\tt http://www.bitmover.com/cgi-bin/download.cgi} 23.228 -\end{quote} 23.229 -The public master BK repository for the 2.0 release lives at: 23.230 -\begin{quote} 23.231 -{\tt bk://xen.bkbits.net/xen-2.0.bk} 23.232 -\end{quote} 23.233 -You can use BitKeeper to 23.234 -download it and keep it updated with the latest features and fixes. 23.235 - 23.236 -Change to the directory in which you want to put the source code, then 23.237 -run: 23.238 -\begin{verbatim} 23.239 -# bk clone bk://xen.bkbits.net/xen-2.0.bk 23.240 -\end{verbatim} 23.241 - 23.242 -Under your current directory, a new directory named \path{xen-2.0.bk} 23.243 -has been created, which contains all the source code for Xen, the OS 23.244 -ports, and the control tools. You can update your repository with the 23.245 -latest changes at any time by running: 23.246 -\begin{verbatim} 23.247 -# cd xen-2.0.bk # to change into the local repository 23.248 -# bk pull # to update the repository 23.249 -\end{verbatim} 23.250 -\end{description} 23.251 - 23.252 -%\section{The distribution} 23.253 -% 23.254 -%The Xen source code repository is structured as follows: 23.255 -% 23.256 -%\begin{description} 23.257 -%\item[\path{tools/}] Xen node controller daemon (Xend), command line tools, 23.258 -% control libraries 23.259 -%\item[\path{xen/}] The Xen VMM. 23.260 -%\item[\path{linux-*-xen-sparse/}] Xen support for Linux. 23.261 -%\item[\path{linux-*-patches/}] Experimental patches for Linux. 23.262 -%\item[\path{netbsd-*-xen-sparse/}] Xen support for NetBSD. 23.263 -%\item[\path{docs/}] Various documentation files for users and developers. 23.264 -%\item[\path{extras/}] Bonus extras. 23.265 -%\end{description} 23.266 - 23.267 -\subsection{Building from Source} 23.268 - 23.269 -The top-level Xen Makefile includes a target `world' that will do the 23.270 -following: 23.271 - 23.272 -\begin{itemize} 23.273 -\item Build Xen 23.274 -\item Build the control tools, including \xend 23.275 -\item Download (if necessary) and unpack the Linux 2.6 source code, 23.276 - and patch it for use with Xen 23.277 -\item Build a Linux kernel to use in domain 0 and a smaller 23.278 - unprivileged kernel, which can optionally be used for 23.279 - unprivileged virtual machines. 23.280 -\end{itemize} 23.281 - 23.282 - 23.283 -After the build has completed you should have a top-level 23.284 -directory called \path{dist/} in which all resulting targets 23.285 -will be placed; of particular interest are the two kernels 23.286 -XenLinux kernel images, one with a `-xen0' extension 23.287 -which contains hardware device drivers and drivers for Xen's virtual 23.288 -devices, and one with a `-xenU' extension that just contains the 23.289 -virtual ones. These are found in \path{dist/install/boot/} along 23.290 -with the image for Xen itself and the configuration files used 23.291 -during the build. 23.292 23.293 -The NetBSD port can be built using: 23.294 -\begin{quote} 23.295 -\begin{verbatim} 23.296 -# make netbsd20 23.297 -\end{verbatim} 23.298 -\end{quote} 23.299 -NetBSD port is built using a snapshot of the netbsd-2-0 cvs branch. 23.300 -The snapshot is downloaded as part of the build process, if it is not 23.301 -yet present in the \path{NETBSD\_SRC\_PATH} search path. The build 23.302 -process also downloads a toolchain which includes all the tools 23.303 -necessary to build the NetBSD kernel under Linux. 23.304 - 23.305 -To customize further the set of kernels built you need to edit 23.306 -the top-level Makefile. Look for the line: 23.307 - 23.308 -\begin{quote} 23.309 -\begin{verbatim} 23.310 -KERNELS ?= mk.linux-2.6-xen0 mk.linux-2.6-xenU 23.311 -\end{verbatim} 23.312 -\end{quote} 23.313 - 23.314 -You can edit this line to include any set of operating system kernels 23.315 -which have configurations in the top-level \path{buildconfigs/} 23.316 -directory, for example \path{mk.linux-2.4-xenU} to build a Linux 2.4 23.317 -kernel containing only virtual device drivers. 23.318 - 23.319 -%% Inspect the Makefile if you want to see what goes on during a build. 23.320 -%% Building Xen and the tools is straightforward, but XenLinux is more 23.321 -%% complicated. The makefile needs a `pristine' Linux kernel tree to which 23.322 -%% it will then add the Xen architecture files. You can tell the 23.323 -%% makefile the location of the appropriate Linux compressed tar file by 23.324 -%% setting the LINUX\_SRC environment variable, e.g. \\ 23.325 -%% \verb!# LINUX_SRC=/tmp/linux-2.6.11.tar.bz2 make world! \\ or by 23.326 -%% placing the tar file somewhere in the search path of {\tt 23.327 -%% LINUX\_SRC\_PATH} which defaults to `{\tt .:..}'. If the makefile 23.328 -%% can't find a suitable kernel tar file it attempts to download it from 23.329 -%% kernel.org (this won't work if you're behind a firewall). 23.330 - 23.331 -%% After untaring the pristine kernel tree, the makefile uses the {\tt 23.332 -%% mkbuildtree} script to add the Xen patches to the kernel. 23.333 - 23.334 - 23.335 -%% The procedure is similar to build the Linux 2.4 port: \\ 23.336 -%% \verb!# LINUX_SRC=/path/to/linux2.4/source make linux24! 23.337 - 23.338 - 23.339 -%% \framebox{\parbox{5in}{ 23.340 -%% {\bf Distro specific:} \\ 23.341 -%% {\it Gentoo} --- if not using udev (most installations, currently), you'll need 23.342 -%% to enable devfs and devfs mount at boot time in the xen0 config. 23.343 -%% }} 23.344 - 23.345 -\subsection{Custom XenLinux Builds} 23.346 - 23.347 -% If you have an SMP machine you may wish to give the {\tt '-j4'} 23.348 -% argument to make to get a parallel build. 23.349 - 23.350 -If you wish to build a customized XenLinux kernel (e.g. to support 23.351 -additional devices or enable distribution-required features), you can 23.352 -use the standard Linux configuration mechanisms, specifying that the 23.353 -architecture being built for is \path{xen}, e.g: 23.354 -\begin{quote} 23.355 -\begin{verbatim} 23.356 -# cd linux-2.6.11-xen0 23.357 -# make ARCH=xen xconfig 23.358 -# cd .. 23.359 -# make 23.360 -\end{verbatim} 23.361 -\end{quote} 23.362 - 23.363 -You can also copy an existing Linux configuration (\path{.config}) 23.364 -into \path{linux-2.6.11-xen0} and execute: 23.365 -\begin{quote} 23.366 -\begin{verbatim} 23.367 -# make ARCH=xen oldconfig 23.368 -\end{verbatim} 23.369 -\end{quote} 23.370 - 23.371 -You may be prompted with some Xen-specific options; we 23.372 -advise accepting the defaults for these options. 23.373 - 23.374 -Note that the only difference between the two types of Linux kernel 23.375 -that are built is the configuration file used for each. The "U" 23.376 -suffixed (unprivileged) versions don't contain any of the physical 23.377 -hardware device drivers, leading to a 30\% reduction in size; hence 23.378 -you may prefer these for your non-privileged domains. The `0' 23.379 -suffixed privileged versions can be used to boot the system, as well 23.380 -as in driver domains and unprivileged domains. 23.381 - 23.382 - 23.383 -\subsection{Installing the Binaries} 23.384 - 23.385 - 23.386 -The files produced by the build process are stored under the 23.387 -\path{dist/install/} directory. To install them in their default 23.388 -locations, do: 23.389 -\begin{quote} 23.390 -\begin{verbatim} 23.391 -# make install 23.392 -\end{verbatim} 23.393 -\end{quote} 23.394 - 23.395 - 23.396 -Alternatively, users with special installation requirements may wish 23.397 -to install them manually by copying the files to their appropriate 23.398 -destinations. 23.399 - 23.400 -%% Files in \path{install/boot/} include: 23.401 -%% \begin{itemize} 23.402 -%% \item \path{install/boot/xen-2.0.gz} Link to the Xen 'kernel' 23.403 -%% \item \path{install/boot/vmlinuz-2.6-xen0} Link to domain 0 XenLinux kernel 23.404 -%% \item \path{install/boot/vmlinuz-2.6-xenU} Link to unprivileged XenLinux kernel 23.405 -%% \end{itemize} 23.406 - 23.407 -The \path{dist/install/boot} directory will also contain the config files 23.408 -used for building the XenLinux kernels, and also versions of Xen and 23.409 -XenLinux kernels that contain debug symbols (\path{xen-syms-2.0.6} and 23.410 -\path{vmlinux-syms-2.6.11.11-xen0}) which are essential for interpreting crash 23.411 -dumps. Retain these files as the developers may wish to see them if 23.412 -you post on the mailing list. 23.413 - 23.414 - 23.415 - 23.416 - 23.417 - 23.418 -\section{Configuration} 23.419 -\label{s:configure} 23.420 -Once you have built and installed the Xen distribution, it is 23.421 -simple to prepare the machine for booting and running Xen. 23.422 - 23.423 -\subsection{GRUB Configuration} 23.424 - 23.425 -An entry should be added to \path{grub.conf} (often found under 23.426 -\path{/boot/} or \path{/boot/grub/}) to allow Xen / XenLinux to boot. 23.427 -This file is sometimes called \path{menu.lst}, depending on your 23.428 -distribution. The entry should look something like the following: 23.429 - 23.430 -{\small 23.431 -\begin{verbatim} 23.432 -title Xen 2.0 / XenLinux 2.6 23.433 - kernel /boot/xen-2.0.gz dom0_mem=131072 23.434 - module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro console=tty0 23.435 -\end{verbatim} 23.436 -} 23.437 +%% Chapter Introduction moved to introduction.tex 23.438 +\include{src/user/introduction} 23.439 23.440 -The kernel line tells GRUB where to find Xen itself and what boot 23.441 -parameters should be passed to it (in this case, setting domain 0's 23.442 -memory allocation in kilobytes and the settings for the serial port). For more 23.443 -details on the various Xen boot parameters see Section~\ref{s:xboot}. 23.444 - 23.445 -The module line of the configuration describes the location of the 23.446 -XenLinux kernel that Xen should start and the parameters that should 23.447 -be passed to it (these are standard Linux parameters, identifying the 23.448 -root device and specifying it be initially mounted read only and 23.449 -instructing that console output be sent to the screen). Some 23.450 -distributions such as SuSE do not require the \path{ro} parameter. 23.451 - 23.452 -%% \framebox{\parbox{5in}{ 23.453 -%% {\bf Distro specific:} \\ 23.454 -%% {\it SuSE} --- Omit the {\tt ro} option from the XenLinux kernel 23.455 -%% command line, since the partition won't be remounted rw during boot. 23.456 -%% }} 23.457 - 23.458 - 23.459 -If you want to use an initrd, just add another \path{module} line to 23.460 -the configuration, as usual: 23.461 -{\small 23.462 -\begin{verbatim} 23.463 - module /boot/my_initrd.gz 23.464 -\end{verbatim} 23.465 -} 23.466 - 23.467 -As always when installing a new kernel, it is recommended that you do 23.468 -not delete existing menu options from \path{menu.lst} --- you may want 23.469 -to boot your old Linux kernel in future, particularly if you 23.470 -have problems. 23.471 - 23.472 - 23.473 -\subsection{Serial Console (optional)} 23.474 - 23.475 -%% kernel /boot/xen-2.0.gz dom0_mem=131072 com1=115200,8n1 23.476 -%% module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro 23.477 - 23.478 - 23.479 -In order to configure Xen serial console output, it is necessary to add 23.480 -an boot option to your GRUB config; e.g. replace the above kernel line 23.481 -with: 23.482 -\begin{quote} 23.483 -{\small 23.484 -\begin{verbatim} 23.485 - kernel /boot/xen.gz dom0_mem=131072 com1=115200,8n1 23.486 -\end{verbatim}} 23.487 -\end{quote} 23.488 - 23.489 -This configures Xen to output on COM1 at 115,200 baud, 8 data bits, 23.490 -1 stop bit and no parity. Modify these parameters for your set up. 23.491 - 23.492 -One can also configure XenLinux to share the serial console; to 23.493 -achieve this append ``\path{console=ttyS0}'' to your 23.494 -module line. 23.495 - 23.496 - 23.497 -If you wish to be able to log in over the XenLinux serial console it 23.498 -is necessary to add a line into \path{/etc/inittab}, just as per 23.499 -regular Linux. Simply add the line: 23.500 -\begin{quote} 23.501 -{\small 23.502 -{\tt c:2345:respawn:/sbin/mingetty ttyS0} 23.503 -} 23.504 -\end{quote} 23.505 - 23.506 -and you should be able to log in. Note that to successfully log in 23.507 -as root over the serial line will require adding \path{ttyS0} to 23.508 -\path{/etc/securetty} in most modern distributions. 23.509 - 23.510 -\subsection{TLS Libraries} 23.511 - 23.512 -Users of the XenLinux 2.6 kernel should disable Thread Local Storage 23.513 -(e.g.\ by doing a \path{mv /lib/tls /lib/tls.disabled}) before 23.514 -attempting to run with a XenLinux kernel\footnote{If you boot without first 23.515 -disabling TLS, you will get a warning message during the boot 23.516 -process. In this case, simply perform the rename after the machine is 23.517 -up and then run \texttt{/sbin/ldconfig} to make it take effect.}. You can 23.518 -always reenable it by restoring the directory to its original location 23.519 -(i.e.\ \path{mv /lib/tls.disabled /lib/tls}). 23.520 - 23.521 -The reason for this is that the current TLS implementation uses 23.522 -segmentation in a way that is not permissible under Xen. If TLS is 23.523 -not disabled, an emulation mode is used within Xen which reduces 23.524 -performance substantially. 23.525 - 23.526 -We hope that this issue can be resolved by working with Linux 23.527 -distribution vendors to implement a minor backward-compatible change 23.528 -to the TLS library. 23.529 - 23.530 -\section{Booting Xen} 23.531 - 23.532 -It should now be possible to restart the system and use Xen. Reboot 23.533 -as usual but choose the new Xen option when the Grub screen appears. 23.534 - 23.535 -What follows should look much like a conventional Linux boot. The 23.536 -first portion of the output comes from Xen itself, supplying low level 23.537 -information about itself and the machine it is running on. The 23.538 -following portion of the output comes from XenLinux. 23.539 - 23.540 -You may see some errors during the XenLinux boot. These are not 23.541 -necessarily anything to worry about --- they may result from kernel 23.542 -configuration differences between your XenLinux kernel and the one you 23.543 -usually use. 23.544 - 23.545 -When the boot completes, you should be able to log into your system as 23.546 -usual. If you are unable to log in to your system running Xen, you 23.547 -should still be able to reboot with your normal Linux kernel. 23.548 - 23.549 - 23.550 -\chapter{Starting Additional Domains} 23.551 - 23.552 -The first step in creating a new domain is to prepare a root 23.553 -filesystem for it to boot off. Typically, this might be stored in a 23.554 -normal partition, an LVM or other volume manager partition, a disk 23.555 -file or on an NFS server. A simple way to do this is simply to boot 23.556 -from your standard OS install CD and install the distribution into 23.557 -another partition on your hard drive. 23.558 - 23.559 -To start the \xend control daemon, type 23.560 -\begin{quote} 23.561 -\verb!# xend start! 23.562 -\end{quote} 23.563 -If you 23.564 -wish the daemon to start automatically, see the instructions in 23.565 -Section~\ref{s:xend}. Once the daemon is running, you can use the 23.566 -\path{xm} tool to monitor and maintain the domains running on your 23.567 -system. This chapter provides only a brief tutorial: we provide full 23.568 -details of the \path{xm} tool in the next chapter. 23.569 - 23.570 -%\section{From the web interface} 23.571 -% 23.572 -%Boot the Xen machine and start Xensv (see Chapter~\ref{cha:xensv} for 23.573 -%more details) using the command: \\ 23.574 -%\verb_# xensv start_ \\ 23.575 -%This will also start Xend (see Chapter~\ref{cha:xend} for more information). 23.576 -% 23.577 -%The domain management interface will then be available at {\tt 23.578 -%http://your\_machine:8080/}. This provides a user friendly wizard for 23.579 -%starting domains and functions for managing running domains. 23.580 -% 23.581 -%\section{From the command line} 23.582 - 23.583 - 23.584 -\section{Creating a Domain Configuration File} 23.585 +%% Chapter Installation moved to installation.tex 23.586 +\include{src/user/installation} 23.587 23.588 -Before you can start an additional domain, you must create a 23.589 -configuration file. We provide two example files which you 23.590 -can use as a starting point: 23.591 -\begin{itemize} 23.592 - \item \path{/etc/xen/xmexample1} is a simple template configuration file 23.593 - for describing a single VM. 23.594 - 23.595 - \item \path{/etc/xen/xmexample2} file is a template description that 23.596 - is intended to be reused for multiple virtual machines. Setting 23.597 - the value of the \path{vmid} variable on the \path{xm} command line 23.598 - fills in parts of this template. 23.599 -\end{itemize} 23.600 - 23.601 -Copy one of these files and edit it as appropriate. 23.602 -Typical values you may wish to edit include: 23.603 - 23.604 -\begin{quote} 23.605 -\begin{description} 23.606 -\item[kernel] Set this to the path of the kernel you compiled for use 23.607 - with Xen (e.g.\ \path{kernel = '/boot/vmlinuz-2.6-xenU'}) 23.608 -\item[memory] Set this to the size of the domain's memory in 23.609 -megabytes (e.g.\ \path{memory = 64}) 23.610 -\item[disk] Set the first entry in this list to calculate the offset 23.611 -of the domain's root partition, based on the domain ID. Set the 23.612 -second to the location of \path{/usr} if you are sharing it between 23.613 -domains (e.g.\ \path{disk = ['phy:your\_hard\_drive\%d,sda1,w' \% 23.614 -(base\_partition\_number + vmid), 'phy:your\_usr\_partition,sda6,r' ]} 23.615 -\item[dhcp] Uncomment the dhcp variable, so that the domain will 23.616 -receive its IP address from a DHCP server (e.g.\ \path{dhcp='dhcp'}) 23.617 -\end{description} 23.618 -\end{quote} 23.619 - 23.620 -You may also want to edit the {\bf vif} variable in order to choose 23.621 -the MAC address of the virtual ethernet interface yourself. For 23.622 -example: 23.623 -\begin{quote} 23.624 -\verb_vif = ['mac=00:06:AA:F6:BB:B3']_ 23.625 -\end{quote} 23.626 -If you do not set this variable, \xend will automatically generate a 23.627 -random MAC address from an unused range. 23.628 - 23.629 - 23.630 -\section{Booting the Domain} 23.631 - 23.632 -The \path{xm} tool provides a variety of commands for managing domains. 23.633 -Use the \path{create} command to start new domains. Assuming you've 23.634 -created a configuration file \path{myvmconf} based around 23.635 -\path{/etc/xen/xmexample2}, to start a domain with virtual 23.636 -machine ID~1 you should type: 23.637 - 23.638 -\begin{quote} 23.639 -\begin{verbatim} 23.640 -# xm create -c myvmconf vmid=1 23.641 -\end{verbatim} 23.642 -\end{quote} 23.643 - 23.644 - 23.645 -The \path{-c} switch causes \path{xm} to turn into the domain's 23.646 -console after creation. The \path{vmid=1} sets the \path{vmid} 23.647 -variable used in the \path{myvmconf} file. 23.648 - 23.649 - 23.650 -You should see the console boot messages from the new domain 23.651 -appearing in the terminal in which you typed the command, 23.652 -culminating in a login prompt. 23.653 - 23.654 - 23.655 -\section{Example: ttylinux} 23.656 - 23.657 -Ttylinux is a very small Linux distribution, designed to require very 23.658 -few resources. We will use it as a concrete example of how to start a 23.659 -Xen domain. Most users will probably want to install a full-featured 23.660 -distribution once they have mastered the basics\footnote{ttylinux is 23.661 -maintained by Pascal Schmidt. You can download source packages from 23.662 -the distribution's home page: {\tt http://www.minimalinux.org/ttylinux/}}. 23.663 - 23.664 -\begin{enumerate} 23.665 -\item Download and extract the ttylinux disk image from the Files 23.666 -section of the project's SourceForge site (see 23.667 -\path{http://sf.net/projects/xen/}). 23.668 -\item Create a configuration file like the following: 23.669 -\begin{verbatim} 23.670 -kernel = "/boot/vmlinuz-2.6-xenU" 23.671 -memory = 64 23.672 -name = "ttylinux" 23.673 -nics = 1 23.674 -ip = "1.2.3.4" 23.675 -disk = ['file:/path/to/ttylinux/rootfs,sda1,w'] 23.676 -root = "/dev/sda1 ro" 23.677 -\end{verbatim} 23.678 -\item Now start the domain and connect to its console: 23.679 -\begin{verbatim} 23.680 -xm create configfile -c 23.681 -\end{verbatim} 23.682 -\item Login as root, password root. 23.683 -\end{enumerate} 23.684 - 23.685 - 23.686 -\section{Starting / Stopping Domains Automatically} 23.687 - 23.688 -It is possible to have certain domains start automatically at boot 23.689 -time and to have dom0 wait for all running domains to shutdown before 23.690 -it shuts down the system. 23.691 - 23.692 -To specify a domain is to start at boot-time, place its 23.693 -configuration file (or a link to it) under \path{/etc/xen/auto/}. 23.694 - 23.695 -A Sys-V style init script for RedHat and LSB-compliant systems is 23.696 -provided and will be automatically copied to \path{/etc/init.d/} 23.697 -during install. You can then enable it in the appropriate way for 23.698 -your distribution. 23.699 - 23.700 -For instance, on RedHat: 23.701 - 23.702 -\begin{quote} 23.703 -\verb_# chkconfig --add xendomains_ 23.704 -\end{quote} 23.705 - 23.706 -By default, this will start the boot-time domains in runlevels 3, 4 23.707 -and 5. 23.708 - 23.709 -You can also use the \path{service} command to run this script 23.710 -manually, e.g: 23.711 - 23.712 -\begin{quote} 23.713 -\verb_# service xendomains start_ 23.714 - 23.715 -Starts all the domains with config files under /etc/xen/auto/. 23.716 -\end{quote} 23.717 - 23.718 - 23.719 -\begin{quote} 23.720 -\verb_# service xendomains stop_ 23.721 - 23.722 -Shuts down ALL running Xen domains. 23.723 -\end{quote} 23.724 - 23.725 -\chapter{Domain Management Tools} 23.726 - 23.727 -The previous chapter described a simple example of how to configure 23.728 -and start a domain. This chapter summarises the tools available to 23.729 -manage running domains. 23.730 - 23.731 -\section{Command-line Management} 23.732 - 23.733 -Command line management tasks are also performed using the \path{xm} 23.734 -tool. For online help for the commands available, type: 23.735 -\begin{quote} 23.736 -\verb_# xm help_ 23.737 -\end{quote} 23.738 - 23.739 -You can also type \path{xm help $<$command$>$} for more information 23.740 -on a given command. 23.741 - 23.742 -\subsection{Basic Management Commands} 23.743 - 23.744 -The most important \path{xm} commands are: 23.745 -\begin{quote} 23.746 -\verb_# xm list_: Lists all domains running.\\ 23.747 -\verb_# xm consoles_ : Gives information about the domain consoles.\\ 23.748 -\verb_# xm console_: Opens a console to a domain (e.g.\ 23.749 - \verb_# xm console myVM_ 23.750 -\end{quote} 23.751 - 23.752 -\subsection{\tt xm list} 23.753 - 23.754 -The output of \path{xm list} is in rows of the following format: 23.755 -\begin{center} 23.756 -{\tt name domid memory cpu state cputime console} 23.757 -\end{center} 23.758 - 23.759 -\begin{quote} 23.760 -\begin{description} 23.761 -\item[name] The descriptive name of the virtual machine. 23.762 -\item[domid] The number of the domain ID this virtual machine is running in. 23.763 -\item[memory] Memory size in megabytes. 23.764 -\item[cpu] The CPU this domain is running on. 23.765 -\item[state] Domain state consists of 5 fields: 23.766 - \begin{description} 23.767 - \item[r] running 23.768 - \item[b] blocked 23.769 - \item[p] paused 23.770 - \item[s] shutdown 23.771 - \item[c] crashed 23.772 - \end{description} 23.773 -\item[cputime] How much CPU time (in seconds) the domain has used so far. 23.774 -\item[console] TCP port accepting connections to the domain's console. 23.775 -\end{description} 23.776 -\end{quote} 23.777 - 23.778 -The \path{xm list} command also supports a long output format when the 23.779 -\path{-l} switch is used. This outputs the fulls details of the 23.780 -running domains in \xend's SXP configuration format. 23.781 - 23.782 -For example, suppose the system is running the ttylinux domain as 23.783 -described earlier. The list command should produce output somewhat 23.784 -like the following: 23.785 -\begin{verbatim} 23.786 -# xm list 23.787 -Name Id Mem(MB) CPU State Time(s) Console 23.788 -Domain-0 0 251 0 r---- 172.2 23.789 -ttylinux 5 63 0 -b--- 3.0 9605 23.790 -\end{verbatim} 23.791 - 23.792 -Here we can see the details for the ttylinux domain, as well as for 23.793 -domain 0 (which, of course, is always running). Note that the console 23.794 -port for the ttylinux domain is 9605. This can be connected to by TCP 23.795 -using a terminal program (e.g. \path{telnet} or, better, 23.796 -\path{xencons}). The simplest way to connect is to use the \path{xm console} 23.797 -command, specifying the domain name or ID. To connect to the console 23.798 -of the ttylinux domain, we could use any of the following: 23.799 -\begin{verbatim} 23.800 -# xm console ttylinux 23.801 -# xm console 5 23.802 -# xencons localhost 9605 23.803 -\end{verbatim} 23.804 - 23.805 -\section{Domain Save and Restore} 23.806 - 23.807 -The administrator of a Xen system may suspend a virtual machine's 23.808 -current state into a disk file in domain 0, allowing it to be resumed 23.809 -at a later time. 23.810 - 23.811 -The ttylinux domain described earlier can be suspended to disk using 23.812 -the command: 23.813 -\begin{verbatim} 23.814 -# xm save ttylinux ttylinux.xen 23.815 -\end{verbatim} 23.816 - 23.817 -This will stop the domain named `ttylinux' and save its current state 23.818 -into a file called \path{ttylinux.xen}. 23.819 - 23.820 -To resume execution of this domain, use the \path{xm restore} command: 23.821 -\begin{verbatim} 23.822 -# xm restore ttylinux.xen 23.823 -\end{verbatim} 23.824 - 23.825 -This will restore the state of the domain and restart it. The domain 23.826 -will carry on as before and the console may be reconnected using the 23.827 -\path{xm console} command, as above. 23.828 - 23.829 -\section{Live Migration} 23.830 - 23.831 -Live migration is used to transfer a domain between physical hosts 23.832 -whilst that domain continues to perform its usual activities --- from 23.833 -the user's perspective, the migration should be imperceptible. 23.834 - 23.835 -To perform a live migration, both hosts must be running Xen / \xend and 23.836 -the destination host must have sufficient resources (e.g. memory 23.837 -capacity) to accommodate the domain after the move. Furthermore we 23.838 -currently require both source and destination machines to be on the 23.839 -same L2 subnet. 23.840 - 23.841 -Currently, there is no support for providing automatic remote access 23.842 -to filesystems stored on local disk when a domain is migrated. 23.843 -Administrators should choose an appropriate storage solution 23.844 -(i.e. SAN, NAS, etc.) to ensure that domain filesystems are also 23.845 -available on their destination node. GNBD is a good method for 23.846 -exporting a volume from one machine to another. iSCSI can do a similar 23.847 -job, but is more complex to set up. 23.848 - 23.849 -When a domain migrates, it's MAC and IP address move with it, thus it 23.850 -is only possible to migrate VMs within the same layer-2 network and IP 23.851 -subnet. If the destination node is on a different subnet, the 23.852 -administrator would need to manually configure a suitable etherip or 23.853 -IP tunnel in the domain 0 of the remote node. 23.854 - 23.855 -A domain may be migrated using the \path{xm migrate} command. To 23.856 -live migrate a domain to another machine, we would use 23.857 -the command: 23.858 - 23.859 -\begin{verbatim} 23.860 -# xm migrate --live mydomain destination.ournetwork.com 23.861 -\end{verbatim} 23.862 - 23.863 -Without the \path{--live} flag, \xend simply stops the domain and 23.864 -copies the memory image over to the new node and restarts it. Since 23.865 -domains can have large allocations this can be quite time consuming, 23.866 -even on a Gigabit network. With the \path{--live} flag \xend attempts 23.867 -to keep the domain running while the migration is in progress, 23.868 -resulting in typical `downtimes' of just 60--300ms. 23.869 - 23.870 -For now it will be necessary to reconnect to the domain's console on 23.871 -the new machine using the \path{xm console} command. If a migrated 23.872 -domain has any open network connections then they will be preserved, 23.873 -so SSH connections do not have this limitation. 23.874 - 23.875 -\section{Managing Domain Memory} 23.876 - 23.877 -XenLinux domains have the ability to relinquish / reclaim machine 23.878 -memory at the request of the administrator or the user of the domain. 23.879 +%% Chapter Starting Additional Domains moved to start_addl_dom.tex 23.880 +\include{src/user/start_addl_dom} 23.881 23.882 -\subsection{Setting memory footprints from dom0} 23.883 - 23.884 -The machine administrator can request that a domain alter its memory 23.885 -footprint using the \path{xm set-mem} command. For instance, we can 23.886 -request that our example ttylinux domain reduce its memory footprint 23.887 -to 32 megabytes. 23.888 - 23.889 -\begin{verbatim} 23.890 -# xm set-mem ttylinux 32 23.891 -\end{verbatim} 23.892 - 23.893 -We can now see the result of this in the output of \path{xm list}: 23.894 - 23.895 -\begin{verbatim} 23.896 -# xm list 23.897 -Name Id Mem(MB) CPU State Time(s) Console 23.898 -Domain-0 0 251 0 r---- 172.2 23.899 -ttylinux 5 31 0 -b--- 4.3 9605 23.900 -\end{verbatim} 23.901 - 23.902 -The domain has responded to the request by returning memory to Xen. We 23.903 -can restore the domain to its original size using the command line: 23.904 - 23.905 -\begin{verbatim} 23.906 -# xm set-mem ttylinux 64 23.907 -\end{verbatim} 23.908 - 23.909 -\subsection{Setting memory footprints from within a domain} 23.910 - 23.911 -The virtual file \path{/proc/xen/balloon} allows the owner of a 23.912 -domain to adjust their own memory footprint. Reading the file 23.913 -(e.g. \path{cat /proc/xen/balloon}) prints out the current 23.914 -memory footprint of the domain. Writing the file 23.915 -(e.g. \path{echo new\_target > /proc/xen/balloon}) requests 23.916 -that the kernel adjust the domain's memory footprint to a new value. 23.917 - 23.918 -\subsection{Setting memory limits} 23.919 - 23.920 -Xen associates a memory size limit with each domain. By default, this 23.921 -is the amount of memory the domain is originally started with, 23.922 -preventing the domain from ever growing beyond this size. To permit a 23.923 -domain to grow beyond its original allocation or to prevent a domain 23.924 -you've shrunk from reclaiming the memory it relinquished, use the 23.925 -\path{xm maxmem} command. 23.926 - 23.927 -\chapter{Domain Filesystem Storage} 23.928 - 23.929 -It is possible to directly export any Linux block device in dom0 to 23.930 -another domain, or to export filesystems / devices to virtual machines 23.931 -using standard network protocols (e.g. NBD, iSCSI, NFS, etc). This 23.932 -chapter covers some of the possibilities. 23.933 - 23.934 - 23.935 -\section{Exporting Physical Devices as VBDs} 23.936 -\label{s:exporting-physical-devices-as-vbds} 23.937 - 23.938 -One of the simplest configurations is to directly export 23.939 -individual partitions from domain 0 to other domains. To 23.940 -achieve this use the \path{phy:} specifier in your domain 23.941 -configuration file. For example a line like 23.942 -\begin{quote} 23.943 -\verb_disk = ['phy:hda3,sda1,w']_ 23.944 -\end{quote} 23.945 -specifies that the partition \path{/dev/hda3} in domain 0 23.946 -should be exported read-write to the new domain as \path{/dev/sda1}; 23.947 -one could equally well export it as \path{/dev/hda} or 23.948 -\path{/dev/sdb5} should one wish. 23.949 - 23.950 -In addition to local disks and partitions, it is possible to export 23.951 -any device that Linux considers to be ``a disk'' in the same manner. 23.952 -For example, if you have iSCSI disks or GNBD volumes imported into 23.953 -domain 0 you can export these to other domains using the \path{phy:} 23.954 -disk syntax. E.g.: 23.955 -\begin{quote} 23.956 -\verb_disk = ['phy:vg/lvm1,sda2,w']_ 23.957 -\end{quote} 23.958 - 23.959 - 23.960 - 23.961 -\begin{center} 23.962 -\framebox{\bf Warning: Block device sharing} 23.963 -\end{center} 23.964 -\begin{quote} 23.965 -Block devices should typically only be shared between domains in a 23.966 -read-only fashion otherwise the Linux kernel's file systems will get 23.967 -very confused as the file system structure may change underneath them 23.968 -(having the same ext3 partition mounted rw twice is a sure fire way to 23.969 -cause irreparable damage)! \Xend will attempt to prevent you from 23.970 -doing this by checking that the device is not mounted read-write in 23.971 -domain 0, and hasn't already been exported read-write to another 23.972 -domain. 23.973 -If you want read-write sharing, export the directory to other domains 23.974 -via NFS from domain0 (or use a cluster file system such as GFS or 23.975 -ocfs2). 23.976 - 23.977 -\end{quote} 23.978 - 23.979 - 23.980 -\section{Using File-backed VBDs} 23.981 - 23.982 -It is also possible to use a file in Domain 0 as the primary storage 23.983 -for a virtual machine. As well as being convenient, this also has the 23.984 -advantage that the virtual block device will be {\em sparse} --- space 23.985 -will only really be allocated as parts of the file are used. So if a 23.986 -virtual machine uses only half of its disk space then the file really 23.987 -takes up half of the size allocated. 23.988 - 23.989 -For example, to create a 2GB sparse file-backed virtual block device 23.990 -(actually only consumes 1KB of disk): 23.991 -\begin{quote} 23.992 -\verb_# dd if=/dev/zero of=vm1disk bs=1k seek=2048k count=1_ 23.993 -\end{quote} 23.994 - 23.995 -Make a file system in the disk file: 23.996 -\begin{quote} 23.997 -\verb_# mkfs -t ext3 vm1disk_ 23.998 -\end{quote} 23.999 - 23.1000 -(when the tool asks for confirmation, answer `y') 23.1001 - 23.1002 -Populate the file system e.g. by copying from the current root: 23.1003 -\begin{quote} 23.1004 -\begin{verbatim} 23.1005 -# mount -o loop vm1disk /mnt 23.1006 -# cp -ax /{root,dev,var,etc,usr,bin,sbin,lib} /mnt 23.1007 -# mkdir /mnt/{proc,sys,home,tmp} 23.1008 -\end{verbatim} 23.1009 -\end{quote} 23.1010 - 23.1011 -Tailor the file system by editing \path{/etc/fstab}, 23.1012 -\path{/etc/hostname}, etc (don't forget to edit the files in the 23.1013 -mounted file system, instead of your domain 0 filesystem, e.g. you 23.1014 -would edit \path{/mnt/etc/fstab} instead of \path{/etc/fstab} ). For 23.1015 -this example put \path{/dev/sda1} to root in fstab. 23.1016 - 23.1017 -Now unmount (this is important!): 23.1018 -\begin{quote} 23.1019 -\verb_# umount /mnt_ 23.1020 -\end{quote} 23.1021 - 23.1022 -In the configuration file set: 23.1023 -\begin{quote} 23.1024 -\verb_disk = ['file:/full/path/to/vm1disk,sda1,w']_ 23.1025 -\end{quote} 23.1026 +%% Chapter Domain Management Tools moved to domain_mgmt.tex 23.1027 +\include{src/user/domain_mgmt} 23.1028 23.1029 -As the virtual machine writes to its `disk', the sparse file will be 23.1030 -filled in and consume more space up to the original 2GB. 23.1031 - 23.1032 -{\bf Note that file-backed VBDs may not be appropriate for backing 23.1033 -I/O-intensive domains.} File-backed VBDs are known to experience 23.1034 -substantial slowdowns under heavy I/O workloads, due to the I/O handling 23.1035 -by the loopback block device used to support file-backed VBDs in dom0. 23.1036 -Better I/O performance can be achieved by using either LVM-backed VBDs 23.1037 -(Section~\ref{s:using-lvm-backed-vbds}) or physical devices as VBDs 23.1038 -(Section~\ref{s:exporting-physical-devices-as-vbds}). 23.1039 - 23.1040 -Linux supports a maximum of eight file-backed VBDs across all domains by 23.1041 -default. This limit can be statically increased by using the {\em 23.1042 -max\_loop} module parameter if CONFIG\_BLK\_DEV\_LOOP is compiled as a 23.1043 -module in the dom0 kernel, or by using the {\em max\_loop=n} boot option 23.1044 -if CONFIG\_BLK\_DEV\_LOOP is compiled directly into the dom0 kernel. 23.1045 - 23.1046 - 23.1047 -\section{Using LVM-backed VBDs} 23.1048 -\label{s:using-lvm-backed-vbds} 23.1049 - 23.1050 -A particularly appealing solution is to use LVM volumes 23.1051 -as backing for domain file-systems since this allows dynamic 23.1052 -growing/shrinking of volumes as well as snapshot and other 23.1053 -features. 23.1054 - 23.1055 -To initialise a partition to support LVM volumes: 23.1056 -\begin{quote} 23.1057 -\begin{verbatim} 23.1058 -# pvcreate /dev/sda10 23.1059 -\end{verbatim} 23.1060 -\end{quote} 23.1061 - 23.1062 -Create a volume group named `vg' on the physical partition: 23.1063 -\begin{quote} 23.1064 -\begin{verbatim} 23.1065 -# vgcreate vg /dev/sda10 23.1066 -\end{verbatim} 23.1067 -\end{quote} 23.1068 - 23.1069 -Create a logical volume of size 4GB named `myvmdisk1': 23.1070 -\begin{quote} 23.1071 -\begin{verbatim} 23.1072 -# lvcreate -L4096M -n myvmdisk1 vg 23.1073 -\end{verbatim} 23.1074 -\end{quote} 23.1075 - 23.1076 -You should now see that you have a \path{/dev/vg/myvmdisk1} 23.1077 -Make a filesystem, mount it and populate it, e.g.: 23.1078 -\begin{quote} 23.1079 -\begin{verbatim} 23.1080 -# mkfs -t ext3 /dev/vg/myvmdisk1 23.1081 -# mount /dev/vg/myvmdisk1 /mnt 23.1082 -# cp -ax / /mnt 23.1083 -# umount /mnt 23.1084 -\end{verbatim} 23.1085 -\end{quote} 23.1086 - 23.1087 -Now configure your VM with the following disk configuration: 23.1088 -\begin{quote} 23.1089 -\begin{verbatim} 23.1090 - disk = [ 'phy:vg/myvmdisk1,sda1,w' ] 23.1091 -\end{verbatim} 23.1092 -\end{quote} 23.1093 - 23.1094 -LVM enables you to grow the size of logical volumes, but you'll need 23.1095 -to resize the corresponding file system to make use of the new 23.1096 -space. Some file systems (e.g. ext3) now support on-line resize. See 23.1097 -the LVM manuals for more details. 23.1098 +%% Chapter Domain Filesystem Storage moved to domain_filesystem.tex 23.1099 +\include{src/user/domain_filesystem} 23.1100 23.1101 -You can also use LVM for creating copy-on-write clones of LVM 23.1102 -volumes (known as writable persistent snapshots in LVM 23.1103 -terminology). This facility is new in Linux 2.6.8, so isn't as 23.1104 -stable as one might hope. In particular, using lots of CoW LVM 23.1105 -disks consumes a lot of dom0 memory, and error conditions such as 23.1106 -running out of disk space are not handled well. Hopefully this 23.1107 -will improve in future. 23.1108 - 23.1109 -To create two copy-on-write clone of the above file system you 23.1110 -would use the following commands: 23.1111 - 23.1112 -\begin{quote} 23.1113 -\begin{verbatim} 23.1114 -# lvcreate -s -L1024M -n myclonedisk1 /dev/vg/myvmdisk1 23.1115 -# lvcreate -s -L1024M -n myclonedisk2 /dev/vg/myvmdisk1 23.1116 -\end{verbatim} 23.1117 -\end{quote} 23.1118 - 23.1119 -Each of these can grow to have 1GB of differences from the master 23.1120 -volume. You can grow the amount of space for storing the 23.1121 -differences using the lvextend command, e.g.: 23.1122 -\begin{quote} 23.1123 -\begin{verbatim} 23.1124 -# lvextend +100M /dev/vg/myclonedisk1 23.1125 -\end{verbatim} 23.1126 -\end{quote} 23.1127 - 23.1128 -Don't let the `differences volume' ever fill up otherwise LVM gets 23.1129 -rather confused. It may be possible to automate the growing 23.1130 -process by using \path{dmsetup wait} to spot the volume getting full 23.1131 -and then issue an \path{lvextend}. 23.1132 - 23.1133 -In principle, it is possible to continue writing to the volume 23.1134 -that has been cloned (the changes will not be visible to the 23.1135 -clones), but we wouldn't recommend this: have the cloned volume 23.1136 -as a `pristine' file system install that isn't mounted directly 23.1137 -by any of the virtual machines. 23.1138 - 23.1139 - 23.1140 -\section{Using NFS Root} 23.1141 - 23.1142 -First, populate a root filesystem in a directory on the server 23.1143 -machine. This can be on a distinct physical machine, or simply 23.1144 -run within a virtual machine on the same node. 23.1145 - 23.1146 -Now configure the NFS server to export this filesystem over the 23.1147 -network by adding a line to \path{/etc/exports}, for instance: 23.1148 - 23.1149 -\begin{quote} 23.1150 -\begin{small} 23.1151 -\begin{verbatim} 23.1152 -/export/vm1root 1.2.3.4/24 (rw,sync,no_root_squash) 23.1153 -\end{verbatim} 23.1154 -\end{small} 23.1155 -\end{quote} 23.1156 - 23.1157 -Finally, configure the domain to use NFS root. In addition to the 23.1158 -normal variables, you should make sure to set the following values in 23.1159 -the domain's configuration file: 23.1160 - 23.1161 -\begin{quote} 23.1162 -\begin{small} 23.1163 -\begin{verbatim} 23.1164 -root = '/dev/nfs' 23.1165 -nfs_server = '2.3.4.5' # substitute IP address of server 23.1166 -nfs_root = '/path/to/root' # path to root FS on the server 23.1167 -\end{verbatim} 23.1168 -\end{small} 23.1169 -\end{quote} 23.1170 - 23.1171 -The domain will need network access at boot time, so either statically 23.1172 -configure an IP address (Using the config variables \path{ip}, 23.1173 -\path{netmask}, \path{gateway}, \path{hostname}) or enable DHCP ( 23.1174 -\path{dhcp='dhcp'}). 23.1175 - 23.1176 -Note that the Linux NFS root implementation is known to have stability 23.1177 -problems under high load (this is not a Xen-specific problem), so this 23.1178 -configuration may not be appropriate for critical servers. 23.1179 23.1180 23.1181 \part{User Reference Documentation} 23.1182 23.1183 -\chapter{Control Software} 23.1184 - 23.1185 -The Xen control software includes the \xend node control daemon (which 23.1186 -must be running), the xm command line tools, and the prototype 23.1187 -xensv web interface. 23.1188 - 23.1189 -\section{\Xend (node control daemon)} 23.1190 -\label{s:xend} 23.1191 - 23.1192 -The Xen Daemon (\Xend) performs system management functions related to 23.1193 -virtual machines. It forms a central point of control for a machine 23.1194 -and can be controlled using an HTTP-based protocol. \Xend must be 23.1195 -running in order to start and manage virtual machines. 23.1196 - 23.1197 -\Xend must be run as root because it needs access to privileged system 23.1198 -management functions. A small set of commands may be issued on the 23.1199 -\xend command line: 23.1200 - 23.1201 -\begin{tabular}{ll} 23.1202 -\verb!# xend start! & start \xend, if not already running \\ 23.1203 -\verb!# xend stop! & stop \xend if already running \\ 23.1204 -\verb!# xend restart! & restart \xend if running, otherwise start it \\ 23.1205 -% \verb!# xend trace_start! & start \xend, with very detailed debug logging \\ 23.1206 -\verb!# xend status! & indicates \xend status by its return code 23.1207 -\end{tabular} 23.1208 - 23.1209 -A SysV init script called {\tt xend} is provided to start \xend at boot 23.1210 -time. {\tt make install} installs this script in {\path{/etc/init.d}. 23.1211 -To enable it, you have to make symbolic links in the appropriate 23.1212 -runlevel directories or use the {\tt chkconfig} tool, where available. 23.1213 - 23.1214 -Once \xend is running, more sophisticated administration can be done 23.1215 -using the xm tool (see Section~\ref{s:xm}) and the experimental 23.1216 -Xensv web interface (see Section~\ref{s:xensv}). 23.1217 - 23.1218 -As \xend runs, events will be logged to \path{/var/log/xend.log} and, 23.1219 -if the migration assistant daemon (\path{xfrd}) has been started, 23.1220 -\path{/var/log/xfrd.log}. These may be of use for troubleshooting 23.1221 -problems. 23.1222 - 23.1223 -\section{Xm (command line interface)} 23.1224 -\label{s:xm} 23.1225 - 23.1226 -The xm tool is the primary tool for managing Xen from the console. 23.1227 -The general format of an xm command line is: 23.1228 - 23.1229 -\begin{verbatim} 23.1230 -# xm command [switches] [arguments] [variables] 23.1231 -\end{verbatim} 23.1232 - 23.1233 -The available {\em switches} and {\em arguments} are dependent on the 23.1234 -{\em command} chosen. The {\em variables} may be set using 23.1235 -declarations of the form {\tt variable=value} and command line 23.1236 -declarations override any of the values in the configuration file 23.1237 -being used, including the standard variables described above and any 23.1238 -custom variables (for instance, the \path{xmdefconfig} file uses a 23.1239 -{\tt vmid} variable). 23.1240 - 23.1241 -The available commands are as follows: 23.1242 - 23.1243 -\begin{description} 23.1244 -\item[set-mem] Request a domain to adjust its memory footprint. 23.1245 -\item[create] Create a new domain. 23.1246 -\item[destroy] Kill a domain immediately. 23.1247 -\item[list] List running domains. 23.1248 -\item[shutdown] Ask a domain to shutdown. 23.1249 -\item[dmesg] Fetch the Xen (not Linux!) boot output. 23.1250 -\item[consoles] Lists the available consoles. 23.1251 -\item[console] Connect to the console for a domain. 23.1252 -\item[help] Get help on xm commands. 23.1253 -\item[save] Suspend a domain to disk. 23.1254 -\item[restore] Restore a domain from disk. 23.1255 -\item[pause] Pause a domain's execution. 23.1256 -\item[unpause] Unpause a domain. 23.1257 -\item[pincpu] Pin a domain to a CPU. 23.1258 -\item[bvt] Set BVT scheduler parameters for a domain. 23.1259 -\item[bvt\_ctxallow] Set the BVT context switching allowance for the system. 23.1260 -\item[atropos] Set the atropos parameters for a domain. 23.1261 -\item[rrobin] Set the round robin time slice for the system. 23.1262 -\item[info] Get information about the Xen host. 23.1263 -\item[call] Call a \xend HTTP API function directly. 23.1264 -\end{description} 23.1265 - 23.1266 -For a detailed overview of switches, arguments and variables to each command 23.1267 -try 23.1268 -\begin{quote} 23.1269 -\begin{verbatim} 23.1270 -# xm help command 23.1271 -\end{verbatim} 23.1272 -\end{quote} 23.1273 - 23.1274 -\section{Xensv (web control interface)} 23.1275 -\label{s:xensv} 23.1276 - 23.1277 -Xensv is the experimental web control interface for managing a Xen 23.1278 -machine. It can be used to perform some (but not yet all) of the 23.1279 -management tasks that can be done using the xm tool. 23.1280 - 23.1281 -It can be started using: 23.1282 -\begin{quote} 23.1283 -\verb_# xensv start_ 23.1284 -\end{quote} 23.1285 -and stopped using: 23.1286 -\begin{quote} 23.1287 -\verb_# xensv stop_ 23.1288 -\end{quote} 23.1289 - 23.1290 -By default, Xensv will serve out the web interface on port 8080. This 23.1291 -can be changed by editing 23.1292 -\path{/usr/lib/python2.3/site-packages/xen/sv/params.py}. 23.1293 - 23.1294 -Once Xensv is running, the web interface can be used to create and 23.1295 -manage running domains. 23.1296 - 23.1297 - 23.1298 - 23.1299 - 23.1300 -\chapter{Domain Configuration} 23.1301 -\label{cha:config} 23.1302 - 23.1303 -The following contains the syntax of the domain configuration 23.1304 -files and description of how to further specify networking, 23.1305 -driver domain and general scheduling behaviour. 23.1306 - 23.1307 -\section{Configuration Files} 23.1308 -\label{s:cfiles} 23.1309 - 23.1310 -Xen configuration files contain the following standard variables. 23.1311 -Unless otherwise stated, configuration items should be enclosed in 23.1312 -quotes: see \path{/etc/xen/xmexample1} and \path{/etc/xen/xmexample2} 23.1313 -for concrete examples of the syntax. 23.1314 - 23.1315 -\begin{description} 23.1316 -\item[kernel] Path to the kernel image 23.1317 -\item[ramdisk] Path to a ramdisk image (optional). 23.1318 -% \item[builder] The name of the domain build function (e.g. {\tt'linux'} or {\tt'netbsd'}. 23.1319 -\item[memory] Memory size in megabytes. 23.1320 -\item[cpu] CPU to run this domain on, or {\tt -1} for 23.1321 - auto-allocation. 23.1322 -\item[console] Port to export the domain console on (default 9600 + domain ID). 23.1323 -\item[nics] Number of virtual network interfaces. 23.1324 -\item[vif] List of MAC addresses (random addresses are assigned if not 23.1325 - given) and bridges to use for the domain's network interfaces, e.g. 23.1326 -\begin{verbatim} 23.1327 -vif = [ 'mac=aa:00:00:00:00:11, bridge=xen-br0', 23.1328 - 'bridge=xen-br1' ] 23.1329 -\end{verbatim} 23.1330 - to assign a MAC address and bridge to the first interface and assign 23.1331 - a different bridge to the second interface, leaving \xend to choose 23.1332 - the MAC address. 23.1333 -\item[disk] List of block devices to export to the domain, e.g. \\ 23.1334 - \verb_disk = [ 'phy:hda1,sda1,r' ]_ \\ 23.1335 - exports physical device \path{/dev/hda1} to the domain 23.1336 - as \path{/dev/sda1} with read-only access. Exporting a disk read-write 23.1337 - which is currently mounted is dangerous -- if you are \emph{certain} 23.1338 - you wish to do this, you can specify \path{w!} as the mode. 23.1339 -\item[dhcp] Set to {\tt 'dhcp'} if you want to use DHCP to configure 23.1340 - networking. 23.1341 -\item[netmask] Manually configured IP netmask. 23.1342 -\item[gateway] Manually configured IP gateway. 23.1343 -\item[hostname] Set the hostname for the virtual machine. 23.1344 -\item[root] Specify the root device parameter on the kernel command 23.1345 - line. 23.1346 -\item[nfs\_server] IP address for the NFS server (if any). 23.1347 -\item[nfs\_root] Path of the root filesystem on the NFS server (if any). 23.1348 -\item[extra] Extra string to append to the kernel command line (if 23.1349 - any) 23.1350 -\item[restart] Three possible options: 23.1351 - \begin{description} 23.1352 - \item[always] Always restart the domain, no matter what 23.1353 - its exit code is. 23.1354 - \item[never] Never restart the domain. 23.1355 - \item[onreboot] Restart the domain iff it requests reboot. 23.1356 - \end{description} 23.1357 -\end{description} 23.1358 - 23.1359 -For additional flexibility, it is also possible to include Python 23.1360 -scripting commands in configuration files. An example of this is the 23.1361 -\path{xmexample2} file, which uses Python code to handle the 23.1362 -\path{vmid} variable. 23.1363 - 23.1364 - 23.1365 -%\part{Advanced Topics} 23.1366 - 23.1367 -\section{Network Configuration} 23.1368 - 23.1369 -For many users, the default installation should work `out of the box'. 23.1370 -More complicated network setups, for instance with multiple ethernet 23.1371 -interfaces and/or existing bridging setups will require some 23.1372 -special configuration. 23.1373 - 23.1374 -The purpose of this section is to describe the mechanisms provided by 23.1375 -\xend to allow a flexible configuration for Xen's virtual networking. 23.1376 - 23.1377 -\subsection{Xen virtual network topology} 23.1378 - 23.1379 -Each domain network interface is connected to a virtual network 23.1380 -interface in dom0 by a point to point link (effectively a `virtual 23.1381 -crossover cable'). These devices are named {\tt 23.1382 -vif$<$domid$>$.$<$vifid$>$} (e.g. {\tt vif1.0} for the first interface 23.1383 -in domain 1, {\tt vif3.1} for the second interface in domain 3). 23.1384 - 23.1385 -Traffic on these virtual interfaces is handled in domain 0 using 23.1386 -standard Linux mechanisms for bridging, routing, rate limiting, etc. 23.1387 -Xend calls on two shell scripts to perform initial configuration of 23.1388 -the network and configuration of new virtual interfaces. By default, 23.1389 -these scripts configure a single bridge for all the virtual 23.1390 -interfaces. Arbitrary routing / bridging configurations can be 23.1391 -configured by customising the scripts, as described in the following 23.1392 -section. 23.1393 - 23.1394 -\subsection{Xen networking scripts} 23.1395 - 23.1396 -Xen's virtual networking is configured by two shell scripts (by 23.1397 -default \path{network} and \path{vif-bridge}). These are 23.1398 -called automatically by \xend when certain events occur, with 23.1399 -arguments to the scripts providing further contextual information. 23.1400 -These scripts are found by default in \path{/etc/xen/scripts}. The 23.1401 -names and locations of the scripts can be configured in 23.1402 -\path{/etc/xen/xend-config.sxp}. 23.1403 - 23.1404 -\begin{description} 23.1405 - 23.1406 -\item[network:] This script is called whenever \xend is started or 23.1407 -stopped to respectively initialise or tear down the Xen virtual 23.1408 -network. In the default configuration initialisation creates the 23.1409 -bridge `xen-br0' and moves eth0 onto that bridge, modifying the 23.1410 -routing accordingly. When \xend exits, it deletes the Xen bridge and 23.1411 -removes eth0, restoring the normal IP and routing configuration. 23.1412 - 23.1413 -%% In configurations where the bridge already exists, this script could 23.1414 -%% be replaced with a link to \path{/bin/true} (for instance). 23.1415 - 23.1416 -\item[vif-bridge:] This script is called for every domain virtual 23.1417 -interface and can configure firewalling rules and add the vif 23.1418 -to the appropriate bridge. By default, this adds and removes 23.1419 -VIFs on the default Xen bridge. 23.1420 - 23.1421 -\end{description} 23.1422 - 23.1423 -For more complex network setups (e.g. where routing is required or 23.1424 -integrate with existing bridges) these scripts may be replaced with 23.1425 -customised variants for your site's preferred configuration. 23.1426 - 23.1427 -%% There are two possible types of privileges: IO privileges and 23.1428 -%% administration privileges. 23.1429 - 23.1430 -\section{Driver Domain Configuration} 23.1431 - 23.1432 -I/O privileges can be assigned to allow a domain to directly access 23.1433 -PCI devices itself. This is used to support driver domains. 23.1434 - 23.1435 -Setting backend privileges is currently only supported in SXP format 23.1436 -config files. To allow a domain to function as a backend for others, 23.1437 -somewhere within the {\tt vm} element of its configuration file must 23.1438 -be a {\tt backend} element of the form {\tt (backend ({\em type}))} 23.1439 -where {\tt \em type} may be either {\tt netif} or {\tt blkif}, 23.1440 -according to the type of virtual device this domain will service. 23.1441 -%% After this domain has been built, \xend will connect all new and 23.1442 -%% existing {\em virtual} devices (of the appropriate type) to that 23.1443 -%% backend. 23.1444 - 23.1445 -Note that a block backend cannot currently import virtual block 23.1446 -devices from other domains, and a network backend cannot import 23.1447 -virtual network devices from other domains. Thus (particularly in the 23.1448 -case of block backends, which cannot import a virtual block device as 23.1449 -their root filesystem), you may need to boot a backend domain from a 23.1450 -ramdisk or a network device. 23.1451 - 23.1452 -Access to PCI devices may be configured on a per-device basis. Xen 23.1453 -will assign the minimal set of hardware privileges to a domain that 23.1454 -are required to control its devices. This can be configured in either 23.1455 -format of configuration file: 23.1456 - 23.1457 -\begin{itemize} 23.1458 -\item SXP Format: Include device elements of the form: \\ 23.1459 -\centerline{ {\tt (device (pci (bus {\em x}) (dev {\em y}) (func {\em z})))}} \\ 23.1460 - inside the top-level {\tt vm} element. Each one specifies the address 23.1461 - of a device this domain is allowed to access --- 23.1462 - the numbers {\em x},{\em y} and {\em z} may be in either decimal or 23.1463 - hexadecimal format. 23.1464 -\item Flat Format: Include a list of PCI device addresses of the 23.1465 - format: \\ 23.1466 -\centerline{{\tt pci = ['x,y,z', ...]}} \\ 23.1467 -where each element in the 23.1468 - list is a string specifying the components of the PCI device 23.1469 - address, separated by commas. The components ({\tt \em x}, {\tt \em 23.1470 - y} and {\tt \em z}) of the list may be formatted as either decimal 23.1471 - or hexadecimal. 23.1472 -\end{itemize} 23.1473 - 23.1474 -%% \section{Administration Domains} 23.1475 - 23.1476 -%% Administration privileges allow a domain to use the `dom0 23.1477 -%% operations' (so called because they are usually available only to 23.1478 -%% domain 0). A privileged domain can build other domains, set scheduling 23.1479 -%% parameters, etc. 23.1480 - 23.1481 -% Support for other administrative domains is not yet available... perhaps 23.1482 -% we should plumb it in some time 23.1483 - 23.1484 - 23.1485 - 23.1486 - 23.1487 - 23.1488 -\section{Scheduler Configuration} 23.1489 -\label{s:sched} 23.1490 - 23.1491 - 23.1492 -Xen offers a boot time choice between multiple schedulers. To select 23.1493 -a scheduler, pass the boot parameter {\em sched=sched\_name} to Xen, 23.1494 -substituting the appropriate scheduler name. Details of the schedulers 23.1495 -and their parameters are included below; future versions of the tools 23.1496 -will provide a higher-level interface to these tools. 23.1497 +%% Chapter Control Software moved to control_software.tex 23.1498 +\include{src/user/control_software} 23.1499 23.1500 -It is expected that system administrators configure their system to 23.1501 -use the scheduler most appropriate to their needs. Currently, the BVT 23.1502 -scheduler is the recommended choice. 23.1503 - 23.1504 -\subsection{Borrowed Virtual Time} 23.1505 - 23.1506 -{\tt sched=bvt} (the default) \\ 23.1507 - 23.1508 -BVT provides proportional fair shares of the CPU time. It has been 23.1509 -observed to penalise domains that block frequently (e.g. I/O intensive 23.1510 -domains), but this can be compensated for by using warping. 23.1511 - 23.1512 -\subsubsection{Global Parameters} 23.1513 - 23.1514 -\begin{description} 23.1515 -\item[ctx\_allow] 23.1516 - the context switch allowance is similar to the `quantum' 23.1517 - in traditional schedulers. It is the minimum time that 23.1518 - a scheduled domain will be allowed to run before being 23.1519 - pre-empted. 23.1520 -\end{description} 23.1521 - 23.1522 -\subsubsection{Per-domain parameters} 23.1523 - 23.1524 -\begin{description} 23.1525 -\item[mcuadv] 23.1526 - the MCU (Minimum Charging Unit) advance determines the 23.1527 - proportional share of the CPU that a domain receives. It 23.1528 - is set inversely proportionally to a domain's sharing weight. 23.1529 -\item[warp] 23.1530 - the amount of `virtual time' the domain is allowed to warp 23.1531 - backwards 23.1532 -\item[warpl] 23.1533 - the warp limit is the maximum time a domain can run warped for 23.1534 -\item[warpu] 23.1535 - the unwarp requirement is the minimum time a domain must 23.1536 - run unwarped for before it can warp again 23.1537 -\end{description} 23.1538 - 23.1539 -\subsection{Atropos} 23.1540 - 23.1541 -{\tt sched=atropos} \\ 23.1542 - 23.1543 -Atropos is a soft real time scheduler. It provides guarantees about 23.1544 -absolute shares of the CPU, with a facility for sharing 23.1545 -slack CPU time on a best-effort basis. It can provide timeliness 23.1546 -guarantees for latency-sensitive domains. 23.1547 - 23.1548 -Every domain has an associated period and slice. The domain should 23.1549 -receive `slice' nanoseconds every `period' nanoseconds. This allows 23.1550 -the administrator to configure both the absolute share of the CPU a 23.1551 -domain receives and the frequency with which it is scheduled. 23.1552 - 23.1553 -%% When 23.1554 -%% domains unblock, their period is reduced to the value of the latency 23.1555 -%% hint (the slice is scaled accordingly so that they still get the same 23.1556 -%% proportion of the CPU). For each subsequent period, the slice and 23.1557 -%% period times are doubled until they reach their original values. 23.1558 - 23.1559 -Note: don't overcommit the CPU when using Atropos (i.e. don't reserve 23.1560 -more CPU than is available --- the utilisation should be kept to 23.1561 -slightly less than 100\% in order to ensure predictable behaviour). 23.1562 - 23.1563 -\subsubsection{Per-domain parameters} 23.1564 - 23.1565 -\begin{description} 23.1566 -\item[period] The regular time interval during which a domain is 23.1567 - guaranteed to receive its allocation of CPU time. 23.1568 -\item[slice] 23.1569 - The length of time per period that a domain is guaranteed to run 23.1570 - for (in the absence of voluntary yielding of the CPU). 23.1571 -\item[latency] 23.1572 - The latency hint is used to control how soon after 23.1573 - waking up a domain it should be scheduled. 23.1574 -\item[xtratime] This is a boolean flag that specifies whether a domain 23.1575 - should be allowed a share of the system slack time. 23.1576 -\end{description} 23.1577 - 23.1578 -\subsection{Round Robin} 23.1579 - 23.1580 -{\tt sched=rrobin} \\ 23.1581 - 23.1582 -The round robin scheduler is included as a simple demonstration of 23.1583 -Xen's internal scheduler API. It is not intended for production use. 23.1584 - 23.1585 -\subsubsection{Global Parameters} 23.1586 - 23.1587 -\begin{description} 23.1588 -\item[rr\_slice] 23.1589 - The maximum time each domain runs before the next 23.1590 - scheduling decision is made. 23.1591 -\end{description} 23.1592 - 23.1593 - 23.1594 - 23.1595 - 23.1596 - 23.1597 - 23.1598 - 23.1599 - 23.1600 - 23.1601 - 23.1602 - 23.1603 - 23.1604 -\chapter{Build, Boot and Debug options} 23.1605 - 23.1606 -This chapter describes the build- and boot-time options 23.1607 -which may be used to tailor your Xen system. 23.1608 - 23.1609 -\section{Xen Build Options} 23.1610 - 23.1611 -Xen provides a number of build-time options which should be 23.1612 -set as environment variables or passed on make's command-line. 23.1613 - 23.1614 -\begin{description} 23.1615 -\item[verbose=y] Enable debugging messages when Xen detects an unexpected condition. 23.1616 -Also enables console output from all domains. 23.1617 -\item[debug=y] 23.1618 -Enable debug assertions. Implies {\bf verbose=y}. 23.1619 -(Primarily useful for tracing bugs in Xen). 23.1620 -\item[debugger=y] 23.1621 -Enable the in-Xen debugger. This can be used to debug 23.1622 -Xen, guest OSes, and applications. 23.1623 -\item[perfc=y] 23.1624 -Enable performance counters for significant events 23.1625 -within Xen. The counts can be reset or displayed 23.1626 -on Xen's console via console control keys. 23.1627 -\item[trace=y] 23.1628 -Enable per-cpu trace buffers which log a range of 23.1629 -events within Xen for collection by control 23.1630 -software. 23.1631 -\end{description} 23.1632 - 23.1633 -\section{Xen Boot Options} 23.1634 -\label{s:xboot} 23.1635 - 23.1636 -These options are used to configure Xen's behaviour at runtime. They 23.1637 -should be appended to Xen's command line, either manually or by 23.1638 -editing \path{grub.conf}. 23.1639 - 23.1640 -\begin{description} 23.1641 -\item [noreboot ] 23.1642 - Don't reboot the machine automatically on errors. This is 23.1643 - useful to catch debug output if you aren't catching console messages 23.1644 - via the serial line. 23.1645 - 23.1646 -\item [nosmp ] 23.1647 - Disable SMP support. 23.1648 - This option is implied by `ignorebiostables'. 23.1649 - 23.1650 -\item [watchdog ] 23.1651 - Enable NMI watchdog which can report certain failures. 23.1652 - 23.1653 -\item [noirqbalance ] 23.1654 - Disable software IRQ balancing and affinity. This can be used on 23.1655 - systems such as Dell 1850/2850 that have workarounds in hardware for 23.1656 - IRQ-routing issues. 23.1657 +%% Chapter Domain Configuration moved to domain_configuration.tex 23.1658 +\include{src/user/domain_configuration} 23.1659 23.1660 -\item [badpage=$<$page number$>$,$<$page number$>$, \ldots ] 23.1661 - Specify a list of pages not to be allocated for use 23.1662 - because they contain bad bytes. For example, if your 23.1663 - memory tester says that byte 0x12345678 is bad, you would 23.1664 - place `badpage=0x12345' on Xen's command line. 23.1665 - 23.1666 -\item [com1=$<$baud$>$,DPS,$<$io\_base$>$,$<$irq$>$ 23.1667 - com2=$<$baud$>$,DPS,$<$io\_base$>$,$<$irq$>$ ] \mbox{}\\ 23.1668 - Xen supports up to two 16550-compatible serial ports. 23.1669 - For example: `com1=9600, 8n1, 0x408, 5' maps COM1 to a 23.1670 - 9600-baud port, 8 data bits, no parity, 1 stop bit, 23.1671 - I/O port base 0x408, IRQ 5. 23.1672 - If some configuration options are standard (e.g., I/O base and IRQ), 23.1673 - then only a prefix of the full configuration string need be 23.1674 - specified. If the baud rate is pre-configured (e.g., by the 23.1675 - bootloader) then you can specify `auto' in place of a numeric baud 23.1676 - rate. 23.1677 - 23.1678 -\item [console=$<$specifier list$>$ ] 23.1679 - Specify the destination for Xen console I/O. 23.1680 - This is a comma-separated list of, for example: 23.1681 -\begin{description} 23.1682 - \item[vga] use VGA console and allow keyboard input 23.1683 - \item[com1] use serial port com1 23.1684 - \item[com2H] use serial port com2. Transmitted chars will 23.1685 - have the MSB set. Received chars must have 23.1686 - MSB set. 23.1687 - \item[com2L] use serial port com2. Transmitted chars will 23.1688 - have the MSB cleared. Received chars must 23.1689 - have MSB cleared. 23.1690 -\end{description} 23.1691 - The latter two examples allow a single port to be 23.1692 - shared by two subsystems (e.g. console and 23.1693 - debugger). Sharing is controlled by MSB of each 23.1694 - transmitted/received character. 23.1695 - [NB. Default for this option is `com1,vga'] 23.1696 - 23.1697 -\item [sync\_console ] 23.1698 - Force synchronous console output. This is useful if you system fails 23.1699 - unexpectedly before it has sent all available output to the 23.1700 - console. In most cases Xen will automatically enter synchronous mode 23.1701 - when an exceptional event occurs, but this option provides a manual 23.1702 - fallback. 23.1703 - 23.1704 -\item [conswitch=$<$switch-char$><$auto-switch-char$>$ ] 23.1705 - Specify how to switch serial-console input between 23.1706 - Xen and DOM0. The required sequence is CTRL-$<$switch-char$>$ 23.1707 - pressed three times. Specifying the backtick character 23.1708 - disables switching. 23.1709 - The $<$auto-switch-char$>$ specifies whether Xen should 23.1710 - auto-switch input to DOM0 when it boots --- if it is `x' 23.1711 - then auto-switching is disabled. Any other value, or 23.1712 - omitting the character, enables auto-switching. 23.1713 - [NB. default switch-char is `a'] 23.1714 - 23.1715 -\item [nmi=xxx ] 23.1716 - Specify what to do with an NMI parity or I/O error. \\ 23.1717 - `nmi=fatal': Xen prints a diagnostic and then hangs. \\ 23.1718 - `nmi=dom0': Inform DOM0 of the NMI. \\ 23.1719 - `nmi=ignore': Ignore the NMI. 23.1720 - 23.1721 -\item [mem=xxx ] 23.1722 - Set the physical RAM address limit. Any RAM appearing beyond this 23.1723 - physical address in the memory map will be ignored. This parameter 23.1724 - may be specified with a B, K, M or G suffix, representing bytes, 23.1725 - kilobytes, megabytes and gigabytes respectively. The 23.1726 - default unit, if no suffix is specified, is kilobytes. 23.1727 - 23.1728 -\item [dom0\_mem=xxx ] 23.1729 - Set the amount of memory to be allocated to domain0. In Xen 3.x the parameter 23.1730 - may be specified with a B, K, M or G suffix, representing bytes, 23.1731 - kilobytes, megabytes and gigabytes respectively; if no suffix is specified, 23.1732 - the parameter defaults to kilobytes. In previous versions of Xen, suffixes 23.1733 - were not supported and the value is always interpreted as kilobytes. 23.1734 - 23.1735 -\item [tbuf\_size=xxx ] 23.1736 - Set the size of the per-cpu trace buffers, in pages 23.1737 - (default 1). Note that the trace buffers are only 23.1738 - enabled in debug builds. Most users can ignore 23.1739 - this feature completely. 23.1740 - 23.1741 -\item [sched=xxx ] 23.1742 - Select the CPU scheduler Xen should use. The current 23.1743 - possibilities are `bvt' (default), `atropos' and `rrobin'. 23.1744 - For more information see Section~\ref{s:sched}. 23.1745 - 23.1746 -\item [apic\_verbosity=debug,verbose ] 23.1747 - Print more detailed information about local APIC and IOAPIC configuration. 23.1748 - 23.1749 -\item [lapic ] 23.1750 - Force use of local APIC even when left disabled by uniprocessor BIOS. 23.1751 - 23.1752 -\item [nolapic ] 23.1753 - Ignore local APIC in a uniprocessor system, even if enabled by the BIOS. 23.1754 - 23.1755 -\item [apic=bigsmp,default,es7000,summit ] 23.1756 - Specify NUMA platform. This can usually be probed automatically. 23.1757 - 23.1758 -\end{description} 23.1759 - 23.1760 -In addition, the following options may be specified on the Xen command 23.1761 -line. Since domain 0 shares responsibility for booting the platform, 23.1762 -Xen will automatically propagate these options to its command 23.1763 -line. These options are taken from Linux's command-line syntax with 23.1764 -unchanged semantics. 23.1765 - 23.1766 -\begin{description} 23.1767 -\item [acpi=off,force,strict,ht,noirq,\ldots ] 23.1768 - Modify how Xen (and domain 0) parses the BIOS ACPI tables. 23.1769 - 23.1770 -\item [acpi\_skip\_timer\_override ] 23.1771 - Instruct Xen (and domain 0) to ignore timer-interrupt override 23.1772 - instructions specified by the BIOS ACPI tables. 23.1773 - 23.1774 -\item [noapic ] 23.1775 - Instruct Xen (and domain 0) to ignore any IOAPICs that are present in 23.1776 - the system, and instead continue to use the legacy PIC. 23.1777 - 23.1778 -\end{description} 23.1779 - 23.1780 -\section{XenLinux Boot Options} 23.1781 - 23.1782 -In addition to the standard Linux kernel boot options, we support: 23.1783 -\begin{description} 23.1784 -\item[xencons=xxx ] Specify the device node to which the Xen virtual 23.1785 -console driver is attached. The following options are supported: 23.1786 -\begin{center} 23.1787 -\begin{tabular}{l} 23.1788 -`xencons=off': disable virtual console \\ 23.1789 -`xencons=tty': attach console to /dev/tty1 (tty0 at boot-time) \\ 23.1790 -`xencons=ttyS': attach console to /dev/ttyS0 23.1791 -\end{tabular} 23.1792 -\end{center} 23.1793 -The default is ttyS for dom0 and tty for all other domains. 23.1794 -\end{description} 23.1795 - 23.1796 - 23.1797 - 23.1798 -\section{Debugging} 23.1799 -\label{s:keys} 23.1800 - 23.1801 -Xen has a set of debugging features that can be useful to try and 23.1802 -figure out what's going on. Hit 'h' on the serial line (if you 23.1803 -specified a baud rate on the Xen command line) or ScrollLock-h on the 23.1804 -keyboard to get a list of supported commands. 23.1805 - 23.1806 -If you have a crash you'll likely get a crash dump containing an EIP 23.1807 -(PC) which, along with an \path{objdump -d image}, can be useful in 23.1808 -figuring out what's happened. Debug a Xenlinux image just as you 23.1809 -would any other Linux kernel. 23.1810 - 23.1811 -%% We supply a handy debug terminal program which you can find in 23.1812 -%% \path{/usr/local/src/xen-2.0.bk/tools/misc/miniterm/} 23.1813 -%% This should be built and executed on another machine that is connected 23.1814 -%% via a null modem cable. Documentation is included. 23.1815 -%% Alternatively, if the Xen machine is connected to a serial-port server 23.1816 -%% then we supply a dumb TCP terminal client, {\tt xencons}. 23.1817 - 23.1818 - 23.1819 +%% Chapter Build, Boot and Debug Options moved to build.tex 23.1820 +\include{src/user/build} 23.1821 23.1822 23.1823 \chapter{Further Support} 23.1824 @@ -1875,6 +108,7 @@ directory of the Xen source distribution 23.1825 %Various HOWTOs are available in \path{docs/HOWTOS} but this content is 23.1826 %being integrated into this manual. 23.1827 23.1828 + 23.1829 \section{Online References} 23.1830 23.1831 The official Xen web site is found at: 23.1832 @@ -1885,6 +119,7 @@ The official Xen web site is found at: 23.1833 This contains links to the latest versions of all on-line 23.1834 documentation (including the lateset version of the FAQ). 23.1835 23.1836 + 23.1837 \section{Mailing Lists} 23.1838 23.1839 There are currently four official Xen mailing lists: 23.1840 @@ -1905,326 +140,18 @@ from the unstable and 2.0 trees - develo 23.1841 \end{description} 23.1842 23.1843 23.1844 + 23.1845 \appendix 23.1846 23.1847 +%% Chapter Installing Xen / XenLinux on Debian moved to debian.tex 23.1848 +\include{src/user/debian} 23.1849 + 23.1850 +%% Chapter Installing Xen on Red Hat moved to redhat.tex 23.1851 +\include{src/user/redhat} 23.1852 + 23.1853 23.1854 -\chapter{Installing Xen / XenLinux on Debian} 23.1855 - 23.1856 -The Debian project provides a tool called \path{debootstrap} which 23.1857 -allows a base Debian system to be installed into a filesystem without 23.1858 -requiring the host system to have any Debian-specific software (such 23.1859 -as \path{apt}. 23.1860 - 23.1861 -Here's some info how to install Debian 3.1 (Sarge) for an unprivileged 23.1862 -Xen domain: 23.1863 - 23.1864 -\begin{enumerate} 23.1865 -\item Set up Xen 2.0 and test that it's working, as described earlier in 23.1866 - this manual. 23.1867 - 23.1868 -\item Create disk images for root-fs and swap (alternatively, you 23.1869 - might create dedicated partitions, LVM logical volumes, etc. if 23.1870 - that suits your setup). 23.1871 -\begin{small}\begin{verbatim} 23.1872 -dd if=/dev/zero of=/path/diskimage bs=1024k count=size_in_mbytes 23.1873 -dd if=/dev/zero of=/path/swapimage bs=1024k count=size_in_mbytes 23.1874 -\end{verbatim}\end{small} 23.1875 - If you're going to use this filesystem / disk image only as a 23.1876 - `template' for other vm disk images, something like 300 MB should 23.1877 - be enough.. (of course it depends what kind of packages you are 23.1878 - planning to install to the template) 23.1879 - 23.1880 -\item Create the filesystem and initialise the swap image 23.1881 -\begin{small}\begin{verbatim} 23.1882 -mkfs.ext3 /path/diskimage 23.1883 -mkswap /path/swapimage 23.1884 -\end{verbatim}\end{small} 23.1885 - 23.1886 -\item Mount the disk image for installation 23.1887 -\begin{small}\begin{verbatim} 23.1888 -mount -o loop /path/diskimage /mnt/disk 23.1889 -\end{verbatim}\end{small} 23.1890 - 23.1891 -\item Install \path{debootstrap} 23.1892 - 23.1893 -Make sure you have debootstrap installed on the host. If you are 23.1894 -running Debian sarge (3.1 / testing) or unstable you can install it by 23.1895 -running \path{apt-get install debootstrap}. Otherwise, it can be 23.1896 -downloaded from the Debian project website. 23.1897 - 23.1898 -\item Install Debian base to the disk image: 23.1899 -\begin{small}\begin{verbatim} 23.1900 -debootstrap --arch i386 sarge /mnt/disk \ 23.1901 - http://ftp.<countrycode>.debian.org/debian 23.1902 -\end{verbatim}\end{small} 23.1903 - 23.1904 -You can use any other Debian http/ftp mirror you want. 23.1905 - 23.1906 -\item When debootstrap completes successfully, modify settings: 23.1907 -\begin{small}\begin{verbatim} 23.1908 -chroot /mnt/disk /bin/bash 23.1909 -\end{verbatim}\end{small} 23.1910 - 23.1911 -Edit the following files using vi or nano and make needed changes: 23.1912 -\begin{small}\begin{verbatim} 23.1913 -/etc/hostname 23.1914 -/etc/hosts 23.1915 -/etc/resolv.conf 23.1916 -/etc/network/interfaces 23.1917 -/etc/networks 23.1918 -\end{verbatim}\end{small} 23.1919 - 23.1920 -Set up access to the services, edit: 23.1921 -\begin{small}\begin{verbatim} 23.1922 -/etc/hosts.deny 23.1923 -/etc/hosts.allow 23.1924 -/etc/inetd.conf 23.1925 -\end{verbatim}\end{small} 23.1926 - 23.1927 -Add Debian mirror to: 23.1928 -\begin{small}\begin{verbatim} 23.1929 -/etc/apt/sources.list 23.1930 -\end{verbatim}\end{small} 23.1931 - 23.1932 -Create fstab like this: 23.1933 -\begin{small}\begin{verbatim} 23.1934 -/dev/sda1 / ext3 errors=remount-ro 0 1 23.1935 -/dev/sda2 none swap sw 0 0 23.1936 -proc /proc proc defaults 0 0 23.1937 -\end{verbatim}\end{small} 23.1938 - 23.1939 -Logout 23.1940 - 23.1941 -\item Unmount the disk image 23.1942 -\begin{small}\begin{verbatim} 23.1943 -umount /mnt/disk 23.1944 -\end{verbatim}\end{small} 23.1945 - 23.1946 -\item Create Xen 2.0 configuration file for the new domain. You can 23.1947 - use the example-configurations coming with Xen as a template. 23.1948 - 23.1949 - Make sure you have the following set up: 23.1950 -\begin{small}\begin{verbatim} 23.1951 -disk = [ 'file:/path/diskimage,sda1,w', 'file:/path/swapimage,sda2,w' ] 23.1952 -root = "/dev/sda1 ro" 23.1953 -\end{verbatim}\end{small} 23.1954 - 23.1955 -\item Start the new domain 23.1956 -\begin{small}\begin{verbatim} 23.1957 -xm create -f domain_config_file 23.1958 -\end{verbatim}\end{small} 23.1959 - 23.1960 -Check that the new domain is running: 23.1961 -\begin{small}\begin{verbatim} 23.1962 -xm list 23.1963 -\end{verbatim}\end{small} 23.1964 - 23.1965 -\item Attach to the console of the new domain. 23.1966 - You should see something like this when starting the new domain: 23.1967 - 23.1968 -\begin{small}\begin{verbatim} 23.1969 -Started domain testdomain2, console on port 9626 23.1970 -\end{verbatim}\end{small} 23.1971 - 23.1972 - There you can see the ID of the console: 26. You can also list 23.1973 - the consoles with \path{xm consoles} (ID is the last two 23.1974 - digits of the port number.) 23.1975 - 23.1976 - Attach to the console: 23.1977 - 23.1978 -\begin{small}\begin{verbatim} 23.1979 -xm console 26 23.1980 -\end{verbatim}\end{small} 23.1981 - 23.1982 - or by telnetting to the port 9626 of localhost (the xm console 23.1983 - program works better). 23.1984 - 23.1985 -\item Log in and run base-config 23.1986 - 23.1987 - As a default there's no password for the root. 23.1988 - 23.1989 - Check that everything looks OK, and the system started without 23.1990 - errors. Check that the swap is active, and the network settings are 23.1991 - correct. 23.1992 - 23.1993 - Run \path{/usr/sbin/base-config} to set up the Debian settings. 23.1994 - 23.1995 - Set up the password for root using passwd. 23.1996 - 23.1997 -\item Done. You can exit the console by pressing \path{Ctrl + ]} 23.1998 - 23.1999 -\end{enumerate} 23.2000 - 23.2001 -If you need to create new domains, you can just copy the contents of 23.2002 -the `template'-image to the new disk images, either by mounting the 23.2003 -template and the new image, and using \path{cp -a} or \path{tar} or by 23.2004 -simply copying the image file. Once this is done, modify the 23.2005 -image-specific settings (hostname, network settings, etc). 23.2006 - 23.2007 -\chapter{Installing Xen / XenLinux on Redhat or Fedora Core} 23.2008 - 23.2009 -When using Xen / XenLinux on a standard Linux distribution there are 23.2010 -a couple of things to watch out for: 23.2011 - 23.2012 -Note that, because domains>0 don't have any privileged access at all, 23.2013 -certain commands in the default boot sequence will fail e.g. attempts 23.2014 -to update the hwclock, change the console font, update the keytable 23.2015 -map, start apmd (power management), or gpm (mouse cursor). Either 23.2016 -ignore the errors (they should be harmless), or remove them from the 23.2017 -startup scripts. Deleting the following links are a good start: 23.2018 -{\path{S24pcmcia}}, {\path{S09isdn}}, 23.2019 -{\path{S17keytable}}, {\path{S26apmd}}, 23.2020 -{\path{S85gpm}}. 23.2021 - 23.2022 -If you want to use a single root file system that works cleanly for 23.2023 -both domain 0 and unprivileged domains, a useful trick is to use 23.2024 -different 'init' run levels. For example, use 23.2025 -run level 3 for domain 0, and run level 4 for other domains. This 23.2026 -enables different startup scripts to be run in depending on the run 23.2027 -level number passed on the kernel command line. 23.2028 - 23.2029 -If using NFS root files systems mounted either from an 23.2030 -external server or from domain0 there are a couple of other gotchas. 23.2031 -The default {\path{/etc/sysconfig/iptables}} rules block NFS, so part 23.2032 -way through the boot sequence things will suddenly go dead. 23.2033 - 23.2034 -If you're planning on having a separate NFS {\path{/usr}} partition, the 23.2035 -RH9 boot scripts don't make life easy - they attempt to mount NFS file 23.2036 -systems way to late in the boot process. The easiest way I found to do 23.2037 -this was to have a {\path{/linuxrc}} script run ahead of 23.2038 -{\path{/sbin/init}} that mounts {\path{/usr}}: 23.2039 - 23.2040 -\begin{quote} 23.2041 -\begin{small}\begin{verbatim} 23.2042 - #!/bin/bash 23.2043 - /sbin/ipconfig lo 127.0.0.1 23.2044 - /sbin/portmap 23.2045 - /bin/mount /usr 23.2046 - exec /sbin/init "$@" <>/dev/console 2>&1 23.2047 -\end{verbatim}\end{small} 23.2048 -\end{quote} 23.2049 - 23.2050 -%$ XXX SMH: font lock fix :-) 23.2051 - 23.2052 -The one slight complication with the above is that 23.2053 -{\path{/sbin/portmap}} is dynamically linked against 23.2054 -{\path{/usr/lib/libwrap.so.0}} Since this is in 23.2055 -{\path{/usr}}, it won't work. This can be solved by copying the 23.2056 -file (and link) below the /usr mount point, and just let the file be 23.2057 -'covered' when the mount happens. 23.2058 - 23.2059 -In some installations, where a shared read-only {\path{/usr}} is 23.2060 -being used, it may be desirable to move other large directories over 23.2061 -into the read-only {\path{/usr}}. For example, you might replace 23.2062 -{\path{/bin}}, {\path{/lib}} and {\path{/sbin}} with 23.2063 -links into {\path{/usr/root/bin}}, {\path{/usr/root/lib}} 23.2064 -and {\path{/usr/root/sbin}} respectively. This creates other 23.2065 -problems for running the {\path{/linuxrc}} script, requiring 23.2066 -bash, portmap, mount, ifconfig, and a handful of other shared 23.2067 -libraries to be copied below the mount point --- a simple 23.2068 -statically-linked C program would solve this problem. 23.2069 - 23.2070 - 23.2071 - 23.2072 - 23.2073 -\chapter{Glossary of Terms} 23.2074 - 23.2075 -\begin{description} 23.2076 -\item[Atropos] One of the CPU schedulers provided by Xen. 23.2077 - Atropos provides domains with absolute shares 23.2078 - of the CPU, with timeliness guarantees and a 23.2079 - mechanism for sharing out `slack time'. 23.2080 - 23.2081 -\item[BVT] The BVT scheduler is used to give proportional 23.2082 - fair shares of the CPU to domains. 23.2083 - 23.2084 -\item[Exokernel] A minimal piece of privileged code, similar to 23.2085 - a {\bf microkernel} but providing a more 23.2086 - `hardware-like' interface to the tasks it 23.2087 - manages. This is similar to a paravirtualising 23.2088 - VMM like {\bf Xen} but was designed as a new 23.2089 - operating system structure, rather than 23.2090 - specifically to run multiple conventional OSs. 23.2091 - 23.2092 -\item[Domain] A domain is the execution context that 23.2093 - contains a running {\bf virtual machine}. 23.2094 - The relationship between virtual machines 23.2095 - and domains on Xen is similar to that between 23.2096 - programs and processes in an operating 23.2097 - system: a virtual machine is a persistent 23.2098 - entity that resides on disk (somewhat like 23.2099 - a program). When it is loaded for execution, 23.2100 - it runs in a domain. Each domain has a 23.2101 - {\bf domain ID}. 23.2102 - 23.2103 -\item[Domain 0] The first domain to be started on a Xen 23.2104 - machine. Domain 0 is responsible for managing 23.2105 - the system. 23.2106 - 23.2107 -\item[Domain ID] A unique identifier for a {\bf domain}, 23.2108 - analogous to a process ID in an operating 23.2109 - system. 23.2110 - 23.2111 -\item[Full virtualisation] An approach to virtualisation which 23.2112 - requires no modifications to the hosted 23.2113 - operating system, providing the illusion of 23.2114 - a complete system of real hardware devices. 23.2115 - 23.2116 -\item[Hypervisor] An alternative term for {\bf VMM}, used 23.2117 - because it means `beyond supervisor', 23.2118 - since it is responsible for managing multiple 23.2119 - `supervisor' kernels. 23.2120 - 23.2121 -\item[Live migration] A technique for moving a running virtual 23.2122 - machine to another physical host, without 23.2123 - stopping it or the services running on it. 23.2124 - 23.2125 -\item[Microkernel] A small base of code running at the highest 23.2126 - hardware privilege level. A microkernel is 23.2127 - responsible for sharing CPU and memory (and 23.2128 - sometimes other devices) between less 23.2129 - privileged tasks running on the system. 23.2130 - This is similar to a VMM, particularly a 23.2131 - {\bf paravirtualising} VMM but typically 23.2132 - addressing a different problem space and 23.2133 - providing different kind of interface. 23.2134 - 23.2135 -\item[NetBSD/Xen] A port of NetBSD to the Xen architecture. 23.2136 - 23.2137 -\item[Paravirtualisation] An approach to virtualisation which requires 23.2138 - modifications to the operating system in 23.2139 - order to run in a virtual machine. Xen 23.2140 - uses paravirtualisation but preserves 23.2141 - binary compatibility for user space 23.2142 - applications. 23.2143 - 23.2144 -\item[Shadow pagetables] A technique for hiding the layout of machine 23.2145 - memory from a virtual machine's operating 23.2146 - system. Used in some {\bf VMMs} to provide 23.2147 - the illusion of contiguous physical memory, 23.2148 - in Xen this is used during 23.2149 - {\bf live migration}. 23.2150 - 23.2151 -\item[Virtual Machine] The environment in which a hosted operating 23.2152 - system runs, providing the abstraction of a 23.2153 - dedicated machine. A virtual machine may 23.2154 - be identical to the underlying hardware (as 23.2155 - in {\bf full virtualisation}, or it may 23.2156 - differ, as in {\bf paravirtualisation}. 23.2157 - 23.2158 -\item[VMM] Virtual Machine Monitor - the software that 23.2159 - allows multiple virtual machines to be 23.2160 - multiplexed on a single physical machine. 23.2161 - 23.2162 -\item[Xen] Xen is a paravirtualising virtual machine 23.2163 - monitor, developed primarily by the 23.2164 - Systems Research Group at the University 23.2165 - of Cambridge Computer Laboratory. 23.2166 - 23.2167 -\item[XenLinux] Official name for the port of the Linux kernel 23.2168 - that runs on Xen. 23.2169 - 23.2170 -\end{description} 23.2171 +%% Chapter Glossary of Terms moved to glossary.tex 23.2172 +\include{src/user/glossary} 23.2173 23.2174 23.2175 \end{document}
24.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 24.2 +++ b/docs/src/user/build.tex Thu Sep 22 11:42:01 2005 -0600 24.3 @@ -0,0 +1,170 @@ 24.4 +\chapter{Build, Boot and Debug Options} 24.5 + 24.6 +This chapter describes the build- and boot-time options which may be 24.7 +used to tailor your Xen system. 24.8 + 24.9 + 24.10 +\section{Xen Build Options} 24.11 + 24.12 +Xen provides a number of build-time options which should be set as 24.13 +environment variables or passed on make's command-line. 24.14 + 24.15 +\begin{description} 24.16 +\item[verbose=y] Enable debugging messages when Xen detects an 24.17 + unexpected condition. Also enables console output from all domains. 24.18 +\item[debug=y] Enable debug assertions. Implies {\bf verbose=y}. 24.19 + (Primarily useful for tracing bugs in Xen). 24.20 +\item[debugger=y] Enable the in-Xen debugger. This can be used to 24.21 + debug Xen, guest OSes, and applications. 24.22 +\item[perfc=y] Enable performance counters for significant events 24.23 + within Xen. The counts can be reset or displayed on Xen's console 24.24 + via console control keys. 24.25 +\item[trace=y] Enable per-cpu trace buffers which log a range of 24.26 + events within Xen for collection by control software. 24.27 +\end{description} 24.28 + 24.29 + 24.30 +\section{Xen Boot Options} 24.31 +\label{s:xboot} 24.32 + 24.33 +These options are used to configure Xen's behaviour at runtime. They 24.34 +should be appended to Xen's command line, either manually or by 24.35 +editing \path{grub.conf}. 24.36 + 24.37 +\begin{description} 24.38 +\item [ noreboot ] Don't reboot the machine automatically on errors. 24.39 + This is useful to catch debug output if you aren't catching console 24.40 + messages via the serial line. 24.41 +\item [ nosmp ] Disable SMP support. This option is implied by 24.42 + `ignorebiostables'. 24.43 +\item [ watchdog ] Enable NMI watchdog which can report certain 24.44 + failures. 24.45 +\item [ noirqbalance ] Disable software IRQ balancing and affinity. 24.46 + This can be used on systems such as Dell 1850/2850 that have 24.47 + workarounds in hardware for IRQ-routing issues. 24.48 +\item [ badpage=$<$page number$>$,$<$page number$>$, \ldots ] Specify 24.49 + a list of pages not to be allocated for use because they contain bad 24.50 + bytes. For example, if your memory tester says that byte 0x12345678 24.51 + is bad, you would place `badpage=0x12345' on Xen's command line. 24.52 +\item [ com1=$<$baud$>$,DPS,$<$io\_base$>$,$<$irq$>$ 24.53 + com2=$<$baud$>$,DPS,$<$io\_base$>$,$<$irq$>$ ] \mbox{}\\ 24.54 + Xen supports up to two 16550-compatible serial ports. For example: 24.55 + `com1=9600, 8n1, 0x408, 5' maps COM1 to a 9600-baud port, 8 data 24.56 + bits, no parity, 1 stop bit, I/O port base 0x408, IRQ 5. If some 24.57 + configuration options are standard (e.g., I/O base and IRQ), then 24.58 + only a prefix of the full configuration string need be specified. If 24.59 + the baud rate is pre-configured (e.g., by the bootloader) then you 24.60 + can specify `auto' in place of a numeric baud rate. 24.61 +\item [ console=$<$specifier list$>$ ] Specify the destination for Xen 24.62 + console I/O. This is a comma-separated list of, for example: 24.63 + \begin{description} 24.64 + \item[ vga ] Use VGA console and allow keyboard input. 24.65 + \item[ com1 ] Use serial port com1. 24.66 + \item[ com2H ] Use serial port com2. Transmitted chars will have the 24.67 + MSB set. Received chars must have MSB set. 24.68 + \item[ com2L] Use serial port com2. Transmitted chars will have the 24.69 + MSB cleared. Received chars must have MSB cleared. 24.70 + \end{description} 24.71 + The latter two examples allow a single port to be shared by two 24.72 + subsystems (e.g.\ console and debugger). Sharing is controlled by 24.73 + MSB of each transmitted/received character. [NB. Default for this 24.74 + option is `com1,vga'] 24.75 +\item [ sync\_console ] Force synchronous console output. This is 24.76 + useful if you system fails unexpectedly before it has sent all 24.77 + available output to the console. In most cases Xen will 24.78 + automatically enter synchronous mode when an exceptional event 24.79 + occurs, but this option provides a manual fallback. 24.80 +\item [ conswitch=$<$switch-char$><$auto-switch-char$>$ ] Specify how 24.81 + to switch serial-console input between Xen and DOM0. The required 24.82 + sequence is CTRL-$<$switch-char$>$ pressed three times. Specifying 24.83 + the backtick character disables switching. The 24.84 + $<$auto-switch-char$>$ specifies whether Xen should auto-switch 24.85 + input to DOM0 when it boots --- if it is `x' then auto-switching is 24.86 + disabled. Any other value, or omitting the character, enables 24.87 + auto-switching. [NB. Default switch-char is `a'.] 24.88 +\item [ nmi=xxx ] 24.89 + Specify what to do with an NMI parity or I/O error. \\ 24.90 + `nmi=fatal': Xen prints a diagnostic and then hangs. \\ 24.91 + `nmi=dom0': Inform DOM0 of the NMI. \\ 24.92 + `nmi=ignore': Ignore the NMI. 24.93 +\item [ mem=xxx ] Set the physical RAM address limit. Any RAM 24.94 + appearing beyond this physical address in the memory map will be 24.95 + ignored. This parameter may be specified with a B, K, M or G suffix, 24.96 + representing bytes, kilobytes, megabytes and gigabytes respectively. 24.97 + The default unit, if no suffix is specified, is kilobytes. 24.98 +\item [ dom0\_mem=xxx ] Set the amount of memory to be allocated to 24.99 + domain0. In Xen 3.x the parameter may be specified with a B, K, M or 24.100 + G suffix, representing bytes, kilobytes, megabytes and gigabytes 24.101 + respectively; if no suffix is specified, the parameter defaults to 24.102 + kilobytes. In previous versions of Xen, suffixes were not supported 24.103 + and the value is always interpreted as kilobytes. 24.104 +\item [ tbuf\_size=xxx ] Set the size of the per-cpu trace buffers, in 24.105 + pages (default 1). Note that the trace buffers are only enabled in 24.106 + debug builds. Most users can ignore this feature completely. 24.107 +\item [ sched=xxx ] Select the CPU scheduler Xen should use. The 24.108 + current possibilities are `bvt' (default), `atropos' and `rrobin'. 24.109 + For more information see Section~\ref{s:sched}. 24.110 +\item [ apic\_verbosity=debug,verbose ] Print more detailed 24.111 + information about local APIC and IOAPIC configuration. 24.112 +\item [ lapic ] Force use of local APIC even when left disabled by 24.113 + uniprocessor BIOS. 24.114 +\item [ nolapic ] Ignore local APIC in a uniprocessor system, even if 24.115 + enabled by the BIOS. 24.116 +\item [ apic=bigsmp,default,es7000,summit ] Specify NUMA platform. 24.117 + This can usually be probed automatically. 24.118 +\end{description} 24.119 + 24.120 +In addition, the following options may be specified on the Xen command 24.121 +line. Since domain 0 shares responsibility for booting the platform, 24.122 +Xen will automatically propagate these options to its command line. 24.123 +These options are taken from Linux's command-line syntax with 24.124 +unchanged semantics. 24.125 + 24.126 +\begin{description} 24.127 +\item [ acpi=off,force,strict,ht,noirq,\ldots ] Modify how Xen (and 24.128 + domain 0) parses the BIOS ACPI tables. 24.129 +\item [ acpi\_skip\_timer\_override ] Instruct Xen (and domain~0) to 24.130 + ignore timer-interrupt override instructions specified by the BIOS 24.131 + ACPI tables. 24.132 +\item [ noapic ] Instruct Xen (and domain~0) to ignore any IOAPICs 24.133 + that are present in the system, and instead continue to use the 24.134 + legacy PIC. 24.135 +\end{description} 24.136 + 24.137 + 24.138 +\section{XenLinux Boot Options} 24.139 + 24.140 +In addition to the standard Linux kernel boot options, we support: 24.141 +\begin{description} 24.142 +\item[ xencons=xxx ] Specify the device node to which the Xen virtual 24.143 + console driver is attached. The following options are supported: 24.144 + \begin{center} 24.145 + \begin{tabular}{l} 24.146 + `xencons=off': disable virtual console \\ 24.147 + `xencons=tty': attach console to /dev/tty1 (tty0 at boot-time) \\ 24.148 + `xencons=ttyS': attach console to /dev/ttyS0 24.149 + \end{tabular} 24.150 +\end{center} 24.151 +The default is ttyS for dom0 and tty for all other domains. 24.152 +\end{description} 24.153 + 24.154 + 24.155 +\section{Debugging} 24.156 +\label{s:keys} 24.157 + 24.158 +Xen has a set of debugging features that can be useful to try and 24.159 +figure out what's going on. Hit `h' on the serial line (if you 24.160 +specified a baud rate on the Xen command line) or ScrollLock-h on the 24.161 +keyboard to get a list of supported commands. 24.162 + 24.163 +If you have a crash you'll likely get a crash dump containing an EIP 24.164 +(PC) which, along with an \path{objdump -d image}, can be useful in 24.165 +figuring out what's happened. Debug a Xenlinux image just as you 24.166 +would any other Linux kernel. 24.167 + 24.168 +%% We supply a handy debug terminal program which you can find in 24.169 +%% \path{/usr/local/src/xen-2.0.bk/tools/misc/miniterm/} This should 24.170 +%% be built and executed on another machine that is connected via a 24.171 +%% null modem cable. Documentation is included. Alternatively, if the 24.172 +%% Xen machine is connected to a serial-port server then we supply a 24.173 +%% dumb TCP terminal client, {\tt xencons}.
25.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 25.2 +++ b/docs/src/user/control_software.tex Thu Sep 22 11:42:01 2005 -0600 25.3 @@ -0,0 +1,115 @@ 25.4 +\chapter{Control Software} 25.5 + 25.6 +The Xen control software includes the \xend\ node control daemon 25.7 +(which must be running), the xm command line tools, and the prototype 25.8 +xensv web interface. 25.9 + 25.10 +\section{\Xend\ (node control daemon)} 25.11 +\label{s:xend} 25.12 + 25.13 +The Xen Daemon (\Xend) performs system management functions related to 25.14 +virtual machines. It forms a central point of control for a machine 25.15 +and can be controlled using an HTTP-based protocol. \Xend\ must be 25.16 +running in order to start and manage virtual machines. 25.17 + 25.18 +\Xend\ must be run as root because it needs access to privileged 25.19 +system management functions. A small set of commands may be issued on 25.20 +the \xend\ command line: 25.21 + 25.22 +\begin{tabular}{ll} 25.23 + \verb!# xend start! & start \xend, if not already running \\ 25.24 + \verb!# xend stop! & stop \xend\ if already running \\ 25.25 + \verb!# xend restart! & restart \xend\ if running, otherwise start it \\ 25.26 + % \verb!# xend trace_start! & start \xend, with very detailed debug logging \\ 25.27 + \verb!# xend status! & indicates \xend\ status by its return code 25.28 +\end{tabular} 25.29 + 25.30 +A SysV init script called {\tt xend} is provided to start \xend\ at 25.31 +boot time. {\tt make install} installs this script in 25.32 +\path{/etc/init.d}. To enable it, you have to make symbolic links in 25.33 +the appropriate runlevel directories or use the {\tt chkconfig} tool, 25.34 +where available. 25.35 + 25.36 +Once \xend\ is running, more sophisticated administration can be done 25.37 +using the xm tool (see Section~\ref{s:xm}) and the experimental Xensv 25.38 +web interface (see Section~\ref{s:xensv}). 25.39 + 25.40 +As \xend\ runs, events will be logged to \path{/var/log/xend.log} and, 25.41 +if the migration assistant daemon (\path{xfrd}) has been started, 25.42 +\path{/var/log/xfrd.log}. These may be of use for troubleshooting 25.43 +problems. 25.44 + 25.45 +\section{Xm (command line interface)} 25.46 +\label{s:xm} 25.47 + 25.48 +The xm tool is the primary tool for managing Xen from the console. 25.49 +The general format of an xm command line is: 25.50 + 25.51 +\begin{verbatim} 25.52 +# xm command [switches] [arguments] [variables] 25.53 +\end{verbatim} 25.54 + 25.55 +The available \emph{switches} and \emph{arguments} are dependent on 25.56 +the \emph{command} chosen. The \emph{variables} may be set using 25.57 +declarations of the form {\tt variable=value} and command line 25.58 +declarations override any of the values in the configuration file 25.59 +being used, including the standard variables described above and any 25.60 +custom variables (for instance, the \path{xmdefconfig} file uses a 25.61 +{\tt vmid} variable). 25.62 + 25.63 +The available commands are as follows: 25.64 + 25.65 +\begin{description} 25.66 +\item[set-mem] Request a domain to adjust its memory footprint. 25.67 +\item[create] Create a new domain. 25.68 +\item[destroy] Kill a domain immediately. 25.69 +\item[list] List running domains. 25.70 +\item[shutdown] Ask a domain to shutdown. 25.71 +\item[dmesg] Fetch the Xen (not Linux!) boot output. 25.72 +\item[consoles] Lists the available consoles. 25.73 +\item[console] Connect to the console for a domain. 25.74 +\item[help] Get help on xm commands. 25.75 +\item[save] Suspend a domain to disk. 25.76 +\item[restore] Restore a domain from disk. 25.77 +\item[pause] Pause a domain's execution. 25.78 +\item[unpause] Un-pause a domain. 25.79 +\item[pincpu] Pin a domain to a CPU. 25.80 +\item[bvt] Set BVT scheduler parameters for a domain. 25.81 +\item[bvt\_ctxallow] Set the BVT context switching allowance for the 25.82 + system. 25.83 +\item[atropos] Set the atropos parameters for a domain. 25.84 +\item[rrobin] Set the round robin time slice for the system. 25.85 +\item[info] Get information about the Xen host. 25.86 +\item[call] Call a \xend\ HTTP API function directly. 25.87 +\end{description} 25.88 + 25.89 +For a detailed overview of switches, arguments and variables to each 25.90 +command try 25.91 +\begin{quote} 25.92 +\begin{verbatim} 25.93 +# xm help command 25.94 +\end{verbatim} 25.95 +\end{quote} 25.96 + 25.97 +\section{Xensv (web control interface)} 25.98 +\label{s:xensv} 25.99 + 25.100 +Xensv is the experimental web control interface for managing a Xen 25.101 +machine. It can be used to perform some (but not yet all) of the 25.102 +management tasks that can be done using the xm tool. 25.103 + 25.104 +It can be started using: 25.105 +\begin{quote} 25.106 + \verb_# xensv start_ 25.107 +\end{quote} 25.108 +and stopped using: 25.109 +\begin{quote} 25.110 + \verb_# xensv stop_ 25.111 +\end{quote} 25.112 + 25.113 +By default, Xensv will serve out the web interface on port 8080. This 25.114 +can be changed by editing 25.115 +\path{/usr/lib/python2.3/site-packages/xen/sv/params.py}. 25.116 + 25.117 +Once Xensv is running, the web interface can be used to create and 25.118 +manage running domains.
26.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 26.2 +++ b/docs/src/user/debian.tex Thu Sep 22 11:42:01 2005 -0600 26.3 @@ -0,0 +1,154 @@ 26.4 +\chapter{Installing Xen / XenLinux on Debian} 26.5 + 26.6 +The Debian project provides a tool called \path{debootstrap} which 26.7 +allows a base Debian system to be installed into a filesystem without 26.8 +requiring the host system to have any Debian-specific software (such 26.9 +as \path{apt}). 26.10 + 26.11 +Here's some info how to install Debian 3.1 (Sarge) for an unprivileged 26.12 +Xen domain: 26.13 + 26.14 +\begin{enumerate} 26.15 + 26.16 +\item Set up Xen and test that it's working, as described earlier in 26.17 + this manual. 26.18 + 26.19 +\item Create disk images for rootfs and swap. Alternatively, you might 26.20 + create dedicated partitions, LVM logical volumes, etc.\ if that 26.21 + suits your setup. 26.22 +\begin{verbatim} 26.23 +dd if=/dev/zero of=/path/diskimage bs=1024k count=size_in_mbytes 26.24 +dd if=/dev/zero of=/path/swapimage bs=1024k count=size_in_mbytes 26.25 +\end{verbatim} 26.26 + 26.27 + If you're going to use this filesystem / disk image only as a 26.28 + `template' for other vm disk images, something like 300 MB should be 26.29 + enough. (of course it depends what kind of packages you are planning 26.30 + to install to the template) 26.31 + 26.32 +\item Create the filesystem and initialise the swap image 26.33 +\begin{verbatim} 26.34 +mkfs.ext3 /path/diskimage 26.35 +mkswap /path/swapimage 26.36 +\end{verbatim} 26.37 + 26.38 +\item Mount the disk image for installation 26.39 +\begin{verbatim} 26.40 +mount -o loop /path/diskimage /mnt/disk 26.41 +\end{verbatim} 26.42 + 26.43 +\item Install \path{debootstrap}. Make sure you have debootstrap 26.44 + installed on the host. If you are running Debian Sarge (3.1 / 26.45 + testing) or unstable you can install it by running \path{apt-get 26.46 + install debootstrap}. Otherwise, it can be downloaded from the 26.47 + Debian project website. 26.48 + 26.49 +\item Install Debian base to the disk image: 26.50 +\begin{verbatim} 26.51 +debootstrap --arch i386 sarge /mnt/disk \ 26.52 + http://ftp.<countrycode>.debian.org/debian 26.53 +\end{verbatim} 26.54 + 26.55 + You can use any other Debian http/ftp mirror you want. 26.56 + 26.57 +\item When debootstrap completes successfully, modify settings: 26.58 +\begin{verbatim} 26.59 +chroot /mnt/disk /bin/bash 26.60 +\end{verbatim} 26.61 + 26.62 +Edit the following files using vi or nano and make needed changes: 26.63 +\begin{verbatim} 26.64 +/etc/hostname 26.65 +/etc/hosts 26.66 +/etc/resolv.conf 26.67 +/etc/network/interfaces 26.68 +/etc/networks 26.69 +\end{verbatim} 26.70 + 26.71 +Set up access to the services, edit: 26.72 +\begin{verbatim} 26.73 +/etc/hosts.deny 26.74 +/etc/hosts.allow 26.75 +/etc/inetd.conf 26.76 +\end{verbatim} 26.77 + 26.78 +Add Debian mirror to: 26.79 +\begin{verbatim} 26.80 +/etc/apt/sources.list 26.81 +\end{verbatim} 26.82 + 26.83 +Create fstab like this: 26.84 +\begin{verbatim} 26.85 +/dev/sda1 / ext3 errors=remount-ro 0 1 26.86 +/dev/sda2 none swap sw 0 0 26.87 +proc /proc proc defaults 0 0 26.88 +\end{verbatim} 26.89 + 26.90 +Logout 26.91 + 26.92 +\item Unmount the disk image 26.93 +\begin{verbatim} 26.94 +umount /mnt/disk 26.95 +\end{verbatim} 26.96 + 26.97 +\item Create Xen 2.0 configuration file for the new domain. You can 26.98 + use the example-configurations coming with Xen as a template. 26.99 + 26.100 + Make sure you have the following set up: 26.101 +\begin{verbatim} 26.102 +disk = [ 'file:/path/diskimage,sda1,w', 'file:/path/swapimage,sda2,w' ] 26.103 +root = "/dev/sda1 ro" 26.104 +\end{verbatim} 26.105 + 26.106 +\item Start the new domain 26.107 +\begin{verbatim} 26.108 +xm create -f domain_config_file 26.109 +\end{verbatim} 26.110 + 26.111 +Check that the new domain is running: 26.112 +\begin{verbatim} 26.113 +xm list 26.114 +\end{verbatim} 26.115 + 26.116 +\item Attach to the console of the new domain. You should see 26.117 + something like this when starting the new domain: 26.118 + 26.119 +\begin{verbatim} 26.120 +Started domain testdomain2, console on port 9626 26.121 +\end{verbatim} 26.122 + 26.123 + There you can see the ID of the console: 26. You can also list the 26.124 + consoles with \path{xm consoles} (ID is the last two digits of the 26.125 + port number.) 26.126 + 26.127 + Attach to the console: 26.128 + 26.129 +\begin{verbatim} 26.130 +xm console 26 26.131 +\end{verbatim} 26.132 + 26.133 + or by telnetting to the port 9626 of localhost (the xm console 26.134 + program works better). 26.135 + 26.136 +\item Log in and run base-config 26.137 + 26.138 + As a default there's no password for the root. 26.139 + 26.140 + Check that everything looks OK, and the system started without 26.141 + errors. Check that the swap is active, and the network settings are 26.142 + correct. 26.143 + 26.144 + Run \path{/usr/sbin/base-config} to set up the Debian settings. 26.145 + 26.146 + Set up the password for root using passwd. 26.147 + 26.148 +\item Done. You can exit the console by pressing {\path{Ctrl + ]}} 26.149 + 26.150 +\end{enumerate} 26.151 + 26.152 + 26.153 +If you need to create new domains, you can just copy the contents of 26.154 +the `template'-image to the new disk images, either by mounting the 26.155 +template and the new image, and using \path{cp -a} or \path{tar} or by 26.156 +simply copying the image file. Once this is done, modify the 26.157 +image-specific settings (hostname, network settings, etc).
27.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 27.2 +++ b/docs/src/user/domain_configuration.tex Thu Sep 22 11:42:01 2005 -0600 27.3 @@ -0,0 +1,281 @@ 27.4 +\chapter{Domain Configuration} 27.5 +\label{cha:config} 27.6 + 27.7 +The following contains the syntax of the domain configuration files 27.8 +and description of how to further specify networking, driver domain 27.9 +and general scheduling behavior. 27.10 + 27.11 + 27.12 +\section{Configuration Files} 27.13 +\label{s:cfiles} 27.14 + 27.15 +Xen configuration files contain the following standard variables. 27.16 +Unless otherwise stated, configuration items should be enclosed in 27.17 +quotes: see \path{/etc/xen/xmexample1} and \path{/etc/xen/xmexample2} 27.18 +for concrete examples of the syntax. 27.19 + 27.20 +\begin{description} 27.21 +\item[kernel] Path to the kernel image. 27.22 +\item[ramdisk] Path to a ramdisk image (optional). 27.23 + % \item[builder] The name of the domain build function (e.g. 27.24 + % {\tt'linux'} or {\tt'netbsd'}. 27.25 +\item[memory] Memory size in megabytes. 27.26 +\item[cpu] CPU to run this domain on, or {\tt -1} for auto-allocation. 27.27 +\item[console] Port to export the domain console on (default 9600 + 27.28 + domain ID). 27.29 +\item[nics] Number of virtual network interfaces. 27.30 +\item[vif] List of MAC addresses (random addresses are assigned if not 27.31 + given) and bridges to use for the domain's network interfaces, e.g.\ 27.32 +\begin{verbatim} 27.33 +vif = [ 'mac=aa:00:00:00:00:11, bridge=xen-br0', 27.34 + 'bridge=xen-br1' ] 27.35 +\end{verbatim} 27.36 + to assign a MAC address and bridge to the first interface and assign 27.37 + a different bridge to the second interface, leaving \xend\ to choose 27.38 + the MAC address. 27.39 +\item[disk] List of block devices to export to the domain, e.g.\ \\ 27.40 + \verb_disk = [ 'phy:hda1,sda1,r' ]_ \\ 27.41 + exports physical device \path{/dev/hda1} to the domain as 27.42 + \path{/dev/sda1} with read-only access. Exporting a disk read-write 27.43 + which is currently mounted is dangerous -- if you are \emph{certain} 27.44 + you wish to do this, you can specify \path{w!} as the mode. 27.45 +\item[dhcp] Set to {\tt `dhcp'} if you want to use DHCP to configure 27.46 + networking. 27.47 +\item[netmask] Manually configured IP netmask. 27.48 +\item[gateway] Manually configured IP gateway. 27.49 +\item[hostname] Set the hostname for the virtual machine. 27.50 +\item[root] Specify the root device parameter on the kernel command 27.51 + line. 27.52 +\item[nfs\_server] IP address for the NFS server (if any). 27.53 +\item[nfs\_root] Path of the root filesystem on the NFS server (if 27.54 + any). 27.55 +\item[extra] Extra string to append to the kernel command line (if 27.56 + any) 27.57 +\item[restart] Three possible options: 27.58 + \begin{description} 27.59 + \item[always] Always restart the domain, no matter what its exit 27.60 + code is. 27.61 + \item[never] Never restart the domain. 27.62 + \item[onreboot] Restart the domain iff it requests reboot. 27.63 + \end{description} 27.64 +\end{description} 27.65 + 27.66 +For additional flexibility, it is also possible to include Python 27.67 +scripting commands in configuration files. An example of this is the 27.68 +\path{xmexample2} file, which uses Python code to handle the 27.69 +\path{vmid} variable. 27.70 + 27.71 + 27.72 +%\part{Advanced Topics} 27.73 + 27.74 + 27.75 +\section{Network Configuration} 27.76 + 27.77 +For many users, the default installation should work ``out of the 27.78 +box''. More complicated network setups, for instance with multiple 27.79 +Ethernet interfaces and/or existing bridging setups will require some 27.80 +special configuration. 27.81 + 27.82 +The purpose of this section is to describe the mechanisms provided by 27.83 +\xend\ to allow a flexible configuration for Xen's virtual networking. 27.84 + 27.85 +\subsection{Xen virtual network topology} 27.86 + 27.87 +Each domain network interface is connected to a virtual network 27.88 +interface in dom0 by a point to point link (effectively a ``virtual 27.89 +crossover cable''). These devices are named {\tt 27.90 + vif$<$domid$>$.$<$vifid$>$} (e.g.\ {\tt vif1.0} for the first 27.91 +interface in domain~1, {\tt vif3.1} for the second interface in 27.92 +domain~3). 27.93 + 27.94 +Traffic on these virtual interfaces is handled in domain~0 using 27.95 +standard Linux mechanisms for bridging, routing, rate limiting, etc. 27.96 +Xend calls on two shell scripts to perform initial configuration of 27.97 +the network and configuration of new virtual interfaces. By default, 27.98 +these scripts configure a single bridge for all the virtual 27.99 +interfaces. Arbitrary routing / bridging configurations can be 27.100 +configured by customizing the scripts, as described in the following 27.101 +section. 27.102 + 27.103 +\subsection{Xen networking scripts} 27.104 + 27.105 +Xen's virtual networking is configured by two shell scripts (by 27.106 +default \path{network} and \path{vif-bridge}). These are called 27.107 +automatically by \xend\ when certain events occur, with arguments to 27.108 +the scripts providing further contextual information. These scripts 27.109 +are found by default in \path{/etc/xen/scripts}. The names and 27.110 +locations of the scripts can be configured in 27.111 +\path{/etc/xen/xend-config.sxp}. 27.112 + 27.113 +\begin{description} 27.114 +\item[network:] This script is called whenever \xend\ is started or 27.115 + stopped to respectively initialize or tear down the Xen virtual 27.116 + network. In the default configuration initialization creates the 27.117 + bridge `xen-br0' and moves eth0 onto that bridge, modifying the 27.118 + routing accordingly. When \xend\ exits, it deletes the Xen bridge 27.119 + and removes eth0, restoring the normal IP and routing configuration. 27.120 + 27.121 + %% In configurations where the bridge already exists, this script 27.122 + %% could be replaced with a link to \path{/bin/true} (for instance). 27.123 + 27.124 +\item[vif-bridge:] This script is called for every domain virtual 27.125 + interface and can configure firewalling rules and add the vif to the 27.126 + appropriate bridge. By default, this adds and removes VIFs on the 27.127 + default Xen bridge. 27.128 +\end{description} 27.129 + 27.130 +For more complex network setups (e.g.\ where routing is required or 27.131 +integrate with existing bridges) these scripts may be replaced with 27.132 +customized variants for your site's preferred configuration. 27.133 + 27.134 +%% There are two possible types of privileges: IO privileges and 27.135 +%% administration privileges. 27.136 + 27.137 + 27.138 +\section{Driver Domain Configuration} 27.139 + 27.140 +I/O privileges can be assigned to allow a domain to directly access 27.141 +PCI devices itself. This is used to support driver domains. 27.142 + 27.143 +Setting back-end privileges is currently only supported in SXP format 27.144 +config files. To allow a domain to function as a back-end for others, 27.145 +somewhere within the {\tt vm} element of its configuration file must 27.146 +be a {\tt back-end} element of the form {\tt (back-end ({\em type}))} 27.147 +where {\tt \em type} may be either {\tt netif} or {\tt blkif}, 27.148 +according to the type of virtual device this domain will service. 27.149 +%% After this domain has been built, \xend will connect all new and 27.150 +%% existing {\em virtual} devices (of the appropriate type) to that 27.151 +%% back-end. 27.152 + 27.153 +Note that a block back-end cannot currently import virtual block 27.154 +devices from other domains, and a network back-end cannot import 27.155 +virtual network devices from other domains. Thus (particularly in the 27.156 +case of block back-ends, which cannot import a virtual block device as 27.157 +their root filesystem), you may need to boot a back-end domain from a 27.158 +ramdisk or a network device. 27.159 + 27.160 +Access to PCI devices may be configured on a per-device basis. Xen 27.161 +will assign the minimal set of hardware privileges to a domain that 27.162 +are required to control its devices. This can be configured in either 27.163 +format of configuration file: 27.164 + 27.165 +\begin{itemize} 27.166 +\item SXP Format: Include device elements of the form: \\ 27.167 + \centerline{ {\tt (device (pci (bus {\em x}) (dev {\em y}) (func {\em z})))}} \\ 27.168 + inside the top-level {\tt vm} element. Each one specifies the 27.169 + address of a device this domain is allowed to access --- the numbers 27.170 + \emph{x},\emph{y} and \emph{z} may be in either decimal or 27.171 + hexadecimal format. 27.172 +\item Flat Format: Include a list of PCI device addresses of the 27.173 + format: \\ 27.174 + \centerline{{\tt pci = ['x,y,z', \ldots]}} \\ 27.175 + where each element in the list is a string specifying the components 27.176 + of the PCI device address, separated by commas. The components 27.177 + ({\tt \em x}, {\tt \em y} and {\tt \em z}) of the list may be 27.178 + formatted as either decimal or hexadecimal. 27.179 +\end{itemize} 27.180 + 27.181 +%% \section{Administration Domains} 27.182 + 27.183 +%% Administration privileges allow a domain to use the `dom0 27.184 +%% operations' (so called because they are usually available only to 27.185 +%% domain 0). A privileged domain can build other domains, set 27.186 +%% scheduling parameters, etc. 27.187 + 27.188 +% Support for other administrative domains is not yet available... 27.189 +% perhaps we should plumb it in some time 27.190 + 27.191 + 27.192 +\section{Scheduler Configuration} 27.193 +\label{s:sched} 27.194 + 27.195 +Xen offers a boot time choice between multiple schedulers. To select 27.196 +a scheduler, pass the boot parameter \emph{sched=sched\_name} to Xen, 27.197 +substituting the appropriate scheduler name. Details of the 27.198 +schedulers and their parameters are included below; future versions of 27.199 +the tools will provide a higher-level interface to these tools. 27.200 + 27.201 +It is expected that system administrators configure their system to 27.202 +use the scheduler most appropriate to their needs. Currently, the BVT 27.203 +scheduler is the recommended choice. 27.204 + 27.205 +\subsection{Borrowed Virtual Time} 27.206 + 27.207 +{\tt sched=bvt} (the default) \\ 27.208 + 27.209 +BVT provides proportional fair shares of the CPU time. It has been 27.210 +observed to penalize domains that block frequently (e.g.\ I/O 27.211 +intensive domains), but this can be compensated for by using warping. 27.212 + 27.213 +\subsubsection{Global Parameters} 27.214 + 27.215 +\begin{description} 27.216 +\item[ctx\_allow] The context switch allowance is similar to the 27.217 + ``quantum'' in traditional schedulers. It is the minimum time that 27.218 + a scheduled domain will be allowed to run before being preempted. 27.219 +\end{description} 27.220 + 27.221 +\subsubsection{Per-domain parameters} 27.222 + 27.223 +\begin{description} 27.224 +\item[mcuadv] The MCU (Minimum Charging Unit) advance determines the 27.225 + proportional share of the CPU that a domain receives. It is set 27.226 + inversely proportionally to a domain's sharing weight. 27.227 +\item[warp] The amount of ``virtual time'' the domain is allowed to 27.228 + warp backwards. 27.229 +\item[warpl] The warp limit is the maximum time a domain can run 27.230 + warped for. 27.231 +\item[warpu] The unwarp requirement is the minimum time a domain must 27.232 + run unwarped for before it can warp again. 27.233 +\end{description} 27.234 + 27.235 +\subsection{Atropos} 27.236 + 27.237 +{\tt sched=atropos} \\ 27.238 + 27.239 +Atropos is a soft real time scheduler. It provides guarantees about 27.240 +absolute shares of the CPU, with a facility for sharing slack CPU time 27.241 +on a best-effort basis. It can provide timeliness guarantees for 27.242 +latency-sensitive domains. 27.243 + 27.244 +Every domain has an associated period and slice. The domain should 27.245 +receive `slice' nanoseconds every `period' nanoseconds. This allows 27.246 +the administrator to configure both the absolute share of the CPU a 27.247 +domain receives and the frequency with which it is scheduled. 27.248 + 27.249 +%% When domains unblock, their period is reduced to the value of the 27.250 +%% latency hint (the slice is scaled accordingly so that they still 27.251 +%% get the same proportion of the CPU). For each subsequent period, 27.252 +%% the slice and period times are doubled until they reach their 27.253 +%% original values. 27.254 + 27.255 +Note: don't over-commit the CPU when using Atropos (i.e.\ don't reserve 27.256 +more CPU than is available --- the utilization should be kept to 27.257 +slightly less than 100\% in order to ensure predictable behavior). 27.258 + 27.259 +\subsubsection{Per-domain parameters} 27.260 + 27.261 +\begin{description} 27.262 +\item[period] The regular time interval during which a domain is 27.263 + guaranteed to receive its allocation of CPU time. 27.264 +\item[slice] The length of time per period that a domain is guaranteed 27.265 + to run for (in the absence of voluntary yielding of the CPU). 27.266 +\item[latency] The latency hint is used to control how soon after 27.267 + waking up a domain it should be scheduled. 27.268 +\item[xtratime] This is a boolean flag that specifies whether a domain 27.269 + should be allowed a share of the system slack time. 27.270 +\end{description} 27.271 + 27.272 +\subsection{Round Robin} 27.273 + 27.274 +{\tt sched=rrobin} \\ 27.275 + 27.276 +The round robin scheduler is included as a simple demonstration of 27.277 +Xen's internal scheduler API. It is not intended for production use. 27.278 + 27.279 +\subsubsection{Global Parameters} 27.280 + 27.281 +\begin{description} 27.282 +\item[rr\_slice] The maximum time each domain runs before the next 27.283 + scheduling decision is made. 27.284 +\end{description}
28.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 28.2 +++ b/docs/src/user/domain_filesystem.tex Thu Sep 22 11:42:01 2005 -0600 28.3 @@ -0,0 +1,243 @@ 28.4 +\chapter{Domain Filesystem Storage} 28.5 + 28.6 +It is possible to directly export any Linux block device in dom0 to 28.7 +another domain, or to export filesystems / devices to virtual machines 28.8 +using standard network protocols (e.g.\ NBD, iSCSI, NFS, etc.). This 28.9 +chapter covers some of the possibilities. 28.10 + 28.11 + 28.12 +\section{Exporting Physical Devices as VBDs} 28.13 +\label{s:exporting-physical-devices-as-vbds} 28.14 + 28.15 +One of the simplest configurations is to directly export individual 28.16 +partitions from domain~0 to other domains. To achieve this use the 28.17 +\path{phy:} specifier in your domain configuration file. For example a 28.18 +line like 28.19 +\begin{quote} 28.20 + \verb_disk = ['phy:hda3,sda1,w']_ 28.21 +\end{quote} 28.22 +specifies that the partition \path{/dev/hda3} in domain~0 should be 28.23 +exported read-write to the new domain as \path{/dev/sda1}; one could 28.24 +equally well export it as \path{/dev/hda} or \path{/dev/sdb5} should 28.25 +one wish. 28.26 + 28.27 +In addition to local disks and partitions, it is possible to export 28.28 +any device that Linux considers to be ``a disk'' in the same manner. 28.29 +For example, if you have iSCSI disks or GNBD volumes imported into 28.30 +domain~0 you can export these to other domains using the \path{phy:} 28.31 +disk syntax. E.g.: 28.32 +\begin{quote} 28.33 + \verb_disk = ['phy:vg/lvm1,sda2,w']_ 28.34 +\end{quote} 28.35 + 28.36 +\begin{center} 28.37 + \framebox{\bf Warning: Block device sharing} 28.38 +\end{center} 28.39 +\begin{quote} 28.40 + Block devices should typically only be shared between domains in a 28.41 + read-only fashion otherwise the Linux kernel's file systems will get 28.42 + very confused as the file system structure may change underneath 28.43 + them (having the same ext3 partition mounted \path{rw} twice is a 28.44 + sure fire way to cause irreparable damage)! \Xend\ will attempt to 28.45 + prevent you from doing this by checking that the device is not 28.46 + mounted read-write in domain~0, and hasn't already been exported 28.47 + read-write to another domain. If you want read-write sharing, 28.48 + export the directory to other domains via NFS from domain~0 (or use 28.49 + a cluster file system such as GFS or ocfs2). 28.50 +\end{quote} 28.51 + 28.52 + 28.53 +\section{Using File-backed VBDs} 28.54 + 28.55 +It is also possible to use a file in Domain~0 as the primary storage 28.56 +for a virtual machine. As well as being convenient, this also has the 28.57 +advantage that the virtual block device will be \emph{sparse} --- 28.58 +space will only really be allocated as parts of the file are used. So 28.59 +if a virtual machine uses only half of its disk space then the file 28.60 +really takes up half of the size allocated. 28.61 + 28.62 +For example, to create a 2GB sparse file-backed virtual block device 28.63 +(actually only consumes 1KB of disk): 28.64 +\begin{quote} 28.65 + \verb_# dd if=/dev/zero of=vm1disk bs=1k seek=2048k count=1_ 28.66 +\end{quote} 28.67 + 28.68 +Make a file system in the disk file: 28.69 +\begin{quote} 28.70 + \verb_# mkfs -t ext3 vm1disk_ 28.71 +\end{quote} 28.72 + 28.73 +(when the tool asks for confirmation, answer `y') 28.74 + 28.75 +Populate the file system e.g.\ by copying from the current root: 28.76 +\begin{quote} 28.77 +\begin{verbatim} 28.78 +# mount -o loop vm1disk /mnt 28.79 +# cp -ax /{root,dev,var,etc,usr,bin,sbin,lib} /mnt 28.80 +# mkdir /mnt/{proc,sys,home,tmp} 28.81 +\end{verbatim} 28.82 +\end{quote} 28.83 + 28.84 +Tailor the file system by editing \path{/etc/fstab}, 28.85 +\path{/etc/hostname}, etc.\ Don't forget to edit the files in the 28.86 +mounted file system, instead of your domain~0 filesystem, e.g.\ you 28.87 +would edit \path{/mnt/etc/fstab} instead of \path{/etc/fstab}. For 28.88 +this example put \path{/dev/sda1} to root in fstab. 28.89 + 28.90 +Now unmount (this is important!): 28.91 +\begin{quote} 28.92 + \verb_# umount /mnt_ 28.93 +\end{quote} 28.94 + 28.95 +In the configuration file set: 28.96 +\begin{quote} 28.97 + \verb_disk = ['file:/full/path/to/vm1disk,sda1,w']_ 28.98 +\end{quote} 28.99 + 28.100 +As the virtual machine writes to its `disk', the sparse file will be 28.101 +filled in and consume more space up to the original 2GB. 28.102 + 28.103 +{\bf Note that file-backed VBDs may not be appropriate for backing 28.104 + I/O-intensive domains.} File-backed VBDs are known to experience 28.105 +substantial slowdowns under heavy I/O workloads, due to the I/O 28.106 +handling by the loopback block device used to support file-backed VBDs 28.107 +in dom0. Better I/O performance can be achieved by using either 28.108 +LVM-backed VBDs (Section~\ref{s:using-lvm-backed-vbds}) or physical 28.109 +devices as VBDs (Section~\ref{s:exporting-physical-devices-as-vbds}). 28.110 + 28.111 +Linux supports a maximum of eight file-backed VBDs across all domains 28.112 +by default. This limit can be statically increased by using the 28.113 +\emph{max\_loop} module parameter if CONFIG\_BLK\_DEV\_LOOP is 28.114 +compiled as a module in the dom0 kernel, or by using the 28.115 +\emph{max\_loop=n} boot option if CONFIG\_BLK\_DEV\_LOOP is compiled 28.116 +directly into the dom0 kernel. 28.117 + 28.118 + 28.119 +\section{Using LVM-backed VBDs} 28.120 +\label{s:using-lvm-backed-vbds} 28.121 + 28.122 +A particularly appealing solution is to use LVM volumes as backing for 28.123 +domain file-systems since this allows dynamic growing/shrinking of 28.124 +volumes as well as snapshot and other features. 28.125 + 28.126 +To initialize a partition to support LVM volumes: 28.127 +\begin{quote} 28.128 +\begin{verbatim} 28.129 +# pvcreate /dev/sda10 28.130 +\end{verbatim} 28.131 +\end{quote} 28.132 + 28.133 +Create a volume group named `vg' on the physical partition: 28.134 +\begin{quote} 28.135 +\begin{verbatim} 28.136 +# vgcreate vg /dev/sda10 28.137 +\end{verbatim} 28.138 +\end{quote} 28.139 + 28.140 +Create a logical volume of size 4GB named `myvmdisk1': 28.141 +\begin{quote} 28.142 +\begin{verbatim} 28.143 +# lvcreate -L4096M -n myvmdisk1 vg 28.144 +\end{verbatim} 28.145 +\end{quote} 28.146 + 28.147 +You should now see that you have a \path{/dev/vg/myvmdisk1} Make a 28.148 +filesystem, mount it and populate it, e.g.: 28.149 +\begin{quote} 28.150 +\begin{verbatim} 28.151 +# mkfs -t ext3 /dev/vg/myvmdisk1 28.152 +# mount /dev/vg/myvmdisk1 /mnt 28.153 +# cp -ax / /mnt 28.154 +# umount /mnt 28.155 +\end{verbatim} 28.156 +\end{quote} 28.157 + 28.158 +Now configure your VM with the following disk configuration: 28.159 +\begin{quote} 28.160 +\begin{verbatim} 28.161 + disk = [ 'phy:vg/myvmdisk1,sda1,w' ] 28.162 +\end{verbatim} 28.163 +\end{quote} 28.164 + 28.165 +LVM enables you to grow the size of logical volumes, but you'll need 28.166 +to resize the corresponding file system to make use of the new space. 28.167 +Some file systems (e.g.\ ext3) now support online resize. See the LVM 28.168 +manuals for more details. 28.169 + 28.170 +You can also use LVM for creating copy-on-write (CoW) clones of LVM 28.171 +volumes (known as writable persistent snapshots in LVM terminology). 28.172 +This facility is new in Linux 2.6.8, so isn't as stable as one might 28.173 +hope. In particular, using lots of CoW LVM disks consumes a lot of 28.174 +dom0 memory, and error conditions such as running out of disk space 28.175 +are not handled well. Hopefully this will improve in future. 28.176 + 28.177 +To create two copy-on-write clone of the above file system you would 28.178 +use the following commands: 28.179 + 28.180 +\begin{quote} 28.181 +\begin{verbatim} 28.182 +# lvcreate -s -L1024M -n myclonedisk1 /dev/vg/myvmdisk1 28.183 +# lvcreate -s -L1024M -n myclonedisk2 /dev/vg/myvmdisk1 28.184 +\end{verbatim} 28.185 +\end{quote} 28.186 + 28.187 +Each of these can grow to have 1GB of differences from the master 28.188 +volume. You can grow the amount of space for storing the differences 28.189 +using the lvextend command, e.g.: 28.190 +\begin{quote} 28.191 +\begin{verbatim} 28.192 +# lvextend +100M /dev/vg/myclonedisk1 28.193 +\end{verbatim} 28.194 +\end{quote} 28.195 + 28.196 +Don't let the `differences volume' ever fill up otherwise LVM gets 28.197 +rather confused. It may be possible to automate the growing process by 28.198 +using \path{dmsetup wait} to spot the volume getting full and then 28.199 +issue an \path{lvextend}. 28.200 + 28.201 +In principle, it is possible to continue writing to the volume that 28.202 +has been cloned (the changes will not be visible to the clones), but 28.203 +we wouldn't recommend this: have the cloned volume as a `pristine' 28.204 +file system install that isn't mounted directly by any of the virtual 28.205 +machines. 28.206 + 28.207 + 28.208 +\section{Using NFS Root} 28.209 + 28.210 +First, populate a root filesystem in a directory on the server 28.211 +machine. This can be on a distinct physical machine, or simply run 28.212 +within a virtual machine on the same node. 28.213 + 28.214 +Now configure the NFS server to export this filesystem over the 28.215 +network by adding a line to \path{/etc/exports}, for instance: 28.216 + 28.217 +\begin{quote} 28.218 + \begin{small} 28.219 +\begin{verbatim} 28.220 +/export/vm1root 1.2.3.4/24 (rw,sync,no_root_squash) 28.221 +\end{verbatim} 28.222 + \end{small} 28.223 +\end{quote} 28.224 + 28.225 +Finally, configure the domain to use NFS root. In addition to the 28.226 +normal variables, you should make sure to set the following values in 28.227 +the domain's configuration file: 28.228 + 28.229 +\begin{quote} 28.230 + \begin{small} 28.231 +\begin{verbatim} 28.232 +root = '/dev/nfs' 28.233 +nfs_server = '2.3.4.5' # substitute IP address of server 28.234 +nfs_root = '/path/to/root' # path to root FS on the server 28.235 +\end{verbatim} 28.236 + \end{small} 28.237 +\end{quote} 28.238 + 28.239 +The domain will need network access at boot time, so either statically 28.240 +configure an IP address using the config variables \path{ip}, 28.241 +\path{netmask}, \path{gateway}, \path{hostname}; or enable DHCP 28.242 +(\path{dhcp='dhcp'}). 28.243 + 28.244 +Note that the Linux NFS root implementation is known to have stability 28.245 +problems under high load (this is not a Xen-specific problem), so this 28.246 +configuration may not be appropriate for critical servers.
29.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 29.2 +++ b/docs/src/user/domain_mgmt.tex Thu Sep 22 11:42:01 2005 -0600 29.3 @@ -0,0 +1,203 @@ 29.4 +\chapter{Domain Management Tools} 29.5 + 29.6 +The previous chapter described a simple example of how to configure 29.7 +and start a domain. This chapter summarises the tools available to 29.8 +manage running domains. 29.9 + 29.10 + 29.11 +\section{Command-line Management} 29.12 + 29.13 +Command line management tasks are also performed using the \path{xm} 29.14 +tool. For online help for the commands available, type: 29.15 +\begin{quote} 29.16 + \verb_# xm help_ 29.17 +\end{quote} 29.18 + 29.19 +You can also type \path{xm help $<$command$>$} for more information on 29.20 +a given command. 29.21 + 29.22 +\subsection{Basic Management Commands} 29.23 + 29.24 +The most important \path{xm} commands are: 29.25 +\begin{quote} 29.26 + \verb_# xm list_: Lists all domains running.\\ 29.27 + \verb_# xm consoles_: Gives information about the domain consoles.\\ 29.28 + \verb_# xm console_: Opens a console to a domain (e.g.\ 29.29 + \verb_# xm console myVM_) 29.30 +\end{quote} 29.31 + 29.32 +\subsection{\tt xm list} 29.33 + 29.34 +The output of \path{xm list} is in rows of the following format: 29.35 +\begin{center} {\tt name domid memory cpu state cputime console} 29.36 +\end{center} 29.37 + 29.38 +\begin{quote} 29.39 + \begin{description} 29.40 + \item[name] The descriptive name of the virtual machine. 29.41 + \item[domid] The number of the domain ID this virtual machine is 29.42 + running in. 29.43 + \item[memory] Memory size in megabytes. 29.44 + \item[cpu] The CPU this domain is running on. 29.45 + \item[state] Domain state consists of 5 fields: 29.46 + \begin{description} 29.47 + \item[r] running 29.48 + \item[b] blocked 29.49 + \item[p] paused 29.50 + \item[s] shutdown 29.51 + \item[c] crashed 29.52 + \end{description} 29.53 + \item[cputime] How much CPU time (in seconds) the domain has used so 29.54 + far. 29.55 + \item[console] TCP port accepting connections to the domain's 29.56 + console. 29.57 + \end{description} 29.58 +\end{quote} 29.59 + 29.60 +The \path{xm list} command also supports a long output format when the 29.61 +\path{-l} switch is used. This outputs the fulls details of the 29.62 +running domains in \xend's SXP configuration format. 29.63 + 29.64 +For example, suppose the system is running the ttylinux domain as 29.65 +described earlier. The list command should produce output somewhat 29.66 +like the following: 29.67 +\begin{verbatim} 29.68 +# xm list 29.69 +Name Id Mem(MB) CPU State Time(s) Console 29.70 +Domain-0 0 251 0 r---- 172.2 29.71 +ttylinux 5 63 0 -b--- 3.0 9605 29.72 +\end{verbatim} 29.73 + 29.74 +Here we can see the details for the ttylinux domain, as well as for 29.75 +domain~0 (which, of course, is always running). Note that the console 29.76 +port for the ttylinux domain is 9605. This can be connected to by TCP 29.77 +using a terminal program (e.g. \path{telnet} or, better, 29.78 +\path{xencons}). The simplest way to connect is to use the 29.79 +\path{xm~console} command, specifying the domain name or ID. To 29.80 +connect to the console of the ttylinux domain, we could use any of the 29.81 +following: 29.82 +\begin{verbatim} 29.83 +# xm console ttylinux 29.84 +# xm console 5 29.85 +# xencons localhost 9605 29.86 +\end{verbatim} 29.87 + 29.88 +\section{Domain Save and Restore} 29.89 + 29.90 +The administrator of a Xen system may suspend a virtual machine's 29.91 +current state into a disk file in domain~0, allowing it to be resumed 29.92 +at a later time. 29.93 + 29.94 +The ttylinux domain described earlier can be suspended to disk using 29.95 +the command: 29.96 +\begin{verbatim} 29.97 +# xm save ttylinux ttylinux.xen 29.98 +\end{verbatim} 29.99 + 29.100 +This will stop the domain named `ttylinux' and save its current state 29.101 +into a file called \path{ttylinux.xen}. 29.102 + 29.103 +To resume execution of this domain, use the \path{xm restore} command: 29.104 +\begin{verbatim} 29.105 +# xm restore ttylinux.xen 29.106 +\end{verbatim} 29.107 + 29.108 +This will restore the state of the domain and restart it. The domain 29.109 +will carry on as before and the console may be reconnected using the 29.110 +\path{xm console} command, as above. 29.111 + 29.112 +\section{Live Migration} 29.113 + 29.114 +Live migration is used to transfer a domain between physical hosts 29.115 +whilst that domain continues to perform its usual activities --- from 29.116 +the user's perspective, the migration should be imperceptible. 29.117 + 29.118 +To perform a live migration, both hosts must be running Xen / \xend\ 29.119 +and the destination host must have sufficient resources (e.g.\ memory 29.120 +capacity) to accommodate the domain after the move. Furthermore we 29.121 +currently require both source and destination machines to be on the 29.122 +same L2 subnet. 29.123 + 29.124 +Currently, there is no support for providing automatic remote access 29.125 +to filesystems stored on local disk when a domain is migrated. 29.126 +Administrators should choose an appropriate storage solution (i.e.\ 29.127 +SAN, NAS, etc.) to ensure that domain filesystems are also available 29.128 +on their destination node. GNBD is a good method for exporting a 29.129 +volume from one machine to another. iSCSI can do a similar job, but is 29.130 +more complex to set up. 29.131 + 29.132 +When a domain migrates, it's MAC and IP address move with it, thus it 29.133 +is only possible to migrate VMs within the same layer-2 network and IP 29.134 +subnet. If the destination node is on a different subnet, the 29.135 +administrator would need to manually configure a suitable etherip or 29.136 +IP tunnel in the domain~0 of the remote node. 29.137 + 29.138 +A domain may be migrated using the \path{xm migrate} command. To live 29.139 +migrate a domain to another machine, we would use the command: 29.140 + 29.141 +\begin{verbatim} 29.142 +# xm migrate --live mydomain destination.ournetwork.com 29.143 +\end{verbatim} 29.144 + 29.145 +Without the \path{--live} flag, \xend\ simply stops the domain and 29.146 +copies the memory image over to the new node and restarts it. Since 29.147 +domains can have large allocations this can be quite time consuming, 29.148 +even on a Gigabit network. With the \path{--live} flag \xend\ attempts 29.149 +to keep the domain running while the migration is in progress, 29.150 +resulting in typical `downtimes' of just 60--300ms. 29.151 + 29.152 +For now it will be necessary to reconnect to the domain's console on 29.153 +the new machine using the \path{xm console} command. If a migrated 29.154 +domain has any open network connections then they will be preserved, 29.155 +so SSH connections do not have this limitation. 29.156 + 29.157 + 29.158 +\section{Managing Domain Memory} 29.159 + 29.160 +XenLinux domains have the ability to relinquish / reclaim machine 29.161 +memory at the request of the administrator or the user of the domain. 29.162 + 29.163 +\subsection{Setting memory footprints from dom0} 29.164 + 29.165 +The machine administrator can request that a domain alter its memory 29.166 +footprint using the \path{xm set-mem} command. For instance, we can 29.167 +request that our example ttylinux domain reduce its memory footprint 29.168 +to 32 megabytes. 29.169 + 29.170 +\begin{verbatim} 29.171 +# xm set-mem ttylinux 32 29.172 +\end{verbatim} 29.173 + 29.174 +We can now see the result of this in the output of \path{xm list}: 29.175 + 29.176 +\begin{verbatim} 29.177 +# xm list 29.178 +Name Id Mem(MB) CPU State Time(s) Console 29.179 +Domain-0 0 251 0 r---- 172.2 29.180 +ttylinux 5 31 0 -b--- 4.3 9605 29.181 +\end{verbatim} 29.182 + 29.183 +The domain has responded to the request by returning memory to Xen. We 29.184 +can restore the domain to its original size using the command line: 29.185 + 29.186 +\begin{verbatim} 29.187 +# xm set-mem ttylinux 64 29.188 +\end{verbatim} 29.189 + 29.190 +\subsection{Setting memory footprints from within a domain} 29.191 + 29.192 +The virtual file \path{/proc/xen/balloon} allows the owner of a domain 29.193 +to adjust their own memory footprint. Reading the file (e.g.\ 29.194 +\path{cat /proc/xen/balloon}) prints out the current memory footprint 29.195 +of the domain. Writing the file (e.g.\ \path{echo new\_target > 29.196 + /proc/xen/balloon}) requests that the kernel adjust the domain's 29.197 +memory footprint to a new value. 29.198 + 29.199 +\subsection{Setting memory limits} 29.200 + 29.201 +Xen associates a memory size limit with each domain. By default, this 29.202 +is the amount of memory the domain is originally started with, 29.203 +preventing the domain from ever growing beyond this size. To permit a 29.204 +domain to grow beyond its original allocation or to prevent a domain 29.205 +you've shrunk from reclaiming the memory it relinquished, use the 29.206 +\path{xm maxmem} command.
30.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 30.2 +++ b/docs/src/user/glossary.tex Thu Sep 22 11:42:01 2005 -0600 30.3 @@ -0,0 +1,79 @@ 30.4 +\chapter{Glossary of Terms} 30.5 + 30.6 +\begin{description} 30.7 + 30.8 +\item[Atropos] One of the CPU schedulers provided by Xen. Atropos 30.9 + provides domains with absolute shares of the CPU, with timeliness 30.10 + guarantees and a mechanism for sharing out `slack time'. 30.11 + 30.12 +\item[BVT] The BVT scheduler is used to give proportional fair shares 30.13 + of the CPU to domains. 30.14 + 30.15 +\item[Exokernel] A minimal piece of privileged code, similar to a {\bf 30.16 + microkernel} but providing a more `hardware-like' interface to the 30.17 + tasks it manages. This is similar to a paravirtualising VMM like 30.18 + {\bf Xen} but was designed as a new operating system structure, 30.19 + rather than specifically to run multiple conventional OSs. 30.20 + 30.21 +\item[Domain] A domain is the execution context that contains a 30.22 + running {\bf virtual machine}. The relationship between virtual 30.23 + machines and domains on Xen is similar to that between programs and 30.24 + processes in an operating system: a virtual machine is a persistent 30.25 + entity that resides on disk (somewhat like a program). When it is 30.26 + loaded for execution, it runs in a domain. Each domain has a {\bf 30.27 + domain ID}. 30.28 + 30.29 +\item[Domain 0] The first domain to be started on a Xen machine. 30.30 + Domain 0 is responsible for managing the system. 30.31 + 30.32 +\item[Domain ID] A unique identifier for a {\bf domain}, analogous to 30.33 + a process ID in an operating system. 30.34 + 30.35 +\item[Full virtualisation] An approach to virtualisation which 30.36 + requires no modifications to the hosted operating system, providing 30.37 + the illusion of a complete system of real hardware devices. 30.38 + 30.39 +\item[Hypervisor] An alternative term for {\bf VMM}, used because it 30.40 + means `beyond supervisor', since it is responsible for managing 30.41 + multiple `supervisor' kernels. 30.42 + 30.43 +\item[Live migration] A technique for moving a running virtual machine 30.44 + to another physical host, without stopping it or the services 30.45 + running on it. 30.46 + 30.47 +\item[Microkernel] A small base of code running at the highest 30.48 + hardware privilege level. A microkernel is responsible for sharing 30.49 + CPU and memory (and sometimes other devices) between less privileged 30.50 + tasks running on the system. This is similar to a VMM, particularly 30.51 + a {\bf paravirtualising} VMM but typically addressing a different 30.52 + problem space and providing different kind of interface. 30.53 + 30.54 +\item[NetBSD/Xen] A port of NetBSD to the Xen architecture. 30.55 + 30.56 +\item[Paravirtualisation] An approach to virtualisation which requires 30.57 + modifications to the operating system in order to run in a virtual 30.58 + machine. Xen uses paravirtualisation but preserves binary 30.59 + compatibility for user space applications. 30.60 + 30.61 +\item[Shadow pagetables] A technique for hiding the layout of machine 30.62 + memory from a virtual machine's operating system. Used in some {\bf 30.63 + VMMs} to provide the illusion of contiguous physical memory, in 30.64 + Xen this is used during {\bf live migration}. 30.65 + 30.66 +\item[Virtual Machine] The environment in which a hosted operating 30.67 + system runs, providing the abstraction of a dedicated machine. A 30.68 + virtual machine may be identical to the underlying hardware (as in 30.69 + {\bf full virtualisation}, or it may differ, as in {\bf 30.70 + paravirtualisation}). 30.71 + 30.72 +\item[VMM] Virtual Machine Monitor - the software that allows multiple 30.73 + virtual machines to be multiplexed on a single physical machine. 30.74 + 30.75 +\item[Xen] Xen is a paravirtualising virtual machine monitor, 30.76 + developed primarily by the Systems Research Group at the University 30.77 + of Cambridge Computer Laboratory. 30.78 + 30.79 +\item[XenLinux] Official name for the port of the Linux kernel that 30.80 + runs on Xen. 30.81 + 30.82 +\end{description}
31.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 31.2 +++ b/docs/src/user/installation.tex Thu Sep 22 11:42:01 2005 -0600 31.3 @@ -0,0 +1,394 @@ 31.4 +\chapter{Installation} 31.5 + 31.6 +The Xen distribution includes three main components: Xen itself, ports 31.7 +of Linux 2.4 and 2.6 and NetBSD to run on Xen, and the userspace 31.8 +tools required to manage a Xen-based system. This chapter describes 31.9 +how to install the Xen~2.0 distribution from source. Alternatively, 31.10 +there may be pre-built packages available as part of your operating 31.11 +system distribution. 31.12 + 31.13 + 31.14 +\section{Prerequisites} 31.15 +\label{sec:prerequisites} 31.16 + 31.17 +The following is a full list of prerequisites. Items marked `$\dag$' 31.18 +are required by the \xend\ control tools, and hence required if you 31.19 +want to run more than one virtual machine; items marked `$*$' are only 31.20 +required if you wish to build from source. 31.21 +\begin{itemize} 31.22 +\item A working Linux distribution using the GRUB bootloader and 31.23 + running on a P6-class (or newer) CPU. 31.24 +\item [$\dag$] The \path{iproute2} package. 31.25 +\item [$\dag$] The Linux bridge-utils\footnote{Available from {\tt 31.26 + http://bridge.sourceforge.net}} (e.g., \path{/sbin/brctl}) 31.27 +\item [$\dag$] An installation of Twisted~v1.3 or 31.28 + above\footnote{Available from {\tt http://www.twistedmatrix.com}}. 31.29 + There may be a binary package available for your distribution; 31.30 + alternatively it can be installed by running `{\sl make 31.31 + install-twisted}' in the root of the Xen source tree. 31.32 +\item [$*$] Build tools (gcc v3.2.x or v3.3.x, binutils, GNU make). 31.33 +\item [$*$] Development installation of libcurl (e.g., libcurl-devel) 31.34 +\item [$*$] Development installation of zlib (e.g., zlib-dev). 31.35 +\item [$*$] Development installation of Python v2.2 or later (e.g., 31.36 + python-dev). 31.37 +\item [$*$] \LaTeX\ and transfig are required to build the 31.38 + documentation. 31.39 +\end{itemize} 31.40 + 31.41 +Once you have satisfied the relevant prerequisites, you can now 31.42 +install either a binary or source distribution of Xen. 31.43 + 31.44 + 31.45 +\section{Installing from Binary Tarball} 31.46 + 31.47 +Pre-built tarballs are available for download from the Xen download 31.48 +page 31.49 +\begin{quote} {\tt http://xen.sf.net} 31.50 +\end{quote} 31.51 + 31.52 +Once you've downloaded the tarball, simply unpack and install: 31.53 +\begin{verbatim} 31.54 +# tar zxvf xen-2.0-install.tgz 31.55 +# cd xen-2.0-install 31.56 +# sh ./install.sh 31.57 +\end{verbatim} 31.58 + 31.59 +Once you've installed the binaries you need to configure your system 31.60 +as described in Section~\ref{s:configure}. 31.61 + 31.62 + 31.63 +\section{Installing from Source} 31.64 + 31.65 +This section describes how to obtain, build, and install Xen from 31.66 +source. 31.67 + 31.68 +\subsection{Obtaining the Source} 31.69 + 31.70 +The Xen source tree is available as either a compressed source tar 31.71 +ball or as a clone of our master BitKeeper repository. 31.72 + 31.73 +\begin{description} 31.74 +\item[Obtaining the Source Tarball]\mbox{} \\ 31.75 + Stable versions (and daily snapshots) of the Xen source tree are 31.76 + available as compressed tarballs from the Xen download page 31.77 + \begin{quote} {\tt http://xen.sf.net} 31.78 + \end{quote} 31.79 + 31.80 +\item[Using BitKeeper]\mbox{} \\ 31.81 + If you wish to install Xen from a clone of our latest BitKeeper 31.82 + repository then you will need to install the BitKeeper tools. 31.83 + Download instructions for BitKeeper can be obtained by filling out 31.84 + the form at: 31.85 + \begin{quote} {\tt http://www.bitmover.com/cgi-bin/download.cgi} 31.86 +\end{quote} 31.87 +The public master BK repository for the 2.0 release lives at: 31.88 +\begin{quote} {\tt bk://xen.bkbits.net/xen-2.0.bk} 31.89 +\end{quote} 31.90 +You can use BitKeeper to download it and keep it updated with the 31.91 +latest features and fixes. 31.92 + 31.93 +Change to the directory in which you want to put the source code, then 31.94 +run: 31.95 +\begin{verbatim} 31.96 +# bk clone bk://xen.bkbits.net/xen-2.0.bk 31.97 +\end{verbatim} 31.98 + 31.99 +Under your current directory, a new directory named \path{xen-2.0.bk} 31.100 +has been created, which contains all the source code for Xen, the OS 31.101 +ports, and the control tools. You can update your repository with the 31.102 +latest changes at any time by running: 31.103 +\begin{verbatim} 31.104 +# cd xen-2.0.bk # to change into the local repository 31.105 +# bk pull # to update the repository 31.106 +\end{verbatim} 31.107 +\end{description} 31.108 + 31.109 +% \section{The distribution} 31.110 +% 31.111 +% The Xen source code repository is structured as follows: 31.112 +% 31.113 +% \begin{description} 31.114 +% \item[\path{tools/}] Xen node controller daemon (Xend), command line 31.115 +% tools, control libraries 31.116 +% \item[\path{xen/}] The Xen VMM. 31.117 +% \item[\path{linux-*-xen-sparse/}] Xen support for Linux. 31.118 +% \item[\path{linux-*-patches/}] Experimental patches for Linux. 31.119 +% \item[\path{netbsd-*-xen-sparse/}] Xen support for NetBSD. 31.120 +% \item[\path{docs/}] Various documentation files for users and 31.121 +% developers. 31.122 +% \item[\path{extras/}] Bonus extras. 31.123 +% \end{description} 31.124 + 31.125 +\subsection{Building from Source} 31.126 + 31.127 +The top-level Xen Makefile includes a target `world' that will do the 31.128 +following: 31.129 + 31.130 +\begin{itemize} 31.131 +\item Build Xen. 31.132 +\item Build the control tools, including \xend. 31.133 +\item Download (if necessary) and unpack the Linux 2.6 source code, 31.134 + and patch it for use with Xen. 31.135 +\item Build a Linux kernel to use in domain 0 and a smaller 31.136 + unprivileged kernel, which can optionally be used for unprivileged 31.137 + virtual machines. 31.138 +\end{itemize} 31.139 + 31.140 +After the build has completed you should have a top-level directory 31.141 +called \path{dist/} in which all resulting targets will be placed; of 31.142 +particular interest are the two kernels XenLinux kernel images, one 31.143 +with a `-xen0' extension which contains hardware device drivers and 31.144 +drivers for Xen's virtual devices, and one with a `-xenU' extension 31.145 +that just contains the virtual ones. These are found in 31.146 +\path{dist/install/boot/} along with the image for Xen itself and the 31.147 +configuration files used during the build. 31.148 + 31.149 +The NetBSD port can be built using: 31.150 +\begin{quote} 31.151 +\begin{verbatim} 31.152 +# make netbsd20 31.153 +\end{verbatim} 31.154 +\end{quote} 31.155 +NetBSD port is built using a snapshot of the netbsd-2-0 cvs branch. 31.156 +The snapshot is downloaded as part of the build process, if it is not 31.157 +yet present in the \path{NETBSD\_SRC\_PATH} search path. The build 31.158 +process also downloads a toolchain which includes all the tools 31.159 +necessary to build the NetBSD kernel under Linux. 31.160 + 31.161 +To customize further the set of kernels built you need to edit the 31.162 +top-level Makefile. Look for the line: 31.163 + 31.164 +\begin{quote} 31.165 +\begin{verbatim} 31.166 +KERNELS ?= mk.linux-2.6-xen0 mk.linux-2.6-xenU 31.167 +\end{verbatim} 31.168 +\end{quote} 31.169 + 31.170 +You can edit this line to include any set of operating system kernels 31.171 +which have configurations in the top-level \path{buildconfigs/} 31.172 +directory, for example \path{mk.linux-2.4-xenU} to build a Linux 2.4 31.173 +kernel containing only virtual device drivers. 31.174 + 31.175 +%% Inspect the Makefile if you want to see what goes on during a 31.176 +%% build. Building Xen and the tools is straightforward, but XenLinux 31.177 +%% is more complicated. The makefile needs a `pristine' Linux kernel 31.178 +%% tree to which it will then add the Xen architecture files. You can 31.179 +%% tell the makefile the location of the appropriate Linux compressed 31.180 +%% tar file by 31.181 +%% setting the LINUX\_SRC environment variable, e.g. \\ 31.182 +%% \verb!# LINUX_SRC=/tmp/linux-2.6.11.tar.bz2 make world! \\ or by 31.183 +%% placing the tar file somewhere in the search path of {\tt 31.184 +%% LINUX\_SRC\_PATH} which defaults to `{\tt .:..}'. If the 31.185 +%% makefile can't find a suitable kernel tar file it attempts to 31.186 +%% download it from kernel.org (this won't work if you're behind a 31.187 +%% firewall). 31.188 + 31.189 +%% After untaring the pristine kernel tree, the makefile uses the {\tt 31.190 +%% mkbuildtree} script to add the Xen patches to the kernel. 31.191 + 31.192 + 31.193 +%% The procedure is similar to build the Linux 2.4 port: \\ 31.194 +%% \verb!# LINUX_SRC=/path/to/linux2.4/source make linux24! 31.195 + 31.196 + 31.197 +%% \framebox{\parbox{5in}{ 31.198 +%% {\bf Distro specific:} \\ 31.199 +%% {\it Gentoo} --- if not using udev (most installations, 31.200 +%% currently), you'll need to enable devfs and devfs mount at boot 31.201 +%% time in the xen0 config. }} 31.202 + 31.203 +\subsection{Custom XenLinux Builds} 31.204 + 31.205 +% If you have an SMP machine you may wish to give the {\tt '-j4'} 31.206 +% argument to make to get a parallel build. 31.207 + 31.208 +If you wish to build a customized XenLinux kernel (e.g. to support 31.209 +additional devices or enable distribution-required features), you can 31.210 +use the standard Linux configuration mechanisms, specifying that the 31.211 +architecture being built for is \path{xen}, e.g: 31.212 +\begin{quote} 31.213 +\begin{verbatim} 31.214 +# cd linux-2.6.11-xen0 31.215 +# make ARCH=xen xconfig 31.216 +# cd .. 31.217 +# make 31.218 +\end{verbatim} 31.219 +\end{quote} 31.220 + 31.221 +You can also copy an existing Linux configuration (\path{.config}) 31.222 +into \path{linux-2.6.11-xen0} and execute: 31.223 +\begin{quote} 31.224 +\begin{verbatim} 31.225 +# make ARCH=xen oldconfig 31.226 +\end{verbatim} 31.227 +\end{quote} 31.228 + 31.229 +You may be prompted with some Xen-specific options; we advise 31.230 +accepting the defaults for these options. 31.231 + 31.232 +Note that the only difference between the two types of Linux kernel 31.233 +that are built is the configuration file used for each. The `U' 31.234 +suffixed (unprivileged) versions don't contain any of the physical 31.235 +hardware device drivers, leading to a 30\% reduction in size; hence 31.236 +you may prefer these for your non-privileged domains. The `0' 31.237 +suffixed privileged versions can be used to boot the system, as well 31.238 +as in driver domains and unprivileged domains. 31.239 + 31.240 +\subsection{Installing the Binaries} 31.241 + 31.242 +The files produced by the build process are stored under the 31.243 +\path{dist/install/} directory. To install them in their default 31.244 +locations, do: 31.245 +\begin{quote} 31.246 +\begin{verbatim} 31.247 +# make install 31.248 +\end{verbatim} 31.249 +\end{quote} 31.250 + 31.251 +Alternatively, users with special installation requirements may wish 31.252 +to install them manually by copying the files to their appropriate 31.253 +destinations. 31.254 + 31.255 +%% Files in \path{install/boot/} include: 31.256 +%% \begin{itemize} 31.257 +%% \item \path{install/boot/xen-2.0.gz} Link to the Xen 'kernel' 31.258 +%% \item \path{install/boot/vmlinuz-2.6-xen0} Link to domain 0 31.259 +%% XenLinux kernel 31.260 +%% \item \path{install/boot/vmlinuz-2.6-xenU} Link to unprivileged 31.261 +%% XenLinux kernel 31.262 +%% \end{itemize} 31.263 + 31.264 +The \path{dist/install/boot} directory will also contain the config 31.265 +files used for building the XenLinux kernels, and also versions of Xen 31.266 +and XenLinux kernels that contain debug symbols (\path{xen-syms-2.0.6} 31.267 +and \path{vmlinux-syms-2.6.11.11-xen0}) which are essential for 31.268 +interpreting crash dumps. Retain these files as the developers may 31.269 +wish to see them if you post on the mailing list. 31.270 + 31.271 + 31.272 +\section{Configuration} 31.273 +\label{s:configure} 31.274 + 31.275 +Once you have built and installed the Xen distribution, it is simple 31.276 +to prepare the machine for booting and running Xen. 31.277 + 31.278 +\subsection{GRUB Configuration} 31.279 + 31.280 +An entry should be added to \path{grub.conf} (often found under 31.281 +\path{/boot/} or \path{/boot/grub/}) to allow Xen / XenLinux to boot. 31.282 +This file is sometimes called \path{menu.lst}, depending on your 31.283 +distribution. The entry should look something like the following: 31.284 + 31.285 +{\small 31.286 +\begin{verbatim} 31.287 +title Xen 2.0 / XenLinux 2.6 31.288 + kernel /boot/xen-2.0.gz dom0_mem=131072 31.289 + module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro console=tty0 31.290 +\end{verbatim} 31.291 +} 31.292 + 31.293 +The kernel line tells GRUB where to find Xen itself and what boot 31.294 +parameters should be passed to it (in this case, setting domain 0's 31.295 +memory allocation in kilobytes and the settings for the serial port). 31.296 +For more details on the various Xen boot parameters see 31.297 +Section~\ref{s:xboot}. 31.298 + 31.299 +The module line of the configuration describes the location of the 31.300 +XenLinux kernel that Xen should start and the parameters that should 31.301 +be passed to it (these are standard Linux parameters, identifying the 31.302 +root device and specifying it be initially mounted read only and 31.303 +instructing that console output be sent to the screen). Some 31.304 +distributions such as SuSE do not require the \path{ro} parameter. 31.305 + 31.306 +%% \framebox{\parbox{5in}{ 31.307 +%% {\bf Distro specific:} \\ 31.308 +%% {\it SuSE} --- Omit the {\tt ro} option from the XenLinux 31.309 +%% kernel command line, since the partition won't be remounted rw 31.310 +%% during boot. }} 31.311 + 31.312 + 31.313 +If you want to use an initrd, just add another \path{module} line to 31.314 +the configuration, as usual: 31.315 + 31.316 +{\small 31.317 +\begin{verbatim} 31.318 + module /boot/my_initrd.gz 31.319 +\end{verbatim} 31.320 +} 31.321 + 31.322 +As always when installing a new kernel, it is recommended that you do 31.323 +not delete existing menu options from \path{menu.lst} --- you may want 31.324 +to boot your old Linux kernel in future, particularly if you have 31.325 +problems. 31.326 + 31.327 +\subsection{Serial Console (optional)} 31.328 + 31.329 +%% kernel /boot/xen-2.0.gz dom0_mem=131072 com1=115200,8n1 31.330 +%% module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro 31.331 + 31.332 + 31.333 +In order to configure Xen serial console output, it is necessary to 31.334 +add an boot option to your GRUB config; e.g.\ replace the above kernel 31.335 +line with: 31.336 +\begin{quote} 31.337 +{\small 31.338 +\begin{verbatim} 31.339 + kernel /boot/xen.gz dom0_mem=131072 com1=115200,8n1 31.340 +\end{verbatim}} 31.341 +\end{quote} 31.342 + 31.343 +This configures Xen to output on COM1 at 115,200 baud, 8 data bits, 1 31.344 +stop bit and no parity. Modify these parameters for your set up. 31.345 + 31.346 +One can also configure XenLinux to share the serial console; to 31.347 +achieve this append ``\path{console=ttyS0}'' to your module line. 31.348 + 31.349 +If you wish to be able to log in over the XenLinux serial console it 31.350 +is necessary to add a line into \path{/etc/inittab}, just as per 31.351 +regular Linux. Simply add the line: 31.352 +\begin{quote} {\small {\tt c:2345:respawn:/sbin/mingetty ttyS0}} 31.353 +\end{quote} 31.354 + 31.355 +and you should be able to log in. Note that to successfully log in as 31.356 +root over the serial line will require adding \path{ttyS0} to 31.357 +\path{/etc/securetty} in most modern distributions. 31.358 + 31.359 +\subsection{TLS Libraries} 31.360 + 31.361 +Users of the XenLinux 2.6 kernel should disable Thread Local Storage 31.362 +(e.g.\ by doing a \path{mv /lib/tls /lib/tls.disabled}) before 31.363 +attempting to run with a XenLinux kernel\footnote{If you boot without 31.364 + first disabling TLS, you will get a warning message during the boot 31.365 + process. In this case, simply perform the rename after the machine 31.366 + is up and then run \texttt{/sbin/ldconfig} to make it take effect.}. 31.367 +You can always reenable it by restoring the directory to its original 31.368 +location (i.e.\ \path{mv /lib/tls.disabled /lib/tls}). 31.369 + 31.370 +The reason for this is that the current TLS implementation uses 31.371 +segmentation in a way that is not permissible under Xen. If TLS is 31.372 +not disabled, an emulation mode is used within Xen which reduces 31.373 +performance substantially. 31.374 + 31.375 +We hope that this issue can be resolved by working with Linux 31.376 +distribution vendors to implement a minor backward-compatible change 31.377 +to the TLS library. 31.378 + 31.379 + 31.380 +\section{Booting Xen} 31.381 + 31.382 +It should now be possible to restart the system and use Xen. Reboot 31.383 +as usual but choose the new Xen option when the Grub screen appears. 31.384 + 31.385 +What follows should look much like a conventional Linux boot. The 31.386 +first portion of the output comes from Xen itself, supplying low level 31.387 +information about itself and the machine it is running on. The 31.388 +following portion of the output comes from XenLinux. 31.389 + 31.390 +You may see some errors during the XenLinux boot. These are not 31.391 +necessarily anything to worry about --- they may result from kernel 31.392 +configuration differences between your XenLinux kernel and the one you 31.393 +usually use. 31.394 + 31.395 +When the boot completes, you should be able to log into your system as 31.396 +usual. If you are unable to log in to your system running Xen, you 31.397 +should still be able to reboot with your normal Linux kernel.
32.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 32.2 +++ b/docs/src/user/introduction.tex Thu Sep 22 11:42:01 2005 -0600 32.3 @@ -0,0 +1,143 @@ 32.4 +\chapter{Introduction} 32.5 + 32.6 + 32.7 +Xen is a \emph{paravirtualising} virtual machine monitor (VMM), or 32.8 +`hypervisor', for the x86 processor architecture. Xen can securely 32.9 +execute multiple virtual machines on a single physical system with 32.10 +close-to-native performance. The virtual machine technology 32.11 +facilitates enterprise-grade functionality, including: 32.12 + 32.13 +\begin{itemize} 32.14 +\item Virtual machines with performance close to native hardware. 32.15 +\item Live migration of running virtual machines between physical 32.16 + hosts. 32.17 +\item Excellent hardware support (supports most Linux device drivers). 32.18 +\item Sandboxed, re-startable device drivers. 32.19 +\end{itemize} 32.20 + 32.21 +Paravirtualisation permits very high performance virtualisation, even 32.22 +on architectures like x86 that are traditionally very hard to 32.23 +virtualise. 32.24 + 32.25 +The drawback of this approach is that it requires operating systems to 32.26 +be \emph{ported} to run on Xen. Porting an OS to run on Xen is 32.27 +similar to supporting a new hardware platform, however the process is 32.28 +simplified because the paravirtual machine architecture is very 32.29 +similar to the underlying native hardware. Even though operating 32.30 +system kernels must explicitly support Xen, a key feature is that user 32.31 +space applications and libraries \emph{do not} require modification. 32.32 + 32.33 +Xen support is available for increasingly many operating systems: 32.34 +right now, Linux 2.4, Linux 2.6 and NetBSD are available for Xen 2.0. 32.35 +A FreeBSD port is undergoing testing and will be incorporated into the 32.36 +release soon. Other OS ports, including Plan 9, are in progress. We 32.37 +hope that that arch-xen patches will be incorporated into the 32.38 +mainstream releases of these operating systems in due course (as has 32.39 +already happened for NetBSD). 32.40 + 32.41 +Possible usage scenarios for Xen include: 32.42 + 32.43 +\begin{description} 32.44 +\item [Kernel development.] Test and debug kernel modifications in a 32.45 + sandboxed virtual machine --- no need for a separate test machine. 32.46 +\item [Multiple OS configurations.] Run multiple operating systems 32.47 + simultaneously, for instance for compatibility or QA purposes. 32.48 +\item [Server consolidation.] Move multiple servers onto a single 32.49 + physical host with performance and fault isolation provided at 32.50 + virtual machine boundaries. 32.51 +\item [Cluster computing.] Management at VM granularity provides more 32.52 + flexibility than separately managing each physical host, but better 32.53 + control and isolation than single-system image solutions, 32.54 + particularly by using live migration for load balancing. 32.55 +\item [Hardware support for custom OSes.] Allow development of new 32.56 + OSes while benefiting from the wide-ranging hardware support of 32.57 + existing OSes such as Linux. 32.58 +\end{description} 32.59 + 32.60 + 32.61 +\section{Structure of a Xen-Based System} 32.62 + 32.63 +A Xen system has multiple layers, the lowest and most privileged of 32.64 +which is Xen itself. 32.65 + 32.66 +Xen in turn may host multiple \emph{guest} operating systems, each of 32.67 +which is executed within a secure virtual machine (in Xen terminology, 32.68 +a \emph{domain}). Domains are scheduled by Xen to make effective use 32.69 +of the available physical CPUs. Each guest OS manages its own 32.70 +applications, which includes responsibility for scheduling each 32.71 +application within the time allotted to the VM by Xen. 32.72 + 32.73 +The first domain, \emph{domain 0}, is created automatically when the 32.74 +system boots and has special management privileges. Domain 0 builds 32.75 +other domains and manages their virtual devices. It also performs 32.76 +administrative tasks such as suspending, resuming and migrating other 32.77 +virtual machines. 32.78 + 32.79 +Within domain 0, a process called \emph{xend} runs to manage the 32.80 +system. \Xend is responsible for managing virtual machines and 32.81 +providing access to their consoles. Commands are issued to \xend over 32.82 +an HTTP interface, either from a command-line tool or from a web 32.83 +browser. 32.84 + 32.85 + 32.86 +\section{Hardware Support} 32.87 + 32.88 +Xen currently runs only on the x86 architecture, requiring a `P6' or 32.89 +newer processor (e.g. Pentium Pro, Celeron, Pentium II, Pentium III, 32.90 +Pentium IV, Xeon, AMD Athlon, AMD Duron). Multiprocessor machines are 32.91 +supported, and we also have basic support for HyperThreading (SMT), 32.92 +although this remains a topic for ongoing research. A port 32.93 +specifically for x86/64 is in progress, although Xen already runs on 32.94 +such systems in 32-bit legacy mode. In addition a port to the IA64 32.95 +architecture is approaching completion. We hope to add other 32.96 +architectures such as PPC and ARM in due course. 32.97 + 32.98 +Xen can currently use up to 4GB of memory. It is possible for x86 32.99 +machines to address up to 64GB of physical memory but there are no 32.100 +current plans to support these systems: The x86/64 port is the planned 32.101 +route to supporting larger memory sizes. 32.102 + 32.103 +Xen offloads most of the hardware support issues to the guest OS 32.104 +running in Domain~0. Xen itself contains only the code required to 32.105 +detect and start secondary processors, set up interrupt routing, and 32.106 +perform PCI bus enumeration. Device drivers run within a privileged 32.107 +guest OS rather than within Xen itself. This approach provides 32.108 +compatibility with the majority of device hardware supported by Linux. 32.109 +The default XenLinux build contains support for relatively modern 32.110 +server-class network and disk hardware, but you can add support for 32.111 +other hardware by configuring your XenLinux kernel in the normal way. 32.112 + 32.113 + 32.114 +\section{History} 32.115 + 32.116 +Xen was originally developed by the Systems Research Group at the 32.117 +University of Cambridge Computer Laboratory as part of the XenoServers 32.118 +project, funded by the UK-EPSRC. 32.119 + 32.120 +XenoServers aim to provide a `public infrastructure for global 32.121 +distributed computing', and Xen plays a key part in that, allowing us 32.122 +to efficiently partition a single machine to enable multiple 32.123 +independent clients to run their operating systems and applications in 32.124 +an environment providing protection, resource isolation and 32.125 +accounting. The project web page contains further information along 32.126 +with pointers to papers and technical reports: 32.127 +\path{http://www.cl.cam.ac.uk/xeno} 32.128 + 32.129 +Xen has since grown into a fully-fledged project in its own right, 32.130 +enabling us to investigate interesting research issues regarding the 32.131 +best techniques for virtualising resources such as the CPU, memory, 32.132 +disk and network. The project has been bolstered by support from 32.133 +Intel Research Cambridge, and HP Labs, who are now working closely 32.134 +with us. 32.135 + 32.136 +Xen was first described in a paper presented at SOSP in 32.137 +2003\footnote{\tt 32.138 + http://www.cl.cam.ac.uk/netos/papers/2003-xensosp.pdf}, and the 32.139 +first public release (1.0) was made that October. Since then, Xen has 32.140 +significantly matured and is now used in production scenarios on many 32.141 +sites. 32.142 + 32.143 +Xen 2.0 features greatly enhanced hardware support, configuration 32.144 +flexibility, usability and a larger complement of supported operating 32.145 +systems. This latest release takes Xen a step closer to becoming the 32.146 +definitive open source solution for virtualisation.
33.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 33.2 +++ b/docs/src/user/redhat.tex Thu Sep 22 11:42:01 2005 -0600 33.3 @@ -0,0 +1,61 @@ 33.4 +\chapter{Installing Xen / XenLinux on Red~Hat or Fedora Core} 33.5 + 33.6 +When using Xen / XenLinux on a standard Linux distribution there are a 33.7 +couple of things to watch out for: 33.8 + 33.9 +Note that, because domains greater than 0 don't have any privileged 33.10 +access at all, certain commands in the default boot sequence will fail 33.11 +e.g.\ attempts to update the hwclock, change the console font, update 33.12 +the keytable map, start apmd (power management), or gpm (mouse 33.13 +cursor). Either ignore the errors (they should be harmless), or 33.14 +remove them from the startup scripts. Deleting the following links 33.15 +are a good start: {\path{S24pcmcia}}, {\path{S09isdn}}, 33.16 +{\path{S17keytable}}, {\path{S26apmd}}, {\path{S85gpm}}. 33.17 + 33.18 +If you want to use a single root file system that works cleanly for 33.19 +both domain~0 and unprivileged domains, a useful trick is to use 33.20 +different `init' run levels. For example, use run level 3 for 33.21 +domain~0, and run level 4 for other domains. This enables different 33.22 +startup scripts to be run in depending on the run level number passed 33.23 +on the kernel command line. 33.24 + 33.25 +If using NFS root files systems mounted either from an external server 33.26 +or from domain0 there are a couple of other gotchas. The default 33.27 +{\path{/etc/sysconfig/iptables}} rules block NFS, so part way through 33.28 +the boot sequence things will suddenly go dead. 33.29 + 33.30 +If you're planning on having a separate NFS {\path{/usr}} partition, 33.31 +the RH9 boot scripts don't make life easy - they attempt to mount NFS 33.32 +file systems way to late in the boot process. The easiest way I found 33.33 +to do this was to have a {\path{/linuxrc}} script run ahead of 33.34 +{\path{/sbin/init}} that mounts {\path{/usr}}: 33.35 + 33.36 +\begin{quote} 33.37 + \begin{small}\begin{verbatim} 33.38 + #!/bin/bash 33.39 + /sbin/ipconfig lo 127.0.0.1 33.40 + /sbin/portmap 33.41 + /bin/mount /usr 33.42 + exec /sbin/init "$@" <>/dev/console 2>&1 33.43 +\end{verbatim}\end{small} 33.44 +\end{quote} 33.45 + 33.46 +%% $ XXX SMH: font lock fix :-) 33.47 + 33.48 +The one slight complication with the above is that 33.49 +{\path{/sbin/portmap}} is dynamically linked against 33.50 +{\path{/usr/lib/libwrap.so.0}} Since this is in {\path{/usr}}, it 33.51 +won't work. This can be solved by copying the file (and link) below 33.52 +the {\path{/usr}} mount point, and just let the file be `covered' when 33.53 +the mount happens. 33.54 + 33.55 +In some installations, where a shared read-only {\path{/usr}} is being 33.56 +used, it may be desirable to move other large directories over into 33.57 +the read-only {\path{/usr}}. For example, you might replace 33.58 +{\path{/bin}}, {\path{/lib}} and {\path{/sbin}} with links into 33.59 +{\path{/usr/root/bin}}, {\path{/usr/root/lib}} and 33.60 +{\path{/usr/root/sbin}} respectively. This creates other problems for 33.61 +running the {\path{/linuxrc}} script, requiring bash, portmap, mount, 33.62 +ifconfig, and a handful of other shared libraries to be copied below 33.63 +the mount point --- a simple statically-linked C program would solve 33.64 +this problem.
34.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 34.2 +++ b/docs/src/user/start_addl_dom.tex Thu Sep 22 11:42:01 2005 -0600 34.3 @@ -0,0 +1,172 @@ 34.4 +\chapter{Starting Additional Domains} 34.5 + 34.6 +The first step in creating a new domain is to prepare a root 34.7 +filesystem for it to boot from. Typically, this might be stored in a 34.8 +normal partition, an LVM or other volume manager partition, a disk 34.9 +file or on an NFS server. A simple way to do this is simply to boot 34.10 +from your standard OS install CD and install the distribution into 34.11 +another partition on your hard drive. 34.12 + 34.13 +To start the \xend\ control daemon, type 34.14 +\begin{quote} 34.15 + \verb!# xend start! 34.16 +\end{quote} 34.17 + 34.18 +If you wish the daemon to start automatically, see the instructions in 34.19 +Section~\ref{s:xend}. Once the daemon is running, you can use the 34.20 +\path{xm} tool to monitor and maintain the domains running on your 34.21 +system. This chapter provides only a brief tutorial. We provide full 34.22 +details of the \path{xm} tool in the next chapter. 34.23 + 34.24 +% \section{From the web interface} 34.25 +% 34.26 +% Boot the Xen machine and start Xensv (see Chapter~\ref{cha:xensv} 34.27 +% for more details) using the command: \\ 34.28 +% \verb_# xensv start_ \\ 34.29 +% This will also start Xend (see Chapter~\ref{cha:xend} for more 34.30 +% information). 34.31 +% 34.32 +% The domain management interface will then be available at {\tt 34.33 +% http://your\_machine:8080/}. This provides a user friendly wizard 34.34 +% for starting domains and functions for managing running domains. 34.35 +% 34.36 +% \section{From the command line} 34.37 + 34.38 + 34.39 +\section{Creating a Domain Configuration File} 34.40 + 34.41 +Before you can start an additional domain, you must create a 34.42 +configuration file. We provide two example files which you can use as 34.43 +a starting point: 34.44 +\begin{itemize} 34.45 +\item \path{/etc/xen/xmexample1} is a simple template configuration 34.46 + file for describing a single VM. 34.47 + 34.48 +\item \path{/etc/xen/xmexample2} file is a template description that 34.49 + is intended to be reused for multiple virtual machines. Setting the 34.50 + value of the \path{vmid} variable on the \path{xm} command line 34.51 + fills in parts of this template. 34.52 +\end{itemize} 34.53 + 34.54 +Copy one of these files and edit it as appropriate. Typical values 34.55 +you may wish to edit include: 34.56 + 34.57 +\begin{quote} 34.58 +\begin{description} 34.59 +\item[kernel] Set this to the path of the kernel you compiled for use 34.60 + with Xen (e.g.\ \path{kernel = `/boot/vmlinuz-2.6-xenU'}) 34.61 +\item[memory] Set this to the size of the domain's memory in megabytes 34.62 + (e.g.\ \path{memory = 64}) 34.63 +\item[disk] Set the first entry in this list to calculate the offset 34.64 + of the domain's root partition, based on the domain ID. Set the 34.65 + second to the location of \path{/usr} if you are sharing it between 34.66 + domains (e.g.\ \path{disk = [`phy:your\_hard\_drive\%d,sda1,w' \% 34.67 + (base\_partition\_number + vmid), 34.68 + `phy:your\_usr\_partition,sda6,r' ]} 34.69 +\item[dhcp] Uncomment the dhcp variable, so that the domain will 34.70 + receive its IP address from a DHCP server (e.g.\ \path{dhcp=`dhcp'}) 34.71 +\end{description} 34.72 +\end{quote} 34.73 + 34.74 +You may also want to edit the {\bf vif} variable in order to choose 34.75 +the MAC address of the virtual ethernet interface yourself. For 34.76 +example: 34.77 +\begin{quote} 34.78 +\verb_vif = [`mac=00:06:AA:F6:BB:B3']_ 34.79 +\end{quote} 34.80 +If you do not set this variable, \xend\ will automatically generate a 34.81 +random MAC address from an unused range. 34.82 + 34.83 + 34.84 +\section{Booting the Domain} 34.85 + 34.86 +The \path{xm} tool provides a variety of commands for managing 34.87 +domains. Use the \path{create} command to start new domains. Assuming 34.88 +you've created a configuration file \path{myvmconf} based around 34.89 +\path{/etc/xen/xmexample2}, to start a domain with virtual machine 34.90 +ID~1 you should type: 34.91 + 34.92 +\begin{quote} 34.93 +\begin{verbatim} 34.94 +# xm create -c myvmconf vmid=1 34.95 +\end{verbatim} 34.96 +\end{quote} 34.97 + 34.98 +The \path{-c} switch causes \path{xm} to turn into the domain's 34.99 +console after creation. The \path{vmid=1} sets the \path{vmid} 34.100 +variable used in the \path{myvmconf} file. 34.101 + 34.102 +You should see the console boot messages from the new domain appearing 34.103 +in the terminal in which you typed the command, culminating in a login 34.104 +prompt. 34.105 + 34.106 + 34.107 +\section{Example: ttylinux} 34.108 + 34.109 +Ttylinux is a very small Linux distribution, designed to require very 34.110 +few resources. We will use it as a concrete example of how to start a 34.111 +Xen domain. Most users will probably want to install a full-featured 34.112 +distribution once they have mastered the basics\footnote{ttylinux is 34.113 + maintained by Pascal Schmidt. You can download source packages from 34.114 + the distribution's home page: {\tt 34.115 + http://www.minimalinux.org/ttylinux/}}. 34.116 + 34.117 +\begin{enumerate} 34.118 +\item Download and extract the ttylinux disk image from the Files 34.119 + section of the project's SourceForge site (see 34.120 + \path{http://sf.net/projects/xen/}). 34.121 +\item Create a configuration file like the following: 34.122 +\begin{verbatim} 34.123 +kernel = "/boot/vmlinuz-2.6-xenU" 34.124 +memory = 64 34.125 +name = "ttylinux" 34.126 +nics = 1 34.127 +ip = "1.2.3.4" 34.128 +disk = ['file:/path/to/ttylinux/rootfs,sda1,w'] 34.129 +root = "/dev/sda1 ro" 34.130 +\end{verbatim} 34.131 +\item Now start the domain and connect to its console: 34.132 +\begin{verbatim} 34.133 +xm create configfile -c 34.134 +\end{verbatim} 34.135 +\item Login as root, password root. 34.136 +\end{enumerate} 34.137 + 34.138 + 34.139 +\section{Starting / Stopping Domains Automatically} 34.140 + 34.141 +It is possible to have certain domains start automatically at boot 34.142 +time and to have dom0 wait for all running domains to shutdown before 34.143 +it shuts down the system. 34.144 + 34.145 +To specify a domain is to start at boot-time, place its configuration 34.146 +file (or a link to it) under \path{/etc/xen/auto/}. 34.147 + 34.148 +A Sys-V style init script for Red Hat and LSB-compliant systems is 34.149 +provided and will be automatically copied to \path{/etc/init.d/} 34.150 +during install. You can then enable it in the appropriate way for 34.151 +your distribution. 34.152 + 34.153 +For instance, on Red Hat: 34.154 + 34.155 +\begin{quote} 34.156 + \verb_# chkconfig --add xendomains_ 34.157 +\end{quote} 34.158 + 34.159 +By default, this will start the boot-time domains in runlevels 3, 4 34.160 +and 5. 34.161 + 34.162 +You can also use the \path{service} command to run this script 34.163 +manually, e.g: 34.164 + 34.165 +\begin{quote} 34.166 + \verb_# service xendomains start_ 34.167 + 34.168 + Starts all the domains with config files under /etc/xen/auto/. 34.169 +\end{quote} 34.170 + 34.171 +\begin{quote} 34.172 + \verb_# service xendomains stop_ 34.173 + 34.174 + Shuts down ALL running Xen domains. 34.175 +\end{quote}
69.1 --- a/extras/mini-os/xenbus/xenbus_xs.c Thu Sep 22 11:34:14 2005 -0600 69.2 +++ b/extras/mini-os/xenbus/xenbus_xs.c Thu Sep 22 11:42:01 2005 -0600 69.3 @@ -127,7 +127,7 @@ static void *xs_talkv(enum xsd_sockmsg_t 69.4 return ERR_PTR(err); 69.5 69.6 for (i = 0; i < num_vecs; i++) { 69.7 - err = xb_write(iovec[i].iov_base, iovec[i].iov_len);; 69.8 + err = xb_write(iovec[i].iov_base, iovec[i].iov_len); 69.9 if (err) 69.10 return ERR_PTR(err); 69.11 }
150.1 --- a/linux-2.6-xen-sparse/arch/xen/Kconfig Thu Sep 22 11:34:14 2005 -0600 150.2 +++ b/linux-2.6-xen-sparse/arch/xen/Kconfig Thu Sep 22 11:42:01 2005 -0600 150.3 @@ -73,6 +73,8 @@ config XEN_NETDEV_BACKEND 150.4 config XEN_TPMDEV_FRONTEND 150.5 bool "TPM-device frontend driver" 150.6 default n 150.7 + select TCG_TPM 150.8 + select TCG_XEN 150.9 help 150.10 The TPM-device frontend driver. 150.11 150.12 @@ -109,13 +111,6 @@ config XEN_NETDEV_FRONTEND 150.13 dedicated device-driver domain, or your master control domain 150.14 (domain 0), then you almost certainly want to say Y here. 150.15 150.16 -config XEN_NETDEV_GRANT 150.17 - bool "Grant table substrate for network drivers (DANGEROUS)" 150.18 - default n 150.19 - help 150.20 - This introduces the use of grant tables as a data exhange mechanism 150.21 - between the frontend and backend network drivers. 150.22 - 150.23 config XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER 150.24 bool "Pipelined transmitter (DANGEROUS)" 150.25 depends on XEN_NETDEV_FRONTEND
154.1 --- a/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_32 Thu Sep 22 11:34:14 2005 -0600 154.2 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_32 Thu Sep 22 11:42:01 2005 -0600 154.3 @@ -19,7 +19,6 @@ CONFIG_XEN_NETDEV_BACKEND=y 154.4 # CONFIG_XEN_TPMDEV_BACKEND is not set 154.5 CONFIG_XEN_BLKDEV_FRONTEND=y 154.6 CONFIG_XEN_NETDEV_FRONTEND=y 154.7 -CONFIG_XEN_NETDEV_GRANT=y 154.8 # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set 154.9 # CONFIG_XEN_BLKDEV_TAP is not set 154.10 # CONFIG_XEN_SHADOW_MODE is not set
155.1 --- a/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_64 Thu Sep 22 11:34:14 2005 -0600 155.2 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_64 Thu Sep 22 11:42:01 2005 -0600 155.3 @@ -19,7 +19,6 @@ CONFIG_XEN_NETDEV_BACKEND=y 155.4 # CONFIG_XEN_TPMDEV_BACKEND is not set 155.5 CONFIG_XEN_BLKDEV_FRONTEND=y 155.6 CONFIG_XEN_NETDEV_FRONTEND=y 155.7 -CONFIG_XEN_NETDEV_GRANT=y 155.8 # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set 155.9 # CONFIG_XEN_BLKDEV_TAP is not set 155.10 # CONFIG_XEN_SHADOW_MODE is not set
156.1 --- a/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_32 Thu Sep 22 11:34:14 2005 -0600 156.2 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_32 Thu Sep 22 11:42:01 2005 -0600 156.3 @@ -16,7 +16,6 @@ CONFIG_NO_IDLE_HZ=y 156.4 # CONFIG_XEN_TPMDEV_BACKEND is not set 156.5 CONFIG_XEN_BLKDEV_FRONTEND=y 156.6 CONFIG_XEN_NETDEV_FRONTEND=y 156.7 -CONFIG_XEN_NETDEV_GRANT=y 156.8 # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set 156.9 # CONFIG_XEN_BLKDEV_TAP is not set 156.10 # CONFIG_XEN_SHADOW_MODE is not set
157.1 --- a/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_64 Thu Sep 22 11:34:14 2005 -0600 157.2 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_64 Thu Sep 22 11:42:01 2005 -0600 157.3 @@ -16,7 +16,6 @@ CONFIG_NO_IDLE_HZ=y 157.4 # CONFIG_XEN_TPMDEV_BACKEND is not set 157.5 CONFIG_XEN_BLKDEV_FRONTEND=y 157.6 CONFIG_XEN_NETDEV_FRONTEND=y 157.7 -CONFIG_XEN_NETDEV_GRANT=y 157.8 # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set 157.9 # CONFIG_XEN_BLKDEV_TAP is not set 157.10 # CONFIG_XEN_SHADOW_MODE is not set
158.1 --- a/linux-2.6-xen-sparse/arch/xen/configs/xen_defconfig_x86_32 Thu Sep 22 11:34:14 2005 -0600 158.2 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xen_defconfig_x86_32 Thu Sep 22 11:42:01 2005 -0600 158.3 @@ -19,7 +19,6 @@ CONFIG_XEN_NETDEV_BACKEND=y 158.4 # CONFIG_XEN_TPMDEV_BACKEND is not set 158.5 CONFIG_XEN_BLKDEV_FRONTEND=y 158.6 CONFIG_XEN_NETDEV_FRONTEND=y 158.7 -CONFIG_XEN_NETDEV_GRANT=y 158.8 # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set 158.9 # CONFIG_XEN_BLKDEV_TAP is not set 158.10 # CONFIG_XEN_SHADOW_MODE is not set 158.11 @@ -372,7 +371,7 @@ CONFIG_PNP=y 158.12 # 158.13 CONFIG_ISAPNP=y 158.14 # CONFIG_PNPBIOS is not set 158.15 -CONFIG_PNPACPI=y 158.16 +# CONFIG_PNPACPI is not set 158.17 158.18 # 158.19 # Block devices
159.1 --- a/linux-2.6-xen-sparse/arch/xen/configs/xen_defconfig_x86_64 Thu Sep 22 11:34:14 2005 -0600 159.2 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xen_defconfig_x86_64 Thu Sep 22 11:42:01 2005 -0600 159.3 @@ -19,7 +19,6 @@ CONFIG_XEN_NETDEV_BACKEND=y 159.4 # CONFIG_XEN_TPMDEV_BACKEND is not set 159.5 CONFIG_XEN_BLKDEV_FRONTEND=y 159.6 CONFIG_XEN_NETDEV_FRONTEND=y 159.7 -CONFIG_XEN_NETDEV_GRANT=y 159.8 # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set 159.9 # CONFIG_XEN_BLKDEV_TAP is not set 159.10 # CONFIG_XEN_SHADOW_MODE is not set
196.1 --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/ioremap.c Thu Sep 22 11:34:14 2005 -0600 196.2 +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/ioremap.c Thu Sep 22 11:42:01 2005 -0600 196.3 @@ -45,12 +45,12 @@ static int direct_remap_area_pte_fn(pte_ 196.4 return 0; 196.5 } 196.6 196.7 -int direct_remap_pfn_range(struct mm_struct *mm, 196.8 - unsigned long address, 196.9 - unsigned long mfn, 196.10 - unsigned long size, 196.11 - pgprot_t prot, 196.12 - domid_t domid) 196.13 +static int __direct_remap_pfn_range(struct mm_struct *mm, 196.14 + unsigned long address, 196.15 + unsigned long mfn, 196.16 + unsigned long size, 196.17 + pgprot_t prot, 196.18 + domid_t domid) 196.19 { 196.20 int i; 196.21 unsigned long start_address; 196.22 @@ -98,6 +98,20 @@ int direct_remap_pfn_range(struct mm_str 196.23 return 0; 196.24 } 196.25 196.26 +int direct_remap_pfn_range(struct vm_area_struct *vma, 196.27 + unsigned long address, 196.28 + unsigned long mfn, 196.29 + unsigned long size, 196.30 + pgprot_t prot, 196.31 + domid_t domid) 196.32 +{ 196.33 + /* Same as remap_pfn_range(). */ 196.34 + vma->vm_flags |= VM_IO | VM_RESERVED; 196.35 + 196.36 + return __direct_remap_pfn_range( 196.37 + vma->vm_mm, address, mfn, size, prot, domid); 196.38 +} 196.39 + 196.40 EXPORT_SYMBOL(direct_remap_pfn_range); 196.41 196.42 196.43 @@ -221,8 +235,9 @@ void __iomem * __ioremap(unsigned long p 196.44 #ifdef __x86_64__ 196.45 flags |= _PAGE_USER; 196.46 #endif 196.47 - if (direct_remap_pfn_range(&init_mm, (unsigned long) addr, phys_addr>>PAGE_SHIFT, 196.48 - size, __pgprot(flags), domid)) { 196.49 + if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr, 196.50 + phys_addr>>PAGE_SHIFT, 196.51 + size, __pgprot(flags), domid)) { 196.52 vunmap((void __force *) addr); 196.53 return NULL; 196.54 }
199.1 --- a/linux-2.6-xen-sparse/arch/xen/i386/pci/i386.c Thu Sep 22 11:34:14 2005 -0600 199.2 +++ b/linux-2.6-xen-sparse/arch/xen/i386/pci/i386.c Thu Sep 22 11:42:01 2005 -0600 199.3 @@ -295,7 +295,7 @@ int pci_mmap_page_range(struct pci_dev * 199.4 /* Write-combine setting is ignored, it is changed via the mtrr 199.5 * interfaces on this platform. 199.6 */ 199.7 - if (direct_remap_pfn_range(vma->vm_mm, vma->vm_start, vma->vm_pgoff, 199.8 + if (direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, 199.9 vma->vm_end - vma->vm_start, 199.10 vma->vm_page_prot, DOMID_IO)) 199.11 return -EAGAIN;
202.1 --- a/linux-2.6-xen-sparse/arch/xen/kernel/devmem.c Thu Sep 22 11:34:14 2005 -0600 202.2 +++ b/linux-2.6-xen-sparse/arch/xen/kernel/devmem.c Thu Sep 22 11:42:01 2005 -0600 202.3 @@ -90,22 +90,10 @@ out: 202.4 202.5 static int mmap_mem(struct file * file, struct vm_area_struct * vma) 202.6 { 202.7 - int uncached; 202.8 - 202.9 - uncached = uncached_access(file); 202.10 - if (uncached) 202.11 + if (uncached_access(file)) 202.12 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 202.13 202.14 - /* Don't try to swap out physical pages.. */ 202.15 - vma->vm_flags |= VM_RESERVED; 202.16 - 202.17 - /* 202.18 - * Don't dump addresses that are not real memory to a core file. 202.19 - */ 202.20 - if (uncached) 202.21 - vma->vm_flags |= VM_IO; 202.22 - 202.23 - if (direct_remap_pfn_range(vma->vm_mm, vma->vm_start, vma->vm_pgoff, 202.24 + if (direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, 202.25 vma->vm_end - vma->vm_start, 202.26 vma->vm_page_prot, DOMID_IO)) 202.27 return -EAGAIN;
205.1 --- a/linux-2.6-xen-sparse/arch/xen/kernel/gnttab.c Thu Sep 22 11:34:14 2005 -0600 205.2 +++ b/linux-2.6-xen-sparse/arch/xen/kernel/gnttab.c Thu Sep 22 11:42:01 2005 -0600 205.3 @@ -182,14 +182,14 @@ gnttab_end_foreign_access(grant_ref_t re 205.4 } 205.5 205.6 int 205.7 -gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn) 205.8 +gnttab_grant_foreign_transfer(domid_t domid) 205.9 { 205.10 int ref; 205.11 205.12 if ( unlikely((ref = get_free_entry()) == -1) ) 205.13 return -ENOSPC; 205.14 205.15 - shared[ref].frame = pfn; 205.16 + shared[ref].frame = 0; 205.17 shared[ref].domid = domid; 205.18 wmb(); 205.19 shared[ref].flags = GTF_accept_transfer; 205.20 @@ -198,10 +198,9 @@ gnttab_grant_foreign_transfer(domid_t do 205.21 } 205.22 205.23 void 205.24 -gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid, 205.25 - unsigned long pfn) 205.26 +gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid) 205.27 { 205.28 - shared[ref].frame = pfn; 205.29 + shared[ref].frame = 0; 205.30 shared[ref].domid = domid; 205.31 wmb(); 205.32 shared[ref].flags = GTF_accept_transfer;
206.1 --- a/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c Thu Sep 22 11:34:14 2005 -0600 206.2 +++ b/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c Thu Sep 22 11:42:01 2005 -0600 206.3 @@ -334,7 +334,7 @@ static void shutdown_handler(struct xenb 206.4 return; 206.5 } 206.6 206.7 - xenbus_write("control", "shutdown", "", O_CREAT); 206.8 + xenbus_write("control", "shutdown", ""); 206.9 206.10 err = xenbus_transaction_end(0); 206.11 if (err == -ETIMEDOUT) {
253.1 --- a/linux-2.6-xen-sparse/drivers/char/tpm/tpm.c Thu Sep 22 11:34:14 2005 -0600 253.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 253.3 @@ -1,627 +0,0 @@ 253.4 -/* 253.5 - * Copyright (C) 2004 IBM Corporation 253.6 - * 253.7 - * Authors: 253.8 - * Leendert van Doorn <leendert@watson.ibm.com> 253.9 - * Dave Safford <safford@watson.ibm.com> 253.10 - * Reiner Sailer <sailer@watson.ibm.com> 253.11 - * Kylene Hall <kjhall@us.ibm.com> 253.12 - * 253.13 - * Maintained by: <tpmdd_devel@lists.sourceforge.net> 253.14 - * 253.15 - * Device driver for TCG/TCPA TPM (trusted platform module). 253.16 - * Specifications at www.trustedcomputinggroup.org 253.17 - * 253.18 - * This program is free software; you can redistribute it and/or 253.19 - * modify it under the terms of the GNU General Public License as 253.20 - * published by the Free Software Foundation, version 2 of the 253.21 - * License. 253.22 - * 253.23 - * Note, the TPM chip is not interrupt driven (only polling) 253.24 - * and can have very long timeouts (minutes!). Hence the unusual 253.25 - * calls to schedule_timeout. 253.26 - * 253.27 - */ 253.28 - 253.29 -#include <linux/sched.h> 253.30 -#include <linux/poll.h> 253.31 -#include <linux/spinlock.h> 253.32 -#include "tpm.h" 253.33 - 253.34 -#define TPM_MINOR 224 /* officially assigned */ 253.35 - 253.36 -#define TPM_BUFSIZE 2048 253.37 - 253.38 -static LIST_HEAD(tpm_chip_list); 253.39 -static DEFINE_SPINLOCK(driver_lock); 253.40 -static int dev_mask[32]; 253.41 - 253.42 -static void user_reader_timeout(unsigned long ptr) 253.43 -{ 253.44 - struct tpm_chip *chip = (struct tpm_chip *) ptr; 253.45 - 253.46 - down(&chip->buffer_mutex); 253.47 - atomic_set(&chip->data_pending, 0); 253.48 - memset(chip->data_buffer, 0, TPM_BUFSIZE); 253.49 - up(&chip->buffer_mutex); 253.50 -} 253.51 - 253.52 -void tpm_time_expired(unsigned long ptr) 253.53 -{ 253.54 - int *exp = (int *) ptr; 253.55 - *exp = 1; 253.56 -} 253.57 - 253.58 -EXPORT_SYMBOL_GPL(tpm_time_expired); 253.59 - 253.60 -/* 253.61 - * Internal kernel interface to transmit TPM commands 253.62 - */ 253.63 -static ssize_t tpm_transmit(struct tpm_chip *chip, const char *buf, 253.64 - size_t bufsiz) 253.65 -{ 253.66 - ssize_t len; 253.67 - u32 count; 253.68 - __be32 *native_size; 253.69 - 253.70 - native_size = (__force __be32 *) (buf + 2); 253.71 - count = be32_to_cpu(*native_size); 253.72 - 253.73 - if (count == 0) 253.74 - return -ENODATA; 253.75 - if (count > bufsiz) { 253.76 - dev_err(&chip->pci_dev->dev, 253.77 - "invalid count value %x %zx \n", count, bufsiz); 253.78 - return -E2BIG; 253.79 - } 253.80 - 253.81 - down(&chip->tpm_mutex); 253.82 - 253.83 - if ((len = chip->vendor->send(chip, (u8 *) buf, count)) < 0) { 253.84 - dev_err(&chip->pci_dev->dev, 253.85 - "tpm_transmit: tpm_send: error %zd\n", len); 253.86 - return len; 253.87 - } 253.88 - 253.89 - down(&chip->timer_manipulation_mutex); 253.90 - chip->time_expired = 0; 253.91 - init_timer(&chip->device_timer); 253.92 - chip->device_timer.function = tpm_time_expired; 253.93 - chip->device_timer.expires = jiffies + 2 * 60 * HZ; 253.94 - chip->device_timer.data = (unsigned long) &chip->time_expired; 253.95 - add_timer(&chip->device_timer); 253.96 - up(&chip->timer_manipulation_mutex); 253.97 - 253.98 - do { 253.99 - u8 status = inb(chip->vendor->base + 1); 253.100 - if ((status & chip->vendor->req_complete_mask) == 253.101 - chip->vendor->req_complete_val) { 253.102 - down(&chip->timer_manipulation_mutex); 253.103 - del_singleshot_timer_sync(&chip->device_timer); 253.104 - up(&chip->timer_manipulation_mutex); 253.105 - goto out_recv; 253.106 - } 253.107 - set_current_state(TASK_UNINTERRUPTIBLE); 253.108 - schedule_timeout(TPM_TIMEOUT); 253.109 - rmb(); 253.110 - } while (!chip->time_expired); 253.111 - 253.112 - 253.113 - chip->vendor->cancel(chip); 253.114 - dev_err(&chip->pci_dev->dev, "Time expired\n"); 253.115 - up(&chip->tpm_mutex); 253.116 - return -EIO; 253.117 - 253.118 -out_recv: 253.119 - len = chip->vendor->recv(chip, (u8 *) buf, bufsiz); 253.120 - if (len < 0) 253.121 - dev_err(&chip->pci_dev->dev, 253.122 - "tpm_transmit: tpm_recv: error %zd\n", len); 253.123 - up(&chip->tpm_mutex); 253.124 - return len; 253.125 -} 253.126 - 253.127 -#define TPM_DIGEST_SIZE 20 253.128 -#define CAP_PCR_RESULT_SIZE 18 253.129 -static u8 cap_pcr[] = { 253.130 - 0, 193, /* TPM_TAG_RQU_COMMAND */ 253.131 - 0, 0, 0, 22, /* length */ 253.132 - 0, 0, 0, 101, /* TPM_ORD_GetCapability */ 253.133 - 0, 0, 0, 5, 253.134 - 0, 0, 0, 4, 253.135 - 0, 0, 1, 1 253.136 -}; 253.137 - 253.138 -#define READ_PCR_RESULT_SIZE 30 253.139 -static u8 pcrread[] = { 253.140 - 0, 193, /* TPM_TAG_RQU_COMMAND */ 253.141 - 0, 0, 0, 14, /* length */ 253.142 - 0, 0, 0, 21, /* TPM_ORD_PcrRead */ 253.143 - 0, 0, 0, 0 /* PCR index */ 253.144 -}; 253.145 - 253.146 -static ssize_t show_pcrs(struct device *dev, char *buf) 253.147 -{ 253.148 - u8 data[READ_PCR_RESULT_SIZE]; 253.149 - ssize_t len; 253.150 - int i, j, index, num_pcrs; 253.151 - char *str = buf; 253.152 - 253.153 - struct tpm_chip *chip = 253.154 - pci_get_drvdata(container_of(dev, struct pci_dev, dev)); 253.155 - if (chip == NULL) 253.156 - return -ENODEV; 253.157 - 253.158 - memcpy(data, cap_pcr, sizeof(cap_pcr)); 253.159 - if ((len = tpm_transmit(chip, data, sizeof(data))) 253.160 - < CAP_PCR_RESULT_SIZE) 253.161 - return len; 253.162 - 253.163 - num_pcrs = be32_to_cpu(*((__force __be32 *) (data + 14))); 253.164 - 253.165 - for (i = 0; i < num_pcrs; i++) { 253.166 - memcpy(data, pcrread, sizeof(pcrread)); 253.167 - index = cpu_to_be32(i); 253.168 - memcpy(data + 10, &index, 4); 253.169 - if ((len = tpm_transmit(chip, data, sizeof(data))) 253.170 - < READ_PCR_RESULT_SIZE) 253.171 - return len; 253.172 - str += sprintf(str, "PCR-%02d: ", i); 253.173 - for (j = 0; j < TPM_DIGEST_SIZE; j++) 253.174 - str += sprintf(str, "%02X ", *(data + 10 + j)); 253.175 - str += sprintf(str, "\n"); 253.176 - } 253.177 - return str - buf; 253.178 -} 253.179 - 253.180 -static DEVICE_ATTR(pcrs, S_IRUGO, show_pcrs, NULL); 253.181 - 253.182 -#define READ_PUBEK_RESULT_SIZE 314 253.183 -static u8 readpubek[] = { 253.184 - 0, 193, /* TPM_TAG_RQU_COMMAND */ 253.185 - 0, 0, 0, 30, /* length */ 253.186 - 0, 0, 0, 124, /* TPM_ORD_ReadPubek */ 253.187 -}; 253.188 - 253.189 -static ssize_t show_pubek(struct device *dev, char *buf) 253.190 -{ 253.191 - u8 data[READ_PUBEK_RESULT_SIZE]; 253.192 - ssize_t len; 253.193 - __be32 *native_val; 253.194 - int i; 253.195 - char *str = buf; 253.196 - 253.197 - struct tpm_chip *chip = 253.198 - pci_get_drvdata(container_of(dev, struct pci_dev, dev)); 253.199 - if (chip == NULL) 253.200 - return -ENODEV; 253.201 - 253.202 - memcpy(data, readpubek, sizeof(readpubek)); 253.203 - memset(data + sizeof(readpubek), 0, 20); /* zero nonce */ 253.204 - 253.205 - if ((len = tpm_transmit(chip, data, sizeof(data))) < 253.206 - READ_PUBEK_RESULT_SIZE) 253.207 - return len; 253.208 - 253.209 - /* 253.210 - ignore header 10 bytes 253.211 - algorithm 32 bits (1 == RSA ) 253.212 - encscheme 16 bits 253.213 - sigscheme 16 bits 253.214 - parameters (RSA 12->bytes: keybit, #primes, expbit) 253.215 - keylenbytes 32 bits 253.216 - 256 byte modulus 253.217 - ignore checksum 20 bytes 253.218 - */ 253.219 - 253.220 - native_val = (__force __be32 *) (data + 34); 253.221 - 253.222 - str += 253.223 - sprintf(str, 253.224 - "Algorithm: %02X %02X %02X %02X\nEncscheme: %02X %02X\n" 253.225 - "Sigscheme: %02X %02X\nParameters: %02X %02X %02X %02X" 253.226 - " %02X %02X %02X %02X %02X %02X %02X %02X\n" 253.227 - "Modulus length: %d\nModulus: \n", 253.228 - data[10], data[11], data[12], data[13], data[14], 253.229 - data[15], data[16], data[17], data[22], data[23], 253.230 - data[24], data[25], data[26], data[27], data[28], 253.231 - data[29], data[30], data[31], data[32], data[33], 253.232 - be32_to_cpu(*native_val) 253.233 - ); 253.234 - 253.235 - for (i = 0; i < 256; i++) { 253.236 - str += sprintf(str, "%02X ", data[i + 39]); 253.237 - if ((i + 1) % 16 == 0) 253.238 - str += sprintf(str, "\n"); 253.239 - } 253.240 - return str - buf; 253.241 -} 253.242 - 253.243 -static DEVICE_ATTR(pubek, S_IRUGO, show_pubek, NULL); 253.244 - 253.245 -#define CAP_VER_RESULT_SIZE 18 253.246 -static u8 cap_version[] = { 253.247 - 0, 193, /* TPM_TAG_RQU_COMMAND */ 253.248 - 0, 0, 0, 18, /* length */ 253.249 - 0, 0, 0, 101, /* TPM_ORD_GetCapability */ 253.250 - 0, 0, 0, 6, 253.251 - 0, 0, 0, 0 253.252 -}; 253.253 - 253.254 -#define CAP_MANUFACTURER_RESULT_SIZE 18 253.255 -static u8 cap_manufacturer[] = { 253.256 - 0, 193, /* TPM_TAG_RQU_COMMAND */ 253.257 - 0, 0, 0, 22, /* length */ 253.258 - 0, 0, 0, 101, /* TPM_ORD_GetCapability */ 253.259 - 0, 0, 0, 5, 253.260 - 0, 0, 0, 4, 253.261 - 0, 0, 1, 3 253.262 -}; 253.263 - 253.264 -static ssize_t show_caps(struct device *dev, char *buf) 253.265 -{ 253.266 - u8 data[READ_PUBEK_RESULT_SIZE]; 253.267 - ssize_t len; 253.268 - char *str = buf; 253.269 - 253.270 - struct tpm_chip *chip = 253.271 - pci_get_drvdata(container_of(dev, struct pci_dev, dev)); 253.272 - if (chip == NULL) 253.273 - return -ENODEV; 253.274 - 253.275 - memcpy(data, cap_manufacturer, sizeof(cap_manufacturer)); 253.276 - 253.277 - if ((len = tpm_transmit(chip, data, sizeof(data))) < 253.278 - CAP_MANUFACTURER_RESULT_SIZE) 253.279 - return len; 253.280 - 253.281 - str += sprintf(str, "Manufacturer: 0x%x\n", 253.282 - be32_to_cpu(*(data + 14))); 253.283 - 253.284 - memcpy(data, cap_version, sizeof(cap_version)); 253.285 - 253.286 - if ((len = tpm_transmit(chip, data, sizeof(data))) < 253.287 - CAP_VER_RESULT_SIZE) 253.288 - return len; 253.289 - 253.290 - str += 253.291 - sprintf(str, "TCG version: %d.%d\nFirmware version: %d.%d\n", 253.292 - (int) data[14], (int) data[15], (int) data[16], 253.293 - (int) data[17]); 253.294 - 253.295 - return str - buf; 253.296 -} 253.297 - 253.298 -static DEVICE_ATTR(caps, S_IRUGO, show_caps, NULL); 253.299 - 253.300 -/* 253.301 - * Device file system interface to the TPM 253.302 - */ 253.303 -int tpm_open(struct inode *inode, struct file *file) 253.304 -{ 253.305 - int rc = 0, minor = iminor(inode); 253.306 - struct tpm_chip *chip = NULL, *pos; 253.307 - 253.308 - spin_lock(&driver_lock); 253.309 - 253.310 - list_for_each_entry(pos, &tpm_chip_list, list) { 253.311 - if (pos->vendor->miscdev.minor == minor) { 253.312 - chip = pos; 253.313 - break; 253.314 - } 253.315 - } 253.316 - 253.317 - if (chip == NULL) { 253.318 - rc = -ENODEV; 253.319 - goto err_out; 253.320 - } 253.321 - 253.322 - if (chip->num_opens) { 253.323 - dev_dbg(&chip->pci_dev->dev, 253.324 - "Another process owns this TPM\n"); 253.325 - rc = -EBUSY; 253.326 - goto err_out; 253.327 - } 253.328 - 253.329 - chip->num_opens++; 253.330 - pci_dev_get(chip->pci_dev); 253.331 - 253.332 - spin_unlock(&driver_lock); 253.333 - 253.334 - chip->data_buffer = kmalloc(TPM_BUFSIZE * sizeof(u8), GFP_KERNEL); 253.335 - if (chip->data_buffer == NULL) { 253.336 - chip->num_opens--; 253.337 - pci_dev_put(chip->pci_dev); 253.338 - return -ENOMEM; 253.339 - } 253.340 - 253.341 - atomic_set(&chip->data_pending, 0); 253.342 - 253.343 - file->private_data = chip; 253.344 - return 0; 253.345 - 253.346 -err_out: 253.347 - spin_unlock(&driver_lock); 253.348 - return rc; 253.349 -} 253.350 - 253.351 -EXPORT_SYMBOL_GPL(tpm_open); 253.352 - 253.353 -int tpm_release(struct inode *inode, struct file *file) 253.354 -{ 253.355 - struct tpm_chip *chip = file->private_data; 253.356 - 253.357 - file->private_data = NULL; 253.358 - 253.359 - spin_lock(&driver_lock); 253.360 - chip->num_opens--; 253.361 - spin_unlock(&driver_lock); 253.362 - 253.363 - down(&chip->timer_manipulation_mutex); 253.364 - if (timer_pending(&chip->user_read_timer)) 253.365 - del_singleshot_timer_sync(&chip->user_read_timer); 253.366 - else if (timer_pending(&chip->device_timer)) 253.367 - del_singleshot_timer_sync(&chip->device_timer); 253.368 - up(&chip->timer_manipulation_mutex); 253.369 - 253.370 - kfree(chip->data_buffer); 253.371 - atomic_set(&chip->data_pending, 0); 253.372 - 253.373 - pci_dev_put(chip->pci_dev); 253.374 - return 0; 253.375 -} 253.376 - 253.377 -EXPORT_SYMBOL_GPL(tpm_release); 253.378 - 253.379 -ssize_t tpm_write(struct file * file, const char __user * buf, 253.380 - size_t size, loff_t * off) 253.381 -{ 253.382 - struct tpm_chip *chip = file->private_data; 253.383 - int in_size = size, out_size; 253.384 - 253.385 - /* cannot perform a write until the read has cleared 253.386 - either via tpm_read or a user_read_timer timeout */ 253.387 - while (atomic_read(&chip->data_pending) != 0) { 253.388 - set_current_state(TASK_UNINTERRUPTIBLE); 253.389 - schedule_timeout(TPM_TIMEOUT); 253.390 - } 253.391 - 253.392 - down(&chip->buffer_mutex); 253.393 - 253.394 - if (in_size > TPM_BUFSIZE) 253.395 - in_size = TPM_BUFSIZE; 253.396 - 253.397 - if (copy_from_user 253.398 - (chip->data_buffer, (void __user *) buf, in_size)) { 253.399 - up(&chip->buffer_mutex); 253.400 - return -EFAULT; 253.401 - } 253.402 - 253.403 - /* atomic tpm command send and result receive */ 253.404 - out_size = tpm_transmit(chip, chip->data_buffer, TPM_BUFSIZE); 253.405 - 253.406 - atomic_set(&chip->data_pending, out_size); 253.407 - atomic_set(&chip->data_position, 0); 253.408 - up(&chip->buffer_mutex); 253.409 - 253.410 - /* Set a timeout by which the reader must come claim the result */ 253.411 - down(&chip->timer_manipulation_mutex); 253.412 - init_timer(&chip->user_read_timer); 253.413 - chip->user_read_timer.function = user_reader_timeout; 253.414 - chip->user_read_timer.data = (unsigned long) chip; 253.415 - chip->user_read_timer.expires = jiffies + (60 * HZ); 253.416 - add_timer(&chip->user_read_timer); 253.417 - up(&chip->timer_manipulation_mutex); 253.418 - 253.419 - return in_size; 253.420 -} 253.421 - 253.422 -EXPORT_SYMBOL_GPL(tpm_write); 253.423 - 253.424 -ssize_t tpm_read(struct file * file, char __user * buf, 253.425 - size_t size, loff_t * off) 253.426 -{ 253.427 - struct tpm_chip *chip = file->private_data; 253.428 - int ret_size = -ENODATA; 253.429 - int pos, pending = 0; 253.430 - 253.431 - down(&chip->buffer_mutex); 253.432 - ret_size = atomic_read(&chip->data_pending); 253.433 - if ( ret_size > 0 ) { /* Result available */ 253.434 - if (size < ret_size) 253.435 - ret_size = size; 253.436 - 253.437 - pos = atomic_read(&chip->data_position); 253.438 - 253.439 - if (copy_to_user((void __user *) buf, 253.440 - &chip->data_buffer[pos], ret_size)) { 253.441 - ret_size = -EFAULT; 253.442 - } else { 253.443 - pending = atomic_read(&chip->data_pending) - ret_size; 253.444 - if ( pending ) { 253.445 - atomic_set( &chip->data_pending, pending ); 253.446 - atomic_set( &chip->data_position, pos+ret_size ); 253.447 - } 253.448 - } 253.449 - } 253.450 - up(&chip->buffer_mutex); 253.451 - 253.452 - if ( ret_size <= 0 || pending == 0 ) { 253.453 - atomic_set( &chip->data_pending, 0 ); 253.454 - down(&chip->timer_manipulation_mutex); 253.455 - del_singleshot_timer_sync(&chip->user_read_timer); 253.456 - up(&chip->timer_manipulation_mutex); 253.457 - } 253.458 - 253.459 - return ret_size; 253.460 -} 253.461 - 253.462 -EXPORT_SYMBOL_GPL(tpm_read); 253.463 - 253.464 -void __devexit tpm_remove(struct pci_dev *pci_dev) 253.465 -{ 253.466 - struct tpm_chip *chip = pci_get_drvdata(pci_dev); 253.467 - 253.468 - if (chip == NULL) { 253.469 - dev_err(&pci_dev->dev, "No device data found\n"); 253.470 - return; 253.471 - } 253.472 - 253.473 - spin_lock(&driver_lock); 253.474 - 253.475 - list_del(&chip->list); 253.476 - 253.477 - spin_unlock(&driver_lock); 253.478 - 253.479 - pci_set_drvdata(pci_dev, NULL); 253.480 - misc_deregister(&chip->vendor->miscdev); 253.481 - 253.482 - device_remove_file(&pci_dev->dev, &dev_attr_pubek); 253.483 - device_remove_file(&pci_dev->dev, &dev_attr_pcrs); 253.484 - device_remove_file(&pci_dev->dev, &dev_attr_caps); 253.485 - 253.486 - pci_disable_device(pci_dev); 253.487 - 253.488 - dev_mask[chip->dev_num / 32] &= !(1 << (chip->dev_num % 32)); 253.489 - 253.490 - kfree(chip); 253.491 - 253.492 - pci_dev_put(pci_dev); 253.493 -} 253.494 - 253.495 -EXPORT_SYMBOL_GPL(tpm_remove); 253.496 - 253.497 -static u8 savestate[] = { 253.498 - 0, 193, /* TPM_TAG_RQU_COMMAND */ 253.499 - 0, 0, 0, 10, /* blob length (in bytes) */ 253.500 - 0, 0, 0, 152 /* TPM_ORD_SaveState */ 253.501 -}; 253.502 - 253.503 -/* 253.504 - * We are about to suspend. Save the TPM state 253.505 - * so that it can be restored. 253.506 - */ 253.507 -int tpm_pm_suspend(struct pci_dev *pci_dev, pm_message_t pm_state) 253.508 -{ 253.509 - struct tpm_chip *chip = pci_get_drvdata(pci_dev); 253.510 - if (chip == NULL) 253.511 - return -ENODEV; 253.512 - 253.513 - tpm_transmit(chip, savestate, sizeof(savestate)); 253.514 - return 0; 253.515 -} 253.516 - 253.517 -EXPORT_SYMBOL_GPL(tpm_pm_suspend); 253.518 - 253.519 -/* 253.520 - * Resume from a power safe. The BIOS already restored 253.521 - * the TPM state. 253.522 - */ 253.523 -int tpm_pm_resume(struct pci_dev *pci_dev) 253.524 -{ 253.525 - struct tpm_chip *chip = pci_get_drvdata(pci_dev); 253.526 - 253.527 - if (chip == NULL) 253.528 - return -ENODEV; 253.529 - 253.530 - return 0; 253.531 -} 253.532 - 253.533 -EXPORT_SYMBOL_GPL(tpm_pm_resume); 253.534 - 253.535 -/* 253.536 - * Called from tpm_<specific>.c probe function only for devices 253.537