direct-io.hg

view xen/include/public/xen.h @ 8686:c0a0f4db5ab1

Create a block of reserved PFNs in shadow translate mode guests, and
move the shared info and grant table pfns into that block. This
allows us to remove the get_gnttablist dom0 op, and simplifies the
domain creation code slightly. Having the reserved block managed by
Xen may also make it slightly easier to handle the case where the
grant table needs to be extended at run time.

Suggested-by: kaf24
Signed-off-by: Steven Smith, sos22@cam.ac.uk
author sos22@douglas.cl.cam.ac.uk
date Thu Jan 26 19:40:13 2006 +0100 (2006-01-26)
parents 06ab200a9e23
children 5a9efc35feb2
line source
1 /******************************************************************************
2 * xen.h
3 *
4 * Guest OS interface to Xen.
5 *
6 * Copyright (c) 2004, K A Fraser
7 */
9 #ifndef __XEN_PUBLIC_XEN_H__
10 #define __XEN_PUBLIC_XEN_H__
12 #if defined(__i386__)
13 #include "arch-x86_32.h"
14 #elif defined(__x86_64__)
15 #include "arch-x86_64.h"
16 #elif defined(__ia64__)
17 #include "arch-ia64.h"
18 #else
19 #error "Unsupported architecture"
20 #endif
22 /*
23 * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
24 */
26 /*
27 * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5.
28 * EAX = return value
29 * (argument registers may be clobbered on return)
30 * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6.
31 * RAX = return value
32 * (argument registers not clobbered on return; RCX, R11 are)
33 */
34 #define __HYPERVISOR_set_trap_table 0
35 #define __HYPERVISOR_mmu_update 1
36 #define __HYPERVISOR_set_gdt 2
37 #define __HYPERVISOR_stack_switch 3
38 #define __HYPERVISOR_set_callbacks 4
39 #define __HYPERVISOR_fpu_taskswitch 5
40 #define __HYPERVISOR_sched_op 6
41 #define __HYPERVISOR_dom0_op 7
42 #define __HYPERVISOR_set_debugreg 8
43 #define __HYPERVISOR_get_debugreg 9
44 #define __HYPERVISOR_update_descriptor 10
45 #define __HYPERVISOR_memory_op 12
46 #define __HYPERVISOR_multicall 13
47 #define __HYPERVISOR_update_va_mapping 14
48 #define __HYPERVISOR_set_timer_op 15
49 #define __HYPERVISOR_event_channel_op 16
50 #define __HYPERVISOR_xen_version 17
51 #define __HYPERVISOR_console_io 18
52 #define __HYPERVISOR_physdev_op 19
53 #define __HYPERVISOR_grant_table_op 20
54 #define __HYPERVISOR_vm_assist 21
55 #define __HYPERVISOR_update_va_mapping_otherdomain 22
56 #define __HYPERVISOR_iret 23 /* x86 only */
57 #define __HYPERVISOR_switch_vm86 23 /* x86/32 only (obsolete name) */
58 #define __HYPERVISOR_switch_to_user 23 /* x86/64 only (obsolete name) */
59 #define __HYPERVISOR_vcpu_op 24
60 #define __HYPERVISOR_set_segment_base 25 /* x86/64 only */
61 #define __HYPERVISOR_mmuext_op 26
62 #define __HYPERVISOR_acm_op 27
63 #define __HYPERVISOR_nmi_op 28
65 /*
66 * VIRTUAL INTERRUPTS
67 *
68 * Virtual interrupts that a guest OS may receive from Xen.
69 */
70 #define VIRQ_TIMER 0 /* Timebase update, and/or requested timeout. */
71 #define VIRQ_DEBUG 1 /* Request guest to dump debug info. */
72 #define VIRQ_CONSOLE 2 /* (DOM0) Bytes received on emergency console. */
73 #define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */
74 #define VIRQ_DEBUGGER 6 /* (DOM0) A domain has paused for debugging. */
75 #define NR_VIRQS 8
77 /*
78 * MMU-UPDATE REQUESTS
79 *
80 * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
81 * A foreigndom (FD) can be specified (or DOMID_SELF for none).
82 * Where the FD has some effect, it is described below.
83 * ptr[1:0] specifies the appropriate MMU_* command.
84 *
85 * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
86 * Updates an entry in a page table. If updating an L1 table, and the new
87 * table entry is valid/present, the mapped frame must belong to the FD, if
88 * an FD has been specified. If attempting to map an I/O page then the
89 * caller assumes the privilege of the FD.
90 * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
91 * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
92 * ptr[:2] -- Machine address of the page-table entry to modify.
93 * val -- Value to write.
94 *
95 * ptr[1:0] == MMU_MACHPHYS_UPDATE:
96 * Updates an entry in the machine->pseudo-physical mapping table.
97 * ptr[:2] -- Machine address within the frame whose mapping to modify.
98 * The frame must belong to the FD, if one is specified.
99 * val -- Value to write into the mapping entry.
100 */
101 #define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */
102 #define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */
104 /*
105 * MMU EXTENDED OPERATIONS
106 *
107 * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
108 * A foreigndom (FD) can be specified (or DOMID_SELF for none).
109 * Where the FD has some effect, it is described below.
110 *
111 * cmd: MMUEXT_(UN)PIN_*_TABLE
112 * mfn: Machine frame number to be (un)pinned as a p.t. page.
113 * The frame must belong to the FD, if one is specified.
114 *
115 * cmd: MMUEXT_NEW_BASEPTR
116 * mfn: Machine frame number of new page-table base to install in MMU.
117 *
118 * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only]
119 * mfn: Machine frame number of new page-table base to install in MMU
120 * when in user space.
121 *
122 * cmd: MMUEXT_TLB_FLUSH_LOCAL
123 * No additional arguments. Flushes local TLB.
124 *
125 * cmd: MMUEXT_INVLPG_LOCAL
126 * linear_addr: Linear address to be flushed from the local TLB.
127 *
128 * cmd: MMUEXT_TLB_FLUSH_MULTI
129 * vcpumask: Pointer to bitmap of VCPUs to be flushed.
130 *
131 * cmd: MMUEXT_INVLPG_MULTI
132 * linear_addr: Linear address to be flushed.
133 * vcpumask: Pointer to bitmap of VCPUs to be flushed.
134 *
135 * cmd: MMUEXT_TLB_FLUSH_ALL
136 * No additional arguments. Flushes all VCPUs' TLBs.
137 *
138 * cmd: MMUEXT_INVLPG_ALL
139 * linear_addr: Linear address to be flushed from all VCPUs' TLBs.
140 *
141 * cmd: MMUEXT_FLUSH_CACHE
142 * No additional arguments. Writes back and flushes cache contents.
143 *
144 * cmd: MMUEXT_SET_LDT
145 * linear_addr: Linear address of LDT base (NB. must be page-aligned).
146 * nr_ents: Number of entries in LDT.
147 *
148 * cmd: MMUEXT_PFN_HOLE_BASE
149 * No additional arguments. Returns the first pfn in the Xen-reserved
150 * pfn hole. Note that we delay allocating the hole until the first
151 * time this is called.
152 *
153 * cmd: MMUEXT_PFN_HOLE_SIZE
154 * No additional arguments. Returns the number of pfns in the
155 * Xen-reserved pfn hole.
156 */
157 #define MMUEXT_PIN_L1_TABLE 0
158 #define MMUEXT_PIN_L2_TABLE 1
159 #define MMUEXT_PIN_L3_TABLE 2
160 #define MMUEXT_PIN_L4_TABLE 3
161 #define MMUEXT_UNPIN_TABLE 4
162 #define MMUEXT_NEW_BASEPTR 5
163 #define MMUEXT_TLB_FLUSH_LOCAL 6
164 #define MMUEXT_INVLPG_LOCAL 7
165 #define MMUEXT_TLB_FLUSH_MULTI 8
166 #define MMUEXT_INVLPG_MULTI 9
167 #define MMUEXT_TLB_FLUSH_ALL 10
168 #define MMUEXT_INVLPG_ALL 11
169 #define MMUEXT_FLUSH_CACHE 12
170 #define MMUEXT_SET_LDT 13
171 #define MMUEXT_NEW_USER_BASEPTR 15
172 #define MMUEXT_PFN_HOLE_BASE 16
173 #define MMUEXT_PFN_HOLE_SIZE 17
175 #ifndef __ASSEMBLY__
176 struct mmuext_op {
177 unsigned int cmd;
178 union {
179 /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
180 unsigned long mfn;
181 /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
182 unsigned long linear_addr;
183 } arg1;
184 union {
185 /* SET_LDT */
186 unsigned int nr_ents;
187 /* TLB_FLUSH_MULTI, INVLPG_MULTI */
188 void *vcpumask;
189 } arg2;
190 };
191 #endif
193 /* These are passed as 'flags' to update_va_mapping. They can be ORed. */
194 /* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap. */
195 /* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer. */
196 #define UVMF_NONE (0UL<<0) /* No flushing at all. */
197 #define UVMF_TLB_FLUSH (1UL<<0) /* Flush entire TLB(s). */
198 #define UVMF_INVLPG (2UL<<0) /* Flush only one entry. */
199 #define UVMF_FLUSHTYPE_MASK (3UL<<0)
200 #define UVMF_MULTI (0UL<<2) /* Flush subset of TLBs. */
201 #define UVMF_LOCAL (0UL<<2) /* Flush local TLB. */
202 #define UVMF_ALL (1UL<<2) /* Flush all TLBs. */
204 /*
205 * Commands to HYPERVISOR_console_io().
206 */
207 #define CONSOLEIO_write 0
208 #define CONSOLEIO_read 1
210 /*
211 * Commands to HYPERVISOR_vm_assist().
212 */
213 #define VMASST_CMD_enable 0
214 #define VMASST_CMD_disable 1
215 #define VMASST_TYPE_4gb_segments 0
216 #define VMASST_TYPE_4gb_segments_notify 1
217 #define VMASST_TYPE_writable_pagetables 2
218 #define MAX_VMASST_TYPE 2
220 #ifndef __ASSEMBLY__
222 typedef uint16_t domid_t;
224 /* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary domains. */
225 #define DOMID_FIRST_RESERVED (0x7FF0U)
227 /* DOMID_SELF is used in certain contexts to refer to oneself. */
228 #define DOMID_SELF (0x7FF0U)
230 /*
231 * DOMID_IO is used to restrict page-table updates to mapping I/O memory.
232 * Although no Foreign Domain need be specified to map I/O pages, DOMID_IO
233 * is useful to ensure that no mappings to the OS's own heap are accidentally
234 * installed. (e.g., in Linux this could cause havoc as reference counts
235 * aren't adjusted on the I/O-mapping code path).
236 * This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that context can
237 * be specified by any calling domain.
238 */
239 #define DOMID_IO (0x7FF1U)
241 /*
242 * DOMID_XEN is used to allow privileged domains to map restricted parts of
243 * Xen's heap space (e.g., the machine_to_phys table).
244 * This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only permitted if
245 * the caller is privileged.
246 */
247 #define DOMID_XEN (0x7FF2U)
249 /*
250 * Send an array of these to HYPERVISOR_mmu_update().
251 * NB. The fields are natural pointer/address size for this architecture.
252 */
253 typedef struct
254 {
255 uint64_t ptr; /* Machine address of PTE. */
256 uint64_t val; /* New contents of PTE. */
257 } mmu_update_t;
259 /*
260 * Send an array of these to HYPERVISOR_multicall().
261 * NB. The fields are natural register size for this architecture.
262 */
263 typedef struct
264 {
265 unsigned long op, result;
266 unsigned long args[6];
267 } multicall_entry_t;
269 /*
270 * Event channel endpoints per domain:
271 * 1024 if a long is 32 bits; 4096 if a long is 64 bits.
272 */
273 #define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
275 typedef struct vcpu_time_info {
276 /*
277 * Updates to the following values are preceded and followed by an
278 * increment of 'version'. The guest can therefore detect updates by
279 * looking for changes to 'version'. If the least-significant bit of
280 * the version number is set then an update is in progress and the guest
281 * must wait to read a consistent set of values.
282 * The correct way to interact with the version number is similar to
283 * Linux's seqlock: see the implementations of read_seqbegin/read_seqretry.
284 */
285 uint32_t version;
286 uint32_t pad0;
287 uint64_t tsc_timestamp; /* TSC at last update of time vals. */
288 uint64_t system_time; /* Time, in nanosecs, since boot. */
289 /*
290 * Current system time:
291 * system_time + ((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul
292 * CPU frequency (Hz):
293 * ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
294 */
295 uint32_t tsc_to_system_mul;
296 int8_t tsc_shift;
297 int8_t pad1[3];
298 } vcpu_time_info_t; /* 32 bytes */
300 typedef struct vcpu_info {
301 /*
302 * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
303 * a pending notification for a particular VCPU. It is then cleared
304 * by the guest OS /before/ checking for pending work, thus avoiding
305 * a set-and-check race. Note that the mask is only accessed by Xen
306 * on the CPU that is currently hosting the VCPU. This means that the
307 * pending and mask flags can be updated by the guest without special
308 * synchronisation (i.e., no need for the x86 LOCK prefix).
309 * This may seem suboptimal because if the pending flag is set by
310 * a different CPU then an IPI may be scheduled even when the mask
311 * is set. However, note:
312 * 1. The task of 'interrupt holdoff' is covered by the per-event-
313 * channel mask bits. A 'noisy' event that is continually being
314 * triggered can be masked at source at this very precise
315 * granularity.
316 * 2. The main purpose of the per-VCPU mask is therefore to restrict
317 * reentrant execution: whether for concurrency control, or to
318 * prevent unbounded stack usage. Whatever the purpose, we expect
319 * that the mask will be asserted only for short periods at a time,
320 * and so the likelihood of a 'spurious' IPI is suitably small.
321 * The mask is read before making an event upcall to the guest: a
322 * non-zero mask therefore guarantees that the VCPU will not receive
323 * an upcall activation. The mask is cleared when the VCPU requests
324 * to block: this avoids wakeup-waiting races.
325 */
326 uint8_t evtchn_upcall_pending;
327 uint8_t evtchn_upcall_mask;
328 unsigned long evtchn_pending_sel;
329 arch_vcpu_info_t arch;
330 vcpu_time_info_t time;
331 } vcpu_info_t; /* 64 bytes (x86) */
333 /*
334 * Xen/kernel shared data -- pointer provided in start_info.
335 * NB. We expect that this struct is smaller than a page.
336 */
337 typedef struct shared_info {
338 vcpu_info_t vcpu_info[MAX_VIRT_CPUS];
340 /*
341 * A domain can create "event channels" on which it can send and receive
342 * asynchronous event notifications. There are three classes of event that
343 * are delivered by this mechanism:
344 * 1. Bi-directional inter- and intra-domain connections. Domains must
345 * arrange out-of-band to set up a connection (usually by allocating
346 * an unbound 'listener' port and avertising that via a storage service
347 * such as xenstore).
348 * 2. Physical interrupts. A domain with suitable hardware-access
349 * privileges can bind an event-channel port to a physical interrupt
350 * source.
351 * 3. Virtual interrupts ('events'). A domain can bind an event-channel
352 * port to a virtual interrupt source, such as the virtual-timer
353 * device or the emergency console.
354 *
355 * Event channels are addressed by a "port index". Each channel is
356 * associated with two bits of information:
357 * 1. PENDING -- notifies the domain that there is a pending notification
358 * to be processed. This bit is cleared by the guest.
359 * 2. MASK -- if this bit is clear then a 0->1 transition of PENDING
360 * will cause an asynchronous upcall to be scheduled. This bit is only
361 * updated by the guest. It is read-only within Xen. If a channel
362 * becomes pending while the channel is masked then the 'edge' is lost
363 * (i.e., when the channel is unmasked, the guest must manually handle
364 * pending notifications as no upcall will be scheduled by Xen).
365 *
366 * To expedite scanning of pending notifications, any 0->1 pending
367 * transition on an unmasked channel causes a corresponding bit in a
368 * per-vcpu selector word to be set. Each bit in the selector covers a
369 * 'C long' in the PENDING bitfield array.
370 */
371 unsigned long evtchn_pending[sizeof(unsigned long) * 8];
372 unsigned long evtchn_mask[sizeof(unsigned long) * 8];
374 /*
375 * Wallclock time: updated only by control software. Guests should base
376 * their gettimeofday() syscall on this wallclock-base value.
377 */
378 uint32_t wc_version; /* Version counter: see vcpu_time_info_t. */
379 uint32_t wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */
380 uint32_t wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */
382 arch_shared_info_t arch;
384 } shared_info_t;
386 /*
387 * Start-of-day memory layout for the initial domain (DOM0):
388 * 1. The domain is started within contiguous virtual-memory region.
389 * 2. The contiguous region begins and ends on an aligned 4MB boundary.
390 * 3. The region start corresponds to the load address of the OS image.
391 * If the load address is not 4MB aligned then the address is rounded down.
392 * 4. This the order of bootstrap elements in the initial virtual region:
393 * a. relocated kernel image
394 * b. initial ram disk [mod_start, mod_len]
395 * c. list of allocated page frames [mfn_list, nr_pages]
396 * d. bootstrap page tables [pt_base, CR3 (x86)]
397 * e. start_info_t structure [register ESI (x86)]
398 * f. bootstrap stack [register ESP (x86)]
399 * 5. Bootstrap elements are packed together, but each is 4kB-aligned.
400 * 6. The initial ram disk may be omitted.
401 * 7. The list of page frames forms a contiguous 'pseudo-physical' memory
402 * layout for the domain. In particular, the bootstrap virtual-memory
403 * region is a 1:1 mapping to the first section of the pseudo-physical map.
404 * 8. All bootstrap elements are mapped read-writable for the guest OS. The
405 * only exception is the bootstrap page table, which is mapped read-only.
406 * 9. There is guaranteed to be at least 512kB padding after the final
407 * bootstrap element. If necessary, the bootstrap virtual region is
408 * extended by an extra 4MB to ensure this.
409 */
411 #define MAX_GUEST_CMDLINE 1024
412 typedef struct start_info {
413 /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME. */
414 char magic[32]; /* "xen-<version>-<platform>". */
415 unsigned long nr_pages; /* Total pages allocated to this domain. */
416 unsigned long shared_info; /* MACHINE address of shared info struct. */
417 uint32_t flags; /* SIF_xxx flags. */
418 unsigned long store_mfn; /* MACHINE page number of shared page. */
419 uint32_t store_evtchn; /* Event channel for store communication. */
420 unsigned long console_mfn; /* MACHINE address of console page. */
421 uint32_t console_evtchn; /* Event channel for console messages. */
422 /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME). */
423 unsigned long pt_base; /* VIRTUAL address of page directory. */
424 unsigned long nr_pt_frames; /* Number of bootstrap p.t. frames. */
425 unsigned long mfn_list; /* VIRTUAL address of page-frame list. */
426 unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */
427 unsigned long mod_len; /* Size (bytes) of pre-loaded module. */
428 int8_t cmd_line[MAX_GUEST_CMDLINE];
429 } start_info_t;
431 /* These flags are passed in the 'flags' field of start_info_t. */
432 #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */
433 #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */
435 typedef uint64_t cpumap_t;
437 typedef uint8_t xen_domain_handle_t[16];
439 /* Turn a plain number into a C unsigned long constant. */
440 #define __mk_unsigned_long(x) x ## UL
441 #define mk_unsigned_long(x) __mk_unsigned_long(x)
443 #else /* __ASSEMBLY__ */
445 /* In assembly code we cannot use C numeric constant suffixes. */
446 #define mk_unsigned_long(x) x
448 #endif /* !__ASSEMBLY__ */
450 #endif /* __XEN_PUBLIC_XEN_H__ */
452 /*
453 * Local variables:
454 * mode: C
455 * c-set-style: "BSD"
456 * c-basic-offset: 4
457 * tab-width: 4
458 * indent-tabs-mode: nil
459 * End:
460 */