direct-io.hg

view tools/libxc/xc_linux_build.c @ 8686:c0a0f4db5ab1

Create a block of reserved PFNs in shadow translate mode guests, and
move the shared info and grant table pfns into that block. This
allows us to remove the get_gnttablist dom0 op, and simplifies the
domain creation code slightly. Having the reserved block managed by
Xen may also make it slightly easier to handle the case where the
grant table needs to be extended at run time.

Suggested-by: kaf24
Signed-off-by: Steven Smith, sos22@cam.ac.uk
author sos22@douglas.cl.cam.ac.uk
date Thu Jan 26 19:40:13 2006 +0100 (2006-01-26)
parents edf1fab86618
children a47b7a464f09
line source
1 /******************************************************************************
2 * xc_linux_build.c
3 */
5 #include "xg_private.h"
6 #include "xc_private.h"
7 #include <xenctrl.h>
9 #if defined(__i386__)
10 #define ELFSIZE 32
11 #endif
13 #if defined(__x86_64__) || defined(__ia64__)
14 #define ELFSIZE 64
15 #endif
17 #include "xc_elf.h"
18 #include "xc_aout9.h"
19 #include <stdlib.h>
20 #include <unistd.h>
21 #include <zlib.h>
23 #if defined(__i386__)
24 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
25 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
26 #define L3_PROT (_PAGE_PRESENT)
27 #endif
29 #if defined(__x86_64__)
30 #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
31 #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
32 #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
33 #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
34 #endif
36 #define NR_GRANT_FRAMES 4
38 #ifdef __ia64__
39 #define get_tot_pages xc_get_max_pages
40 #else
41 #define get_tot_pages xc_get_tot_pages
42 #endif
44 #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
45 #define round_pgdown(_p) ((_p)&PAGE_MASK)
47 #ifdef __ia64__
48 #define probe_aout9(image,image_size,load_funcs) 1
49 #endif
51 static int probeimageformat(char *image,
52 unsigned long image_size,
53 struct load_funcs *load_funcs)
54 {
55 if ( probe_elf(image, image_size, load_funcs) &&
56 probe_bin(image, image_size, load_funcs) &&
57 probe_aout9(image, image_size, load_funcs) )
58 {
59 ERROR( "Unrecognized image format" );
60 return -EINVAL;
61 }
63 return 0;
64 }
66 #define alloc_pt(ltab, vltab, pltab) \
67 do { \
68 pltab = ppt_alloc++; \
69 ltab = (uint64_t)page_array[pltab] << PAGE_SHIFT; \
70 pltab <<= PAGE_SHIFT; \
71 if ( vltab != NULL ) \
72 munmap(vltab, PAGE_SIZE); \
73 if ( (vltab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, \
74 PROT_READ|PROT_WRITE, \
75 ltab >> PAGE_SHIFT)) == NULL ) \
76 goto error_out; \
77 memset(vltab, 0x0, PAGE_SIZE); \
78 } while ( 0 )
80 #if defined(__i386__)
82 static int setup_pg_tables(int xc_handle, uint32_t dom,
83 vcpu_guest_context_t *ctxt,
84 unsigned long dsi_v_start,
85 unsigned long v_end,
86 unsigned long *page_array,
87 unsigned long vpt_start,
88 unsigned long vpt_end,
89 unsigned shadow_mode_enabled)
90 {
91 l1_pgentry_t *vl1tab=NULL, *vl1e=NULL;
92 l2_pgentry_t *vl2tab=NULL, *vl2e=NULL;
93 unsigned long l1tab = 0, pl1tab;
94 unsigned long l2tab = 0, pl2tab;
95 unsigned long ppt_alloc;
96 unsigned long count;
98 ppt_alloc = (vpt_start - dsi_v_start) >> PAGE_SHIFT;
99 alloc_pt(l2tab, vl2tab, pl2tab);
100 vl2e = &vl2tab[l2_table_offset(dsi_v_start)];
101 if (shadow_mode_enabled)
102 ctxt->ctrlreg[3] = pl2tab;
103 else
104 ctxt->ctrlreg[3] = l2tab;
106 for ( count = 0; count < ((v_end - dsi_v_start) >> PAGE_SHIFT); count++ )
107 {
108 if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 )
109 {
110 alloc_pt(l1tab, vl1tab, pl1tab);
111 vl1e = &vl1tab[l1_table_offset(dsi_v_start + (count<<PAGE_SHIFT))];
112 if (shadow_mode_enabled)
113 *vl2e = pl1tab | L2_PROT;
114 else
115 *vl2e = l1tab | L2_PROT;
116 vl2e++;
117 }
119 if (shadow_mode_enabled) {
120 *vl1e = (count << PAGE_SHIFT) | L1_PROT;
121 } else {
122 *vl1e = (page_array[count] << PAGE_SHIFT) | L1_PROT;
123 if ( (count >= ((vpt_start-dsi_v_start)>>PAGE_SHIFT)) &&
124 (count < ((vpt_end -dsi_v_start)>>PAGE_SHIFT)) )
125 *vl1e &= ~_PAGE_RW;
126 }
127 vl1e++;
128 }
129 munmap(vl1tab, PAGE_SIZE);
130 munmap(vl2tab, PAGE_SIZE);
131 return 0;
133 error_out:
134 if (vl1tab)
135 munmap(vl1tab, PAGE_SIZE);
136 if (vl2tab)
137 munmap(vl2tab, PAGE_SIZE);
138 return -1;
139 }
141 static int setup_pg_tables_pae(int xc_handle, uint32_t dom,
142 vcpu_guest_context_t *ctxt,
143 unsigned long dsi_v_start,
144 unsigned long v_end,
145 unsigned long *page_array,
146 unsigned long vpt_start,
147 unsigned long vpt_end,
148 unsigned shadow_mode_enabled)
149 {
150 l1_pgentry_64_t *vl1tab = NULL, *vl1e = NULL;
151 l2_pgentry_64_t *vl2tab = NULL, *vl2e = NULL;
152 l3_pgentry_64_t *vl3tab = NULL, *vl3e = NULL;
153 uint64_t l1tab, l2tab, l3tab, pl1tab, pl2tab, pl3tab;
154 unsigned long ppt_alloc, count, nmfn;
156 /* First allocate page for page dir. */
157 ppt_alloc = (vpt_start - dsi_v_start) >> PAGE_SHIFT;
159 if ( page_array[ppt_alloc] > 0xfffff )
160 {
161 nmfn = xc_make_page_below_4G(xc_handle, dom, page_array[ppt_alloc]);
162 if ( nmfn == 0 )
163 {
164 fprintf(stderr, "Couldn't get a page below 4GB :-(\n");
165 goto error_out;
166 }
167 page_array[ppt_alloc] = nmfn;
168 }
170 alloc_pt(l3tab, vl3tab, pl3tab);
171 vl3e = &vl3tab[l3_table_offset_pae(dsi_v_start)];
172 if (shadow_mode_enabled)
173 ctxt->ctrlreg[3] = pl3tab;
174 else
175 ctxt->ctrlreg[3] = l3tab;
177 for ( count = 0; count < ((v_end - dsi_v_start) >> PAGE_SHIFT); count++)
178 {
179 if ( !((unsigned long)vl1e & (PAGE_SIZE-1)) )
180 {
181 if ( !((unsigned long)vl2e & (PAGE_SIZE-1)) )
182 {
183 alloc_pt(l2tab, vl2tab, pl2tab);
184 vl2e = &vl2tab[l2_table_offset_pae(
185 dsi_v_start + (count << PAGE_SHIFT))];
186 if (shadow_mode_enabled)
187 *vl3e = pl2tab | L3_PROT;
188 else
189 *vl3e++ = l2tab | L3_PROT;
190 }
192 alloc_pt(l1tab, vl1tab, pl1tab);
193 vl1e = &vl1tab[l1_table_offset_pae(
194 dsi_v_start + (count << PAGE_SHIFT))];
195 if (shadow_mode_enabled)
196 *vl2e = pl1tab | L2_PROT;
197 else
198 *vl2e++ = l1tab | L2_PROT;
199 }
201 if (shadow_mode_enabled) {
202 *vl1e = (count << PAGE_SHIFT) | L1_PROT;
203 } else {
204 *vl1e = ((uint64_t)page_array[count] << PAGE_SHIFT) | L1_PROT;
205 if ( (count >= ((vpt_start-dsi_v_start)>>PAGE_SHIFT)) &&
206 (count < ((vpt_end -dsi_v_start)>>PAGE_SHIFT)) )
207 *vl1e &= ~_PAGE_RW;
208 }
209 vl1e++;
210 }
212 munmap(vl1tab, PAGE_SIZE);
213 munmap(vl2tab, PAGE_SIZE);
214 munmap(vl3tab, PAGE_SIZE);
215 return 0;
217 error_out:
218 if (vl1tab)
219 munmap(vl1tab, PAGE_SIZE);
220 if (vl2tab)
221 munmap(vl2tab, PAGE_SIZE);
222 if (vl3tab)
223 munmap(vl3tab, PAGE_SIZE);
224 return -1;
225 }
227 #endif
229 #if defined(__x86_64__)
231 static int setup_pg_tables_64(int xc_handle, uint32_t dom,
232 vcpu_guest_context_t *ctxt,
233 unsigned long dsi_v_start,
234 unsigned long v_end,
235 unsigned long *page_array,
236 unsigned long vpt_start,
237 unsigned long vpt_end,
238 int shadow_mode_enabled)
239 {
240 l1_pgentry_t *vl1tab=NULL, *vl1e=NULL;
241 l2_pgentry_t *vl2tab=NULL, *vl2e=NULL;
242 l3_pgentry_t *vl3tab=NULL, *vl3e=NULL;
243 l4_pgentry_t *vl4tab=NULL, *vl4e=NULL;
244 unsigned long l2tab = 0, pl2tab;
245 unsigned long l1tab = 0, pl1tab;
246 unsigned long l3tab = 0, pl3tab;
247 unsigned long l4tab = 0, pl4tab;
248 unsigned long ppt_alloc;
249 unsigned long count;
251 /* First allocate page for page dir. */
252 ppt_alloc = (vpt_start - dsi_v_start) >> PAGE_SHIFT;
253 alloc_pt(l4tab, vl4tab, pl4tab);
254 vl4e = &vl4tab[l4_table_offset(dsi_v_start)];
255 if (shadow_mode_enabled)
256 ctxt->ctrlreg[3] = pl4tab;
257 else
258 ctxt->ctrlreg[3] = l4tab;
260 for ( count = 0; count < ((v_end-dsi_v_start)>>PAGE_SHIFT); count++)
261 {
262 if ( !((unsigned long)vl1e & (PAGE_SIZE-1)) )
263 {
264 alloc_pt(l1tab, vl1tab, pl1tab);
266 if ( !((unsigned long)vl2e & (PAGE_SIZE-1)) )
267 {
268 alloc_pt(l2tab, vl2tab, pl2tab);
269 if ( !((unsigned long)vl3e & (PAGE_SIZE-1)) )
270 {
271 alloc_pt(l3tab, vl3tab, pl3tab);
272 vl3e = &vl3tab[l3_table_offset(dsi_v_start + (count<<PAGE_SHIFT))];
273 if (shadow_mode_enabled)
274 *vl4e = pl3tab | L4_PROT;
275 else
276 *vl4e = l3tab | L4_PROT;
277 vl4e++;
278 }
279 vl2e = &vl2tab[l2_table_offset(dsi_v_start + (count<<PAGE_SHIFT))];
280 if (shadow_mode_enabled)
281 *vl3e = pl2tab | L3_PROT;
282 else
283 *vl3e = l2tab | L3_PROT;
284 vl3e++;
285 }
286 vl1e = &vl1tab[l1_table_offset(dsi_v_start + (count<<PAGE_SHIFT))];
287 if (shadow_mode_enabled)
288 *vl2e = pl1tab | L2_PROT;
289 else
290 *vl2e = l1tab | L2_PROT;
291 vl2e++;
292 }
294 if (shadow_mode_enabled) {
295 *vl1e = (count << PAGE_SHIFT) | L1_PROT;
296 } else {
297 *vl1e = (page_array[count] << PAGE_SHIFT) | L1_PROT;
298 if ( (count >= ((vpt_start-dsi_v_start)>>PAGE_SHIFT)) &&
299 (count < ((vpt_end -dsi_v_start)>>PAGE_SHIFT)) )
300 {
301 *vl1e &= ~_PAGE_RW;
302 }
303 }
304 vl1e++;
305 }
307 munmap(vl1tab, PAGE_SIZE);
308 munmap(vl2tab, PAGE_SIZE);
309 munmap(vl3tab, PAGE_SIZE);
310 munmap(vl4tab, PAGE_SIZE);
311 return 0;
313 error_out:
314 if (vl1tab)
315 munmap(vl1tab, PAGE_SIZE);
316 if (vl2tab)
317 munmap(vl2tab, PAGE_SIZE);
318 if (vl3tab)
319 munmap(vl3tab, PAGE_SIZE);
320 if (vl4tab)
321 munmap(vl4tab, PAGE_SIZE);
322 return -1;
323 }
324 #endif
326 #ifdef __ia64__
327 extern unsigned long xc_ia64_fpsr_default(void);
329 static int setup_guest(int xc_handle,
330 uint32_t dom,
331 char *image, unsigned long image_size,
332 gzFile initrd_gfd, unsigned long initrd_len,
333 unsigned long nr_pages,
334 unsigned long *pvsi, unsigned long *pvke,
335 unsigned long *pvss, vcpu_guest_context_t *ctxt,
336 const char *cmdline,
337 unsigned long shared_info_frame,
338 unsigned long flags,
339 unsigned int store_evtchn, unsigned long *store_mfn,
340 unsigned int console_evtchn, unsigned long *console_mfn)
341 {
342 unsigned long *page_array = NULL;
343 struct load_funcs load_funcs;
344 struct domain_setup_info dsi;
345 unsigned long vinitrd_start;
346 unsigned long vinitrd_end;
347 unsigned long v_end;
348 unsigned long start_page, pgnr;
349 start_info_t *start_info;
350 int rc;
351 unsigned long i;
353 rc = probeimageformat(image, image_size, &load_funcs);
354 if ( rc != 0 )
355 goto error_out;
357 memset(&dsi, 0, sizeof(struct domain_setup_info));
359 rc = (load_funcs.parseimage)(image, image_size, &dsi);
360 if ( rc != 0 )
361 goto error_out;
363 dsi.v_start = round_pgdown(dsi.v_start);
364 vinitrd_start = round_pgup(dsi.v_end);
365 vinitrd_end = vinitrd_start + initrd_len;
366 v_end = round_pgup(vinitrd_end);
368 start_page = dsi.v_start >> PAGE_SHIFT;
369 pgnr = (v_end - dsi.v_start) >> PAGE_SHIFT;
370 if ( (page_array = malloc(pgnr * sizeof(unsigned long))) == NULL )
371 {
372 PERROR("Could not allocate memory");
373 goto error_out;
374 }
376 if ( xc_ia64_get_pfn_list(xc_handle, dom, page_array, start_page, pgnr) != pgnr )
377 {
378 PERROR("Could not get the page frame list");
379 goto error_out;
380 }
382 #define _p(a) ((void *) (a))
384 printf("VIRTUAL MEMORY ARRANGEMENT:\n"
385 " Loaded kernel: %p->%p\n"
386 " Init. ramdisk: %p->%p\n"
387 " TOTAL: %p->%p\n",
388 _p(dsi.v_kernstart), _p(dsi.v_kernend),
389 _p(vinitrd_start), _p(vinitrd_end),
390 _p(dsi.v_start), _p(v_end));
391 printf(" ENTRY ADDRESS: %p\n", _p(dsi.v_kernentry));
393 (load_funcs.loadimage)(image, image_size, xc_handle, dom, page_array,
394 &dsi);
396 /* Load the initial ramdisk image. */
397 if ( initrd_len != 0 )
398 {
399 for ( i = (vinitrd_start - dsi.v_start);
400 i < (vinitrd_end - dsi.v_start); i += PAGE_SIZE )
401 {
402 char page[PAGE_SIZE];
403 if ( gzread(initrd_gfd, page, PAGE_SIZE) == -1 )
404 {
405 PERROR("Error reading initrd image, could not");
406 goto error_out;
407 }
408 xc_copy_to_domain_page(xc_handle, dom,
409 page_array[i>>PAGE_SHIFT], page);
410 }
411 }
414 *pvke = dsi.v_kernentry;
416 /* Now need to retrieve machine pfn for system pages:
417 * start_info/store/console
418 */
419 pgnr = 3;
420 if ( xc_ia64_get_pfn_list(xc_handle, dom, page_array,
421 nr_pages - 3, pgnr) != pgnr )
422 {
423 PERROR("Could not get page frame for xenstore");
424 goto error_out;
425 }
427 *store_mfn = page_array[1];
428 *console_mfn = page_array[2];
429 printf("store_mfn: 0x%lx, console_mfn: 0x%lx\n",
430 (uint64_t)store_mfn, (uint64_t)console_mfn);
432 start_info = xc_map_foreign_range(
433 xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, page_array[0]);
434 memset(start_info, 0, sizeof(*start_info));
435 rc = xc_version(xc_handle, XENVER_version, NULL);
436 sprintf(start_info->magic, "xen-%i.%i-ia64", rc >> 16, rc & (0xFFFF));
437 start_info->flags = flags;
438 start_info->store_mfn = nr_pages - 2;
439 start_info->store_evtchn = store_evtchn;
440 start_info->console_mfn = nr_pages - 1;
441 start_info->console_evtchn = console_evtchn;
442 start_info->nr_pages = nr_pages; // FIXME?: nr_pages - 2 ????
443 if ( initrd_len != 0 )
444 {
445 ctxt->initrd.start = vinitrd_start;
446 ctxt->initrd.size = initrd_len;
447 } else {
448 ctxt->initrd.start = 0;
449 ctxt->initrd.size = 0;
450 }
451 if ( cmdline != NULL )
452 {
453 strncpy((char *)ctxt->cmdline, cmdline, IA64_COMMAND_LINE_SIZE);
454 ctxt->cmdline[IA64_COMMAND_LINE_SIZE-1] = '\0';
455 }
456 munmap(start_info, PAGE_SIZE);
458 free(page_array);
459 return 0;
461 error_out:
462 free(page_array);
463 return -1;
464 }
465 #else /* x86 */
466 static int setup_guest(int xc_handle,
467 uint32_t dom,
468 char *image, unsigned long image_size,
469 gzFile initrd_gfd, unsigned long initrd_len,
470 unsigned long nr_pages,
471 unsigned long *pvsi, unsigned long *pvke,
472 unsigned long *pvss, vcpu_guest_context_t *ctxt,
473 const char *cmdline,
474 unsigned long shared_info_frame,
475 unsigned long flags,
476 unsigned int store_evtchn, unsigned long *store_mfn,
477 unsigned int console_evtchn, unsigned long *console_mfn)
478 {
479 unsigned long *page_array = NULL;
480 unsigned long count, i, hypercall_pfn;
481 start_info_t *start_info;
482 shared_info_t *shared_info;
483 xc_mmu_t *mmu = NULL;
484 char *p;
485 DECLARE_DOM0_OP;
486 int rc;
488 unsigned long nr_pt_pages;
489 unsigned long physmap_pfn;
490 unsigned long *physmap, *physmap_e;
492 struct load_funcs load_funcs;
493 struct domain_setup_info dsi;
494 unsigned long vinitrd_start;
495 unsigned long vinitrd_end;
496 unsigned long vphysmap_start;
497 unsigned long vphysmap_end;
498 unsigned long vstartinfo_start;
499 unsigned long vstartinfo_end;
500 unsigned long vstoreinfo_start;
501 unsigned long vstoreinfo_end;
502 unsigned long vconsole_start;
503 unsigned long vconsole_end;
504 unsigned long vstack_start;
505 unsigned long vstack_end;
506 unsigned long vpt_start;
507 unsigned long vpt_end;
508 unsigned long v_end;
509 unsigned shadow_mode_enabled;
510 unsigned long guest_store_mfn, guest_console_mfn, guest_shared_info_mfn;
512 rc = probeimageformat(image, image_size, &load_funcs);
513 if ( rc != 0 )
514 goto error_out;
516 memset(&dsi, 0, sizeof(struct domain_setup_info));
518 rc = (load_funcs.parseimage)(image, image_size, &dsi);
519 if ( rc != 0 )
520 goto error_out;
522 if ( (dsi.v_start & (PAGE_SIZE-1)) != 0 )
523 {
524 PERROR("Guest OS must load to a page boundary.\n");
525 goto error_out;
526 }
528 shadow_mode_enabled = !!strstr(dsi.xen_guest_string,
529 "SHADOW=translate");
530 /*
531 * Why do we need this? The number of page-table frames depends on the
532 * size of the bootstrap address space. But the size of the address space
533 * depends on the number of page-table frames (since each one is mapped
534 * read-only). We have a pair of simultaneous equations in two unknowns,
535 * which we solve by exhaustive search.
536 */
537 vinitrd_start = round_pgup(dsi.v_end);
538 vinitrd_end = vinitrd_start + initrd_len;
539 vphysmap_start = round_pgup(vinitrd_end);
540 vphysmap_end = vphysmap_start + (nr_pages * sizeof(unsigned long));
541 vstartinfo_start = round_pgup(vphysmap_end);
542 vstartinfo_end = vstartinfo_start + PAGE_SIZE;
543 vstoreinfo_start = vstartinfo_end;
544 vstoreinfo_end = vstoreinfo_start + PAGE_SIZE;
545 vconsole_start = vstoreinfo_end;
546 vconsole_end = vconsole_start + PAGE_SIZE;
547 vpt_start = vconsole_end;
549 for ( nr_pt_pages = 2; ; nr_pt_pages++ )
550 {
551 vpt_end = vpt_start + (nr_pt_pages * PAGE_SIZE);
552 vstack_start = vpt_end;
553 vstack_end = vstack_start + PAGE_SIZE;
554 v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
555 if ( (v_end - vstack_end) < (512UL << 10) )
556 v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
557 #if defined(__i386__)
558 if (dsi.pae_kernel) {
559 /* FIXME: assumes one L2 pgtable @ 0xc0000000 */
560 if ( (((v_end - dsi.v_start + ((1<<L2_PAGETABLE_SHIFT_PAE)-1)) >>
561 L2_PAGETABLE_SHIFT_PAE) + 2) <= nr_pt_pages )
562 break;
563 } else {
564 if ( (((v_end - dsi.v_start + ((1<<L2_PAGETABLE_SHIFT)-1)) >>
565 L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
566 break;
567 }
568 #endif
569 #if defined(__x86_64__)
570 #define NR(_l,_h,_s) \
571 (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
572 ((_l) & ~((1UL<<(_s))-1))) >> (_s))
573 if ( (1 + /* # L4 */
574 NR(dsi.v_start, v_end, L4_PAGETABLE_SHIFT) + /* # L3 */
575 NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT) + /* # L2 */
576 NR(dsi.v_start, v_end, L2_PAGETABLE_SHIFT)) /* # L1 */
577 <= nr_pt_pages )
578 break;
579 #endif
580 }
582 #define _p(a) ((void *) (a))
584 printf("VIRTUAL MEMORY ARRANGEMENT:\n"
585 " Loaded kernel: %p->%p\n"
586 " Init. ramdisk: %p->%p\n"
587 " Phys-Mach map: %p->%p\n"
588 " Start info: %p->%p\n"
589 " Store page: %p->%p\n"
590 " Console page: %p->%p\n"
591 " Page tables: %p->%p\n"
592 " Boot stack: %p->%p\n"
593 " TOTAL: %p->%p\n",
594 _p(dsi.v_kernstart), _p(dsi.v_kernend),
595 _p(vinitrd_start), _p(vinitrd_end),
596 _p(vphysmap_start), _p(vphysmap_end),
597 _p(vstartinfo_start), _p(vstartinfo_end),
598 _p(vstoreinfo_start), _p(vstoreinfo_end),
599 _p(vconsole_start), _p(vconsole_end),
600 _p(vpt_start), _p(vpt_end),
601 _p(vstack_start), _p(vstack_end),
602 _p(dsi.v_start), _p(v_end));
603 printf(" ENTRY ADDRESS: %p\n", _p(dsi.v_kernentry));
605 if ( ((v_end - dsi.v_start)>>PAGE_SHIFT) > nr_pages )
606 {
607 PERROR("Initial guest OS requires too much space\n"
608 "(%luMB is greater than %luMB limit)\n",
609 (v_end-dsi.v_start)>>20, nr_pages>>(20-PAGE_SHIFT));
610 goto error_out;
611 }
613 if ( (page_array = malloc(nr_pages * sizeof(unsigned long))) == NULL )
614 {
615 PERROR("Could not allocate memory");
616 goto error_out;
617 }
619 if ( xc_get_pfn_list(xc_handle, dom, page_array, nr_pages) != nr_pages )
620 {
621 PERROR("Could not get the page frame list");
622 goto error_out;
623 }
625 (load_funcs.loadimage)(image, image_size, xc_handle, dom, page_array,
626 &dsi);
628 /* Load the initial ramdisk image. */
629 if ( initrd_len != 0 )
630 {
631 for ( i = (vinitrd_start - dsi.v_start);
632 i < (vinitrd_end - dsi.v_start); i += PAGE_SIZE )
633 {
634 char page[PAGE_SIZE];
635 if ( gzread(initrd_gfd, page, PAGE_SIZE) == -1 )
636 {
637 PERROR("Error reading initrd image, could not");
638 goto error_out;
639 }
640 xc_copy_to_domain_page(xc_handle, dom,
641 page_array[i>>PAGE_SHIFT], page);
642 }
643 }
645 if ( (mmu = xc_init_mmu_updates(xc_handle, dom)) == NULL )
646 goto error_out;
648 /* Write the phys->machine and machine->phys table entries. */
649 physmap_pfn = (vphysmap_start - dsi.v_start) >> PAGE_SHIFT;
650 physmap = physmap_e = xc_map_foreign_range(
651 xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
652 page_array[physmap_pfn++]);
654 for ( count = 0; count < nr_pages; count++ )
655 {
656 if ( xc_add_mmu_update(
657 xc_handle, mmu,
658 ((uint64_t)page_array[count] << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
659 count) )
660 {
661 fprintf(stderr,"m2p update failure p=%lx m=%lx\n",
662 count, page_array[count]);
663 munmap(physmap, PAGE_SIZE);
664 goto error_out;
665 }
666 *physmap_e++ = page_array[count];
667 if ( ((unsigned long)physmap_e & (PAGE_SIZE-1)) == 0 )
668 {
669 munmap(physmap, PAGE_SIZE);
670 physmap = physmap_e = xc_map_foreign_range(
671 xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
672 page_array[physmap_pfn++]);
673 }
674 }
675 munmap(physmap, PAGE_SIZE);
677 /* Send the page update requests down to the hypervisor. */
678 if ( xc_finish_mmu_updates(xc_handle, mmu) )
679 goto error_out;
681 if (shadow_mode_enabled) {
682 /* Enable shadow translate mode */
683 if (xc_shadow_control(xc_handle, dom,
684 DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE,
685 NULL, 0, NULL) < 0) {
686 PERROR("Could not enable translation mode");
687 goto error_out;
688 }
690 /* Find the shared info frame. It's guaranteed to be at the
691 start of the PFN hole. */
692 guest_shared_info_mfn = xc_get_pfn_hole_start(xc_handle, dom);
693 if (guest_shared_info_mfn <= 0) {
694 PERROR("Cannot find shared info pfn");
695 goto error_out;
696 }
697 } else {
698 guest_shared_info_mfn = shared_info_frame;
699 }
701 /* setup page tables */
702 #if defined(__i386__)
703 if (dsi.pae_kernel)
704 rc = setup_pg_tables_pae(xc_handle, dom, ctxt,
705 dsi.v_start, v_end,
706 page_array, vpt_start, vpt_end,
707 shadow_mode_enabled);
708 else
709 rc = setup_pg_tables(xc_handle, dom, ctxt,
710 dsi.v_start, v_end,
711 page_array, vpt_start, vpt_end,
712 shadow_mode_enabled);
713 #endif
714 #if defined(__x86_64__)
715 rc = setup_pg_tables_64(xc_handle, dom, ctxt,
716 dsi.v_start, v_end,
717 page_array, vpt_start, vpt_end,
718 shadow_mode_enabled);
719 #endif
720 if (0 != rc)
721 goto error_out;
723 #if defined(__i386__)
724 /*
725 * Pin down l2tab addr as page dir page - causes hypervisor to provide
726 * correct protection for the page
727 */
728 if (!shadow_mode_enabled) {
729 if (dsi.pae_kernel) {
730 if ( pin_table(xc_handle, MMUEXT_PIN_L3_TABLE,
731 ctxt->ctrlreg[3] >> PAGE_SHIFT, dom) )
732 goto error_out;
733 } else {
734 if ( pin_table(xc_handle, MMUEXT_PIN_L2_TABLE,
735 ctxt->ctrlreg[3] >> PAGE_SHIFT, dom) )
736 goto error_out;
737 }
738 }
739 #endif
741 #if defined(__x86_64__)
742 /*
743 * Pin down l4tab addr as page dir page - causes hypervisor to provide
744 * correct protection for the page
745 */
746 if ( pin_table(xc_handle, MMUEXT_PIN_L4_TABLE,
747 ctxt->ctrlreg[3] >> PAGE_SHIFT, dom) )
748 goto error_out;
749 #endif
751 *store_mfn = page_array[(vstoreinfo_start-dsi.v_start) >> PAGE_SHIFT];
752 *console_mfn = page_array[(vconsole_start-dsi.v_start) >> PAGE_SHIFT];
753 if ( xc_clear_domain_page(xc_handle, dom, *store_mfn) ||
754 xc_clear_domain_page(xc_handle, dom, *console_mfn) )
755 goto error_out;
756 if (shadow_mode_enabled) {
757 guest_store_mfn = (vstoreinfo_start-dsi.v_start) >> PAGE_SHIFT;
758 guest_console_mfn = (vconsole_start-dsi.v_start) >> PAGE_SHIFT;
759 } else {
760 guest_store_mfn = *store_mfn;
761 guest_console_mfn = *console_mfn;
762 }
764 start_info = xc_map_foreign_range(
765 xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
766 page_array[(vstartinfo_start-dsi.v_start)>>PAGE_SHIFT]);
767 /*shared_info, start_info */
768 memset(start_info, 0, sizeof(*start_info));
769 rc = xc_version(xc_handle, XENVER_version, NULL);
770 sprintf(start_info->magic, "xen-%i.%i-x86_%d%s",
771 rc >> 16, rc & (0xFFFF), (unsigned int)sizeof(long)*8,
772 dsi.pae_kernel ? "p" : "");
773 start_info->nr_pages = nr_pages;
774 start_info->shared_info = guest_shared_info_mfn << PAGE_SHIFT;
775 start_info->flags = flags;
776 start_info->pt_base = vpt_start;
777 start_info->nr_pt_frames = nr_pt_pages;
778 start_info->mfn_list = vphysmap_start;
779 start_info->store_mfn = guest_store_mfn;
780 start_info->store_evtchn = store_evtchn;
781 start_info->console_mfn = guest_console_mfn;
782 start_info->console_evtchn = console_evtchn;
783 if ( initrd_len != 0 )
784 {
785 start_info->mod_start = vinitrd_start;
786 start_info->mod_len = initrd_len;
787 }
788 if ( cmdline != NULL )
789 {
790 strncpy((char *)start_info->cmd_line, cmdline, MAX_GUEST_CMDLINE);
791 start_info->cmd_line[MAX_GUEST_CMDLINE-1] = '\0';
792 }
793 munmap(start_info, PAGE_SIZE);
795 /* shared_info page starts its life empty. */
796 shared_info = xc_map_foreign_range(
797 xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, shared_info_frame);
798 memset(shared_info, 0, sizeof(shared_info_t));
799 /* Mask all upcalls... */
800 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
801 shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
803 munmap(shared_info, PAGE_SIZE);
805 /* Send the page update requests down to the hypervisor. */
806 if ( xc_finish_mmu_updates(xc_handle, mmu) )
807 goto error_out;
809 p = strstr(dsi.xen_guest_string, "HYPERCALL_PAGE=");
810 if ( p != NULL )
811 {
812 p += strlen("HYPERCALL_PAGE=");
813 hypercall_pfn = strtoul(p, NULL, 16);
814 if ( hypercall_pfn >= nr_pages )
815 goto error_out;
816 op.u.hypercall_init.domain = (domid_t)dom;
817 op.u.hypercall_init.mfn = page_array[hypercall_pfn];
818 op.cmd = DOM0_HYPERCALL_INIT;
819 if ( xc_dom0_op(xc_handle, &op) )
820 goto error_out;
821 }
823 free(mmu);
824 free(page_array);
826 *pvsi = vstartinfo_start;
827 *pvss = vstack_start;
828 *pvke = dsi.v_kernentry;
830 return 0;
832 error_out:
833 free(mmu);
834 free(page_array);
835 return -1;
836 }
837 #endif
839 int xc_linux_build(int xc_handle,
840 uint32_t domid,
841 const char *image_name,
842 const char *ramdisk_name,
843 const char *cmdline,
844 unsigned long flags,
845 unsigned int store_evtchn,
846 unsigned long *store_mfn,
847 unsigned int console_evtchn,
848 unsigned long *console_mfn)
849 {
850 dom0_op_t launch_op;
851 DECLARE_DOM0_OP;
852 int initrd_fd = -1;
853 gzFile initrd_gfd = NULL;
854 int rc, i;
855 vcpu_guest_context_t st_ctxt, *ctxt = &st_ctxt;
856 unsigned long nr_pages;
857 char *image = NULL;
858 unsigned long image_size, initrd_size=0;
859 unsigned long vstartinfo_start, vkern_entry, vstack_start;
861 if ( (nr_pages = get_tot_pages(xc_handle, domid)) < 0 )
862 {
863 PERROR("Could not find total pages for domain");
864 goto error_out;
865 }
867 if ( (image = xc_read_kernel_image(image_name, &image_size)) == NULL )
868 goto error_out;
870 if ( (ramdisk_name != NULL) && (strlen(ramdisk_name) != 0) )
871 {
872 if ( (initrd_fd = open(ramdisk_name, O_RDONLY)) < 0 )
873 {
874 PERROR("Could not open the initial ramdisk image");
875 goto error_out;
876 }
878 initrd_size = xc_get_filesz(initrd_fd);
880 if ( (initrd_gfd = gzdopen(initrd_fd, "rb")) == NULL )
881 {
882 PERROR("Could not allocate decompression state for initrd");
883 goto error_out;
884 }
885 }
887 #ifdef VALGRIND
888 memset(&st_ctxt, 0, sizeof(st_ctxt));
889 #endif
891 if ( mlock(&st_ctxt, sizeof(st_ctxt) ) )
892 {
893 PERROR("%s: ctxt mlock failed", __func__);
894 return 1;
895 }
897 op.cmd = DOM0_GETDOMAININFO;
898 op.u.getdomaininfo.domain = (domid_t)domid;
899 if ( (xc_dom0_op(xc_handle, &op) < 0) ||
900 ((uint16_t)op.u.getdomaininfo.domain != domid) )
901 {
902 PERROR("Could not get info on domain");
903 goto error_out;
904 }
906 memset(ctxt, 0, sizeof(*ctxt));
908 if ( setup_guest(xc_handle, domid, image, image_size,
909 initrd_gfd, initrd_size, nr_pages,
910 &vstartinfo_start, &vkern_entry,
911 &vstack_start, ctxt, cmdline,
912 op.u.getdomaininfo.shared_info_frame,
913 flags, store_evtchn, store_mfn,
914 console_evtchn, console_mfn) < 0 )
915 {
916 ERROR("Error constructing guest OS");
917 goto error_out;
918 }
920 if ( initrd_fd >= 0 )
921 close(initrd_fd);
922 if ( initrd_gfd )
923 gzclose(initrd_gfd);
924 free(image);
926 #ifdef __ia64__
927 /* based on new_thread in xen/arch/ia64/domain.c */
928 ctxt->flags = 0;
929 ctxt->shared.flags = flags;
930 ctxt->shared.start_info_pfn = nr_pages - 3; /* metaphysical */
931 ctxt->regs.cr_ipsr = 0; /* all necessary bits filled by hypervisor */
932 ctxt->regs.cr_iip = vkern_entry;
933 ctxt->regs.cr_ifs = 1UL << 63;
934 ctxt->regs.ar_fpsr = xc_ia64_fpsr_default();
935 /* currently done by hypervisor, should move here */
936 /* ctxt->regs.r28 = dom_fw_setup(); */
937 ctxt->vcpu.privregs = 0;
938 ctxt->sys_pgnr = 3;
939 i = 0; /* silence unused variable warning */
940 #else /* x86 */
941 /*
942 * Initial register values:
943 * DS,ES,FS,GS = FLAT_KERNEL_DS
944 * CS:EIP = FLAT_KERNEL_CS:start_pc
945 * SS:ESP = FLAT_KERNEL_DS:start_stack
946 * ESI = start_info
947 * [EAX,EBX,ECX,EDX,EDI,EBP are zero]
948 * EFLAGS = IF | 2 (bit 1 is reserved and should always be 1)
949 */
950 ctxt->user_regs.ds = FLAT_KERNEL_DS;
951 ctxt->user_regs.es = FLAT_KERNEL_DS;
952 ctxt->user_regs.fs = FLAT_KERNEL_DS;
953 ctxt->user_regs.gs = FLAT_KERNEL_DS;
954 ctxt->user_regs.ss = FLAT_KERNEL_SS;
955 ctxt->user_regs.cs = FLAT_KERNEL_CS;
956 ctxt->user_regs.eip = vkern_entry;
957 ctxt->user_regs.esp = vstack_start + PAGE_SIZE;
958 ctxt->user_regs.esi = vstartinfo_start;
959 ctxt->user_regs.eflags = 1 << 9; /* Interrupt Enable */
961 ctxt->flags = VGCF_IN_KERNEL;
963 /* FPU is set up to default initial state. */
964 memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
966 /* Virtual IDT is empty at start-of-day. */
967 for ( i = 0; i < 256; i++ )
968 {
969 ctxt->trap_ctxt[i].vector = i;
970 ctxt->trap_ctxt[i].cs = FLAT_KERNEL_CS;
971 }
973 /* No LDT. */
974 ctxt->ldt_ents = 0;
976 /* Use the default Xen-provided GDT. */
977 ctxt->gdt_ents = 0;
979 /* Ring 1 stack is the initial stack. */
980 ctxt->kernel_ss = FLAT_KERNEL_SS;
981 ctxt->kernel_sp = vstack_start + PAGE_SIZE;
983 /* No debugging. */
984 memset(ctxt->debugreg, 0, sizeof(ctxt->debugreg));
986 /* No callback handlers. */
987 #if defined(__i386__)
988 ctxt->event_callback_cs = FLAT_KERNEL_CS;
989 ctxt->event_callback_eip = 0;
990 ctxt->failsafe_callback_cs = FLAT_KERNEL_CS;
991 ctxt->failsafe_callback_eip = 0;
992 #elif defined(__x86_64__)
993 ctxt->event_callback_eip = 0;
994 ctxt->failsafe_callback_eip = 0;
995 ctxt->syscall_callback_eip = 0;
996 #endif
997 #endif /* x86 */
999 memset( &launch_op, 0, sizeof(launch_op) );
1001 launch_op.u.setvcpucontext.domain = (domid_t)domid;
1002 launch_op.u.setvcpucontext.vcpu = 0;
1003 launch_op.u.setvcpucontext.ctxt = ctxt;
1005 launch_op.cmd = DOM0_SETVCPUCONTEXT;
1006 rc = xc_dom0_op(xc_handle, &launch_op);
1008 return rc;
1010 error_out:
1011 if ( initrd_gfd != NULL )
1012 gzclose(initrd_gfd);
1013 else if ( initrd_fd >= 0 )
1014 close(initrd_fd);
1015 free(image);
1016 return -1;
1019 /*
1020 * Local variables:
1021 * mode: C
1022 * c-set-style: "BSD"
1023 * c-basic-offset: 4
1024 * tab-width: 4
1025 * indent-tabs-mode: nil
1026 * End:
1027 */