ia64/xen-unstable

view tools/libxc/xc_linux_restore.c @ 7238:971e7c7411b3

Raise an exception if an error appears on the pipes to our children, and make
sure that the child's pipes are closed even under that exception. Move the
handling of POLLHUP to the end of the loop, so that we guarantee to read any
remaining data from the child if POLLHUP and POLLIN appear at the same time.

Signed-off-by: Ewan Mellor <ewan@xensource.com>
author emellor@ewan
date Thu Oct 06 10:13:11 2005 +0100 (2005-10-06)
parents 540d17fe32ce
children b3a255e88810
line source
1 /******************************************************************************
2 * xc_linux_restore.c
3 *
4 * Restore the state of a Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <stdlib.h>
10 #include <unistd.h>
11 #include "xg_private.h"
12 #include <xenctrl.h>
13 #include <xen/memory.h>
15 #define MAX_BATCH_SIZE 1024
17 #define DEBUG 0
19 #if 1
20 #define ERR(_f, _a...) do { fprintf ( stderr, _f , ## _a ); fflush(stderr); } while(0)
21 #else
22 #define ERR(_f, _a...) ((void)0)
23 #endif
25 #if DEBUG
26 #define DPRINTF(_f, _a...) do { fprintf ( stdout, _f , ## _a ); fflush(stdout); } while (0)
27 #else
28 #define DPRINTF(_f, _a...) ((void)0)
29 #endif
31 #define PROGRESS 0
32 #if PROGRESS
33 #define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a ); fflush(stderr)
34 #else
35 #define PPRINTF(_f, _a...)
36 #endif
38 static ssize_t
39 read_exact(int fd, void *buf, size_t count)
40 {
41 int r = 0, s;
42 unsigned char *b = buf;
44 while (r < count) {
45 s = read(fd, &b[r], count - r);
46 if (s <= 0)
47 break;
48 r += s;
49 }
51 return r;
52 }
54 int xc_linux_restore(int xc_handle, int io_fd, u32 dom, unsigned long nr_pfns,
55 unsigned int store_evtchn, unsigned long *store_mfn,
56 unsigned int console_evtchn, unsigned long *console_mfn)
57 {
58 dom0_op_t op;
59 int rc = 1, i, n, k;
60 unsigned long mfn, pfn, xpfn;
61 unsigned int prev_pc, this_pc;
62 int verify = 0;
63 int err;
65 /* The new domain's shared-info frame number. */
66 unsigned long shared_info_frame;
67 unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
68 shared_info_t *shared_info = (shared_info_t *)shared_info_page;
70 /* A copy of the CPU context of the guest. */
71 vcpu_guest_context_t ctxt;
73 /* A table containg the type of each PFN (/not/ MFN!). */
74 unsigned long *pfn_type = NULL;
76 /* A table of MFNs to map in the current region */
77 unsigned long *region_mfn = NULL;
79 /* A temporary mapping, and a copy, of one frame of guest memory. */
80 unsigned long *ppage = NULL;
82 /* A copy of the pfn-to-mfn table frame list. */
83 unsigned long pfn_to_mfn_frame_list[1024];
85 /* A table mapping each PFN to its new MFN. */
86 unsigned long *pfn_to_mfn_table = NULL;
88 /* used by mapper for updating the domain's copy of the table */
89 unsigned long *live_pfn_to_mfn_table = NULL;
91 /* A temporary mapping of the guest's start_info page. */
92 start_info_t *start_info;
94 int pt_levels = 2; /* XXX auto-detect this */
96 char *region_base;
98 xc_mmu_t *mmu = NULL;
100 /* used by debug verify code */
101 unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
103 #define MAX_PIN_BATCH 1024
104 struct mmuext_op pin[MAX_PIN_BATCH];
105 unsigned int nr_pins = 0;
107 DPRINTF("xc_linux_restore start: nr_pfns = %lx\n", nr_pfns);
109 if (mlock(&ctxt, sizeof(ctxt))) {
110 /* needed for when we do the build dom0 op,
111 but might as well do early */
112 ERR("Unable to mlock ctxt");
113 return 1;
114 }
116 if (read_exact(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE) {
117 ERR("read pfn_to_mfn_frame_list failed");
118 goto out;
119 }
121 /* We want zeroed memory so use calloc rather than malloc. */
122 pfn_to_mfn_table = calloc(4, nr_pfns);
123 pfn_type = calloc(4, nr_pfns);
124 region_mfn = calloc(4, MAX_BATCH_SIZE);
126 if ((pfn_to_mfn_table == NULL) || (pfn_type == NULL) ||
127 (region_mfn == NULL)) {
128 ERR("memory alloc failed");
129 errno = ENOMEM;
130 goto out;
131 }
133 if (mlock(region_mfn, 4 * MAX_BATCH_SIZE)) {
134 ERR("Could not mlock region_mfn");
135 goto out;
136 }
138 /* Get the domain's shared-info frame. */
139 op.cmd = DOM0_GETDOMAININFO;
140 op.u.getdomaininfo.domain = (domid_t)dom;
141 if (xc_dom0_op(xc_handle, &op) < 0) {
142 ERR("Could not get information on new domain");
143 goto out;
144 }
145 shared_info_frame = op.u.getdomaininfo.shared_info_frame;
147 err = xc_domain_setmaxmem(xc_handle, dom, nr_pfns * PAGE_SIZE / 1024);
148 if (err != 0) {
149 errno = ENOMEM;
150 goto out;
151 }
153 err = xc_domain_memory_increase_reservation(xc_handle, dom,
154 nr_pfns, 0, 0, NULL);
155 if (err != 0) {
156 ERR("Failed to increase reservation by %lx\n",
157 nr_pfns * PAGE_SIZE / 1024);
158 errno = ENOMEM;
159 goto out;
160 }
162 /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
163 if (xc_get_pfn_list(xc_handle, dom, pfn_to_mfn_table, nr_pfns) !=
164 nr_pfns) {
165 ERR("Did not read correct number of frame numbers for new dom");
166 goto out;
167 }
169 mmu = xc_init_mmu_updates(xc_handle, dom);
170 if (mmu == NULL) {
171 ERR("Could not initialise for MMU updates");
172 goto out;
173 }
175 DPRINTF("Reloading memory pages: 0%%");
177 /*
178 * Now simply read each saved frame into its new machine frame.
179 * We uncanonicalise page tables as we go.
180 */
181 prev_pc = 0;
183 n = 0;
184 while ( 1 )
185 {
186 int j;
187 unsigned long region_pfn_type[MAX_BATCH_SIZE];
189 this_pc = (n * 100) / nr_pfns;
190 if ( (this_pc - prev_pc) >= 5 )
191 {
192 PPRINTF("\b\b\b\b%3d%%", this_pc);
193 prev_pc = this_pc;
194 }
196 if ( read_exact(io_fd, &j, sizeof(int)) != sizeof(int) )
197 {
198 ERR("Error when reading batch size");
199 goto out;
200 }
202 PPRINTF("batch %d\n",j);
204 if ( j == -1 )
205 {
206 verify = 1;
207 printf("Entering page verify mode\n");
208 continue;
209 }
211 if ( j == 0 )
212 break; /* our work here is done */
214 if ( j > MAX_BATCH_SIZE )
215 {
216 ERR("Max batch size exceeded. Giving up.");
217 goto out;
218 }
220 if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) !=
221 j*sizeof(unsigned long) ) {
222 ERR("Error when reading region pfn types");
223 goto out;
224 }
226 for ( i = 0; i < j; i++ )
227 {
228 if ( (region_pfn_type[i] & LTAB_MASK) == XTAB)
229 {
230 region_mfn[i] = 0; /* we know map will fail, but don't care */
231 }
232 else
233 {
234 pfn = region_pfn_type[i] & ~LTAB_MASK;
235 region_mfn[i] = pfn_to_mfn_table[pfn];
236 }
237 }
239 if ( (region_base = xc_map_foreign_batch( xc_handle, dom,
240 PROT_WRITE,
241 region_mfn,
242 j )) == 0 )
243 {
244 ERR("map batch failed");
245 goto out;
246 }
248 for ( i = 0; i < j; i++ )
249 {
250 unsigned long *ppage;
252 pfn = region_pfn_type[i] & ~LTAB_MASK;
254 if ( (region_pfn_type[i] & LTAB_MASK) == XTAB) continue;
256 if (pfn>nr_pfns)
257 {
258 ERR("pfn out of range");
259 goto out;
260 }
262 region_pfn_type[i] &= LTAB_MASK;
264 pfn_type[pfn] = region_pfn_type[i];
266 mfn = pfn_to_mfn_table[pfn];
268 if ( verify )
269 ppage = (unsigned long*) buf; /* debug case */
270 else
271 ppage = (unsigned long*) (region_base + i*PAGE_SIZE);
273 if ( read_exact(io_fd, ppage, PAGE_SIZE) != PAGE_SIZE )
274 {
275 ERR("Error when reading pagetable page");
276 goto out;
277 }
279 switch( region_pfn_type[i] & LTABTYPE_MASK )
280 {
281 case 0:
282 break;
284 case L1TAB:
285 {
286 for ( k = 0; k < 1024; k++ )
287 {
288 if ( ppage[k] & _PAGE_PRESENT )
289 {
290 xpfn = ppage[k] >> PAGE_SHIFT;
291 if ( xpfn >= nr_pfns )
292 {
293 ERR("Frame number in type %lu page "
294 "table is out of range. i=%d k=%d "
295 "pfn=0x%lx nr_pfns=%lu",
296 region_pfn_type[i]>>28, i,
297 k, xpfn, nr_pfns);
298 goto out;
299 }
301 ppage[k] &= (PAGE_SIZE - 1) &
302 ~(_PAGE_GLOBAL | _PAGE_PAT);
303 ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
304 }
305 }
306 }
307 break;
309 case L2TAB:
310 {
311 for ( k = 0;
312 k < (HYPERVISOR_VIRT_START>>L2_PAGETABLE_SHIFT);
313 k++ )
314 {
315 if ( ppage[k] & _PAGE_PRESENT )
316 {
317 xpfn = ppage[k] >> PAGE_SHIFT;
319 if ( xpfn >= nr_pfns )
320 {
321 ERR("Frame number in type %lu page"
322 " table is out of range. i=%d k=%d "
323 "pfn=%lu nr_pfns=%lu",
324 region_pfn_type[i]>>28, i, k,
325 xpfn, nr_pfns);
326 goto out;
327 }
329 ppage[k] &= (PAGE_SIZE - 1) &
330 ~(_PAGE_GLOBAL | _PAGE_PSE);
331 ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
332 }
333 }
334 }
335 break;
337 default:
338 ERR("Bogus page type %lx page table is "
339 "out of range. i=%d nr_pfns=%lu",
340 region_pfn_type[i], i, nr_pfns);
341 goto out;
343 } /* end of page type switch statement */
345 if ( verify )
346 {
347 int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE );
348 if ( res )
349 {
350 int v;
351 printf("************** pfn=%lx type=%lx gotcs=%08lx "
352 "actualcs=%08lx\n", pfn, pfn_type[pfn],
353 csum_page(region_base + i*PAGE_SIZE),
354 csum_page(buf));
355 for ( v = 0; v < 4; v++ )
356 {
357 unsigned long *p = (unsigned long *)
358 (region_base + i*PAGE_SIZE);
359 if ( buf[v] != p[v] )
360 printf(" %d: %08lx %08lx\n",
361 v, buf[v], p[v] );
362 }
363 }
364 }
366 if ( xc_add_mmu_update(xc_handle, mmu,
367 (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
368 pfn) )
369 {
370 printf("machpys mfn=%ld pfn=%ld\n",mfn,pfn);
371 goto out;
372 }
374 } /* end of 'batch' for loop */
376 munmap( region_base, j*PAGE_SIZE );
377 n+=j; /* crude stats */
378 }
380 DPRINTF("Received all pages\n");
382 if ( pt_levels == 3 )
383 {
384 /* Get all PGDs below 4GB. */
385 for ( i = 0; i < nr_pfns; i++ )
386 {
387 if ( ((pfn_type[i] & LTABTYPE_MASK) == L3TAB) &&
388 (pfn_to_mfn_table[i] > 0xfffffUL) )
389 {
390 unsigned long new_mfn = xc_make_page_below_4G(
391 xc_handle, dom, pfn_to_mfn_table[i]);
392 if ( new_mfn == 0 )
393 {
394 fprintf(stderr, "Couldn't get a page below 4GB :-(\n");
395 goto out;
396 }
397 pfn_to_mfn_table[i] = new_mfn;
398 if ( xc_add_mmu_update(
399 xc_handle, mmu, (new_mfn << PAGE_SHIFT) |
400 MMU_MACHPHYS_UPDATE, i) )
401 {
402 fprintf(stderr, "Couldn't m2p on PAE root pgdir\n");
403 goto out;
404 }
405 }
406 }
407 }
409 if ( xc_finish_mmu_updates(xc_handle, mmu) )
410 goto out;
412 /*
413 * Pin page tables. Do this after writing to them as otherwise Xen
414 * will barf when doing the type-checking.
415 */
416 for ( i = 0; i < nr_pfns; i++ )
417 {
418 if ( (pfn_type[i] & LPINTAB) == 0 )
419 continue;
420 if ( pfn_type[i] == (L1TAB|LPINTAB) )
421 pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
422 else /* pfn_type[i] == (L2TAB|LPINTAB) */
423 pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
424 pin[nr_pins].arg1.mfn = pfn_to_mfn_table[i];
425 if ( ++nr_pins == MAX_PIN_BATCH )
426 {
427 if ( xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0 )
428 goto out;
429 nr_pins = 0;
430 }
431 }
433 if ( (nr_pins != 0) &&
434 (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) )
435 goto out;
437 DPRINTF("\b\b\b\b100%%\n");
438 DPRINTF("Memory reloaded.\n");
440 /* Get the list of PFNs that are not in the psuedo-phys map */
441 {
442 unsigned int count;
443 unsigned long *pfntab;
444 int rc;
446 if ( read_exact(io_fd, &count, sizeof(count)) != sizeof(count) )
447 {
448 ERR("Error when reading pfn count");
449 goto out;
450 }
452 pfntab = malloc( sizeof(unsigned int) * count );
453 if ( pfntab == NULL )
454 {
455 ERR("Out of memory");
456 goto out;
457 }
459 if ( read_exact(io_fd, pfntab, sizeof(unsigned int)*count) !=
460 sizeof(unsigned int)*count )
461 {
462 ERR("Error when reading pfntab");
463 goto out;
464 }
466 for ( i = 0; i < count; i++ )
467 {
468 unsigned long pfn = pfntab[i];
469 pfntab[i]=pfn_to_mfn_table[pfn];
470 pfn_to_mfn_table[pfn] = 0x80000001; // not in pmap
471 }
473 if ( count > 0 )
474 {
475 struct xen_memory_reservation reservation = {
476 .extent_start = pfntab,
477 .nr_extents = count,
478 .extent_order = 0,
479 .domid = dom
480 };
481 if ( (rc = xc_memory_op(xc_handle,
482 XENMEM_decrease_reservation,
483 &reservation)) != count )
484 {
485 ERR("Could not decrease reservation : %d",rc);
486 goto out;
487 }
488 else
489 {
490 printf("Decreased reservation by %d pages\n", count);
491 }
492 }
493 }
495 if ( read_exact(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) ||
496 read_exact(io_fd, shared_info_page, PAGE_SIZE) != PAGE_SIZE )
497 {
498 ERR("Error when reading ctxt or shared info page");
499 goto out;
500 }
502 /* Uncanonicalise the suspend-record frame number and poke resume rec. */
503 pfn = ctxt.user_regs.edx;
504 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
505 {
506 ERR("Suspend record frame number is bad");
507 goto out;
508 }
509 ctxt.user_regs.edx = mfn = pfn_to_mfn_table[pfn];
510 start_info = xc_map_foreign_range(
511 xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
512 start_info->nr_pages = nr_pfns;
513 start_info->shared_info = shared_info_frame << PAGE_SHIFT;
514 start_info->flags = 0;
515 *store_mfn = start_info->store_mfn =
516 pfn_to_mfn_table[start_info->store_mfn];
517 start_info->store_evtchn = store_evtchn;
518 *console_mfn = start_info->console_mfn =
519 pfn_to_mfn_table[start_info->console_mfn];
520 start_info->console_evtchn = console_evtchn;
521 munmap(start_info, PAGE_SIZE);
523 /* Uncanonicalise each GDT frame number. */
524 if ( ctxt.gdt_ents > 8192 )
525 {
526 ERR("GDT entry count out of range");
527 goto out;
528 }
530 for ( i = 0; i < ctxt.gdt_ents; i += 512 )
531 {
532 pfn = ctxt.gdt_frames[i];
533 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
534 {
535 ERR("GDT frame number is bad");
536 goto out;
537 }
538 ctxt.gdt_frames[i] = pfn_to_mfn_table[pfn];
539 }
541 /* Uncanonicalise the page table base pointer. */
542 pfn = ctxt.ctrlreg[3] >> PAGE_SHIFT;
543 if ( (pfn >= nr_pfns) || ((pfn_type[pfn]&LTABTYPE_MASK) != L2TAB) )
544 {
545 printf("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
546 pfn, nr_pfns, pfn_type[pfn], (unsigned long)L2TAB);
547 ERR("PT base is bad.");
548 goto out;
549 }
550 ctxt.ctrlreg[3] = pfn_to_mfn_table[pfn] << PAGE_SHIFT;
552 /* clear any pending events and the selector */
553 memset(&(shared_info->evtchn_pending[0]), 0,
554 sizeof (shared_info->evtchn_pending));
555 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
556 shared_info->vcpu_data[i].evtchn_pending_sel = 0;
558 /* Copy saved contents of shared-info page. No checking needed. */
559 ppage = xc_map_foreign_range(
560 xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
561 memcpy(ppage, shared_info, sizeof(shared_info_t));
562 munmap(ppage, PAGE_SIZE);
564 /* Uncanonicalise the pfn-to-mfn table frame-number list. */
565 for ( i = 0; i < (nr_pfns+1023)/1024; i++ )
566 {
567 unsigned long pfn, mfn;
569 pfn = pfn_to_mfn_frame_list[i];
570 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
571 {
572 ERR("PFN-to-MFN frame number is bad");
573 goto out;
574 }
575 mfn = pfn_to_mfn_table[pfn];
576 pfn_to_mfn_frame_list[i] = mfn;
577 }
579 if ( (live_pfn_to_mfn_table =
580 xc_map_foreign_batch(xc_handle, dom,
581 PROT_WRITE,
582 pfn_to_mfn_frame_list,
583 (nr_pfns+1023)/1024 )) == 0 )
584 {
585 ERR("Couldn't map pfn_to_mfn table");
586 goto out;
587 }
589 memcpy(live_pfn_to_mfn_table, pfn_to_mfn_table,
590 nr_pfns*sizeof(unsigned long) );
592 munmap(live_pfn_to_mfn_table, ((nr_pfns+1023)/1024)*PAGE_SIZE);
594 /*
595 * Safety checking of saved context:
596 * 1. user_regs is fine, as Xen checks that on context switch.
597 * 2. fpu_ctxt is fine, as it can't hurt Xen.
598 * 3. trap_ctxt needs the code selectors checked.
599 * 4. ldt base must be page-aligned, no more than 8192 ents, ...
600 * 5. gdt already done, and further checking is done by Xen.
601 * 6. check that kernel_ss is safe.
602 * 7. pt_base is already done.
603 * 8. debugregs are checked by Xen.
604 * 9. callback code selectors need checking.
605 */
606 for ( i = 0; i < 256; i++ )
607 {
608 ctxt.trap_ctxt[i].vector = i;
609 if ( (ctxt.trap_ctxt[i].cs & 3) == 0 )
610 ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS;
611 }
612 if ( (ctxt.kernel_ss & 3) == 0 )
613 ctxt.kernel_ss = FLAT_KERNEL_DS;
614 #if defined(__i386__)
615 if ( (ctxt.event_callback_cs & 3) == 0 )
616 ctxt.event_callback_cs = FLAT_KERNEL_CS;
617 if ( (ctxt.failsafe_callback_cs & 3) == 0 )
618 ctxt.failsafe_callback_cs = FLAT_KERNEL_CS;
619 #endif
620 if ( ((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) ||
621 (ctxt.ldt_ents > 8192) ||
622 (ctxt.ldt_base > HYPERVISOR_VIRT_START) ||
623 ((ctxt.ldt_base + ctxt.ldt_ents*8) > HYPERVISOR_VIRT_START) )
624 {
625 ERR("Bad LDT base or size");
626 goto out;
627 }
629 DPRINTF("Domain ready to be built.\n");
631 op.cmd = DOM0_SETDOMAININFO;
632 op.u.setdomaininfo.domain = (domid_t)dom;
633 op.u.setdomaininfo.vcpu = 0;
634 op.u.setdomaininfo.ctxt = &ctxt;
635 rc = xc_dom0_op(xc_handle, &op);
637 if ( rc != 0 )
638 {
639 ERR("Couldn't build the domain");
640 goto out;
641 }
643 DPRINTF("Domain ready to be unpaused\n");
644 op.cmd = DOM0_UNPAUSEDOMAIN;
645 op.u.unpausedomain.domain = (domid_t)dom;
646 rc = xc_dom0_op(xc_handle, &op);
647 if (rc == 0) {
648 /* Success: print the domain id. */
649 DPRINTF("DOM=%u\n", dom);
650 return 0;
651 }
653 out:
654 if ( (rc != 0) && (dom != 0) )
655 xc_domain_destroy(xc_handle, dom);
656 free(mmu);
657 free(pfn_to_mfn_table);
658 free(pfn_type);
660 DPRINTF("Restore exit with rc=%d\n", rc);
661 return rc;
662 }