ia64/xen-unstable

view tools/libxc/xc_linux_restore.c @ 7800:5aad7e145e50

If /sbin/ isn't in the path, udev rules will erroneously not get
installed.

Signed-off-by: Nivedita Singhvi <niv@us.ibm.com>
author kaf24@firebug.cl.cam.ac.uk
date Mon Nov 14 11:05:34 2005 +0100 (2005-11-14)
parents 5066d2aa2fb0
children 8ee7df2c18d1
line source
1 /******************************************************************************
2 * xc_linux_restore.c
3 *
4 * Restore the state of a Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <stdlib.h>
10 #include <unistd.h>
12 #include "xg_private.h"
13 #include "xg_save_restore.h"
15 /* max mfn of the whole machine */
16 static uint32_t max_mfn;
18 /* virtual starting address of the hypervisor */
19 static uint32_t hvirt_start;
21 /* #levels of page tables used by the currrent guest */
22 static uint32_t pt_levels;
24 /* total number of pages used by the current guest */
25 static unsigned long max_pfn;
27 /* Live mapping of the table mapping each PFN to its current MFN. */
28 static unsigned long *live_p2m = NULL;
30 /* A table mapping each PFN to its new MFN. */
31 static unsigned long *p2m = NULL;
34 static ssize_t
35 read_exact(int fd, void *buf, size_t count)
36 {
37 int r = 0, s;
38 unsigned char *b = buf;
40 while (r < count) {
41 s = read(fd, &b[r], count - r);
42 if ((s == -1) && (errno == EINTR))
43 continue;
44 if (s <= 0) {
45 break;
46 }
47 r += s;
48 }
50 return (r == count) ? 1 : 0;
51 }
54 /*
55 ** In the state file (or during transfer), all page-table pages are
56 ** converted into a 'canonical' form where references to actual mfns
57 ** are replaced with references to the corresponding pfns.
58 ** This function inverts that operation, replacing the pfn values with
59 ** the (now known) appropriate mfn values.
60 */
61 int uncanonicalize_pagetable(unsigned long type, void *page)
62 {
63 int i, pte_last, xen_start, xen_end;
64 unsigned long pfn;
65 uint64_t pte;
67 /*
68 ** We need to determine which entries in this page table hold
69 ** reserved hypervisor mappings. This depends on the current
70 ** page table type as well as the number of paging levels.
71 */
72 xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
74 if (pt_levels == 2 && type == L2TAB)
75 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
77 if (pt_levels == 3 && type == L3TAB)
78 xen_start = L3_PAGETABLE_ENTRIES_PAE;
81 /* Now iterate through the page table, uncanonicalizing each PTE */
82 for(i = 0; i < pte_last; i++) {
84 if(pt_levels == 2)
85 pte = ((uint32_t *)page)[i];
86 else
87 pte = ((uint64_t *)page)[i];
89 if(i >= xen_start && i < xen_end)
90 pte = 0;
92 if(pte & _PAGE_PRESENT) {
94 pfn = pte >> PAGE_SHIFT;
96 if(pfn >= max_pfn) {
97 ERR("Frame number in type %lu page table is out of range: "
98 "i=%d pfn=0x%lx max_pfn=%lu",
99 type >> 28, i, pfn, max_pfn);
100 return 0;
101 }
104 if(type == L1TAB)
105 pte &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PAT);
106 else
107 pte &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PSE);
109 pte |= p2m[pfn] << PAGE_SHIFT;
111 if(pt_levels == 2)
112 ((uint32_t *)page)[i] = (uint32_t)pte;
113 else
114 ((uint64_t *)page)[i] = (uint64_t)pte;
115 }
116 }
118 return 1;
119 }
121 int xc_linux_restore(int xc_handle, int io_fd,
122 uint32_t dom, unsigned long nr_pfns,
123 unsigned int store_evtchn, unsigned long *store_mfn,
124 unsigned int console_evtchn, unsigned long *console_mfn)
125 {
126 dom0_op_t op;
127 int rc = 1, i, n;
128 unsigned long mfn, pfn;
129 unsigned int prev_pc, this_pc;
130 int verify = 0;
132 /* The new domain's shared-info frame number. */
133 unsigned long shared_info_frame;
134 unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
135 shared_info_t *shared_info = (shared_info_t *)shared_info_page;
137 /* A copy of the CPU context of the guest. */
138 vcpu_guest_context_t ctxt;
140 /* A table containing the type of each PFN (/not/ MFN!). */
141 unsigned long *pfn_type = NULL;
143 /* A table of MFNs to map in the current region */
144 unsigned long *region_mfn = NULL;
146 /* A temporary mapping, and a copy, of one frame of guest memory. */
147 unsigned long *page = NULL;
149 /* A copy of the pfn-to-mfn table frame list. */
150 unsigned long *p2m_frame_list = NULL;
152 /* A temporary mapping of the guest's start_info page. */
153 start_info_t *start_info;
155 char *region_base;
157 xc_mmu_t *mmu = NULL;
159 /* used by debug verify code */
160 unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
162 struct mmuext_op pin[MAX_PIN_BATCH];
163 unsigned int nr_pins = 0;
166 max_pfn = nr_pfns;
168 DPRINTF("xc_linux_restore start: max_pfn = %lx\n", max_pfn);
171 if(!get_platform_info(xc_handle, dom,
172 &max_mfn, &hvirt_start, &pt_levels)) {
173 ERR("Unable to get platform info.");
174 return 1;
175 }
178 if (mlock(&ctxt, sizeof(ctxt))) {
179 /* needed for build dom0 op, but might as well do early */
180 ERR("Unable to mlock ctxt");
181 return 1;
182 }
185 /* Only have to worry about vcpu 0 even for SMP */
186 if (xc_domain_get_vcpu_context( xc_handle, dom, 0, &ctxt)) {
187 ERR("Could not get vcpu context");
188 goto out;
189 }
192 /* Read the saved P2M frame list */
193 if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
194 ERR("Couldn't allocate p2m_frame_list array");
195 goto out;
196 }
198 if (!read_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
199 ERR("read p2m_frame_list failed");
200 goto out;
201 }
204 /* We want zeroed memory so use calloc rather than malloc. */
205 p2m = calloc(sizeof(unsigned long), max_pfn);
206 pfn_type = calloc(sizeof(unsigned long), max_pfn);
207 region_mfn = calloc(sizeof(unsigned long), MAX_BATCH_SIZE);
209 if ((p2m == NULL) || (pfn_type == NULL) || (region_mfn == NULL)) {
210 ERR("memory alloc failed");
211 errno = ENOMEM;
212 goto out;
213 }
215 if (mlock(region_mfn, sizeof(unsigned long) * MAX_BATCH_SIZE)) {
216 ERR("Could not mlock region_mfn");
217 goto out;
218 }
220 /* Get the domain's shared-info frame. */
221 op.cmd = DOM0_GETDOMAININFO;
222 op.u.getdomaininfo.domain = (domid_t)dom;
223 if (xc_dom0_op(xc_handle, &op) < 0) {
224 ERR("Could not get information on new domain");
225 goto out;
226 }
227 shared_info_frame = op.u.getdomaininfo.shared_info_frame;
229 if(xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) {
230 errno = ENOMEM;
231 goto out;
232 }
234 if(xc_domain_memory_increase_reservation(
235 xc_handle, dom, max_pfn, 0, 0, NULL) != 0) {
236 ERR("Failed to increase reservation by %lx KB\n", max_pfn);
237 errno = ENOMEM;
238 goto out;
239 }
241 /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
242 if (xc_get_pfn_list(xc_handle, dom, p2m, max_pfn) != max_pfn) {
243 ERR("Did not read correct number of frame numbers for new dom");
244 goto out;
245 }
247 if(!(mmu = xc_init_mmu_updates(xc_handle, dom))) {
248 ERR("Could not initialise for MMU updates");
249 goto out;
250 }
252 DPRINTF("Reloading memory pages: 0%%\n");
254 /*
255 * Now simply read each saved frame into its new machine frame.
256 * We uncanonicalise page tables as we go.
257 */
258 prev_pc = 0;
260 n = 0;
261 while (1) {
263 int j;
264 unsigned long region_pfn_type[MAX_BATCH_SIZE];
266 this_pc = (n * 100) / max_pfn;
267 if ( (this_pc - prev_pc) >= 5 )
268 {
269 PPRINTF("\b\b\b\b%3d%%", this_pc);
270 prev_pc = this_pc;
271 }
273 if (!read_exact(io_fd, &j, sizeof(int))) {
274 ERR("Error when reading batch size");
275 goto out;
276 }
278 PPRINTF("batch %d\n",j);
280 if (j == -1) {
281 verify = 1;
282 fprintf(stderr, "Entering page verify mode\n");
283 continue;
284 }
286 if (j == 0)
287 break; /* our work here is done */
289 if (j > MAX_BATCH_SIZE) {
290 ERR("Max batch size exceeded. Giving up.");
291 goto out;
292 }
294 if (!read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long))) {
295 ERR("Error when reading region pfn types");
296 goto out;
297 }
299 for (i = 0; i < j; i++) {
301 if ((region_pfn_type[i] & LTAB_MASK) == XTAB)
302 region_mfn[i] = 0; /* we know map will fail, but don't care */
303 else
304 region_mfn[i] = p2m[region_pfn_type[i] & ~LTAB_MASK];
306 }
308 if (!(region_base = xc_map_foreign_batch(
309 xc_handle, dom, PROT_WRITE, region_mfn, j))) {
310 ERR("map batch failed");
311 goto out;
312 }
314 for ( i = 0; i < j; i++ )
315 {
316 void *page;
317 unsigned long pagetype;
319 pfn = region_pfn_type[i] & ~LTAB_MASK;
320 pagetype = region_pfn_type[i] & LTAB_MASK;
322 if (pagetype == XTAB)
323 /* a bogus/unmapped page: skip it */
324 continue;
326 if (pfn > max_pfn) {
327 ERR("pfn out of range");
328 goto out;
329 }
331 pfn_type[pfn] = pagetype;
333 mfn = p2m[pfn];
335 /* In verify mode, we use a copy; otherwise we work in place */
336 page = verify ? (void *)buf : (region_base + i*PAGE_SIZE);
338 if (!read_exact(io_fd, page, PAGE_SIZE)) {
339 ERR("Error when reading page (type was %lx)", pagetype);
340 goto out;
341 }
343 pagetype &= LTABTYPE_MASK;
345 if(pagetype >= L1TAB && pagetype <= L4TAB) {
347 /*
348 ** A page table page - need to 'uncanonicalize' it, i.e.
349 ** replace all the references to pfns with the corresponding
350 ** mfns for the new domain.
351 */
352 if(!uncanonicalize_pagetable(pagetype, page))
353 goto out;
355 } else if(pagetype != NOTAB) {
357 ERR("Bogus page type %lx page table is out of range: "
358 "i=%d max_pfn=%lu", pagetype, i, max_pfn);
359 goto out;
361 }
365 if (verify) {
367 int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
369 if (res) {
371 int v;
373 DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
374 "actualcs=%08lx\n", pfn, pfn_type[pfn],
375 csum_page(region_base + i*PAGE_SIZE),
376 csum_page(buf));
378 for (v = 0; v < 4; v++) {
380 unsigned long *p = (unsigned long *)
381 (region_base + i*PAGE_SIZE);
382 if (buf[v] != p[v])
383 DPRINTF(" %d: %08lx %08lx\n", v, buf[v], p[v]);
384 }
385 }
386 }
388 if (xc_add_mmu_update(xc_handle, mmu,
389 (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
390 pfn)) {
391 ERR("machpys mfn=%ld pfn=%ld", mfn, pfn);
392 goto out;
393 }
394 } /* end of 'batch' for loop */
396 munmap(region_base, j*PAGE_SIZE);
397 n+= j; /* crude stats */
398 }
400 DPRINTF("Received all pages\n");
402 if (pt_levels == 3) {
404 /* Get all PGDs below 4GB. */
405 for (i = 0; i < max_pfn; i++) {
407 if (((pfn_type[i] & LTABTYPE_MASK)==L3TAB) && (p2m[i]>0xfffffUL)) {
409 unsigned long new_mfn;
411 if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) {
412 ERR("Couldn't get a page below 4GB :-(");
413 goto out;
414 }
416 p2m[i] = new_mfn;
417 if (xc_add_mmu_update(
418 xc_handle, mmu,
419 (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, i)) {
420 ERR("Couldn't m2p on PAE root pgdir");
421 goto out;
422 }
423 }
424 }
426 }
429 if (xc_finish_mmu_updates(xc_handle, mmu)) {
430 ERR("Error doing finish_mmu_updates()");
431 goto out;
432 }
434 /*
435 * Pin page tables. Do this after writing to them as otherwise Xen
436 * will barf when doing the type-checking.
437 */
438 for (i = 0; i < max_pfn; i++) {
440 if ( (pfn_type[i] & LPINTAB) == 0 )
441 continue;
443 switch(pfn_type[i]) {
445 case (L1TAB|LPINTAB):
446 pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
447 break;
449 case (L2TAB|LPINTAB):
450 pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
451 break;
453 case (L3TAB|LPINTAB):
454 pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
455 break;
457 case (L4TAB|LPINTAB):
458 pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
459 break;
461 default:
462 continue;
463 }
465 pin[nr_pins].arg1.mfn = p2m[i];
467 if (++nr_pins == MAX_PIN_BATCH) {
468 if (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) {
469 ERR("Failed to pin batch of %d page tables", nr_pins);
470 goto out;
471 }
472 DPRINTF("successfully pinned batch of %d page tables", nr_pins);
473 nr_pins = 0;
474 }
475 }
477 if (nr_pins != 0) {
478 if((rc = xc_mmuext_op(xc_handle, pin, nr_pins, dom)) < 0) {
479 ERR("Failed (2) to pin batch of %d page tables", nr_pins);
480 DPRINTF("rc is %d\n", rc);
481 goto out;
482 }
483 }
485 DPRINTF("\b\b\b\b100%%\n");
486 DPRINTF("Memory reloaded.\n");
488 /* Get the list of PFNs that are not in the psuedo-phys map */
489 {
490 unsigned int count;
491 unsigned long *pfntab;
492 int rc;
494 if (!read_exact(io_fd, &count, sizeof(count))) {
495 ERR("Error when reading pfn count");
496 goto out;
497 }
499 if(!(pfntab = malloc(sizeof(unsigned long) * count))) {
500 ERR("Out of memory");
501 goto out;
502 }
504 if (!read_exact(io_fd, pfntab, sizeof(unsigned long)*count)) {
505 ERR("Error when reading pfntab");
506 goto out;
507 }
509 for (i = 0; i < count; i++) {
511 unsigned long pfn = pfntab[i];
513 if(pfn > max_pfn)
514 /* shouldn't happen - continue optimistically */
515 continue;
517 pfntab[i] = p2m[pfn];
518 p2m[pfn] = INVALID_P2M_ENTRY; // not in pseudo-physical map
519 }
521 if (count > 0) {
523 struct xen_memory_reservation reservation = {
524 .extent_start = pfntab,
525 .nr_extents = count,
526 .extent_order = 0,
527 .domid = dom
528 };
530 if ((rc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
531 &reservation)) != count) {
532 ERR("Could not decrease reservation : %d", rc);
533 goto out;
534 } else
535 DPRINTF("Decreased reservation by %d pages\n", count);
536 }
537 }
539 if (!read_exact(io_fd, &ctxt, sizeof(ctxt)) ||
540 !read_exact(io_fd, shared_info_page, PAGE_SIZE)) {
541 ERR("Error when reading ctxt or shared info page");
542 goto out;
543 }
545 /* Uncanonicalise the suspend-record frame number and poke resume rec. */
546 pfn = ctxt.user_regs.edx;
547 if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
548 ERR("Suspend record frame number is bad");
549 goto out;
550 }
551 ctxt.user_regs.edx = mfn = p2m[pfn];
552 start_info = xc_map_foreign_range(
553 xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
554 start_info->nr_pages = max_pfn;
555 start_info->shared_info = shared_info_frame << PAGE_SHIFT;
556 start_info->flags = 0;
557 *store_mfn = start_info->store_mfn = p2m[start_info->store_mfn];
558 start_info->store_evtchn = store_evtchn;
559 *console_mfn = start_info->console_mfn = p2m[start_info->console_mfn];
560 start_info->console_evtchn = console_evtchn;
561 munmap(start_info, PAGE_SIZE);
563 /* Uncanonicalise each GDT frame number. */
564 if (ctxt.gdt_ents > 8192) {
565 ERR("GDT entry count out of range");
566 goto out;
567 }
569 for (i = 0; i < ctxt.gdt_ents; i += 512) {
570 pfn = ctxt.gdt_frames[i];
571 if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
572 ERR("GDT frame number is bad");
573 goto out;
574 }
575 ctxt.gdt_frames[i] = p2m[pfn];
576 }
578 /* Uncanonicalise the page table base pointer. */
579 pfn = ctxt.ctrlreg[3] >> PAGE_SHIFT;
581 if (pfn >= max_pfn) {
582 DPRINTF("PT base is bad: pfn=%lu max_pfn=%lu type=%08lx\n",
583 pfn, max_pfn, pfn_type[pfn]);
584 ERR("PT base is bad.");
585 goto out;
586 }
588 if ((pt_levels == 2) && ((pfn_type[pfn]&LTABTYPE_MASK) != L2TAB)) {
589 DPRINTF("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
590 pfn, max_pfn, pfn_type[pfn], (unsigned long)L2TAB);
591 ERR("PT base is bad.");
592 goto out;
593 }
595 if ((pt_levels == 3) && ((pfn_type[pfn]&LTABTYPE_MASK) != L3TAB)) {
596 DPRINTF("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
597 pfn, max_pfn, pfn_type[pfn], (unsigned long)L3TAB);
598 ERR("PT base is bad.");
599 goto out;
600 }
602 ctxt.ctrlreg[3] = p2m[pfn] << PAGE_SHIFT;
604 /* clear any pending events and the selector */
605 memset(&(shared_info->evtchn_pending[0]), 0,
606 sizeof (shared_info->evtchn_pending));
607 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
608 shared_info->vcpu_data[i].evtchn_pending_sel = 0;
610 /* Copy saved contents of shared-info page. No checking needed. */
611 page = xc_map_foreign_range(
612 xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
613 memcpy(page, shared_info, sizeof(shared_info_t));
614 munmap(page, PAGE_SIZE);
616 /* Uncanonicalise the pfn-to-mfn table frame-number list. */
617 for (i = 0; i < P2M_FL_ENTRIES; i++) {
618 pfn = p2m_frame_list[i];
619 if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
620 ERR("PFN-to-MFN frame number is bad");
621 goto out;
622 }
624 p2m_frame_list[i] = p2m[pfn];
625 }
627 /* Copy the P2M we've constructed to the 'live' P2M */
628 if (!(live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_WRITE,
629 p2m_frame_list, P2M_FL_ENTRIES))) {
630 ERR("Couldn't map p2m table");
631 goto out;
632 }
634 memcpy(live_p2m, p2m, P2M_SIZE);
635 munmap(live_p2m, P2M_SIZE);
637 /*
638 * Safety checking of saved context:
639 * 1. user_regs is fine, as Xen checks that on context switch.
640 * 2. fpu_ctxt is fine, as it can't hurt Xen.
641 * 3. trap_ctxt needs the code selectors checked.
642 * 4. ldt base must be page-aligned, no more than 8192 ents, ...
643 * 5. gdt already done, and further checking is done by Xen.
644 * 6. check that kernel_ss is safe.
645 * 7. pt_base is already done.
646 * 8. debugregs are checked by Xen.
647 * 9. callback code selectors need checking.
648 */
649 for ( i = 0; i < 256; i++ ) {
650 ctxt.trap_ctxt[i].vector = i;
651 if ((ctxt.trap_ctxt[i].cs & 3) == 0)
652 ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS;
653 }
654 if ((ctxt.kernel_ss & 3) == 0)
655 ctxt.kernel_ss = FLAT_KERNEL_DS;
656 #if defined(__i386__)
657 if ((ctxt.event_callback_cs & 3) == 0)
658 ctxt.event_callback_cs = FLAT_KERNEL_CS;
659 if ((ctxt.failsafe_callback_cs & 3) == 0)
660 ctxt.failsafe_callback_cs = FLAT_KERNEL_CS;
661 #endif
662 if (((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) ||
663 (ctxt.ldt_ents > 8192) ||
664 (ctxt.ldt_base > hvirt_start) ||
665 ((ctxt.ldt_base + ctxt.ldt_ents*8) > hvirt_start)) {
666 ERR("Bad LDT base or size");
667 goto out;
668 }
670 DPRINTF("Domain ready to be built.\n");
672 op.cmd = DOM0_SETDOMAININFO;
673 op.u.setdomaininfo.domain = (domid_t)dom;
674 op.u.setdomaininfo.vcpu = 0;
675 op.u.setdomaininfo.ctxt = &ctxt;
676 rc = xc_dom0_op(xc_handle, &op);
678 if (rc != 0) {
679 ERR("Couldn't build the domain");
680 goto out;
681 }
683 out:
684 if ( (rc != 0) && (dom != 0) )
685 xc_domain_destroy(xc_handle, dom);
686 free(mmu);
687 free(p2m);
688 free(pfn_type);
690 DPRINTF("Restore exit with rc=%d\n", rc);
692 return rc;
693 }