direct-io.hg

view tools/libxc/xc_linux_restore.c @ 7785:8ee7df2c18d1

Revert accidentally applied changes from changeset
7783:5aad7e145e501fbfb346954950a33b042a963633.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Mon Nov 14 11:35:50 2005 +0100 (2005-11-14)
parents 5aad7e145e50
children 20bd6f55b813
line source
1 /******************************************************************************
2 * xc_linux_restore.c
3 *
4 * Restore the state of a Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <stdlib.h>
10 #include <unistd.h>
12 #include "xg_private.h"
13 #include "xg_save_restore.h"
17 /* max mfn of the whole machine */
18 static uint32_t max_mfn;
20 /* virtual starting address of the hypervisor */
21 static uint32_t hvirt_start;
23 /* #levels of page tables used by the currrent guest */
24 static uint32_t pt_levels;
26 /* total number of pages used by the current guest */
27 static unsigned long max_pfn;
29 /* Live mapping of the table mapping each PFN to its current MFN. */
30 static unsigned long *live_p2m = NULL;
32 /* A table mapping each PFN to its new MFN. */
33 static unsigned long *p2m = NULL;
36 static ssize_t
37 read_exact(int fd, void *buf, size_t count)
38 {
39 int r = 0, s;
40 unsigned char *b = buf;
42 while (r < count) {
43 s = read(fd, &b[r], count - r);
44 if ((s == -1) && (errno == EINTR))
45 continue;
46 if (s <= 0) {
47 break;
48 }
49 r += s;
50 }
52 return (r == count) ? 1 : 0;
53 }
56 /*
57 ** In the state file (or during transfer), all page-table pages are
58 ** converted into a 'canonical' form where references to actual mfns
59 ** are replaced with references to the corresponding pfns.
60 ** This function inverts that operation, replacing the pfn values with
61 ** the (now known) appropriate mfn values.
62 */
63 int uncanonicalize_pagetable(unsigned long type, void *page)
64 {
65 int i, pte_last, xen_start, xen_end;
66 unsigned long pfn;
67 uint64_t pte;
69 /*
70 ** We need to determine which entries in this page table hold
71 ** reserved hypervisor mappings. This depends on the current
72 ** page table type as well as the number of paging levels.
73 */
74 xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
76 if (pt_levels == 2 && type == L2TAB)
77 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
79 if (pt_levels == 3 && type == L3TAB)
80 xen_start = L3_PAGETABLE_ENTRIES_PAE;
83 /* Now iterate through the page table, uncanonicalizing each PTE */
84 for(i = 0; i < pte_last; i++) {
86 if(pt_levels == 2)
87 pte = ((uint32_t *)page)[i];
88 else
89 pte = ((uint64_t *)page)[i];
91 if(i >= xen_start && i < xen_end)
92 pte = 0;
94 if(pte & _PAGE_PRESENT) {
96 pfn = pte >> PAGE_SHIFT;
98 if(pfn >= max_pfn) {
99 ERR("Frame number in type %lu page table is out of range: "
100 "i=%d pfn=0x%lx max_pfn=%lu",
101 type >> 28, i, pfn, max_pfn);
102 return 0;
103 }
106 if(type == L1TAB)
107 pte &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PAT);
108 else
109 pte &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PSE);
111 pte |= p2m[pfn] << PAGE_SHIFT;
113 if(pt_levels == 2)
114 ((uint32_t *)page)[i] = (uint32_t)pte;
115 else
116 ((uint64_t *)page)[i] = (uint64_t)pte;
117 }
118 }
120 return 1;
121 }
123 int xc_linux_restore(int xc_handle, int io_fd,
124 uint32_t dom, unsigned long nr_pfns,
125 unsigned int store_evtchn, unsigned long *store_mfn,
126 unsigned int console_evtchn, unsigned long *console_mfn)
127 {
128 dom0_op_t op;
129 int rc = 1, i, n;
130 unsigned long mfn, pfn;
131 unsigned int prev_pc, this_pc;
132 int verify = 0;
134 /* The new domain's shared-info frame number. */
135 unsigned long shared_info_frame;
136 unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
137 shared_info_t *shared_info = (shared_info_t *)shared_info_page;
139 /* A copy of the CPU context of the guest. */
140 vcpu_guest_context_t ctxt;
142 /* A table containing the type of each PFN (/not/ MFN!). */
143 unsigned long *pfn_type = NULL;
145 /* A table of MFNs to map in the current region */
146 unsigned long *region_mfn = NULL;
148 /* A temporary mapping, and a copy, of one frame of guest memory. */
149 unsigned long *page = NULL;
151 /* A copy of the pfn-to-mfn table frame list. */
152 unsigned long *p2m_frame_list = NULL;
154 /* A temporary mapping of the guest's start_info page. */
155 start_info_t *start_info;
157 char *region_base;
159 xc_mmu_t *mmu = NULL;
161 /* used by debug verify code */
162 unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
164 struct mmuext_op pin[MAX_PIN_BATCH];
165 unsigned int nr_pins = 0;
168 max_pfn = nr_pfns;
170 DPRINTF("xc_linux_restore start: max_pfn = %lx\n", max_pfn);
173 if(!get_platform_info(xc_handle, dom,
174 &max_mfn, &hvirt_start, &pt_levels)) {
175 ERR("Unable to get platform info.");
176 return 1;
177 }
180 if (mlock(&ctxt, sizeof(ctxt))) {
181 /* needed for build dom0 op, but might as well do early */
182 ERR("Unable to mlock ctxt");
183 return 1;
184 }
187 /* Only have to worry about vcpu 0 even for SMP */
188 if (xc_domain_get_vcpu_context( xc_handle, dom, 0, &ctxt)) {
189 ERR("Could not get vcpu context");
190 goto out;
191 }
194 /* Read the saved P2M frame list */
195 if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
196 ERR("Couldn't allocate p2m_frame_list array");
197 goto out;
198 }
200 if (!read_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
201 ERR("read p2m_frame_list failed");
202 goto out;
203 }
206 /* We want zeroed memory so use calloc rather than malloc. */
207 p2m = calloc(sizeof(unsigned long), max_pfn);
208 pfn_type = calloc(sizeof(unsigned long), max_pfn);
209 region_mfn = calloc(sizeof(unsigned long), MAX_BATCH_SIZE);
211 if ((p2m == NULL) || (pfn_type == NULL) || (region_mfn == NULL)) {
212 ERR("memory alloc failed");
213 errno = ENOMEM;
214 goto out;
215 }
217 if (mlock(region_mfn, sizeof(unsigned long) * MAX_BATCH_SIZE)) {
218 ERR("Could not mlock region_mfn");
219 goto out;
220 }
222 /* Get the domain's shared-info frame. */
223 op.cmd = DOM0_GETDOMAININFO;
224 op.u.getdomaininfo.domain = (domid_t)dom;
225 if (xc_dom0_op(xc_handle, &op) < 0) {
226 ERR("Could not get information on new domain");
227 goto out;
228 }
229 shared_info_frame = op.u.getdomaininfo.shared_info_frame;
231 if(xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) {
232 errno = ENOMEM;
233 goto out;
234 }
236 if(xc_domain_memory_increase_reservation(
237 xc_handle, dom, max_pfn, 0, 0, NULL) != 0) {
238 ERR("Failed to increase reservation by %lx KB\n", max_pfn);
239 errno = ENOMEM;
240 goto out;
241 }
243 /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
244 if (xc_get_pfn_list(xc_handle, dom, p2m, max_pfn) != max_pfn) {
245 ERR("Did not read correct number of frame numbers for new dom");
246 goto out;
247 }
249 if(!(mmu = xc_init_mmu_updates(xc_handle, dom))) {
250 ERR("Could not initialise for MMU updates");
251 goto out;
252 }
254 DPRINTF("Reloading memory pages: 0%%\n");
256 /*
257 * Now simply read each saved frame into its new machine frame.
258 * We uncanonicalise page tables as we go.
259 */
260 prev_pc = 0;
262 n = 0;
263 while (1) {
265 int j;
266 unsigned long region_pfn_type[MAX_BATCH_SIZE];
268 this_pc = (n * 100) / max_pfn;
269 if ( (this_pc - prev_pc) >= 5 )
270 {
271 PPRINTF("\b\b\b\b%3d%%", this_pc);
272 prev_pc = this_pc;
273 }
275 if (!read_exact(io_fd, &j, sizeof(int))) {
276 ERR("Error when reading batch size");
277 goto out;
278 }
280 PPRINTF("batch %d\n",j);
282 if (j == -1) {
283 verify = 1;
284 fprintf(stderr, "Entering page verify mode\n");
285 continue;
286 }
288 if (j == 0)
289 break; /* our work here is done */
291 if (j > MAX_BATCH_SIZE) {
292 ERR("Max batch size exceeded. Giving up.");
293 goto out;
294 }
296 if (!read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long))) {
297 ERR("Error when reading region pfn types");
298 goto out;
299 }
301 for (i = 0; i < j; i++) {
303 if ((region_pfn_type[i] & LTAB_MASK) == XTAB)
304 region_mfn[i] = 0; /* we know map will fail, but don't care */
305 else
306 region_mfn[i] = p2m[region_pfn_type[i] & ~LTAB_MASK];
308 }
310 if (!(region_base = xc_map_foreign_batch(
311 xc_handle, dom, PROT_WRITE, region_mfn, j))) {
312 ERR("map batch failed");
313 goto out;
314 }
316 for ( i = 0; i < j; i++ )
317 {
318 void *page;
319 unsigned long pagetype;
321 pfn = region_pfn_type[i] & ~LTAB_MASK;
322 pagetype = region_pfn_type[i] & LTAB_MASK;
324 if (pagetype == XTAB)
325 /* a bogus/unmapped page: skip it */
326 continue;
328 if (pfn > max_pfn) {
329 ERR("pfn out of range");
330 goto out;
331 }
333 pfn_type[pfn] = pagetype;
335 mfn = p2m[pfn];
337 /* In verify mode, we use a copy; otherwise we work in place */
338 page = verify ? (void *)buf : (region_base + i*PAGE_SIZE);
340 if (!read_exact(io_fd, page, PAGE_SIZE)) {
341 ERR("Error when reading page (type was %lx)", pagetype);
342 goto out;
343 }
345 pagetype &= LTABTYPE_MASK;
347 if(pagetype >= L1TAB && pagetype <= L4TAB) {
349 /*
350 ** A page table page - need to 'uncanonicalize' it, i.e.
351 ** replace all the references to pfns with the corresponding
352 ** mfns for the new domain.
353 */
354 if(!uncanonicalize_pagetable(pagetype, page))
355 goto out;
357 } else if(pagetype != NOTAB) {
359 ERR("Bogus page type %lx page table is out of range: "
360 "i=%d max_pfn=%lu", pagetype, i, max_pfn);
361 goto out;
363 }
367 if (verify) {
369 int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
371 if (res) {
373 int v;
375 DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
376 "actualcs=%08lx\n", pfn, pfn_type[pfn],
377 csum_page(region_base + i*PAGE_SIZE),
378 csum_page(buf));
380 for (v = 0; v < 4; v++) {
382 unsigned long *p = (unsigned long *)
383 (region_base + i*PAGE_SIZE);
384 if (buf[v] != p[v])
385 DPRINTF(" %d: %08lx %08lx\n", v, buf[v], p[v]);
386 }
387 }
388 }
390 if (xc_add_mmu_update(xc_handle, mmu,
391 (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
392 pfn)) {
393 ERR("machpys mfn=%ld pfn=%ld", mfn, pfn);
394 goto out;
395 }
396 } /* end of 'batch' for loop */
398 munmap(region_base, j*PAGE_SIZE);
399 n+= j; /* crude stats */
400 }
402 DPRINTF("Received all pages\n");
404 if (pt_levels == 3) {
406 /* Get all PGDs below 4GB. */
407 for (i = 0; i < max_pfn; i++) {
409 if (((pfn_type[i] & LTABTYPE_MASK)==L3TAB) && (p2m[i]>0xfffffUL)) {
411 unsigned long new_mfn;
413 if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) {
414 ERR("Couldn't get a page below 4GB :-(");
415 goto out;
416 }
418 p2m[i] = new_mfn;
419 if (xc_add_mmu_update(
420 xc_handle, mmu,
421 (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, i)) {
422 ERR("Couldn't m2p on PAE root pgdir");
423 goto out;
424 }
425 }
426 }
428 }
431 if (xc_finish_mmu_updates(xc_handle, mmu)) {
432 ERR("Error doing finish_mmu_updates()");
433 goto out;
434 }
436 /*
437 * Pin page tables. Do this after writing to them as otherwise Xen
438 * will barf when doing the type-checking.
439 */
440 for (i = 0; i < max_pfn; i++) {
442 if ( (pfn_type[i] & LPINTAB) == 0 )
443 continue;
445 switch(pfn_type[i]) {
447 case (L1TAB|LPINTAB):
448 pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
449 break;
451 case (L2TAB|LPINTAB):
452 pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
453 break;
455 case (L3TAB|LPINTAB):
456 pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
457 break;
459 case (L4TAB|LPINTAB):
460 pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
461 break;
463 default:
464 continue;
465 }
467 pin[nr_pins].arg1.mfn = p2m[i];
469 if (++nr_pins == MAX_PIN_BATCH) {
470 if (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) {
471 ERR("Failed to pin batch of %d page tables", nr_pins);
472 goto out;
473 }
474 DPRINTF("successfully pinned batch of %d page tables", nr_pins);
475 nr_pins = 0;
476 }
477 }
479 if (nr_pins != 0) {
480 if((rc = xc_mmuext_op(xc_handle, pin, nr_pins, dom)) < 0) {
481 ERR("Failed (2) to pin batch of %d page tables", nr_pins);
482 DPRINTF("rc is %d\n", rc);
483 goto out;
484 }
485 }
487 DPRINTF("\b\b\b\b100%%\n");
488 DPRINTF("Memory reloaded.\n");
490 /* Get the list of PFNs that are not in the psuedo-phys map */
491 {
492 unsigned int count;
493 unsigned long *pfntab;
494 int rc;
496 if (!read_exact(io_fd, &count, sizeof(count))) {
497 ERR("Error when reading pfn count");
498 goto out;
499 }
501 if(!(pfntab = malloc(sizeof(unsigned long) * count))) {
502 ERR("Out of memory");
503 goto out;
504 }
506 if (!read_exact(io_fd, pfntab, sizeof(unsigned long)*count)) {
507 ERR("Error when reading pfntab");
508 goto out;
509 }
511 for (i = 0; i < count; i++) {
513 unsigned long pfn = pfntab[i];
515 if(pfn > max_pfn)
516 /* shouldn't happen - continue optimistically */
517 continue;
519 pfntab[i] = p2m[pfn];
520 p2m[pfn] = INVALID_P2M_ENTRY; // not in pseudo-physical map
521 }
523 if (count > 0) {
525 struct xen_memory_reservation reservation = {
526 .extent_start = pfntab,
527 .nr_extents = count,
528 .extent_order = 0,
529 .domid = dom
530 };
532 if ((rc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
533 &reservation)) != count) {
534 ERR("Could not decrease reservation : %d", rc);
535 goto out;
536 } else
537 DPRINTF("Decreased reservation by %d pages\n", count);
538 }
539 }
541 if (!read_exact(io_fd, &ctxt, sizeof(ctxt)) ||
542 !read_exact(io_fd, shared_info_page, PAGE_SIZE)) {
543 ERR("Error when reading ctxt or shared info page");
544 goto out;
545 }
547 /* Uncanonicalise the suspend-record frame number and poke resume rec. */
548 pfn = ctxt.user_regs.edx;
549 if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
550 ERR("Suspend record frame number is bad");
551 goto out;
552 }
553 ctxt.user_regs.edx = mfn = p2m[pfn];
554 start_info = xc_map_foreign_range(
555 xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
556 start_info->nr_pages = max_pfn;
557 start_info->shared_info = shared_info_frame << PAGE_SHIFT;
558 start_info->flags = 0;
559 *store_mfn = start_info->store_mfn = p2m[start_info->store_mfn];
560 start_info->store_evtchn = store_evtchn;
561 *console_mfn = start_info->console_mfn = p2m[start_info->console_mfn];
562 start_info->console_evtchn = console_evtchn;
563 munmap(start_info, PAGE_SIZE);
565 /* Uncanonicalise each GDT frame number. */
566 if (ctxt.gdt_ents > 8192) {
567 ERR("GDT entry count out of range");
568 goto out;
569 }
571 for (i = 0; i < ctxt.gdt_ents; i += 512) {
572 pfn = ctxt.gdt_frames[i];
573 if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
574 ERR("GDT frame number is bad");
575 goto out;
576 }
577 ctxt.gdt_frames[i] = p2m[pfn];
578 }
580 /* Uncanonicalise the page table base pointer. */
581 pfn = ctxt.ctrlreg[3] >> PAGE_SHIFT;
583 if (pfn >= max_pfn) {
584 DPRINTF("PT base is bad: pfn=%lu max_pfn=%lu type=%08lx\n",
585 pfn, max_pfn, pfn_type[pfn]);
586 ERR("PT base is bad.");
587 goto out;
588 }
590 if ((pt_levels == 2) && ((pfn_type[pfn]&LTABTYPE_MASK) != L2TAB)) {
591 DPRINTF("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
592 pfn, max_pfn, pfn_type[pfn], (unsigned long)L2TAB);
593 ERR("PT base is bad.");
594 goto out;
595 }
597 if ((pt_levels == 3) && ((pfn_type[pfn]&LTABTYPE_MASK) != L3TAB)) {
598 DPRINTF("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
599 pfn, max_pfn, pfn_type[pfn], (unsigned long)L3TAB);
600 ERR("PT base is bad.");
601 goto out;
602 }
604 ctxt.ctrlreg[3] = p2m[pfn] << PAGE_SHIFT;
606 /* clear any pending events and the selector */
607 memset(&(shared_info->evtchn_pending[0]), 0,
608 sizeof (shared_info->evtchn_pending));
609 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
610 shared_info->vcpu_data[i].evtchn_pending_sel = 0;
612 /* Copy saved contents of shared-info page. No checking needed. */
613 page = xc_map_foreign_range(
614 xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
615 memcpy(page, shared_info, sizeof(shared_info_t));
616 munmap(page, PAGE_SIZE);
618 /* Uncanonicalise the pfn-to-mfn table frame-number list. */
619 for (i = 0; i < P2M_FL_ENTRIES; i++) {
620 pfn = p2m_frame_list[i];
621 if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
622 ERR("PFN-to-MFN frame number is bad");
623 goto out;
624 }
626 p2m_frame_list[i] = p2m[pfn];
627 }
629 /* Copy the P2M we've constructed to the 'live' P2M */
630 if (!(live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_WRITE,
631 p2m_frame_list, P2M_FL_ENTRIES))) {
632 ERR("Couldn't map p2m table");
633 goto out;
634 }
636 memcpy(live_p2m, p2m, P2M_SIZE);
637 munmap(live_p2m, P2M_SIZE);
639 /*
640 * Safety checking of saved context:
641 * 1. user_regs is fine, as Xen checks that on context switch.
642 * 2. fpu_ctxt is fine, as it can't hurt Xen.
643 * 3. trap_ctxt needs the code selectors checked.
644 * 4. ldt base must be page-aligned, no more than 8192 ents, ...
645 * 5. gdt already done, and further checking is done by Xen.
646 * 6. check that kernel_ss is safe.
647 * 7. pt_base is already done.
648 * 8. debugregs are checked by Xen.
649 * 9. callback code selectors need checking.
650 */
651 for ( i = 0; i < 256; i++ ) {
652 ctxt.trap_ctxt[i].vector = i;
653 if ((ctxt.trap_ctxt[i].cs & 3) == 0)
654 ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS;
655 }
656 if ((ctxt.kernel_ss & 3) == 0)
657 ctxt.kernel_ss = FLAT_KERNEL_DS;
658 #if defined(__i386__)
659 if ((ctxt.event_callback_cs & 3) == 0)
660 ctxt.event_callback_cs = FLAT_KERNEL_CS;
661 if ((ctxt.failsafe_callback_cs & 3) == 0)
662 ctxt.failsafe_callback_cs = FLAT_KERNEL_CS;
663 #endif
664 if (((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) ||
665 (ctxt.ldt_ents > 8192) ||
666 (ctxt.ldt_base > hvirt_start) ||
667 ((ctxt.ldt_base + ctxt.ldt_ents*8) > hvirt_start)) {
668 ERR("Bad LDT base or size");
669 goto out;
670 }
672 DPRINTF("Domain ready to be built.\n");
674 op.cmd = DOM0_SETDOMAININFO;
675 op.u.setdomaininfo.domain = (domid_t)dom;
676 op.u.setdomaininfo.vcpu = 0;
677 op.u.setdomaininfo.ctxt = &ctxt;
678 rc = xc_dom0_op(xc_handle, &op);
680 if (rc != 0) {
681 ERR("Couldn't build the domain");
682 goto out;
683 }
685 out:
686 if ( (rc != 0) && (dom != 0) )
687 xc_domain_destroy(xc_handle, dom);
688 free(mmu);
689 free(p2m);
690 free(pfn_type);
692 DPRINTF("Restore exit with rc=%d\n", rc);
694 return rc;
695 }