direct-io.hg

view tools/libxc/xc_linux_restore.c @ 14350:f3f5f2756d75

x86: Add VGCF_onlien flag to vcpu_guest_context.
Change common Xen code to start all VCPUs (except idle ones)
offline. Change arch code to deal with this.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Mon Mar 12 13:53:43 2007 +0000 (2007-03-12)
parents d5ca4c37b3c5
children d05a3220ea05
line source
1 /******************************************************************************
2 * xc_linux_restore.c
3 *
4 * Restore the state of a Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <stdlib.h>
10 #include <unistd.h>
12 #include "xg_private.h"
13 #include "xg_save_restore.h"
14 #include "xc_dom.h"
16 /* max mfn of the current host machine */
17 static unsigned long max_mfn;
19 /* virtual starting address of the hypervisor */
20 static unsigned long hvirt_start;
22 /* #levels of page tables used by the current guest */
23 static unsigned int pt_levels;
25 /* total number of pages used by the current guest */
26 static unsigned long max_pfn;
28 /* Live mapping of the table mapping each PFN to its current MFN. */
29 static xen_pfn_t *live_p2m = NULL;
31 /* A table mapping each PFN to its new MFN. */
32 static xen_pfn_t *p2m = NULL;
34 /* A table of P2M mappings in the current region */
35 static xen_pfn_t *p2m_batch = NULL;
38 static ssize_t
39 read_exact(int fd, void *buf, size_t count)
40 {
41 int r = 0, s;
42 unsigned char *b = buf;
44 while (r < count) {
45 s = read(fd, &b[r], count - r);
46 if ((s == -1) && (errno == EINTR))
47 continue;
48 if (s <= 0) {
49 break;
50 }
51 r += s;
52 }
54 return (r == count) ? 1 : 0;
55 }
57 /*
58 ** In the state file (or during transfer), all page-table pages are
59 ** converted into a 'canonical' form where references to actual mfns
60 ** are replaced with references to the corresponding pfns.
61 ** This function inverts that operation, replacing the pfn values with
62 ** the (now known) appropriate mfn values.
63 */
64 static int uncanonicalize_pagetable(int xc_handle, uint32_t dom,
65 unsigned long type, void *page)
66 {
67 int i, pte_last;
68 unsigned long pfn;
69 uint64_t pte;
70 int nr_mfns = 0;
72 pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
74 /* First pass: work out how many (if any) MFNs we need to alloc */
75 for(i = 0; i < pte_last; i++) {
77 if(pt_levels == 2)
78 pte = ((uint32_t *)page)[i];
79 else
80 pte = ((uint64_t *)page)[i];
82 /* XXX SMH: below needs fixing for PROT_NONE etc */
83 if(!(pte & _PAGE_PRESENT))
84 continue;
86 pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
88 if(pfn >= max_pfn) {
89 /* This "page table page" is probably not one; bail. */
90 ERROR("Frame number in type %lu page table is out of range: "
91 "i=%d pfn=0x%lx max_pfn=%lu",
92 type >> 28, i, pfn, max_pfn);
93 return 0;
94 }
96 if(p2m[pfn] == INVALID_P2M_ENTRY) {
97 /* Have a 'valid' PFN without a matching MFN - need to alloc */
98 p2m_batch[nr_mfns++] = pfn;
99 }
100 }
103 /* Alllocate the requistite number of mfns */
104 if (nr_mfns && xc_domain_memory_populate_physmap(
105 xc_handle, dom, nr_mfns, 0, 0, p2m_batch) != 0) {
106 ERROR("Failed to allocate memory for batch.!\n");
107 errno = ENOMEM;
108 return 0;
109 }
111 /* Second pass: uncanonicalize each present PTE */
112 nr_mfns = 0;
113 for(i = 0; i < pte_last; i++) {
115 if(pt_levels == 2)
116 pte = ((uint32_t *)page)[i];
117 else
118 pte = ((uint64_t *)page)[i];
120 /* XXX SMH: below needs fixing for PROT_NONE etc */
121 if(!(pte & _PAGE_PRESENT))
122 continue;
124 pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
126 if(p2m[pfn] == INVALID_P2M_ENTRY)
127 p2m[pfn] = p2m_batch[nr_mfns++];
129 pte &= ~MADDR_MASK_X86;
130 pte |= (uint64_t)p2m[pfn] << PAGE_SHIFT;
132 if(pt_levels == 2)
133 ((uint32_t *)page)[i] = (uint32_t)pte;
134 else
135 ((uint64_t *)page)[i] = (uint64_t)pte;
136 }
138 return 1;
139 }
141 int xc_linux_restore(int xc_handle, int io_fd,
142 uint32_t dom, unsigned long nr_pfns,
143 unsigned int store_evtchn, unsigned long *store_mfn,
144 unsigned int console_evtchn, unsigned long *console_mfn)
145 {
146 DECLARE_DOMCTL;
147 int rc = 1, i, j, n, m, pae_extended_cr3 = 0;
148 unsigned long mfn, pfn;
149 unsigned int prev_pc, this_pc;
150 int verify = 0;
151 int nraces = 0;
153 /* The new domain's shared-info frame number. */
154 unsigned long shared_info_frame;
155 unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
156 shared_info_t *shared_info = (shared_info_t *)shared_info_page;
158 /* A copy of the CPU context of the guest. */
159 vcpu_guest_context_t ctxt;
161 /* A table containing the type of each PFN (/not/ MFN!). */
162 unsigned long *pfn_type = NULL;
164 /* A table of MFNs to map in the current region */
165 xen_pfn_t *region_mfn = NULL;
167 /* Types of the pfns in the current region */
168 unsigned long region_pfn_type[MAX_BATCH_SIZE];
170 /* A temporary mapping, and a copy, of one frame of guest memory. */
171 unsigned long *page = NULL;
173 /* A copy of the pfn-to-mfn table frame list. */
174 xen_pfn_t *p2m_frame_list = NULL;
176 /* A temporary mapping of the guest's start_info page. */
177 start_info_t *start_info;
179 /* Our mapping of the current region (batch) */
180 char *region_base;
182 xc_mmu_t *mmu = NULL;
184 /* used by debug verify code */
185 unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
187 struct mmuext_op pin[MAX_PIN_BATCH];
188 unsigned int nr_pins;
190 uint64_t vcpumap = 1ULL;
191 unsigned int max_vcpu_id = 0;
192 int new_ctxt_format = 0;
194 max_pfn = nr_pfns;
196 DPRINTF("xc_linux_restore start: max_pfn = %lx\n", max_pfn);
198 /*
199 * XXX For now, 32bit dom0's can only save/restore 32bit domUs
200 * on 64bit hypervisors.
201 */
202 memset(&domctl, 0, sizeof(domctl));
203 domctl.domain = dom;
204 domctl.cmd = XEN_DOMCTL_set_address_size;
205 domctl.u.address_size.size = sizeof(unsigned long) * 8;
206 rc = do_domctl(xc_handle, &domctl);
207 if ( rc != 0 ) {
208 ERROR("Unable to set guest address size.");
209 goto out;
210 }
212 if(!get_platform_info(xc_handle, dom,
213 &max_mfn, &hvirt_start, &pt_levels)) {
214 ERROR("Unable to get platform info.");
215 return 1;
216 }
218 if (lock_pages(&ctxt, sizeof(ctxt))) {
219 /* needed for build domctl, but might as well do early */
220 ERROR("Unable to lock ctxt");
221 return 1;
222 }
224 if (!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
225 ERROR("Couldn't allocate p2m_frame_list array");
226 goto out;
227 }
229 /* Read first entry of P2M list, or extended-info signature (~0UL). */
230 if (!read_exact(io_fd, p2m_frame_list, sizeof(long))) {
231 ERROR("read extended-info signature failed");
232 goto out;
233 }
235 if (p2m_frame_list[0] == ~0UL) {
236 uint32_t tot_bytes;
238 /* Next 4 bytes: total size of following extended info. */
239 if (!read_exact(io_fd, &tot_bytes, sizeof(tot_bytes))) {
240 ERROR("read extended-info size failed");
241 goto out;
242 }
244 while (tot_bytes) {
245 uint32_t chunk_bytes;
246 char chunk_sig[4];
248 /* 4-character chunk signature + 4-byte remaining chunk size. */
249 if (!read_exact(io_fd, chunk_sig, sizeof(chunk_sig)) ||
250 !read_exact(io_fd, &chunk_bytes, sizeof(chunk_bytes))) {
251 ERROR("read extended-info chunk signature failed");
252 goto out;
253 }
254 tot_bytes -= 8;
256 /* VCPU context structure? */
257 if (!strncmp(chunk_sig, "vcpu", 4)) {
258 if (!read_exact(io_fd, &ctxt, sizeof(ctxt))) {
259 ERROR("read extended-info vcpu context failed");
260 goto out;
261 }
262 tot_bytes -= sizeof(struct vcpu_guest_context);
263 chunk_bytes -= sizeof(struct vcpu_guest_context);
265 if (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3))
266 pae_extended_cr3 = 1;
267 }
269 /* Any remaining bytes of this chunk: read and discard. */
270 while (chunk_bytes) {
271 unsigned long sz = chunk_bytes;
272 if ( sz > P2M_FL_SIZE )
273 sz = P2M_FL_SIZE;
274 if (!read_exact(io_fd, p2m_frame_list, sz)) {
275 ERROR("read-and-discard extended-info chunk bytes failed");
276 goto out;
277 }
278 chunk_bytes -= sz;
279 tot_bytes -= sz;
280 }
281 }
283 /* Now read the real first entry of P2M list. */
284 if (!read_exact(io_fd, p2m_frame_list, sizeof(long))) {
285 ERROR("read first entry of p2m_frame_list failed");
286 goto out;
287 }
288 }
290 /* First entry is already read into the p2m array. */
291 if (!read_exact(io_fd, &p2m_frame_list[1], P2M_FL_SIZE - sizeof(long))) {
292 ERROR("read p2m_frame_list failed");
293 goto out;
294 }
296 /* We want zeroed memory so use calloc rather than malloc. */
297 p2m = calloc(max_pfn, sizeof(xen_pfn_t));
298 pfn_type = calloc(max_pfn, sizeof(unsigned long));
299 region_mfn = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t));
300 p2m_batch = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t));
302 if ((p2m == NULL) || (pfn_type == NULL) ||
303 (region_mfn == NULL) || (p2m_batch == NULL)) {
304 ERROR("memory alloc failed");
305 errno = ENOMEM;
306 goto out;
307 }
309 if (lock_pages(region_mfn, sizeof(xen_pfn_t) * MAX_BATCH_SIZE)) {
310 ERROR("Could not lock region_mfn");
311 goto out;
312 }
314 if (lock_pages(p2m_batch, sizeof(xen_pfn_t) * MAX_BATCH_SIZE)) {
315 ERROR("Could not lock p2m_batch");
316 goto out;
317 }
319 /* Get the domain's shared-info frame. */
320 domctl.cmd = XEN_DOMCTL_getdomaininfo;
321 domctl.domain = (domid_t)dom;
322 if (xc_domctl(xc_handle, &domctl) < 0) {
323 ERROR("Could not get information on new domain");
324 goto out;
325 }
326 shared_info_frame = domctl.u.getdomaininfo.shared_info_frame;
328 if (xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) {
329 errno = ENOMEM;
330 goto out;
331 }
333 /* Mark all PFNs as invalid; we allocate on demand */
334 for ( pfn = 0; pfn < max_pfn; pfn++ )
335 p2m[pfn] = INVALID_P2M_ENTRY;
337 if(!(mmu = xc_init_mmu_updates(xc_handle, dom))) {
338 ERROR("Could not initialise for MMU updates");
339 goto out;
340 }
342 DPRINTF("Reloading memory pages: 0%%\n");
344 /*
345 * Now simply read each saved frame into its new machine frame.
346 * We uncanonicalise page tables as we go.
347 */
348 prev_pc = 0;
350 n = m = 0;
351 while (1) {
353 int j, nr_mfns = 0;
355 this_pc = (n * 100) / max_pfn;
356 if ( (this_pc - prev_pc) >= 5 )
357 {
358 PPRINTF("\b\b\b\b%3d%%", this_pc);
359 prev_pc = this_pc;
360 }
362 if (!read_exact(io_fd, &j, sizeof(int))) {
363 ERROR("Error when reading batch size");
364 goto out;
365 }
367 PPRINTF("batch %d\n",j);
369 if (j == -1) {
370 verify = 1;
371 DPRINTF("Entering page verify mode\n");
372 continue;
373 }
375 if (j == -2) {
376 new_ctxt_format = 1;
377 if (!read_exact(io_fd, &max_vcpu_id, sizeof(int)) ||
378 (max_vcpu_id >= 64) ||
379 !read_exact(io_fd, &vcpumap, sizeof(uint64_t))) {
380 ERROR("Error when reading max_vcpu_id");
381 goto out;
382 }
383 continue;
384 }
386 if (j == 0)
387 break; /* our work here is done */
389 if (j > MAX_BATCH_SIZE) {
390 ERROR("Max batch size exceeded. Giving up.");
391 goto out;
392 }
394 if (!read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long))) {
395 ERROR("Error when reading region pfn types");
396 goto out;
397 }
399 /* First pass for this batch: work out how much memory to alloc */
400 nr_mfns = 0;
401 for ( i = 0; i < j; i++ )
402 {
403 unsigned long pfn, pagetype;
404 pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
405 pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
407 if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) &&
408 (p2m[pfn] == INVALID_P2M_ENTRY) )
409 {
410 /* Have a live PFN which hasn't had an MFN allocated */
411 p2m_batch[nr_mfns++] = pfn;
412 }
413 }
416 /* Now allocate a bunch of mfns for this batch */
417 if (nr_mfns && xc_domain_memory_populate_physmap(
418 xc_handle, dom, nr_mfns, 0, 0, p2m_batch) != 0) {
419 ERROR("Failed to allocate memory for batch.!\n");
420 errno = ENOMEM;
421 goto out;
422 }
424 /* Second pass for this batch: update p2m[] and region_mfn[] */
425 nr_mfns = 0;
426 for ( i = 0; i < j; i++ )
427 {
428 unsigned long pfn, pagetype;
429 pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
430 pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
432 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB)
433 region_mfn[i] = ~0UL; /* map will fail but we don't care */
434 else
435 {
436 if (p2m[pfn] == INVALID_P2M_ENTRY) {
437 /* We just allocated a new mfn above; update p2m */
438 p2m[pfn] = p2m_batch[nr_mfns++];
439 }
441 /* setup region_mfn[] for batch map */
442 region_mfn[i] = p2m[pfn];
443 }
444 }
446 /* Map relevant mfns */
447 region_base = xc_map_foreign_batch(
448 xc_handle, dom, PROT_WRITE, region_mfn, j);
450 if ( region_base == NULL )
451 {
452 ERROR("map batch failed");
453 goto out;
454 }
456 for ( i = 0; i < j; i++ )
457 {
458 void *page;
459 unsigned long pagetype;
461 pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
462 pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
464 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
465 /* a bogus/unmapped page: skip it */
466 continue;
468 if ( pfn > max_pfn )
469 {
470 ERROR("pfn out of range");
471 goto out;
472 }
474 pfn_type[pfn] = pagetype;
476 mfn = p2m[pfn];
478 /* In verify mode, we use a copy; otherwise we work in place */
479 page = verify ? (void *)buf : (region_base + i*PAGE_SIZE);
481 if (!read_exact(io_fd, page, PAGE_SIZE)) {
482 ERROR("Error when reading page (type was %lx)", pagetype);
483 goto out;
484 }
486 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
488 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
489 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
490 {
491 /*
492 ** A page table page - need to 'uncanonicalize' it, i.e.
493 ** replace all the references to pfns with the corresponding
494 ** mfns for the new domain.
495 **
496 ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
497 ** so we may need to update the p2m after the main loop.
498 ** Hence we defer canonicalization of L1s until then.
499 */
500 if ((pt_levels != 3) ||
501 pae_extended_cr3 ||
502 (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
504 if (!uncanonicalize_pagetable(xc_handle, dom,
505 pagetype, page)) {
506 /*
507 ** Failing to uncanonicalize a page table can be ok
508 ** under live migration since the pages type may have
509 ** changed by now (and we'll get an update later).
510 */
511 DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
512 pagetype >> 28, pfn, mfn);
513 nraces++;
514 continue;
515 }
516 }
517 }
518 else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
519 {
520 ERROR("Bogus page type %lx page table is out of range: "
521 "i=%d max_pfn=%lu", pagetype, i, max_pfn);
522 goto out;
524 }
527 if (verify) {
529 int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
531 if (res) {
533 int v;
535 DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
536 "actualcs=%08lx\n", pfn, pfn_type[pfn],
537 csum_page(region_base + i*PAGE_SIZE),
538 csum_page(buf));
540 for (v = 0; v < 4; v++) {
542 unsigned long *p = (unsigned long *)
543 (region_base + i*PAGE_SIZE);
544 if (buf[v] != p[v])
545 DPRINTF(" %d: %08lx %08lx\n", v, buf[v], p[v]);
546 }
547 }
548 }
550 if (xc_add_mmu_update(xc_handle, mmu,
551 (((unsigned long long)mfn) << PAGE_SHIFT)
552 | MMU_MACHPHYS_UPDATE, pfn)) {
553 ERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
554 goto out;
555 }
556 } /* end of 'batch' for loop */
558 munmap(region_base, j*PAGE_SIZE);
559 n+= j; /* crude stats */
561 /*
562 * Discard cache for portion of file read so far up to last
563 * page boundary every 16MB or so.
564 */
565 m += j;
566 if ( m > MAX_PAGECACHE_USAGE )
567 {
568 discard_file_cache(io_fd, 0 /* no flush */);
569 m = 0;
570 }
571 }
573 /*
574 * Ensure we flush all machphys updates before potential PAE-specific
575 * reallocations below.
576 */
577 if (xc_finish_mmu_updates(xc_handle, mmu)) {
578 ERROR("Error doing finish_mmu_updates()");
579 goto out;
580 }
582 DPRINTF("Received all pages (%d races)\n", nraces);
584 if ((pt_levels == 3) && !pae_extended_cr3) {
586 /*
587 ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
588 ** is a little awkward and involves (a) finding all such PGDs and
589 ** replacing them with 'lowmem' versions; (b) upating the p2m[]
590 ** with the new info; and (c) canonicalizing all the L1s using the
591 ** (potentially updated) p2m[].
592 **
593 ** This is relatively slow (and currently involves two passes through
594 ** the pfn_type[] array), but at least seems to be correct. May wish
595 ** to consider more complex approaches to optimize this later.
596 */
598 int j, k;
600 /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
601 for ( i = 0; i < max_pfn; i++ )
602 {
603 if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
604 XEN_DOMCTL_PFINFO_L3TAB) &&
605 (p2m[i] > 0xfffffUL) )
606 {
607 unsigned long new_mfn;
608 uint64_t l3ptes[4];
609 uint64_t *l3tab;
611 l3tab = (uint64_t *)
612 xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
613 PROT_READ, p2m[i]);
615 for(j = 0; j < 4; j++)
616 l3ptes[j] = l3tab[j];
618 munmap(l3tab, PAGE_SIZE);
620 if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) {
621 ERROR("Couldn't get a page below 4GB :-(");
622 goto out;
623 }
625 p2m[i] = new_mfn;
626 if (xc_add_mmu_update(xc_handle, mmu,
627 (((unsigned long long)new_mfn)
628 << PAGE_SHIFT) |
629 MMU_MACHPHYS_UPDATE, i)) {
630 ERROR("Couldn't m2p on PAE root pgdir");
631 goto out;
632 }
634 l3tab = (uint64_t *)
635 xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
636 PROT_READ | PROT_WRITE, p2m[i]);
638 for(j = 0; j < 4; j++)
639 l3tab[j] = l3ptes[j];
641 munmap(l3tab, PAGE_SIZE);
643 }
644 }
646 /* Second pass: find all L1TABs and uncanonicalize them */
647 j = 0;
649 for ( i = 0; i < max_pfn; i++ )
650 {
651 if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
652 XEN_DOMCTL_PFINFO_L1TAB) )
653 {
654 region_mfn[j] = p2m[i];
655 j++;
656 }
658 if(i == (max_pfn-1) || j == MAX_BATCH_SIZE) {
660 if (!(region_base = xc_map_foreign_batch(
661 xc_handle, dom, PROT_READ | PROT_WRITE,
662 region_mfn, j))) {
663 ERROR("map batch failed");
664 goto out;
665 }
667 for(k = 0; k < j; k++) {
668 if(!uncanonicalize_pagetable(xc_handle, dom,
669 XEN_DOMCTL_PFINFO_L1TAB,
670 region_base + k*PAGE_SIZE)) {
671 ERROR("failed uncanonicalize pt!");
672 goto out;
673 }
674 }
676 munmap(region_base, j*PAGE_SIZE);
677 j = 0;
678 }
679 }
681 if (xc_finish_mmu_updates(xc_handle, mmu)) {
682 ERROR("Error doing finish_mmu_updates()");
683 goto out;
684 }
685 }
687 /*
688 * Pin page tables. Do this after writing to them as otherwise Xen
689 * will barf when doing the type-checking.
690 */
691 nr_pins = 0;
692 for ( i = 0; i < max_pfn; i++ )
693 {
694 if ( (pfn_type[i] & XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
695 continue;
697 switch ( pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
698 {
699 case XEN_DOMCTL_PFINFO_L1TAB:
700 pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
701 break;
703 case XEN_DOMCTL_PFINFO_L2TAB:
704 pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
705 break;
707 case XEN_DOMCTL_PFINFO_L3TAB:
708 pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
709 break;
711 case XEN_DOMCTL_PFINFO_L4TAB:
712 pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
713 break;
715 default:
716 continue;
717 }
719 pin[nr_pins].arg1.mfn = p2m[i];
720 nr_pins++;
722 /* Batch full? Then flush. */
723 if (nr_pins == MAX_PIN_BATCH) {
724 if (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) {
725 ERROR("Failed to pin batch of %d page tables", nr_pins);
726 goto out;
727 }
728 nr_pins = 0;
729 }
730 }
732 /* Flush final partial batch. */
733 if ((nr_pins != 0) && (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0)) {
734 ERROR("Failed to pin batch of %d page tables", nr_pins);
735 goto out;
736 }
738 DPRINTF("\b\b\b\b100%%\n");
739 DPRINTF("Memory reloaded.\n");
741 /* Get the list of PFNs that are not in the psuedo-phys map */
742 {
743 unsigned int count;
744 unsigned long *pfntab;
745 int nr_frees, rc;
747 if (!read_exact(io_fd, &count, sizeof(count))) {
748 ERROR("Error when reading pfn count");
749 goto out;
750 }
752 if(!(pfntab = malloc(sizeof(unsigned long) * count))) {
753 ERROR("Out of memory");
754 goto out;
755 }
757 if (!read_exact(io_fd, pfntab, sizeof(unsigned long)*count)) {
758 ERROR("Error when reading pfntab");
759 goto out;
760 }
762 nr_frees = 0;
763 for (i = 0; i < count; i++) {
765 unsigned long pfn = pfntab[i];
767 if(p2m[pfn] != INVALID_P2M_ENTRY) {
768 /* pfn is not in physmap now, but was at some point during
769 the save/migration process - need to free it */
770 pfntab[nr_frees++] = p2m[pfn];
771 p2m[pfn] = INVALID_P2M_ENTRY; // not in pseudo-physical map
772 }
773 }
775 if (nr_frees > 0) {
777 struct xen_memory_reservation reservation = {
778 .nr_extents = nr_frees,
779 .extent_order = 0,
780 .domid = dom
781 };
782 set_xen_guest_handle(reservation.extent_start, pfntab);
784 if ((rc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
785 &reservation)) != nr_frees) {
786 ERROR("Could not decrease reservation : %d", rc);
787 goto out;
788 } else
789 DPRINTF("Decreased reservation by %d pages\n", count);
790 }
791 }
793 for (i = 0; i <= max_vcpu_id; i++) {
794 if (!(vcpumap & (1ULL << i)))
795 continue;
797 if (!read_exact(io_fd, &ctxt, sizeof(ctxt))) {
798 ERROR("Error when reading ctxt %d", i);
799 goto out;
800 }
802 if ( !new_ctxt_format )
803 ctxt.flags |= VGCF_online;
805 if (i == 0) {
806 /*
807 * Uncanonicalise the suspend-record frame number and poke
808 * resume record.
809 */
810 pfn = ctxt.user_regs.edx;
811 if ((pfn >= max_pfn) ||
812 (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB)) {
813 ERROR("Suspend record frame number is bad");
814 goto out;
815 }
816 ctxt.user_regs.edx = mfn = p2m[pfn];
817 start_info = xc_map_foreign_range(
818 xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
819 start_info->nr_pages = max_pfn;
820 start_info->shared_info = shared_info_frame << PAGE_SHIFT;
821 start_info->flags = 0;
822 *store_mfn = start_info->store_mfn = p2m[start_info->store_mfn];
823 start_info->store_evtchn = store_evtchn;
824 start_info->console.domU.mfn = p2m[start_info->console.domU.mfn];
825 start_info->console.domU.evtchn = console_evtchn;
826 *console_mfn = start_info->console.domU.mfn;
827 munmap(start_info, PAGE_SIZE);
828 }
830 /* Uncanonicalise each GDT frame number. */
831 if (ctxt.gdt_ents > 8192) {
832 ERROR("GDT entry count out of range");
833 goto out;
834 }
836 for (j = 0; (512*j) < ctxt.gdt_ents; j++) {
837 pfn = ctxt.gdt_frames[j];
838 if ((pfn >= max_pfn) ||
839 (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB)) {
840 ERROR("GDT frame number is bad");
841 goto out;
842 }
843 ctxt.gdt_frames[j] = p2m[pfn];
844 }
846 /* Uncanonicalise the page table base pointer. */
847 pfn = xen_cr3_to_pfn(ctxt.ctrlreg[3]);
849 if (pfn >= max_pfn) {
850 ERROR("PT base is bad: pfn=%lu max_pfn=%lu type=%08lx",
851 pfn, max_pfn, pfn_type[pfn]);
852 goto out;
853 }
855 if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
856 ((unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) ) {
857 ERROR("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
858 pfn, max_pfn, pfn_type[pfn],
859 (unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
860 goto out;
861 }
863 ctxt.ctrlreg[3] = xen_pfn_to_cr3(p2m[pfn]);
865 /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
866 if ( (pt_levels == 4) && ctxt.ctrlreg[1] )
867 {
868 pfn = xen_cr3_to_pfn(ctxt.ctrlreg[1]);
870 if (pfn >= max_pfn) {
871 ERROR("User PT base is bad: pfn=%lu max_pfn=%lu type=%08lx",
872 pfn, max_pfn, pfn_type[pfn]);
873 goto out;
874 }
876 if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
877 ((unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) ) {
878 ERROR("User PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
879 pfn, max_pfn, pfn_type[pfn],
880 (unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
881 goto out;
882 }
884 ctxt.ctrlreg[1] = xen_pfn_to_cr3(p2m[pfn]);
885 }
887 domctl.cmd = XEN_DOMCTL_setvcpucontext;
888 domctl.domain = (domid_t)dom;
889 domctl.u.vcpucontext.vcpu = i;
890 set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt);
891 rc = xc_domctl(xc_handle, &domctl);
892 if (rc != 0) {
893 ERROR("Couldn't build vcpu%d", i);
894 goto out;
895 }
896 }
898 if (!read_exact(io_fd, shared_info_page, PAGE_SIZE)) {
899 ERROR("Error when reading shared info page");
900 goto out;
901 }
903 /* clear any pending events and the selector */
904 memset(&(shared_info->evtchn_pending[0]), 0,
905 sizeof (shared_info->evtchn_pending));
906 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
907 shared_info->vcpu_info[i].evtchn_pending_sel = 0;
909 /* Copy saved contents of shared-info page. No checking needed. */
910 page = xc_map_foreign_range(
911 xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
912 memcpy(page, shared_info, PAGE_SIZE);
913 munmap(page, PAGE_SIZE);
915 /* Uncanonicalise the pfn-to-mfn table frame-number list. */
916 for (i = 0; i < P2M_FL_ENTRIES; i++) {
917 pfn = p2m_frame_list[i];
918 if ((pfn >= max_pfn) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB)) {
919 ERROR("PFN-to-MFN frame number is bad");
920 goto out;
921 }
923 p2m_frame_list[i] = p2m[pfn];
924 }
926 /* Copy the P2M we've constructed to the 'live' P2M */
927 if (!(live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_WRITE,
928 p2m_frame_list, P2M_FL_ENTRIES))) {
929 ERROR("Couldn't map p2m table");
930 goto out;
931 }
933 memcpy(live_p2m, p2m, P2M_SIZE);
934 munmap(live_p2m, P2M_SIZE);
936 DPRINTF("Domain ready to be built.\n");
938 out:
939 if ( (rc != 0) && (dom != 0) )
940 xc_domain_destroy(xc_handle, dom);
941 free(mmu);
942 free(p2m);
943 free(pfn_type);
945 /* discard cache for save file */
946 discard_file_cache(io_fd, 1 /*flush*/);
948 DPRINTF("Restore exit with rc=%d\n", rc);
950 return rc;
951 }