ia64/xen-unstable

view tools/libxc/xc_domain_restore.c @ 14783:099593da38cb

Fix PV guest restore.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Tue Apr 10 13:46:28 2007 +0100 (2007-04-10)
parents 7cc1d532f9ee
children 1fa9b5f1df8f
line source
1 /******************************************************************************
2 * xc_domain_restore.c
3 *
4 * Restore the state of a guest session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 * Copyright (c) 2006, Intel Corporation
8 * Copyright (c) 2007, XenSource Inc.
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms and conditions of the GNU General Public License,
12 * version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
17 * more details.
18 *
19 * You should have received a copy of the GNU General Public License along with
20 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
21 * Place - Suite 330, Boston, MA 02111-1307 USA.
22 *
23 */
25 #include <stdlib.h>
26 #include <unistd.h>
28 #include "xg_private.h"
29 #include "xg_save_restore.h"
30 #include "xc_dom.h"
32 #include <xen/hvm/ioreq.h>
33 #include <xen/hvm/params.h>
35 /* max mfn of the current host machine */
36 static unsigned long max_mfn;
38 /* virtual starting address of the hypervisor */
39 static unsigned long hvirt_start;
41 /* #levels of page tables used by the current guest */
42 static unsigned int pt_levels;
44 /* number of pfns this guest has (i.e. number of entries in the P2M) */
45 static unsigned long p2m_size;
47 /* number of 'in use' pfns in the guest (i.e. #P2M entries with a valid mfn) */
48 static unsigned long nr_pfns;
50 /* Live mapping of the table mapping each PFN to its current MFN. */
51 static xen_pfn_t *live_p2m = NULL;
53 /* A table mapping each PFN to its new MFN. */
54 static xen_pfn_t *p2m = NULL;
56 /* A table of P2M mappings in the current region */
57 static xen_pfn_t *p2m_batch = NULL;
59 static ssize_t
60 read_exact(int fd, void *buf, size_t count)
61 {
62 int r = 0, s;
63 unsigned char *b = buf;
65 while ( r < count )
66 {
67 s = read(fd, &b[r], count - r);
68 if ( (s == -1) && (errno == EINTR) )
69 continue;
70 if ( s <= 0 )
71 break;
72 r += s;
73 }
75 return (r == count);
76 }
78 /*
79 ** In the state file (or during transfer), all page-table pages are
80 ** converted into a 'canonical' form where references to actual mfns
81 ** are replaced with references to the corresponding pfns.
82 ** This function inverts that operation, replacing the pfn values with
83 ** the (now known) appropriate mfn values.
84 */
85 static int uncanonicalize_pagetable(int xc_handle, uint32_t dom,
86 unsigned long type, void *page)
87 {
88 int i, pte_last;
89 unsigned long pfn;
90 uint64_t pte;
91 int nr_mfns = 0;
93 pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
95 /* First pass: work out how many (if any) MFNs we need to alloc */
96 for ( i = 0; i < pte_last; i++ )
97 {
98 if ( pt_levels == 2 )
99 pte = ((uint32_t *)page)[i];
100 else
101 pte = ((uint64_t *)page)[i];
103 /* XXX SMH: below needs fixing for PROT_NONE etc */
104 if ( !(pte & _PAGE_PRESENT) )
105 continue;
107 pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
109 if ( pfn >= p2m_size )
110 {
111 /* This "page table page" is probably not one; bail. */
112 ERROR("Frame number in type %lu page table is out of range: "
113 "i=%d pfn=0x%lx p2m_size=%lu",
114 type >> 28, i, pfn, p2m_size);
115 return 0;
116 }
118 if ( p2m[pfn] == INVALID_P2M_ENTRY )
119 {
120 /* Have a 'valid' PFN without a matching MFN - need to alloc */
121 p2m_batch[nr_mfns++] = pfn;
122 p2m[pfn]--;
123 }
124 }
126 /* Allocate the requisite number of mfns. */
127 if ( nr_mfns &&
128 (xc_domain_memory_populate_physmap(xc_handle, dom, nr_mfns, 0, 0,
129 p2m_batch) != 0) )
130 {
131 ERROR("Failed to allocate memory for batch.!\n");
132 errno = ENOMEM;
133 return 0;
134 }
136 /* Second pass: uncanonicalize each present PTE */
137 nr_mfns = 0;
138 for ( i = 0; i < pte_last; i++ )
139 {
140 if ( pt_levels == 2 )
141 pte = ((uint32_t *)page)[i];
142 else
143 pte = ((uint64_t *)page)[i];
145 /* XXX SMH: below needs fixing for PROT_NONE etc */
146 if ( !(pte & _PAGE_PRESENT) )
147 continue;
149 pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
151 if ( p2m[pfn] == (INVALID_P2M_ENTRY-1) )
152 p2m[pfn] = p2m_batch[nr_mfns++];
154 pte &= ~MADDR_MASK_X86;
155 pte |= (uint64_t)p2m[pfn] << PAGE_SHIFT;
157 if ( pt_levels == 2 )
158 ((uint32_t *)page)[i] = (uint32_t)pte;
159 else
160 ((uint64_t *)page)[i] = (uint64_t)pte;
161 }
163 return 1;
164 }
167 /* Load the p2m frame list, plus potential extended info chunk */
168 static xen_pfn_t *load_p2m_frame_list(int io_fd, int *pae_extended_cr3)
169 {
170 xen_pfn_t *p2m_frame_list;
171 vcpu_guest_context_t ctxt;
173 if ( (p2m_frame_list = malloc(P2M_FL_SIZE)) == NULL )
174 {
175 ERROR("Couldn't allocate p2m_frame_list array");
176 return NULL;
177 }
179 /* Read first entry of P2M list, or extended-info signature (~0UL). */
180 if ( !read_exact(io_fd, p2m_frame_list, sizeof(long)) )
181 {
182 ERROR("read extended-info signature failed");
183 return NULL;
184 }
186 if ( p2m_frame_list[0] == ~0UL )
187 {
188 uint32_t tot_bytes;
190 /* Next 4 bytes: total size of following extended info. */
191 if ( !read_exact(io_fd, &tot_bytes, sizeof(tot_bytes)) )
192 {
193 ERROR("read extended-info size failed");
194 return NULL;
195 }
197 while ( tot_bytes )
198 {
199 uint32_t chunk_bytes;
200 char chunk_sig[4];
202 /* 4-character chunk signature + 4-byte remaining chunk size. */
203 if ( !read_exact(io_fd, chunk_sig, sizeof(chunk_sig)) ||
204 !read_exact(io_fd, &chunk_bytes, sizeof(chunk_bytes)) )
205 {
206 ERROR("read extended-info chunk signature failed");
207 return NULL;
208 }
209 tot_bytes -= 8;
211 /* VCPU context structure? */
212 if ( !strncmp(chunk_sig, "vcpu", 4) )
213 {
214 if ( !read_exact(io_fd, &ctxt, sizeof(ctxt)) )
215 {
216 ERROR("read extended-info vcpu context failed");
217 return NULL;
218 }
219 tot_bytes -= sizeof(struct vcpu_guest_context);
220 chunk_bytes -= sizeof(struct vcpu_guest_context);
222 if ( ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3) )
223 *pae_extended_cr3 = 1;
224 }
226 /* Any remaining bytes of this chunk: read and discard. */
227 while ( chunk_bytes )
228 {
229 unsigned long sz = chunk_bytes;
230 if ( sz > P2M_FL_SIZE )
231 sz = P2M_FL_SIZE;
232 if ( !read_exact(io_fd, p2m_frame_list, sz) )
233 {
234 ERROR("read-and-discard extended-info chunk bytes failed");
235 return NULL;
236 }
237 chunk_bytes -= sz;
238 tot_bytes -= sz;
239 }
240 }
242 /* Now read the real first entry of P2M list. */
243 if ( !read_exact(io_fd, p2m_frame_list, sizeof(long)) )
244 {
245 ERROR("read first entry of p2m_frame_list failed");
246 return NULL;
247 }
248 }
250 /* First entry is already read into the p2m array. */
251 if ( !read_exact(io_fd, &p2m_frame_list[1], P2M_FL_SIZE - sizeof(long)) )
252 {
253 ERROR("read p2m_frame_list failed");
254 return NULL;
255 }
257 return p2m_frame_list;
258 }
260 int xc_domain_restore(int xc_handle, int io_fd, uint32_t dom,
261 unsigned int store_evtchn, unsigned long *store_mfn,
262 unsigned int console_evtchn, unsigned long *console_mfn,
263 unsigned int hvm, unsigned int pae)
264 {
265 DECLARE_DOMCTL;
266 int rc = 1, i, j, n, m, pae_extended_cr3 = 0;
267 unsigned long mfn, pfn;
268 unsigned int prev_pc, this_pc;
269 int verify = 0;
270 int nraces = 0;
272 /* The new domain's shared-info frame number. */
273 unsigned long shared_info_frame;
274 unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
275 shared_info_t *shared_info = (shared_info_t *)shared_info_page;
277 /* A copy of the CPU context of the guest. */
278 vcpu_guest_context_t ctxt;
280 /* A table containing the type of each PFN (/not/ MFN!). */
281 unsigned long *pfn_type = NULL;
283 /* A table of MFNs to map in the current region */
284 xen_pfn_t *region_mfn = NULL;
286 /* Types of the pfns in the current region */
287 unsigned long region_pfn_type[MAX_BATCH_SIZE];
289 /* A temporary mapping, and a copy, of one frame of guest memory. */
290 unsigned long *page = NULL;
292 /* A copy of the pfn-to-mfn table frame list. */
293 xen_pfn_t *p2m_frame_list = NULL;
295 /* A temporary mapping of the guest's start_info page. */
296 start_info_t *start_info;
298 /* Our mapping of the current region (batch) */
299 char *region_base;
301 struct xc_mmu *mmu = NULL;
303 /* used by debug verify code */
304 unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
306 struct mmuext_op pin[MAX_PIN_BATCH];
307 unsigned int nr_pins;
309 uint64_t vcpumap = 1ULL;
310 unsigned int max_vcpu_id = 0;
311 int new_ctxt_format = 0;
313 /* Magic frames in HVM guests: ioreqs and xenstore comms. */
314 uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
316 /* Buffer for holding HVM context */
317 uint8_t *hvm_buf = NULL;
319 /* For info only */
320 nr_pfns = 0;
322 if ( !read_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
323 {
324 ERROR("read: p2m_size");
325 goto out;
326 }
327 DPRINTF("xc_domain_restore start: p2m_size = %lx\n", p2m_size);
329 if ( !hvm )
330 {
331 /*
332 * XXX For now, 32bit dom0's can only save/restore 32bit domUs
333 * on 64bit hypervisors.
334 */
335 memset(&domctl, 0, sizeof(domctl));
336 domctl.domain = dom;
337 domctl.cmd = XEN_DOMCTL_set_address_size;
338 domctl.u.address_size.size = sizeof(unsigned long) * 8;
339 rc = do_domctl(xc_handle, &domctl);
340 if ( rc != 0 )
341 {
342 ERROR("Unable to set guest address size.");
343 goto out;
344 }
345 rc = 1;
346 }
348 if ( !get_platform_info(xc_handle, dom,
349 &max_mfn, &hvirt_start, &pt_levels) )
350 {
351 ERROR("Unable to get platform info.");
352 return 1;
353 }
355 if ( lock_pages(&ctxt, sizeof(ctxt)) )
356 {
357 /* needed for build domctl, but might as well do early */
358 ERROR("Unable to lock ctxt");
359 return 1;
360 }
362 /* Load the p2m frame list, plus potential extended info chunk */
363 if ( !hvm )
364 {
365 p2m_frame_list = load_p2m_frame_list(io_fd, &pae_extended_cr3);
366 if ( !p2m_frame_list )
367 goto out;
368 }
370 /* We want zeroed memory so use calloc rather than malloc. */
371 p2m = calloc(p2m_size, sizeof(xen_pfn_t));
372 pfn_type = calloc(p2m_size, sizeof(unsigned long));
373 region_mfn = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t));
374 p2m_batch = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t));
376 if ( (p2m == NULL) || (pfn_type == NULL) ||
377 (region_mfn == NULL) || (p2m_batch == NULL) )
378 {
379 ERROR("memory alloc failed");
380 errno = ENOMEM;
381 goto out;
382 }
384 if ( lock_pages(region_mfn, sizeof(xen_pfn_t) * MAX_BATCH_SIZE) )
385 {
386 ERROR("Could not lock region_mfn");
387 goto out;
388 }
390 if ( lock_pages(p2m_batch, sizeof(xen_pfn_t) * MAX_BATCH_SIZE) )
391 {
392 ERROR("Could not lock p2m_batch");
393 goto out;
394 }
396 /* Get the domain's shared-info frame. */
397 domctl.cmd = XEN_DOMCTL_getdomaininfo;
398 domctl.domain = (domid_t)dom;
399 if ( xc_domctl(xc_handle, &domctl) < 0 )
400 {
401 ERROR("Could not get information on new domain");
402 goto out;
403 }
404 shared_info_frame = domctl.u.getdomaininfo.shared_info_frame;
406 /* Mark all PFNs as invalid; we allocate on demand */
407 for ( pfn = 0; pfn < p2m_size; pfn++ )
408 p2m[pfn] = INVALID_P2M_ENTRY;
410 mmu = xc_alloc_mmu_updates(xc_handle, dom);
411 if ( mmu == NULL )
412 {
413 ERROR("Could not initialise for MMU updates");
414 goto out;
415 }
417 DPRINTF("Reloading memory pages: 0%%\n");
419 /*
420 * Now simply read each saved frame into its new machine frame.
421 * We uncanonicalise page tables as we go.
422 */
423 prev_pc = 0;
425 n = m = 0;
426 for ( ; ; )
427 {
428 int j, nr_mfns = 0;
430 this_pc = (n * 100) / p2m_size;
431 if ( (this_pc - prev_pc) >= 5 )
432 {
433 PPRINTF("\b\b\b\b%3d%%", this_pc);
434 prev_pc = this_pc;
435 }
437 if ( !read_exact(io_fd, &j, sizeof(int)) )
438 {
439 ERROR("Error when reading batch size");
440 goto out;
441 }
443 PPRINTF("batch %d\n",j);
445 if ( j == -1 )
446 {
447 verify = 1;
448 DPRINTF("Entering page verify mode\n");
449 continue;
450 }
452 if ( j == -2 )
453 {
454 new_ctxt_format = 1;
455 if ( !read_exact(io_fd, &max_vcpu_id, sizeof(int)) ||
456 (max_vcpu_id >= 64) ||
457 !read_exact(io_fd, &vcpumap, sizeof(uint64_t)) )
458 {
459 ERROR("Error when reading max_vcpu_id");
460 goto out;
461 }
462 continue;
463 }
465 if ( j == 0 )
466 break; /* our work here is done */
468 if ( j > MAX_BATCH_SIZE )
469 {
470 ERROR("Max batch size exceeded. Giving up.");
471 goto out;
472 }
474 if ( !read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) )
475 {
476 ERROR("Error when reading region pfn types");
477 goto out;
478 }
480 /* First pass for this batch: work out how much memory to alloc */
481 nr_mfns = 0;
482 for ( i = 0; i < j; i++ )
483 {
484 unsigned long pfn, pagetype;
485 pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
486 pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
488 if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) &&
489 (p2m[pfn] == INVALID_P2M_ENTRY) )
490 {
491 /* Have a live PFN which hasn't had an MFN allocated */
492 p2m_batch[nr_mfns++] = pfn;
493 p2m[pfn]--;
494 }
495 }
497 /* Now allocate a bunch of mfns for this batch */
498 if ( nr_mfns &&
499 (xc_domain_memory_populate_physmap(xc_handle, dom, nr_mfns, 0,
500 0, p2m_batch) != 0) )
501 {
502 ERROR("Failed to allocate memory for batch.!\n");
503 errno = ENOMEM;
504 goto out;
505 }
507 /* Second pass for this batch: update p2m[] and region_mfn[] */
508 nr_mfns = 0;
509 for ( i = 0; i < j; i++ )
510 {
511 unsigned long pfn, pagetype;
512 pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
513 pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
515 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
516 region_mfn[i] = ~0UL; /* map will fail but we don't care */
517 else
518 {
519 if ( p2m[pfn] == (INVALID_P2M_ENTRY-1) )
520 {
521 /* We just allocated a new mfn above; update p2m */
522 p2m[pfn] = p2m_batch[nr_mfns++];
523 nr_pfns++;
524 }
526 /* setup region_mfn[] for batch map.
527 * For HVM guests, this interface takes PFNs, not MFNs */
528 region_mfn[i] = hvm ? pfn : p2m[pfn];
529 }
530 }
532 /* Map relevant mfns */
533 region_base = xc_map_foreign_batch(
534 xc_handle, dom, PROT_WRITE, region_mfn, j);
536 if ( region_base == NULL )
537 {
538 ERROR("map batch failed");
539 goto out;
540 }
542 for ( i = 0; i < j; i++ )
543 {
544 void *page;
545 unsigned long pagetype;
547 pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
548 pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
550 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
551 /* a bogus/unmapped page: skip it */
552 continue;
554 if ( pfn > p2m_size )
555 {
556 ERROR("pfn out of range");
557 goto out;
558 }
560 pfn_type[pfn] = pagetype;
562 mfn = p2m[pfn];
564 /* In verify mode, we use a copy; otherwise we work in place */
565 page = verify ? (void *)buf : (region_base + i*PAGE_SIZE);
567 if ( !read_exact(io_fd, page, PAGE_SIZE) )
568 {
569 ERROR("Error when reading page (type was %lx)", pagetype);
570 goto out;
571 }
573 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
575 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
576 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
577 {
578 /*
579 ** A page table page - need to 'uncanonicalize' it, i.e.
580 ** replace all the references to pfns with the corresponding
581 ** mfns for the new domain.
582 **
583 ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
584 ** so we may need to update the p2m after the main loop.
585 ** Hence we defer canonicalization of L1s until then.
586 */
587 if ((pt_levels != 3) ||
588 pae_extended_cr3 ||
589 (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
591 if (!uncanonicalize_pagetable(xc_handle, dom,
592 pagetype, page)) {
593 /*
594 ** Failing to uncanonicalize a page table can be ok
595 ** under live migration since the pages type may have
596 ** changed by now (and we'll get an update later).
597 */
598 DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
599 pagetype >> 28, pfn, mfn);
600 nraces++;
601 continue;
602 }
603 }
604 }
605 else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
606 {
607 ERROR("Bogus page type %lx page table is out of range: "
608 "i=%d p2m_size=%lu", pagetype, i, p2m_size);
609 goto out;
611 }
613 if ( verify )
614 {
615 int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
616 if ( res )
617 {
618 int v;
620 DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
621 "actualcs=%08lx\n", pfn, pfn_type[pfn],
622 csum_page(region_base + i*PAGE_SIZE),
623 csum_page(buf));
625 for ( v = 0; v < 4; v++ )
626 {
627 unsigned long *p = (unsigned long *)
628 (region_base + i*PAGE_SIZE);
629 if ( buf[v] != p[v] )
630 DPRINTF(" %d: %08lx %08lx\n", v, buf[v], p[v]);
631 }
632 }
633 }
635 if ( !hvm &&
636 xc_add_mmu_update(xc_handle, mmu,
637 (((unsigned long long)mfn) << PAGE_SHIFT)
638 | MMU_MACHPHYS_UPDATE, pfn) )
639 {
640 ERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
641 goto out;
642 }
643 } /* end of 'batch' for loop */
645 munmap(region_base, j*PAGE_SIZE);
646 n+= j; /* crude stats */
648 /*
649 * Discard cache for portion of file read so far up to last
650 * page boundary every 16MB or so.
651 */
652 m += j;
653 if ( m > MAX_PAGECACHE_USAGE )
654 {
655 discard_file_cache(io_fd, 0 /* no flush */);
656 m = 0;
657 }
658 }
660 /*
661 * Ensure we flush all machphys updates before potential PAE-specific
662 * reallocations below.
663 */
664 if ( !hvm && xc_flush_mmu_updates(xc_handle, mmu) )
665 {
666 ERROR("Error doing flush_mmu_updates()");
667 goto out;
668 }
670 DPRINTF("Received all pages (%d races)\n", nraces);
672 if ( hvm )
673 {
674 uint32_t rec_len;
676 /* Set HVM-specific parameters */
677 if ( !read_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
678 {
679 ERROR("error reading magic page addresses");
680 goto out;
681 }
683 /* These comms pages need to be zeroed at the start of day */
684 if ( xc_clear_domain_page(xc_handle, dom, magic_pfns[0]) ||
685 xc_clear_domain_page(xc_handle, dom, magic_pfns[1]) ||
686 xc_clear_domain_page(xc_handle, dom, magic_pfns[2]) )
687 {
688 ERROR("error zeroing magic pages");
689 goto out;
690 }
692 xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, magic_pfns[0]);
693 xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, magic_pfns[1]);
694 xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN, magic_pfns[2]);
695 xc_set_hvm_param(xc_handle, dom, HVM_PARAM_PAE_ENABLED, pae);
696 xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_EVTCHN, store_evtchn);
697 *store_mfn = magic_pfns[2];
699 /* Read vcpu contexts */
700 for ( i = 0; i <= max_vcpu_id; i++ )
701 {
702 if ( !(vcpumap & (1ULL << i)) )
703 continue;
705 if ( !read_exact(io_fd, &(ctxt), sizeof(ctxt)) )
706 {
707 ERROR("error read vcpu context.\n");
708 goto out;
709 }
711 if ( (rc = xc_vcpu_setcontext(xc_handle, dom, i, &ctxt)) )
712 {
713 ERROR("Could not set vcpu context, rc=%d", rc);
714 goto out;
715 }
716 rc = 1;
717 }
719 /* Read HVM context */
720 if ( !read_exact(io_fd, &rec_len, sizeof(uint32_t)) )
721 {
722 ERROR("error read hvm context size!\n");
723 goto out;
724 }
726 hvm_buf = malloc(rec_len);
727 if ( hvm_buf == NULL )
728 {
729 ERROR("memory alloc for hvm context buffer failed");
730 errno = ENOMEM;
731 goto out;
732 }
734 if ( !read_exact(io_fd, hvm_buf, rec_len) )
735 {
736 ERROR("error loading the HVM context");
737 goto out;
738 }
740 rc = xc_domain_hvm_setcontext(xc_handle, dom, hvm_buf, rec_len);
741 if ( rc )
742 ERROR("error setting the HVM context");
744 goto out;
745 }
747 /* Non-HVM guests only from here on */
749 if ( (pt_levels == 3) && !pae_extended_cr3 )
750 {
751 /*
752 ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
753 ** is a little awkward and involves (a) finding all such PGDs and
754 ** replacing them with 'lowmem' versions; (b) upating the p2m[]
755 ** with the new info; and (c) canonicalizing all the L1s using the
756 ** (potentially updated) p2m[].
757 **
758 ** This is relatively slow (and currently involves two passes through
759 ** the pfn_type[] array), but at least seems to be correct. May wish
760 ** to consider more complex approaches to optimize this later.
761 */
763 int j, k;
765 /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
766 for ( i = 0; i < p2m_size; i++ )
767 {
768 if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
769 XEN_DOMCTL_PFINFO_L3TAB) &&
770 (p2m[i] > 0xfffffUL) )
771 {
772 unsigned long new_mfn;
773 uint64_t l3ptes[4];
774 uint64_t *l3tab;
776 l3tab = (uint64_t *)
777 xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
778 PROT_READ, p2m[i]);
780 for ( j = 0; j < 4; j++ )
781 l3ptes[j] = l3tab[j];
783 munmap(l3tab, PAGE_SIZE);
785 new_mfn = xc_make_page_below_4G(xc_handle, dom, p2m[i]);
786 if ( !new_mfn )
787 {
788 ERROR("Couldn't get a page below 4GB :-(");
789 goto out;
790 }
792 p2m[i] = new_mfn;
793 if ( xc_add_mmu_update(xc_handle, mmu,
794 (((unsigned long long)new_mfn)
795 << PAGE_SHIFT) |
796 MMU_MACHPHYS_UPDATE, i) )
797 {
798 ERROR("Couldn't m2p on PAE root pgdir");
799 goto out;
800 }
802 l3tab = (uint64_t *)
803 xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
804 PROT_READ | PROT_WRITE, p2m[i]);
806 for ( j = 0; j < 4; j++ )
807 l3tab[j] = l3ptes[j];
809 munmap(l3tab, PAGE_SIZE);
810 }
811 }
813 /* Second pass: find all L1TABs and uncanonicalize them */
814 j = 0;
816 for ( i = 0; i < p2m_size; i++ )
817 {
818 if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
819 XEN_DOMCTL_PFINFO_L1TAB) )
820 {
821 region_mfn[j] = p2m[i];
822 j++;
823 }
825 if ( (i == (p2m_size-1)) || (j == MAX_BATCH_SIZE) )
826 {
827 region_base = xc_map_foreign_batch(
828 xc_handle, dom, PROT_READ | PROT_WRITE, region_mfn, j);
829 if ( region_base == NULL )
830 {
831 ERROR("map batch failed");
832 goto out;
833 }
835 for ( k = 0; k < j; k++ )
836 {
837 if ( !uncanonicalize_pagetable(
838 xc_handle, dom, XEN_DOMCTL_PFINFO_L1TAB,
839 region_base + k*PAGE_SIZE) )
840 {
841 ERROR("failed uncanonicalize pt!");
842 goto out;
843 }
844 }
846 munmap(region_base, j*PAGE_SIZE);
847 j = 0;
848 }
849 }
851 if ( xc_flush_mmu_updates(xc_handle, mmu) )
852 {
853 ERROR("Error doing xc_flush_mmu_updates()");
854 goto out;
855 }
856 }
858 /*
859 * Pin page tables. Do this after writing to them as otherwise Xen
860 * will barf when doing the type-checking.
861 */
862 nr_pins = 0;
863 for ( i = 0; i < p2m_size; i++ )
864 {
865 if ( (pfn_type[i] & XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
866 continue;
868 switch ( pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
869 {
870 case XEN_DOMCTL_PFINFO_L1TAB:
871 pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
872 break;
874 case XEN_DOMCTL_PFINFO_L2TAB:
875 pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
876 break;
878 case XEN_DOMCTL_PFINFO_L3TAB:
879 pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
880 break;
882 case XEN_DOMCTL_PFINFO_L4TAB:
883 pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
884 break;
886 default:
887 continue;
888 }
890 pin[nr_pins].arg1.mfn = p2m[i];
891 nr_pins++;
893 /* Batch full? Then flush. */
894 if ( nr_pins == MAX_PIN_BATCH )
895 {
896 if ( xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0 )
897 {
898 ERROR("Failed to pin batch of %d page tables", nr_pins);
899 goto out;
900 }
901 nr_pins = 0;
902 }
903 }
905 /* Flush final partial batch. */
906 if ( (nr_pins != 0) && (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) )
907 {
908 ERROR("Failed to pin batch of %d page tables", nr_pins);
909 goto out;
910 }
912 DPRINTF("\b\b\b\b100%%\n");
913 DPRINTF("Memory reloaded (%ld pages)\n", nr_pfns);
915 /* Get the list of PFNs that are not in the psuedo-phys map */
916 {
917 unsigned int count;
918 unsigned long *pfntab;
919 int nr_frees, rc;
921 if ( !read_exact(io_fd, &count, sizeof(count)) )
922 {
923 ERROR("Error when reading pfn count");
924 goto out;
925 }
927 if ( !(pfntab = malloc(sizeof(unsigned long) * count)) )
928 {
929 ERROR("Out of memory");
930 goto out;
931 }
933 if ( !read_exact(io_fd, pfntab, sizeof(unsigned long)*count) )
934 {
935 ERROR("Error when reading pfntab");
936 goto out;
937 }
939 nr_frees = 0;
940 for ( i = 0; i < count; i++ )
941 {
942 unsigned long pfn = pfntab[i];
944 if ( p2m[pfn] != INVALID_P2M_ENTRY )
945 {
946 /* pfn is not in physmap now, but was at some point during
947 the save/migration process - need to free it */
948 pfntab[nr_frees++] = p2m[pfn];
949 p2m[pfn] = INVALID_P2M_ENTRY; /* not in pseudo-physical map */
950 }
951 }
953 if ( nr_frees > 0 )
954 {
955 struct xen_memory_reservation reservation = {
956 .nr_extents = nr_frees,
957 .extent_order = 0,
958 .domid = dom
959 };
960 set_xen_guest_handle(reservation.extent_start, pfntab);
962 if ( (rc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
963 &reservation)) != nr_frees )
964 {
965 ERROR("Could not decrease reservation : %d", rc);
966 goto out;
967 }
968 else
969 DPRINTF("Decreased reservation by %d pages\n", count);
970 }
971 }
973 for ( i = 0; i <= max_vcpu_id; i++ )
974 {
975 if ( !(vcpumap & (1ULL << i)) )
976 continue;
978 if ( !read_exact(io_fd, &ctxt, sizeof(ctxt)) )
979 {
980 ERROR("Error when reading ctxt %d", i);
981 goto out;
982 }
984 if ( !new_ctxt_format )
985 ctxt.flags |= VGCF_online;
987 if ( i == 0 )
988 {
989 /*
990 * Uncanonicalise the suspend-record frame number and poke
991 * resume record.
992 */
993 pfn = ctxt.user_regs.edx;
994 if ( (pfn >= p2m_size) ||
995 (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
996 {
997 ERROR("Suspend record frame number is bad");
998 goto out;
999 }
1000 ctxt.user_regs.edx = mfn = p2m[pfn];
1001 start_info = xc_map_foreign_range(
1002 xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
1003 start_info->nr_pages = p2m_size;
1004 start_info->shared_info = shared_info_frame << PAGE_SHIFT;
1005 start_info->flags = 0;
1006 *store_mfn = start_info->store_mfn = p2m[start_info->store_mfn];
1007 start_info->store_evtchn = store_evtchn;
1008 start_info->console.domU.mfn = p2m[start_info->console.domU.mfn];
1009 start_info->console.domU.evtchn = console_evtchn;
1010 *console_mfn = start_info->console.domU.mfn;
1011 munmap(start_info, PAGE_SIZE);
1014 /* Uncanonicalise each GDT frame number. */
1015 if ( ctxt.gdt_ents > 8192 )
1017 ERROR("GDT entry count out of range");
1018 goto out;
1021 for ( j = 0; (512*j) < ctxt.gdt_ents; j++ )
1023 pfn = ctxt.gdt_frames[j];
1024 if ( (pfn >= p2m_size) ||
1025 (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
1027 ERROR("GDT frame number is bad");
1028 goto out;
1030 ctxt.gdt_frames[j] = p2m[pfn];
1033 /* Uncanonicalise the page table base pointer. */
1034 pfn = xen_cr3_to_pfn(ctxt.ctrlreg[3]);
1036 if ( pfn >= p2m_size )
1038 ERROR("PT base is bad: pfn=%lu p2m_size=%lu type=%08lx",
1039 pfn, p2m_size, pfn_type[pfn]);
1040 goto out;
1043 if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
1044 ((unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
1046 ERROR("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
1047 pfn, p2m_size, pfn_type[pfn],
1048 (unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
1049 goto out;
1052 ctxt.ctrlreg[3] = xen_pfn_to_cr3(p2m[pfn]);
1054 /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
1055 if ( (pt_levels == 4) && ctxt.ctrlreg[1] )
1057 pfn = xen_cr3_to_pfn(ctxt.ctrlreg[1]);
1059 if ( pfn >= p2m_size )
1061 ERROR("User PT base is bad: pfn=%lu p2m_size=%lu type=%08lx",
1062 pfn, p2m_size, pfn_type[pfn]);
1063 goto out;
1066 if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
1067 ((unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
1069 ERROR("User PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
1070 pfn, p2m_size, pfn_type[pfn],
1071 (unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
1072 goto out;
1075 ctxt.ctrlreg[1] = xen_pfn_to_cr3(p2m[pfn]);
1078 domctl.cmd = XEN_DOMCTL_setvcpucontext;
1079 domctl.domain = (domid_t)dom;
1080 domctl.u.vcpucontext.vcpu = i;
1081 set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt);
1082 rc = xc_domctl(xc_handle, &domctl);
1083 if ( rc != 0 )
1085 ERROR("Couldn't build vcpu%d", i);
1086 goto out;
1088 rc = 1;
1091 if ( !read_exact(io_fd, shared_info_page, PAGE_SIZE) )
1093 ERROR("Error when reading shared info page");
1094 goto out;
1097 /* clear any pending events and the selector */
1098 memset(&(shared_info->evtchn_pending[0]), 0,
1099 sizeof (shared_info->evtchn_pending));
1100 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
1101 shared_info->vcpu_info[i].evtchn_pending_sel = 0;
1103 /* Copy saved contents of shared-info page. No checking needed. */
1104 page = xc_map_foreign_range(
1105 xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
1106 memcpy(page, shared_info, PAGE_SIZE);
1107 munmap(page, PAGE_SIZE);
1109 /* Uncanonicalise the pfn-to-mfn table frame-number list. */
1110 for ( i = 0; i < P2M_FL_ENTRIES; i++ )
1112 pfn = p2m_frame_list[i];
1113 if ( (pfn >= p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
1115 ERROR("PFN-to-MFN frame number is bad");
1116 goto out;
1119 p2m_frame_list[i] = p2m[pfn];
1122 /* Copy the P2M we've constructed to the 'live' P2M */
1123 if ( !(live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_WRITE,
1124 p2m_frame_list, P2M_FL_ENTRIES)) )
1126 ERROR("Couldn't map p2m table");
1127 goto out;
1130 memcpy(live_p2m, p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
1131 munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
1133 DPRINTF("Domain ready to be built.\n");
1134 rc = 0;
1136 out:
1137 if ( (rc != 0) && (dom != 0) )
1138 xc_domain_destroy(xc_handle, dom);
1139 free(mmu);
1140 free(p2m);
1141 free(pfn_type);
1142 free(hvm_buf);
1144 /* discard cache for save file */
1145 discard_file_cache(io_fd, 1 /*flush*/);
1147 DPRINTF("Restore exit with rc=%d\n", rc);
1149 return rc;