ia64/xen-unstable

view tools/libxc/xc_domain_restore.c @ 19526:a6003404e95b

restore: sign extend p2m when restoring on a host with pfn width <
guest pfn width

(i.e. 32on64 domain 0 and 64 bit guest domain).

Otherwise P2M entries which were INVALID_P2M_ENTRY
(==0xffffffffffffffff) become 0xffffffff after a migrate.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Apr 08 19:10:33 2009 +0100 (2009-04-08)
parents 6595393a3d28
children 205b1badbcfd
line source
1 /******************************************************************************
2 * xc_domain_restore.c
3 *
4 * Restore the state of a guest session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 * Copyright (c) 2006, Intel Corporation
8 * Copyright (c) 2007, XenSource Inc.
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms and conditions of the GNU General Public License,
12 * version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
17 * more details.
18 *
19 * You should have received a copy of the GNU General Public License along with
20 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
21 * Place - Suite 330, Boston, MA 02111-1307 USA.
22 *
23 */
25 #include <stdlib.h>
26 #include <unistd.h>
28 #include "xg_private.h"
29 #include "xg_save_restore.h"
30 #include "xc_dom.h"
32 #include <xen/hvm/ioreq.h>
33 #include <xen/hvm/params.h>
35 /* max mfn of the current host machine */
36 static unsigned long max_mfn;
38 /* virtual starting address of the hypervisor */
39 static unsigned long hvirt_start;
41 /* #levels of page tables used by the current guest */
42 static unsigned int pt_levels;
44 /* number of pfns this guest has (i.e. number of entries in the P2M) */
45 static unsigned long p2m_size;
47 /* number of 'in use' pfns in the guest (i.e. #P2M entries with a valid mfn) */
48 static unsigned long nr_pfns;
50 /* Live mapping of the table mapping each PFN to its current MFN. */
51 static xen_pfn_t *live_p2m = NULL;
53 /* A table mapping each PFN to its new MFN. */
54 static xen_pfn_t *p2m = NULL;
56 /* A table of P2M mappings in the current region */
57 static xen_pfn_t *p2m_batch = NULL;
59 /* Address size of the guest, in bytes */
60 unsigned int guest_width;
62 /*
63 ** In the state file (or during transfer), all page-table pages are
64 ** converted into a 'canonical' form where references to actual mfns
65 ** are replaced with references to the corresponding pfns.
66 ** This function inverts that operation, replacing the pfn values with
67 ** the (now known) appropriate mfn values.
68 */
69 static int uncanonicalize_pagetable(int xc_handle, uint32_t dom,
70 unsigned long type, void *page)
71 {
72 int i, pte_last;
73 unsigned long pfn;
74 uint64_t pte;
75 int nr_mfns = 0;
77 pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
79 /* First pass: work out how many (if any) MFNs we need to alloc */
80 for ( i = 0; i < pte_last; i++ )
81 {
82 if ( pt_levels == 2 )
83 pte = ((uint32_t *)page)[i];
84 else
85 pte = ((uint64_t *)page)[i];
87 /* XXX SMH: below needs fixing for PROT_NONE etc */
88 if ( !(pte & _PAGE_PRESENT) )
89 continue;
91 pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
93 if ( pfn >= p2m_size )
94 {
95 /* This "page table page" is probably not one; bail. */
96 ERROR("Frame number in type %lu page table is out of range: "
97 "i=%d pfn=0x%lx p2m_size=%lu",
98 type >> 28, i, pfn, p2m_size);
99 return 0;
100 }
102 if ( p2m[pfn] == INVALID_P2M_ENTRY )
103 {
104 /* Have a 'valid' PFN without a matching MFN - need to alloc */
105 p2m_batch[nr_mfns++] = pfn;
106 p2m[pfn]--;
107 }
108 }
110 /* Allocate the requisite number of mfns. */
111 if ( nr_mfns &&
112 (xc_domain_memory_populate_physmap(xc_handle, dom, nr_mfns, 0, 0,
113 p2m_batch) != 0) )
114 {
115 ERROR("Failed to allocate memory for batch.!\n");
116 errno = ENOMEM;
117 return 0;
118 }
120 /* Second pass: uncanonicalize each present PTE */
121 nr_mfns = 0;
122 for ( i = 0; i < pte_last; i++ )
123 {
124 if ( pt_levels == 2 )
125 pte = ((uint32_t *)page)[i];
126 else
127 pte = ((uint64_t *)page)[i];
129 /* XXX SMH: below needs fixing for PROT_NONE etc */
130 if ( !(pte & _PAGE_PRESENT) )
131 continue;
133 pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
135 if ( p2m[pfn] == (INVALID_P2M_ENTRY-1) )
136 p2m[pfn] = p2m_batch[nr_mfns++];
138 pte &= ~MADDR_MASK_X86;
139 pte |= (uint64_t)p2m[pfn] << PAGE_SHIFT;
141 if ( pt_levels == 2 )
142 ((uint32_t *)page)[i] = (uint32_t)pte;
143 else
144 ((uint64_t *)page)[i] = (uint64_t)pte;
145 }
147 return 1;
148 }
151 /* Load the p2m frame list, plus potential extended info chunk */
152 static xen_pfn_t *load_p2m_frame_list(
153 int io_fd, int *pae_extended_cr3, int *ext_vcpucontext)
154 {
155 xen_pfn_t *p2m_frame_list;
156 vcpu_guest_context_any_t ctxt;
157 xen_pfn_t p2m_fl_zero;
159 /* Read first entry of P2M list, or extended-info signature (~0UL). */
160 if ( read_exact(io_fd, &p2m_fl_zero, sizeof(long)) )
161 {
162 ERROR("read extended-info signature failed");
163 return NULL;
164 }
166 if ( p2m_fl_zero == ~0UL )
167 {
168 uint32_t tot_bytes;
170 /* Next 4 bytes: total size of following extended info. */
171 if ( read_exact(io_fd, &tot_bytes, sizeof(tot_bytes)) )
172 {
173 ERROR("read extended-info size failed");
174 return NULL;
175 }
177 while ( tot_bytes )
178 {
179 uint32_t chunk_bytes;
180 char chunk_sig[4];
182 /* 4-character chunk signature + 4-byte remaining chunk size. */
183 if ( read_exact(io_fd, chunk_sig, sizeof(chunk_sig)) ||
184 read_exact(io_fd, &chunk_bytes, sizeof(chunk_bytes)) ||
185 (tot_bytes < (chunk_bytes + 8)) )
186 {
187 ERROR("read extended-info chunk signature failed");
188 return NULL;
189 }
190 tot_bytes -= 8;
192 /* VCPU context structure? */
193 if ( !strncmp(chunk_sig, "vcpu", 4) )
194 {
195 /* Pick a guest word-size and PT depth from the ctxt size */
196 if ( chunk_bytes == sizeof (ctxt.x32) )
197 {
198 guest_width = 4;
199 if ( pt_levels > 2 )
200 pt_levels = 3;
201 }
202 else if ( chunk_bytes == sizeof (ctxt.x64) )
203 {
204 guest_width = 8;
205 pt_levels = 4;
206 }
207 else
208 {
209 ERROR("bad extended-info context size %d", chunk_bytes);
210 return NULL;
211 }
213 if ( read_exact(io_fd, &ctxt, chunk_bytes) )
214 {
215 ERROR("read extended-info vcpu context failed");
216 return NULL;
217 }
218 tot_bytes -= chunk_bytes;
219 chunk_bytes = 0;
221 if ( GET_FIELD(&ctxt, vm_assist)
222 & (1UL << VMASST_TYPE_pae_extended_cr3) )
223 *pae_extended_cr3 = 1;
224 }
225 else if ( !strncmp(chunk_sig, "extv", 4) )
226 {
227 *ext_vcpucontext = 1;
228 }
230 /* Any remaining bytes of this chunk: read and discard. */
231 while ( chunk_bytes )
232 {
233 unsigned long sz = MIN(chunk_bytes, sizeof(xen_pfn_t));
234 if ( read_exact(io_fd, &p2m_fl_zero, sz) )
235 {
236 ERROR("read-and-discard extended-info chunk bytes failed");
237 return NULL;
238 }
239 chunk_bytes -= sz;
240 tot_bytes -= sz;
241 }
242 }
244 /* Now read the real first entry of P2M list. */
245 if ( read_exact(io_fd, &p2m_fl_zero, sizeof(xen_pfn_t)) )
246 {
247 ERROR("read first entry of p2m_frame_list failed");
248 return NULL;
249 }
250 }
252 /* Now that we know the guest's word-size, can safely allocate
253 * the p2m frame list */
254 if ( (p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE)) == NULL )
255 {
256 ERROR("Couldn't allocate p2m_frame_list array");
257 return NULL;
258 }
260 /* First entry has already been read. */
261 p2m_frame_list[0] = p2m_fl_zero;
262 if ( read_exact(io_fd, &p2m_frame_list[1],
263 (P2M_FL_ENTRIES - 1) * sizeof(xen_pfn_t)) )
264 {
265 ERROR("read p2m_frame_list failed");
266 return NULL;
267 }
269 return p2m_frame_list;
270 }
272 int xc_domain_restore(int xc_handle, int io_fd, uint32_t dom,
273 unsigned int store_evtchn, unsigned long *store_mfn,
274 unsigned int console_evtchn, unsigned long *console_mfn,
275 unsigned int hvm, unsigned int pae)
276 {
277 DECLARE_DOMCTL;
278 int rc = 1, frc, i, j, n, m, pae_extended_cr3 = 0, ext_vcpucontext = 0;
279 unsigned long mfn, pfn;
280 unsigned int prev_pc, this_pc;
281 int verify = 0;
282 int nraces = 0;
284 /* The new domain's shared-info frame number. */
285 unsigned long shared_info_frame;
286 unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
287 shared_info_any_t *old_shared_info =
288 (shared_info_any_t *)shared_info_page;
289 shared_info_any_t *new_shared_info;
291 /* A copy of the CPU context of the guest. */
292 vcpu_guest_context_any_t ctxt;
294 /* A table containing the type of each PFN (/not/ MFN!). */
295 unsigned long *pfn_type = NULL;
297 /* A table of MFNs to map in the current region */
298 xen_pfn_t *region_mfn = NULL;
300 /* Types of the pfns in the current region */
301 unsigned long region_pfn_type[MAX_BATCH_SIZE];
303 /* A copy of the pfn-to-mfn table frame list. */
304 xen_pfn_t *p2m_frame_list = NULL;
306 /* A temporary mapping of the guest's start_info page. */
307 start_info_any_t *start_info;
309 /* Our mapping of the current region (batch) */
310 char *region_base;
312 struct xc_mmu *mmu = NULL;
314 /* used by debug verify code */
315 unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
317 struct mmuext_op pin[MAX_PIN_BATCH];
318 unsigned int nr_pins;
320 uint64_t vcpumap = 1ULL;
321 unsigned int max_vcpu_id = 0;
322 int new_ctxt_format = 0;
324 /* Magic frames in HVM guests: ioreqs and xenstore comms. */
325 uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
327 /* Buffer for holding HVM context */
328 uint8_t *hvm_buf = NULL;
330 /* For info only */
331 nr_pfns = 0;
333 if ( read_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
334 {
335 ERROR("read: p2m_size");
336 goto out;
337 }
338 DPRINTF("xc_domain_restore start: p2m_size = %lx\n", p2m_size);
340 if ( !get_platform_info(xc_handle, dom,
341 &max_mfn, &hvirt_start, &pt_levels, &guest_width) )
342 {
343 ERROR("Unable to get platform info.");
344 return 1;
345 }
347 /* The *current* word size of the guest isn't very interesting; for now
348 * assume the guest will be the same as we are. We'll fix that later
349 * if we discover otherwise. */
350 guest_width = sizeof(unsigned long);
351 pt_levels = (guest_width == 8) ? 4 : (pt_levels == 2) ? 2 : 3;
353 if ( !hvm )
354 {
355 /* Load the p2m frame list, plus potential extended info chunk */
356 p2m_frame_list = load_p2m_frame_list(
357 io_fd, &pae_extended_cr3, &ext_vcpucontext);
358 if ( !p2m_frame_list )
359 goto out;
361 /* Now that we know the word size, tell Xen about it */
362 memset(&domctl, 0, sizeof(domctl));
363 domctl.domain = dom;
364 domctl.cmd = XEN_DOMCTL_set_address_size;
365 domctl.u.address_size.size = guest_width * 8;
366 frc = do_domctl(xc_handle, &domctl);
367 if ( frc != 0 )
368 {
369 ERROR("Unable to set guest address size.");
370 goto out;
371 }
372 }
374 /* We want zeroed memory so use calloc rather than malloc. */
375 p2m = calloc(p2m_size, sizeof(xen_pfn_t));
376 pfn_type = calloc(p2m_size, sizeof(unsigned long));
378 region_mfn = xg_memalign(PAGE_SIZE, ROUNDUP(
379 MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
380 p2m_batch = xg_memalign(PAGE_SIZE, ROUNDUP(
381 MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
383 if ( (p2m == NULL) || (pfn_type == NULL) ||
384 (region_mfn == NULL) || (p2m_batch == NULL) )
385 {
386 ERROR("memory alloc failed");
387 errno = ENOMEM;
388 goto out;
389 }
391 memset(region_mfn, 0,
392 ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
393 memset(p2m_batch, 0,
394 ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
396 if ( lock_pages(region_mfn, sizeof(xen_pfn_t) * MAX_BATCH_SIZE) )
397 {
398 ERROR("Could not lock region_mfn");
399 goto out;
400 }
402 if ( lock_pages(p2m_batch, sizeof(xen_pfn_t) * MAX_BATCH_SIZE) )
403 {
404 ERROR("Could not lock p2m_batch");
405 goto out;
406 }
408 /* Get the domain's shared-info frame. */
409 domctl.cmd = XEN_DOMCTL_getdomaininfo;
410 domctl.domain = (domid_t)dom;
411 if ( xc_domctl(xc_handle, &domctl) < 0 )
412 {
413 ERROR("Could not get information on new domain");
414 goto out;
415 }
416 shared_info_frame = domctl.u.getdomaininfo.shared_info_frame;
418 /* Mark all PFNs as invalid; we allocate on demand */
419 for ( pfn = 0; pfn < p2m_size; pfn++ )
420 p2m[pfn] = INVALID_P2M_ENTRY;
422 mmu = xc_alloc_mmu_updates(xc_handle, dom);
423 if ( mmu == NULL )
424 {
425 ERROR("Could not initialise for MMU updates");
426 goto out;
427 }
429 DPRINTF("Reloading memory pages: 0%%\n");
431 /*
432 * Now simply read each saved frame into its new machine frame.
433 * We uncanonicalise page tables as we go.
434 */
435 prev_pc = 0;
437 n = m = 0;
438 for ( ; ; )
439 {
440 int j, nr_mfns = 0;
442 this_pc = (n * 100) / p2m_size;
443 if ( (this_pc - prev_pc) >= 5 )
444 {
445 PPRINTF("\b\b\b\b%3d%%", this_pc);
446 prev_pc = this_pc;
447 }
449 if ( read_exact(io_fd, &j, sizeof(int)) )
450 {
451 ERROR("Error when reading batch size");
452 goto out;
453 }
455 PPRINTF("batch %d\n",j);
457 if ( j == -1 )
458 {
459 verify = 1;
460 DPRINTF("Entering page verify mode\n");
461 continue;
462 }
464 if ( j == -2 )
465 {
466 new_ctxt_format = 1;
467 if ( read_exact(io_fd, &max_vcpu_id, sizeof(int)) ||
468 (max_vcpu_id >= 64) ||
469 read_exact(io_fd, &vcpumap, sizeof(uint64_t)) )
470 {
471 ERROR("Error when reading max_vcpu_id");
472 goto out;
473 }
474 continue;
475 }
477 if ( j == -3 )
478 {
479 uint64_t ident_pt;
481 /* Skip padding 4 bytes then read the EPT identity PT location. */
482 if ( read_exact(io_fd, &ident_pt, sizeof(uint32_t)) ||
483 read_exact(io_fd, &ident_pt, sizeof(uint64_t)) )
484 {
485 ERROR("error read the address of the EPT identity map");
486 goto out;
487 }
489 xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT, ident_pt);
490 continue;
491 }
493 if ( j == -4 )
494 {
495 uint64_t vm86_tss;
497 /* Skip padding 4 bytes then read the vm86 TSS location. */
498 if ( read_exact(io_fd, &vm86_tss, sizeof(uint32_t)) ||
499 read_exact(io_fd, &vm86_tss, sizeof(uint64_t)) )
500 {
501 ERROR("error read the address of the vm86 TSS");
502 goto out;
503 }
505 xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, vm86_tss);
506 continue;
507 }
509 if ( j == 0 )
510 break; /* our work here is done */
512 if ( (j > MAX_BATCH_SIZE) || (j < 0) )
513 {
514 ERROR("Max batch size exceeded. Giving up.");
515 goto out;
516 }
518 if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) )
519 {
520 ERROR("Error when reading region pfn types");
521 goto out;
522 }
524 /* First pass for this batch: work out how much memory to alloc */
525 nr_mfns = 0;
526 for ( i = 0; i < j; i++ )
527 {
528 unsigned long pfn, pagetype;
529 pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
530 pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
532 if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) &&
533 (p2m[pfn] == INVALID_P2M_ENTRY) )
534 {
535 /* Have a live PFN which hasn't had an MFN allocated */
536 p2m_batch[nr_mfns++] = pfn;
537 p2m[pfn]--;
538 }
539 }
541 /* Now allocate a bunch of mfns for this batch */
542 if ( nr_mfns &&
543 (xc_domain_memory_populate_physmap(xc_handle, dom, nr_mfns, 0,
544 0, p2m_batch) != 0) )
545 {
546 ERROR("Failed to allocate memory for batch.!\n");
547 errno = ENOMEM;
548 goto out;
549 }
551 /* Second pass for this batch: update p2m[] and region_mfn[] */
552 nr_mfns = 0;
553 for ( i = 0; i < j; i++ )
554 {
555 unsigned long pfn, pagetype;
556 pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
557 pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
559 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
560 region_mfn[i] = ~0UL; /* map will fail but we don't care */
561 else
562 {
563 if ( p2m[pfn] == (INVALID_P2M_ENTRY-1) )
564 {
565 /* We just allocated a new mfn above; update p2m */
566 p2m[pfn] = p2m_batch[nr_mfns++];
567 nr_pfns++;
568 }
570 /* setup region_mfn[] for batch map.
571 * For HVM guests, this interface takes PFNs, not MFNs */
572 region_mfn[i] = hvm ? pfn : p2m[pfn];
573 }
574 }
576 /* Map relevant mfns */
577 region_base = xc_map_foreign_batch(
578 xc_handle, dom, PROT_WRITE, region_mfn, j);
580 if ( region_base == NULL )
581 {
582 ERROR("map batch failed");
583 goto out;
584 }
586 for ( i = 0; i < j; i++ )
587 {
588 void *page;
589 unsigned long pagetype;
591 pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
592 pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
594 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
595 /* a bogus/unmapped page: skip it */
596 continue;
598 if ( pfn > p2m_size )
599 {
600 ERROR("pfn out of range");
601 goto out;
602 }
604 pfn_type[pfn] = pagetype;
606 mfn = p2m[pfn];
608 /* In verify mode, we use a copy; otherwise we work in place */
609 page = verify ? (void *)buf : (region_base + i*PAGE_SIZE);
611 if ( read_exact(io_fd, page, PAGE_SIZE) )
612 {
613 ERROR("Error when reading page (type was %lx)", pagetype);
614 goto out;
615 }
617 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
619 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
620 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
621 {
622 /*
623 ** A page table page - need to 'uncanonicalize' it, i.e.
624 ** replace all the references to pfns with the corresponding
625 ** mfns for the new domain.
626 **
627 ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
628 ** so we may need to update the p2m after the main loop.
629 ** Hence we defer canonicalization of L1s until then.
630 */
631 if ((pt_levels != 3) ||
632 pae_extended_cr3 ||
633 (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
635 if (!uncanonicalize_pagetable(xc_handle, dom,
636 pagetype, page)) {
637 /*
638 ** Failing to uncanonicalize a page table can be ok
639 ** under live migration since the pages type may have
640 ** changed by now (and we'll get an update later).
641 */
642 DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
643 pagetype >> 28, pfn, mfn);
644 nraces++;
645 continue;
646 }
647 }
648 }
649 else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
650 {
651 ERROR("Bogus page type %lx page table is out of range: "
652 "i=%d p2m_size=%lu", pagetype, i, p2m_size);
653 goto out;
655 }
657 if ( verify )
658 {
659 int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
660 if ( res )
661 {
662 int v;
664 DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
665 "actualcs=%08lx\n", pfn, pfn_type[pfn],
666 csum_page(region_base + i*PAGE_SIZE),
667 csum_page(buf));
669 for ( v = 0; v < 4; v++ )
670 {
671 unsigned long *p = (unsigned long *)
672 (region_base + i*PAGE_SIZE);
673 if ( buf[v] != p[v] )
674 DPRINTF(" %d: %08lx %08lx\n", v, buf[v], p[v]);
675 }
676 }
677 }
679 if ( !hvm &&
680 xc_add_mmu_update(xc_handle, mmu,
681 (((unsigned long long)mfn) << PAGE_SHIFT)
682 | MMU_MACHPHYS_UPDATE, pfn) )
683 {
684 ERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
685 goto out;
686 }
687 } /* end of 'batch' for loop */
689 munmap(region_base, j*PAGE_SIZE);
690 n+= j; /* crude stats */
692 /*
693 * Discard cache for portion of file read so far up to last
694 * page boundary every 16MB or so.
695 */
696 m += j;
697 if ( m > MAX_PAGECACHE_USAGE )
698 {
699 discard_file_cache(io_fd, 0 /* no flush */);
700 m = 0;
701 }
702 }
704 /*
705 * Ensure we flush all machphys updates before potential PAE-specific
706 * reallocations below.
707 */
708 if ( !hvm && xc_flush_mmu_updates(xc_handle, mmu) )
709 {
710 ERROR("Error doing flush_mmu_updates()");
711 goto out;
712 }
714 DPRINTF("Received all pages (%d races)\n", nraces);
716 if ( hvm )
717 {
718 uint32_t rec_len;
720 /* Set HVM-specific parameters */
721 if ( read_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
722 {
723 ERROR("error reading magic page addresses");
724 goto out;
725 }
727 /* These comms pages need to be zeroed at the start of day */
728 if ( xc_clear_domain_page(xc_handle, dom, magic_pfns[0]) ||
729 xc_clear_domain_page(xc_handle, dom, magic_pfns[1]) ||
730 xc_clear_domain_page(xc_handle, dom, magic_pfns[2]) )
731 {
732 ERROR("error zeroing magic pages");
733 goto out;
734 }
736 if ( (frc = xc_set_hvm_param(xc_handle, dom,
737 HVM_PARAM_IOREQ_PFN, magic_pfns[0]))
738 || (frc = xc_set_hvm_param(xc_handle, dom,
739 HVM_PARAM_BUFIOREQ_PFN, magic_pfns[1]))
740 || (frc = xc_set_hvm_param(xc_handle, dom,
741 HVM_PARAM_STORE_PFN, magic_pfns[2]))
742 || (frc = xc_set_hvm_param(xc_handle, dom,
743 HVM_PARAM_PAE_ENABLED, pae))
744 || (frc = xc_set_hvm_param(xc_handle, dom,
745 HVM_PARAM_STORE_EVTCHN,
746 store_evtchn)) )
747 {
748 ERROR("error setting HVM params: %i", frc);
749 goto out;
750 }
751 *store_mfn = magic_pfns[2];
753 /* Read HVM context */
754 if ( read_exact(io_fd, &rec_len, sizeof(uint32_t)) )
755 {
756 ERROR("error read hvm context size!\n");
757 goto out;
758 }
760 hvm_buf = malloc(rec_len);
761 if ( hvm_buf == NULL )
762 {
763 ERROR("memory alloc for hvm context buffer failed");
764 errno = ENOMEM;
765 goto out;
766 }
768 if ( read_exact(io_fd, hvm_buf, rec_len) )
769 {
770 ERROR("error loading the HVM context");
771 goto out;
772 }
774 frc = xc_domain_hvm_setcontext(xc_handle, dom, hvm_buf, rec_len);
775 if ( frc )
776 {
777 ERROR("error setting the HVM context");
778 goto out;
779 }
781 /* HVM success! */
782 rc = 0;
783 goto out;
784 }
786 /* Non-HVM guests only from here on */
788 if ( (pt_levels == 3) && !pae_extended_cr3 )
789 {
790 /*
791 ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
792 ** is a little awkward and involves (a) finding all such PGDs and
793 ** replacing them with 'lowmem' versions; (b) upating the p2m[]
794 ** with the new info; and (c) canonicalizing all the L1s using the
795 ** (potentially updated) p2m[].
796 **
797 ** This is relatively slow (and currently involves two passes through
798 ** the pfn_type[] array), but at least seems to be correct. May wish
799 ** to consider more complex approaches to optimize this later.
800 */
802 int j, k;
804 /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
805 for ( i = 0; i < p2m_size; i++ )
806 {
807 if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
808 XEN_DOMCTL_PFINFO_L3TAB) &&
809 (p2m[i] > 0xfffffUL) )
810 {
811 unsigned long new_mfn;
812 uint64_t l3ptes[4];
813 uint64_t *l3tab;
815 l3tab = (uint64_t *)
816 xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
817 PROT_READ, p2m[i]);
819 for ( j = 0; j < 4; j++ )
820 l3ptes[j] = l3tab[j];
822 munmap(l3tab, PAGE_SIZE);
824 new_mfn = xc_make_page_below_4G(xc_handle, dom, p2m[i]);
825 if ( !new_mfn )
826 {
827 ERROR("Couldn't get a page below 4GB :-(");
828 goto out;
829 }
831 p2m[i] = new_mfn;
832 if ( xc_add_mmu_update(xc_handle, mmu,
833 (((unsigned long long)new_mfn)
834 << PAGE_SHIFT) |
835 MMU_MACHPHYS_UPDATE, i) )
836 {
837 ERROR("Couldn't m2p on PAE root pgdir");
838 goto out;
839 }
841 l3tab = (uint64_t *)
842 xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
843 PROT_READ | PROT_WRITE, p2m[i]);
845 for ( j = 0; j < 4; j++ )
846 l3tab[j] = l3ptes[j];
848 munmap(l3tab, PAGE_SIZE);
849 }
850 }
852 /* Second pass: find all L1TABs and uncanonicalize them */
853 j = 0;
855 for ( i = 0; i < p2m_size; i++ )
856 {
857 if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
858 XEN_DOMCTL_PFINFO_L1TAB) )
859 {
860 region_mfn[j] = p2m[i];
861 j++;
862 }
864 if ( (i == (p2m_size-1)) || (j == MAX_BATCH_SIZE) )
865 {
866 region_base = xc_map_foreign_batch(
867 xc_handle, dom, PROT_READ | PROT_WRITE, region_mfn, j);
868 if ( region_base == NULL )
869 {
870 ERROR("map batch failed");
871 goto out;
872 }
874 for ( k = 0; k < j; k++ )
875 {
876 if ( !uncanonicalize_pagetable(
877 xc_handle, dom, XEN_DOMCTL_PFINFO_L1TAB,
878 region_base + k*PAGE_SIZE) )
879 {
880 ERROR("failed uncanonicalize pt!");
881 goto out;
882 }
883 }
885 munmap(region_base, j*PAGE_SIZE);
886 j = 0;
887 }
888 }
890 if ( xc_flush_mmu_updates(xc_handle, mmu) )
891 {
892 ERROR("Error doing xc_flush_mmu_updates()");
893 goto out;
894 }
895 }
897 /*
898 * Pin page tables. Do this after writing to them as otherwise Xen
899 * will barf when doing the type-checking.
900 */
901 nr_pins = 0;
902 for ( i = 0; i < p2m_size; i++ )
903 {
904 if ( (pfn_type[i] & XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
905 continue;
907 switch ( pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
908 {
909 case XEN_DOMCTL_PFINFO_L1TAB:
910 pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
911 break;
913 case XEN_DOMCTL_PFINFO_L2TAB:
914 pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
915 break;
917 case XEN_DOMCTL_PFINFO_L3TAB:
918 pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
919 break;
921 case XEN_DOMCTL_PFINFO_L4TAB:
922 pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
923 break;
925 default:
926 continue;
927 }
929 pin[nr_pins].arg1.mfn = p2m[i];
930 nr_pins++;
932 /* Batch full? Then flush. */
933 if ( nr_pins == MAX_PIN_BATCH )
934 {
935 if ( xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0 )
936 {
937 ERROR("Failed to pin batch of %d page tables", nr_pins);
938 goto out;
939 }
940 nr_pins = 0;
941 }
942 }
944 /* Flush final partial batch. */
945 if ( (nr_pins != 0) && (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) )
946 {
947 ERROR("Failed to pin batch of %d page tables", nr_pins);
948 goto out;
949 }
951 DPRINTF("\b\b\b\b100%%\n");
952 DPRINTF("Memory reloaded (%ld pages)\n", nr_pfns);
954 /* Get the list of PFNs that are not in the psuedo-phys map */
955 {
956 unsigned int count = 0;
957 unsigned long *pfntab;
958 int nr_frees;
960 if ( read_exact(io_fd, &count, sizeof(count)) ||
961 (count > (1U << 28)) ) /* up to 1TB of address space */
962 {
963 ERROR("Error when reading pfn count (= %u)", count);
964 goto out;
965 }
967 if ( !(pfntab = malloc(sizeof(unsigned long) * count)) )
968 {
969 ERROR("Out of memory");
970 goto out;
971 }
973 if ( read_exact(io_fd, pfntab, sizeof(unsigned long)*count) )
974 {
975 ERROR("Error when reading pfntab");
976 goto out;
977 }
979 nr_frees = 0;
980 for ( i = 0; i < count; i++ )
981 {
982 unsigned long pfn = pfntab[i];
984 if ( p2m[pfn] != INVALID_P2M_ENTRY )
985 {
986 /* pfn is not in physmap now, but was at some point during
987 the save/migration process - need to free it */
988 pfntab[nr_frees++] = p2m[pfn];
989 p2m[pfn] = INVALID_P2M_ENTRY; /* not in pseudo-physical map */
990 }
991 }
993 if ( nr_frees > 0 )
994 {
995 struct xen_memory_reservation reservation = {
996 .nr_extents = nr_frees,
997 .extent_order = 0,
998 .domid = dom
999 };
1000 set_xen_guest_handle(reservation.extent_start, pfntab);
1002 if ( (frc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
1003 &reservation)) != nr_frees )
1005 ERROR("Could not decrease reservation : %d", frc);
1006 goto out;
1008 else
1009 DPRINTF("Decreased reservation by %d pages\n", count);
1013 if ( lock_pages(&ctxt, sizeof(ctxt)) )
1015 ERROR("Unable to lock ctxt");
1016 return 1;
1019 for ( i = 0; i <= max_vcpu_id; i++ )
1021 if ( !(vcpumap & (1ULL << i)) )
1022 continue;
1024 if ( read_exact(io_fd, &ctxt, ((guest_width == 8)
1025 ? sizeof(ctxt.x64)
1026 : sizeof(ctxt.x32))) )
1028 ERROR("Error when reading ctxt %d", i);
1029 goto out;
1032 if ( !new_ctxt_format )
1033 SET_FIELD(&ctxt, flags, GET_FIELD(&ctxt, flags) | VGCF_online);
1035 if ( i == 0 )
1037 /*
1038 * Uncanonicalise the suspend-record frame number and poke
1039 * resume record.
1040 */
1041 pfn = GET_FIELD(&ctxt, user_regs.edx);
1042 if ( (pfn >= p2m_size) ||
1043 (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
1045 ERROR("Suspend record frame number is bad");
1046 goto out;
1048 mfn = p2m[pfn];
1049 SET_FIELD(&ctxt, user_regs.edx, mfn);
1050 start_info = xc_map_foreign_range(
1051 xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
1052 SET_FIELD(start_info, nr_pages, p2m_size);
1053 SET_FIELD(start_info, shared_info, shared_info_frame<<PAGE_SHIFT);
1054 SET_FIELD(start_info, flags, 0);
1055 *store_mfn = p2m[GET_FIELD(start_info, store_mfn)];
1056 SET_FIELD(start_info, store_mfn, *store_mfn);
1057 SET_FIELD(start_info, store_evtchn, store_evtchn);
1058 *console_mfn = p2m[GET_FIELD(start_info, console.domU.mfn)];
1059 SET_FIELD(start_info, console.domU.mfn, *console_mfn);
1060 SET_FIELD(start_info, console.domU.evtchn, console_evtchn);
1061 munmap(start_info, PAGE_SIZE);
1063 /* Uncanonicalise each GDT frame number. */
1064 if ( GET_FIELD(&ctxt, gdt_ents) > 8192 )
1066 ERROR("GDT entry count out of range");
1067 goto out;
1070 for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents); j++ )
1072 pfn = GET_FIELD(&ctxt, gdt_frames[j]);
1073 if ( (pfn >= p2m_size) ||
1074 (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
1076 ERROR("GDT frame number %i (0x%lx) is bad",
1077 j, (unsigned long)pfn);
1078 goto out;
1080 SET_FIELD(&ctxt, gdt_frames[j], p2m[pfn]);
1082 /* Uncanonicalise the page table base pointer. */
1083 pfn = UNFOLD_CR3(GET_FIELD(&ctxt, ctrlreg[3]));
1085 if ( pfn >= p2m_size )
1087 ERROR("PT base is bad: pfn=%lu p2m_size=%lu type=%08lx",
1088 pfn, p2m_size, pfn_type[pfn]);
1089 goto out;
1092 if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
1093 ((unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
1095 ERROR("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
1096 pfn, p2m_size, pfn_type[pfn],
1097 (unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
1098 goto out;
1100 SET_FIELD(&ctxt, ctrlreg[3], FOLD_CR3(p2m[pfn]));
1102 /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
1103 if ( (pt_levels == 4) && (ctxt.x64.ctrlreg[1] & 1) )
1105 pfn = UNFOLD_CR3(ctxt.x64.ctrlreg[1] & ~1);
1106 if ( pfn >= p2m_size )
1108 ERROR("User PT base is bad: pfn=%lu p2m_size=%lu",
1109 pfn, p2m_size);
1110 goto out;
1112 if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
1113 ((unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
1115 ERROR("User PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
1116 pfn, p2m_size, pfn_type[pfn],
1117 (unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
1118 goto out;
1120 ctxt.x64.ctrlreg[1] = FOLD_CR3(p2m[pfn]);
1122 domctl.cmd = XEN_DOMCTL_setvcpucontext;
1123 domctl.domain = (domid_t)dom;
1124 domctl.u.vcpucontext.vcpu = i;
1125 set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt.c);
1126 frc = xc_domctl(xc_handle, &domctl);
1127 if ( frc != 0 )
1129 ERROR("Couldn't build vcpu%d", i);
1130 goto out;
1133 if ( !ext_vcpucontext )
1134 continue;
1135 if ( read_exact(io_fd, &domctl.u.ext_vcpucontext, 128) ||
1136 (domctl.u.ext_vcpucontext.vcpu != i) )
1138 ERROR("Error when reading extended ctxt %d", i);
1139 goto out;
1141 domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
1142 domctl.domain = dom;
1143 frc = xc_domctl(xc_handle, &domctl);
1144 if ( frc != 0 )
1146 ERROR("Couldn't set extended vcpu%d info\n", i);
1147 goto out;
1151 if ( read_exact(io_fd, shared_info_page, PAGE_SIZE) )
1153 ERROR("Error when reading shared info page");
1154 goto out;
1157 /* Restore contents of shared-info page. No checking needed. */
1158 new_shared_info = xc_map_foreign_range(
1159 xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
1161 /* restore saved vcpu_info and arch specific info */
1162 MEMCPY_FIELD(new_shared_info, old_shared_info, vcpu_info);
1163 MEMCPY_FIELD(new_shared_info, old_shared_info, arch);
1165 /* clear any pending events and the selector */
1166 MEMSET_ARRAY_FIELD(new_shared_info, evtchn_pending, 0);
1167 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
1168 SET_FIELD(new_shared_info, vcpu_info[i].evtchn_pending_sel, 0);
1170 /* mask event channels */
1171 MEMSET_ARRAY_FIELD(new_shared_info, evtchn_mask, 0xff);
1173 /* leave wallclock time. set by hypervisor */
1174 munmap(new_shared_info, PAGE_SIZE);
1176 /* Uncanonicalise the pfn-to-mfn table frame-number list. */
1177 for ( i = 0; i < P2M_FL_ENTRIES; i++ )
1179 pfn = p2m_frame_list[i];
1180 if ( (pfn >= p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
1182 ERROR("PFN-to-MFN frame number %i (%#lx) is bad", i, pfn);
1183 goto out;
1185 p2m_frame_list[i] = p2m[pfn];
1188 /* Copy the P2M we've constructed to the 'live' P2M */
1189 if ( !(live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_WRITE,
1190 p2m_frame_list, P2M_FL_ENTRIES)) )
1192 ERROR("Couldn't map p2m table");
1193 goto out;
1196 /* If the domain we're restoring has a different word size to ours,
1197 * we need to adjust the live_p2m assignment appropriately */
1198 if ( guest_width > sizeof (xen_pfn_t) )
1199 for ( i = p2m_size - 1; i >= 0; i-- )
1200 ((int64_t *)live_p2m)[i] = (long)p2m[i];
1201 else if ( guest_width < sizeof (xen_pfn_t) )
1202 for ( i = 0; i < p2m_size; i++ )
1203 ((uint32_t *)live_p2m)[i] = p2m[i];
1204 else
1205 memcpy(live_p2m, p2m, p2m_size * sizeof(xen_pfn_t));
1206 munmap(live_p2m, P2M_FL_ENTRIES * PAGE_SIZE);
1208 DPRINTF("Domain ready to be built.\n");
1209 rc = 0;
1211 out:
1212 if ( (rc != 0) && (dom != 0) )
1213 xc_domain_destroy(xc_handle, dom);
1214 free(mmu);
1215 free(p2m);
1216 free(pfn_type);
1217 free(hvm_buf);
1219 /* discard cache for save file */
1220 discard_file_cache(io_fd, 1 /*flush*/);
1222 DPRINTF("Restore exit with rc=%d\n", rc);
1224 return rc;