ia64/xen-unstable

view tools/libxc/xc_domain_restore.c @ 19639:205b1badbcfd

Add support for superpages (hugepages) in PV domain

This patch adds the option "superpages" to the domain configuration
file. If it is set, the domain is populated using 2M pages.

This code does not support fallback to small pages. If the domain can
not be created with 2M pages, the create will fail.

The patch also includes support for saving and restoring domains with
the superpage flag set. However, if a domain has freed small pages
within its physical page array and then extended the array, the
restore will fill in those freed pages. It will then attempt to
allocate more than its memory limit and will fail. This is
significant because apparently Linux does this during boot, thus a
freshly booted Linux image can not be saved and restored successfully.

Signed-off-by: Dave McCracken <dcm@mccr.org>
author Keir Fraser <keir.fraser@citrix.com>
date Tue May 26 09:58:38 2009 +0100 (2009-05-26)
parents a6003404e95b
children 2f9e1348aa98
line source
1 /******************************************************************************
2 * xc_domain_restore.c
3 *
4 * Restore the state of a guest session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 * Copyright (c) 2006, Intel Corporation
8 * Copyright (c) 2007, XenSource Inc.
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms and conditions of the GNU General Public License,
12 * version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
17 * more details.
18 *
19 * You should have received a copy of the GNU General Public License along with
20 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
21 * Place - Suite 330, Boston, MA 02111-1307 USA.
22 *
23 */
25 #include <stdlib.h>
26 #include <unistd.h>
28 #include "xg_private.h"
29 #include "xg_save_restore.h"
30 #include "xc_dom.h"
32 #include <xen/hvm/ioreq.h>
33 #include <xen/hvm/params.h>
35 /* max mfn of the current host machine */
36 static unsigned long max_mfn;
38 /* virtual starting address of the hypervisor */
39 static unsigned long hvirt_start;
41 /* #levels of page tables used by the current guest */
42 static unsigned int pt_levels;
44 /* number of pfns this guest has (i.e. number of entries in the P2M) */
45 static unsigned long p2m_size;
47 /* number of 'in use' pfns in the guest (i.e. #P2M entries with a valid mfn) */
48 static unsigned long nr_pfns;
50 /* Live mapping of the table mapping each PFN to its current MFN. */
51 static xen_pfn_t *live_p2m = NULL;
53 /* A table mapping each PFN to its new MFN. */
54 static xen_pfn_t *p2m = NULL;
56 /* Address size of the guest, in bytes */
57 unsigned int guest_width;
59 /*
60 **
61 **
62 */
63 #define SUPERPAGE_PFN_SHIFT 9
64 #define SUPERPAGE_NR_PFNS (1UL << SUPERPAGE_PFN_SHIFT)
66 static int allocate_mfn(int xc_handle, uint32_t dom, unsigned long pfn, int superpages)
67 {
68 unsigned long mfn;
70 if (superpages)
71 {
72 unsigned long base_pfn;
74 base_pfn = pfn & ~(SUPERPAGE_NR_PFNS-1);
75 mfn = base_pfn;
77 if (xc_domain_memory_populate_physmap(xc_handle, dom, 1,
78 SUPERPAGE_PFN_SHIFT, 0, &mfn) != 0)
79 {
80 ERROR("Failed to allocate physical memory at pfn 0x%x, base 0x%x.\n", pfn, base_pfn);
81 errno = ENOMEM;
82 return 1;
83 }
84 for (pfn = base_pfn; pfn < base_pfn + SUPERPAGE_NR_PFNS; pfn++, mfn++)
85 {
86 p2m[pfn] = mfn;
87 }
88 }
89 else
90 {
91 mfn = pfn;
92 if (xc_domain_memory_populate_physmap(xc_handle, dom, 1, 0,
93 0, &mfn) != 0)
94 {
95 ERROR("Failed to allocate physical memory.!\n");
96 errno = ENOMEM;
97 return 1;
98 }
99 p2m[pfn] = mfn;
100 }
101 return 0;
102 }
104 static int allocate_physmem(int xc_handle, uint32_t dom,
105 unsigned long *region_pfn_type, int region_size,
106 unsigned int hvm, xen_pfn_t *region_mfn, int superpages)
107 {
108 int i;
109 unsigned long pfn;
110 unsigned long pagetype;
112 for (i = 0; i < region_size; i++)
113 {
114 pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
115 pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
117 if ( pfn > p2m_size )
118 {
119 ERROR("pfn out of range");
120 return 1;
121 }
122 if (pagetype == XEN_DOMCTL_PFINFO_XTAB)
123 {
124 region_mfn[i] = ~0UL;
125 }
126 else
127 {
128 if (p2m[pfn] == INVALID_P2M_ENTRY)
129 {
130 if (allocate_mfn(xc_handle, dom, pfn, superpages) != 0)
131 return 1;
132 }
134 /* setup region_mfn[] for batch map.
135 * For HVM guests, this interface takes PFNs, not MFNs */
136 region_mfn[i] = hvm ? pfn : p2m[pfn];
137 }
138 }
139 return 0;
140 }
143 /*
144 ** In the state file (or during transfer), all page-table pages are
145 ** converted into a 'canonical' form where references to actual mfns
146 ** are replaced with references to the corresponding pfns.
147 ** This function inverts that operation, replacing the pfn values with
148 ** the (now known) appropriate mfn values.
149 */
150 static int uncanonicalize_pagetable(int xc_handle, uint32_t dom,
151 unsigned long type, void *page, int superpages)
152 {
153 int i, pte_last;
154 unsigned long pfn;
155 uint64_t pte;
157 pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
159 for ( i = 0; i < pte_last; i++ )
160 {
161 if ( pt_levels == 2 )
162 pte = ((uint32_t *)page)[i];
163 else
164 pte = ((uint64_t *)page)[i];
166 /* XXX SMH: below needs fixing for PROT_NONE etc */
167 if ( !(pte & _PAGE_PRESENT) )
168 continue;
170 pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
172 /* Allocate mfn if necessary */
173 if ( p2m[pfn] == INVALID_P2M_ENTRY )
174 {
175 if (allocate_mfn(xc_handle, dom, pfn, superpages) != 0)
176 return 0;
177 }
178 pte &= ~MADDR_MASK_X86;
179 pte |= (uint64_t)p2m[pfn] << PAGE_SHIFT;
181 if ( pt_levels == 2 )
182 ((uint32_t *)page)[i] = (uint32_t)pte;
183 else
184 ((uint64_t *)page)[i] = (uint64_t)pte;
185 }
187 return 1;
188 }
191 /* Load the p2m frame list, plus potential extended info chunk */
192 static xen_pfn_t *load_p2m_frame_list(
193 int io_fd, int *pae_extended_cr3, int *ext_vcpucontext)
194 {
195 xen_pfn_t *p2m_frame_list;
196 vcpu_guest_context_any_t ctxt;
197 xen_pfn_t p2m_fl_zero;
199 /* Read first entry of P2M list, or extended-info signature (~0UL). */
200 if ( read_exact(io_fd, &p2m_fl_zero, sizeof(long)) )
201 {
202 ERROR("read extended-info signature failed");
203 return NULL;
204 }
206 if ( p2m_fl_zero == ~0UL )
207 {
208 uint32_t tot_bytes;
210 /* Next 4 bytes: total size of following extended info. */
211 if ( read_exact(io_fd, &tot_bytes, sizeof(tot_bytes)) )
212 {
213 ERROR("read extended-info size failed");
214 return NULL;
215 }
217 while ( tot_bytes )
218 {
219 uint32_t chunk_bytes;
220 char chunk_sig[4];
222 /* 4-character chunk signature + 4-byte remaining chunk size. */
223 if ( read_exact(io_fd, chunk_sig, sizeof(chunk_sig)) ||
224 read_exact(io_fd, &chunk_bytes, sizeof(chunk_bytes)) ||
225 (tot_bytes < (chunk_bytes + 8)) )
226 {
227 ERROR("read extended-info chunk signature failed");
228 return NULL;
229 }
230 tot_bytes -= 8;
232 /* VCPU context structure? */
233 if ( !strncmp(chunk_sig, "vcpu", 4) )
234 {
235 /* Pick a guest word-size and PT depth from the ctxt size */
236 if ( chunk_bytes == sizeof (ctxt.x32) )
237 {
238 guest_width = 4;
239 if ( pt_levels > 2 )
240 pt_levels = 3;
241 }
242 else if ( chunk_bytes == sizeof (ctxt.x64) )
243 {
244 guest_width = 8;
245 pt_levels = 4;
246 }
247 else
248 {
249 ERROR("bad extended-info context size %d", chunk_bytes);
250 return NULL;
251 }
253 if ( read_exact(io_fd, &ctxt, chunk_bytes) )
254 {
255 ERROR("read extended-info vcpu context failed");
256 return NULL;
257 }
258 tot_bytes -= chunk_bytes;
259 chunk_bytes = 0;
261 if ( GET_FIELD(&ctxt, vm_assist)
262 & (1UL << VMASST_TYPE_pae_extended_cr3) )
263 *pae_extended_cr3 = 1;
264 }
265 else if ( !strncmp(chunk_sig, "extv", 4) )
266 {
267 *ext_vcpucontext = 1;
268 }
270 /* Any remaining bytes of this chunk: read and discard. */
271 while ( chunk_bytes )
272 {
273 unsigned long sz = MIN(chunk_bytes, sizeof(xen_pfn_t));
274 if ( read_exact(io_fd, &p2m_fl_zero, sz) )
275 {
276 ERROR("read-and-discard extended-info chunk bytes failed");
277 return NULL;
278 }
279 chunk_bytes -= sz;
280 tot_bytes -= sz;
281 }
282 }
284 /* Now read the real first entry of P2M list. */
285 if ( read_exact(io_fd, &p2m_fl_zero, sizeof(xen_pfn_t)) )
286 {
287 ERROR("read first entry of p2m_frame_list failed");
288 return NULL;
289 }
290 }
292 /* Now that we know the guest's word-size, can safely allocate
293 * the p2m frame list */
294 if ( (p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE)) == NULL )
295 {
296 ERROR("Couldn't allocate p2m_frame_list array");
297 return NULL;
298 }
300 /* First entry has already been read. */
301 p2m_frame_list[0] = p2m_fl_zero;
302 if ( read_exact(io_fd, &p2m_frame_list[1],
303 (P2M_FL_ENTRIES - 1) * sizeof(xen_pfn_t)) )
304 {
305 ERROR("read p2m_frame_list failed");
306 return NULL;
307 }
309 return p2m_frame_list;
310 }
312 int xc_domain_restore(int xc_handle, int io_fd, uint32_t dom,
313 unsigned int store_evtchn, unsigned long *store_mfn,
314 unsigned int console_evtchn, unsigned long *console_mfn,
315 unsigned int hvm, unsigned int pae, int superpages)
316 {
317 DECLARE_DOMCTL;
318 int rc = 1, frc, i, j, n, m, pae_extended_cr3 = 0, ext_vcpucontext = 0;
319 unsigned long mfn, pfn;
320 unsigned int prev_pc, this_pc;
321 int verify = 0;
322 int nraces = 0;
324 /* The new domain's shared-info frame number. */
325 unsigned long shared_info_frame;
326 unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
327 shared_info_any_t *old_shared_info =
328 (shared_info_any_t *)shared_info_page;
329 shared_info_any_t *new_shared_info;
331 /* A copy of the CPU context of the guest. */
332 vcpu_guest_context_any_t ctxt;
334 /* A table containing the type of each PFN (/not/ MFN!). */
335 unsigned long *pfn_type = NULL;
337 /* A table of MFNs to map in the current region */
338 xen_pfn_t *region_mfn = NULL;
340 /* Types of the pfns in the current region */
341 unsigned long region_pfn_type[MAX_BATCH_SIZE];
343 /* A copy of the pfn-to-mfn table frame list. */
344 xen_pfn_t *p2m_frame_list = NULL;
346 /* A temporary mapping of the guest's start_info page. */
347 start_info_any_t *start_info;
349 /* Our mapping of the current region (batch) */
350 char *region_base;
352 struct xc_mmu *mmu = NULL;
354 /* used by debug verify code */
355 unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
357 struct mmuext_op pin[MAX_PIN_BATCH];
358 unsigned int nr_pins;
360 uint64_t vcpumap = 1ULL;
361 unsigned int max_vcpu_id = 0;
362 int new_ctxt_format = 0;
364 /* Magic frames in HVM guests: ioreqs and xenstore comms. */
365 uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
367 /* Buffer for holding HVM context */
368 uint8_t *hvm_buf = NULL;
370 /* For info only */
371 nr_pfns = 0;
373 if ( read_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
374 {
375 ERROR("read: p2m_size");
376 goto out;
377 }
378 DPRINTF("xc_domain_restore start: p2m_size = %lx\n", p2m_size);
380 if ( !get_platform_info(xc_handle, dom,
381 &max_mfn, &hvirt_start, &pt_levels, &guest_width) )
382 {
383 ERROR("Unable to get platform info.");
384 return 1;
385 }
387 /* The *current* word size of the guest isn't very interesting; for now
388 * assume the guest will be the same as we are. We'll fix that later
389 * if we discover otherwise. */
390 guest_width = sizeof(unsigned long);
391 pt_levels = (guest_width == 8) ? 4 : (pt_levels == 2) ? 2 : 3;
393 if ( !hvm )
394 {
395 /* Load the p2m frame list, plus potential extended info chunk */
396 p2m_frame_list = load_p2m_frame_list(
397 io_fd, &pae_extended_cr3, &ext_vcpucontext);
398 if ( !p2m_frame_list )
399 goto out;
401 /* Now that we know the word size, tell Xen about it */
402 memset(&domctl, 0, sizeof(domctl));
403 domctl.domain = dom;
404 domctl.cmd = XEN_DOMCTL_set_address_size;
405 domctl.u.address_size.size = guest_width * 8;
406 frc = do_domctl(xc_handle, &domctl);
407 if ( frc != 0 )
408 {
409 ERROR("Unable to set guest address size.");
410 goto out;
411 }
412 }
414 /* We want zeroed memory so use calloc rather than malloc. */
415 p2m = calloc(p2m_size, sizeof(xen_pfn_t));
416 pfn_type = calloc(p2m_size, sizeof(unsigned long));
418 region_mfn = xg_memalign(PAGE_SIZE, ROUNDUP(
419 MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
421 if ( (p2m == NULL) || (pfn_type == NULL) ||
422 (region_mfn == NULL) )
423 {
424 ERROR("memory alloc failed");
425 errno = ENOMEM;
426 goto out;
427 }
429 memset(region_mfn, 0,
430 ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
432 if ( lock_pages(region_mfn, sizeof(xen_pfn_t) * MAX_BATCH_SIZE) )
433 {
434 ERROR("Could not lock region_mfn");
435 goto out;
436 }
438 /* Get the domain's shared-info frame. */
439 domctl.cmd = XEN_DOMCTL_getdomaininfo;
440 domctl.domain = (domid_t)dom;
441 if ( xc_domctl(xc_handle, &domctl) < 0 )
442 {
443 ERROR("Could not get information on new domain");
444 goto out;
445 }
446 shared_info_frame = domctl.u.getdomaininfo.shared_info_frame;
448 /* Mark all PFNs as invalid; we allocate on demand */
449 for ( pfn = 0; pfn < p2m_size; pfn++ )
450 p2m[pfn] = INVALID_P2M_ENTRY;
452 mmu = xc_alloc_mmu_updates(xc_handle, dom);
453 if ( mmu == NULL )
454 {
455 ERROR("Could not initialise for MMU updates");
456 goto out;
457 }
459 DPRINTF("Reloading memory pages: 0%%\n");
461 /*
462 * Now simply read each saved frame into its new machine frame.
463 * We uncanonicalise page tables as we go.
464 */
465 prev_pc = 0;
467 n = m = 0;
468 for ( ; ; )
469 {
470 int j;
472 this_pc = (n * 100) / p2m_size;
473 if ( (this_pc - prev_pc) >= 5 )
474 {
475 PPRINTF("\b\b\b\b%3d%%", this_pc);
476 prev_pc = this_pc;
477 }
479 if ( read_exact(io_fd, &j, sizeof(int)) )
480 {
481 ERROR("Error when reading batch size");
482 goto out;
483 }
485 PPRINTF("batch %d\n",j);
487 if ( j == -1 )
488 {
489 verify = 1;
490 DPRINTF("Entering page verify mode\n");
491 continue;
492 }
494 if ( j == -2 )
495 {
496 new_ctxt_format = 1;
497 if ( read_exact(io_fd, &max_vcpu_id, sizeof(int)) ||
498 (max_vcpu_id >= 64) ||
499 read_exact(io_fd, &vcpumap, sizeof(uint64_t)) )
500 {
501 ERROR("Error when reading max_vcpu_id");
502 goto out;
503 }
504 continue;
505 }
507 if ( j == -3 )
508 {
509 uint64_t ident_pt;
511 /* Skip padding 4 bytes then read the EPT identity PT location. */
512 if ( read_exact(io_fd, &ident_pt, sizeof(uint32_t)) ||
513 read_exact(io_fd, &ident_pt, sizeof(uint64_t)) )
514 {
515 ERROR("error read the address of the EPT identity map");
516 goto out;
517 }
519 xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT, ident_pt);
520 continue;
521 }
523 if ( j == -4 )
524 {
525 uint64_t vm86_tss;
527 /* Skip padding 4 bytes then read the vm86 TSS location. */
528 if ( read_exact(io_fd, &vm86_tss, sizeof(uint32_t)) ||
529 read_exact(io_fd, &vm86_tss, sizeof(uint64_t)) )
530 {
531 ERROR("error read the address of the vm86 TSS");
532 goto out;
533 }
535 xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, vm86_tss);
536 continue;
537 }
539 if ( j == 0 )
540 break; /* our work here is done */
542 if ( (j > MAX_BATCH_SIZE) || (j < 0) )
543 {
544 ERROR("Max batch size exceeded. Giving up.");
545 goto out;
546 }
548 if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) )
549 {
550 ERROR("Error when reading region pfn types");
551 goto out;
552 }
554 if (allocate_physmem(xc_handle, dom, region_pfn_type,
555 j, hvm, region_mfn, superpages) != 0)
556 goto out;
558 /* Map relevant mfns */
559 region_base = xc_map_foreign_batch(
560 xc_handle, dom, PROT_WRITE, region_mfn, j);
562 if ( region_base == NULL )
563 {
564 ERROR("map batch failed");
565 goto out;
566 }
568 for ( i = 0; i < j; i++ )
569 {
570 void *page;
571 unsigned long pagetype;
573 pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
574 pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
576 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
577 /* a bogus/unmapped page: skip it */
578 continue;
580 if ( pfn > p2m_size )
581 {
582 ERROR("pfn out of range");
583 goto out;
584 }
586 pfn_type[pfn] = pagetype;
588 mfn = p2m[pfn];
590 /* In verify mode, we use a copy; otherwise we work in place */
591 page = verify ? (void *)buf : (region_base + i*PAGE_SIZE);
593 if ( read_exact(io_fd, page, PAGE_SIZE) )
594 {
595 ERROR("Error when reading page (type was %lx)", pagetype);
596 goto out;
597 }
599 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
601 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
602 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
603 {
604 /*
605 ** A page table page - need to 'uncanonicalize' it, i.e.
606 ** replace all the references to pfns with the corresponding
607 ** mfns for the new domain.
608 **
609 ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
610 ** so we may need to update the p2m after the main loop.
611 ** Hence we defer canonicalization of L1s until then.
612 */
613 if ((pt_levels != 3) ||
614 pae_extended_cr3 ||
615 (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
617 if (!uncanonicalize_pagetable(xc_handle, dom,
618 pagetype, page, superpages)) {
619 /*
620 ** Failing to uncanonicalize a page table can be ok
621 ** under live migration since the pages type may have
622 ** changed by now (and we'll get an update later).
623 */
624 DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
625 pagetype >> 28, pfn, mfn);
626 nraces++;
627 continue;
628 }
629 }
630 }
631 else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
632 {
633 ERROR("Bogus page type %lx page table is out of range: "
634 "i=%d p2m_size=%lu", pagetype, i, p2m_size);
635 goto out;
637 }
639 if ( verify )
640 {
641 int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
642 if ( res )
643 {
644 int v;
646 DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
647 "actualcs=%08lx\n", pfn, pfn_type[pfn],
648 csum_page(region_base + i*PAGE_SIZE),
649 csum_page(buf));
651 for ( v = 0; v < 4; v++ )
652 {
653 unsigned long *p = (unsigned long *)
654 (region_base + i*PAGE_SIZE);
655 if ( buf[v] != p[v] )
656 DPRINTF(" %d: %08lx %08lx\n", v, buf[v], p[v]);
657 }
658 }
659 }
661 if ( !hvm &&
662 xc_add_mmu_update(xc_handle, mmu,
663 (((unsigned long long)mfn) << PAGE_SHIFT)
664 | MMU_MACHPHYS_UPDATE, pfn) )
665 {
666 ERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
667 goto out;
668 }
669 } /* end of 'batch' for loop */
671 munmap(region_base, j*PAGE_SIZE);
672 n+= j; /* crude stats */
674 /*
675 * Discard cache for portion of file read so far up to last
676 * page boundary every 16MB or so.
677 */
678 m += j;
679 if ( m > MAX_PAGECACHE_USAGE )
680 {
681 discard_file_cache(io_fd, 0 /* no flush */);
682 m = 0;
683 }
684 }
686 /*
687 * Ensure we flush all machphys updates before potential PAE-specific
688 * reallocations below.
689 */
690 if ( !hvm && xc_flush_mmu_updates(xc_handle, mmu) )
691 {
692 ERROR("Error doing flush_mmu_updates()");
693 goto out;
694 }
696 DPRINTF("Received all pages (%d races)\n", nraces);
698 if ( hvm )
699 {
700 uint32_t rec_len;
702 /* Set HVM-specific parameters */
703 if ( read_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
704 {
705 ERROR("error reading magic page addresses");
706 goto out;
707 }
709 /* These comms pages need to be zeroed at the start of day */
710 if ( xc_clear_domain_page(xc_handle, dom, magic_pfns[0]) ||
711 xc_clear_domain_page(xc_handle, dom, magic_pfns[1]) ||
712 xc_clear_domain_page(xc_handle, dom, magic_pfns[2]) )
713 {
714 ERROR("error zeroing magic pages");
715 goto out;
716 }
718 if ( (frc = xc_set_hvm_param(xc_handle, dom,
719 HVM_PARAM_IOREQ_PFN, magic_pfns[0]))
720 || (frc = xc_set_hvm_param(xc_handle, dom,
721 HVM_PARAM_BUFIOREQ_PFN, magic_pfns[1]))
722 || (frc = xc_set_hvm_param(xc_handle, dom,
723 HVM_PARAM_STORE_PFN, magic_pfns[2]))
724 || (frc = xc_set_hvm_param(xc_handle, dom,
725 HVM_PARAM_PAE_ENABLED, pae))
726 || (frc = xc_set_hvm_param(xc_handle, dom,
727 HVM_PARAM_STORE_EVTCHN,
728 store_evtchn)) )
729 {
730 ERROR("error setting HVM params: %i", frc);
731 goto out;
732 }
733 *store_mfn = magic_pfns[2];
735 /* Read HVM context */
736 if ( read_exact(io_fd, &rec_len, sizeof(uint32_t)) )
737 {
738 ERROR("error read hvm context size!\n");
739 goto out;
740 }
742 hvm_buf = malloc(rec_len);
743 if ( hvm_buf == NULL )
744 {
745 ERROR("memory alloc for hvm context buffer failed");
746 errno = ENOMEM;
747 goto out;
748 }
750 if ( read_exact(io_fd, hvm_buf, rec_len) )
751 {
752 ERROR("error loading the HVM context");
753 goto out;
754 }
756 frc = xc_domain_hvm_setcontext(xc_handle, dom, hvm_buf, rec_len);
757 if ( frc )
758 {
759 ERROR("error setting the HVM context");
760 goto out;
761 }
763 /* HVM success! */
764 rc = 0;
765 goto out;
766 }
768 /* Non-HVM guests only from here on */
770 if ( (pt_levels == 3) && !pae_extended_cr3 )
771 {
772 /*
773 ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
774 ** is a little awkward and involves (a) finding all such PGDs and
775 ** replacing them with 'lowmem' versions; (b) upating the p2m[]
776 ** with the new info; and (c) canonicalizing all the L1s using the
777 ** (potentially updated) p2m[].
778 **
779 ** This is relatively slow (and currently involves two passes through
780 ** the pfn_type[] array), but at least seems to be correct. May wish
781 ** to consider more complex approaches to optimize this later.
782 */
784 int j, k;
786 /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
787 for ( i = 0; i < p2m_size; i++ )
788 {
789 if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
790 XEN_DOMCTL_PFINFO_L3TAB) &&
791 (p2m[i] > 0xfffffUL) )
792 {
793 unsigned long new_mfn;
794 uint64_t l3ptes[4];
795 uint64_t *l3tab;
797 l3tab = (uint64_t *)
798 xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
799 PROT_READ, p2m[i]);
801 for ( j = 0; j < 4; j++ )
802 l3ptes[j] = l3tab[j];
804 munmap(l3tab, PAGE_SIZE);
806 new_mfn = xc_make_page_below_4G(xc_handle, dom, p2m[i]);
807 if ( !new_mfn )
808 {
809 ERROR("Couldn't get a page below 4GB :-(");
810 goto out;
811 }
813 p2m[i] = new_mfn;
814 if ( xc_add_mmu_update(xc_handle, mmu,
815 (((unsigned long long)new_mfn)
816 << PAGE_SHIFT) |
817 MMU_MACHPHYS_UPDATE, i) )
818 {
819 ERROR("Couldn't m2p on PAE root pgdir");
820 goto out;
821 }
823 l3tab = (uint64_t *)
824 xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
825 PROT_READ | PROT_WRITE, p2m[i]);
827 for ( j = 0; j < 4; j++ )
828 l3tab[j] = l3ptes[j];
830 munmap(l3tab, PAGE_SIZE);
831 }
832 }
834 /* Second pass: find all L1TABs and uncanonicalize them */
835 j = 0;
837 for ( i = 0; i < p2m_size; i++ )
838 {
839 if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) ==
840 XEN_DOMCTL_PFINFO_L1TAB) )
841 {
842 region_mfn[j] = p2m[i];
843 j++;
844 }
846 if ( (i == (p2m_size-1)) || (j == MAX_BATCH_SIZE) )
847 {
848 region_base = xc_map_foreign_batch(
849 xc_handle, dom, PROT_READ | PROT_WRITE, region_mfn, j);
850 if ( region_base == NULL )
851 {
852 ERROR("map batch failed");
853 goto out;
854 }
856 for ( k = 0; k < j; k++ )
857 {
858 if ( !uncanonicalize_pagetable(
859 xc_handle, dom, XEN_DOMCTL_PFINFO_L1TAB,
860 region_base + k*PAGE_SIZE, superpages) )
861 {
862 ERROR("failed uncanonicalize pt!");
863 goto out;
864 }
865 }
867 munmap(region_base, j*PAGE_SIZE);
868 j = 0;
869 }
870 }
872 if ( xc_flush_mmu_updates(xc_handle, mmu) )
873 {
874 ERROR("Error doing xc_flush_mmu_updates()");
875 goto out;
876 }
877 }
879 /*
880 * Pin page tables. Do this after writing to them as otherwise Xen
881 * will barf when doing the type-checking.
882 */
883 nr_pins = 0;
884 for ( i = 0; i < p2m_size; i++ )
885 {
886 if ( (pfn_type[i] & XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
887 continue;
889 switch ( pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
890 {
891 case XEN_DOMCTL_PFINFO_L1TAB:
892 pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
893 break;
895 case XEN_DOMCTL_PFINFO_L2TAB:
896 pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
897 break;
899 case XEN_DOMCTL_PFINFO_L3TAB:
900 pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
901 break;
903 case XEN_DOMCTL_PFINFO_L4TAB:
904 pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
905 break;
907 default:
908 continue;
909 }
911 pin[nr_pins].arg1.mfn = p2m[i];
912 nr_pins++;
914 /* Batch full? Then flush. */
915 if ( nr_pins == MAX_PIN_BATCH )
916 {
917 if ( xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0 )
918 {
919 ERROR("Failed to pin batch of %d page tables", nr_pins);
920 goto out;
921 }
922 nr_pins = 0;
923 }
924 }
926 /* Flush final partial batch. */
927 if ( (nr_pins != 0) && (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) )
928 {
929 ERROR("Failed to pin batch of %d page tables", nr_pins);
930 goto out;
931 }
933 DPRINTF("\b\b\b\b100%%\n");
934 DPRINTF("Memory reloaded (%ld pages)\n", nr_pfns);
936 /* Get the list of PFNs that are not in the psuedo-phys map */
937 {
938 unsigned int count = 0;
939 unsigned long *pfntab;
940 int nr_frees;
942 if ( read_exact(io_fd, &count, sizeof(count)) ||
943 (count > (1U << 28)) ) /* up to 1TB of address space */
944 {
945 ERROR("Error when reading pfn count (= %u)", count);
946 goto out;
947 }
949 if ( !(pfntab = malloc(sizeof(unsigned long) * count)) )
950 {
951 ERROR("Out of memory");
952 goto out;
953 }
955 if ( read_exact(io_fd, pfntab, sizeof(unsigned long)*count) )
956 {
957 ERROR("Error when reading pfntab");
958 goto out;
959 }
961 nr_frees = 0;
962 for ( i = 0; i < count; i++ )
963 {
964 unsigned long pfn = pfntab[i];
966 if ( p2m[pfn] != INVALID_P2M_ENTRY )
967 {
968 /* pfn is not in physmap now, but was at some point during
969 the save/migration process - need to free it */
970 pfntab[nr_frees++] = p2m[pfn];
971 p2m[pfn] = INVALID_P2M_ENTRY; /* not in pseudo-physical map */
972 }
973 }
975 if ( nr_frees > 0 )
976 {
977 struct xen_memory_reservation reservation = {
978 .nr_extents = nr_frees,
979 .extent_order = 0,
980 .domid = dom
981 };
982 set_xen_guest_handle(reservation.extent_start, pfntab);
984 if ( (frc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
985 &reservation)) != nr_frees )
986 {
987 ERROR("Could not decrease reservation : %d", frc);
988 goto out;
989 }
990 else
991 DPRINTF("Decreased reservation by %d pages\n", count);
992 }
993 }
995 if ( lock_pages(&ctxt, sizeof(ctxt)) )
996 {
997 ERROR("Unable to lock ctxt");
998 return 1;
999 }
1001 for ( i = 0; i <= max_vcpu_id; i++ )
1003 if ( !(vcpumap & (1ULL << i)) )
1004 continue;
1006 if ( read_exact(io_fd, &ctxt, ((guest_width == 8)
1007 ? sizeof(ctxt.x64)
1008 : sizeof(ctxt.x32))) )
1010 ERROR("Error when reading ctxt %d", i);
1011 goto out;
1014 if ( !new_ctxt_format )
1015 SET_FIELD(&ctxt, flags, GET_FIELD(&ctxt, flags) | VGCF_online);
1017 if ( i == 0 )
1019 /*
1020 * Uncanonicalise the suspend-record frame number and poke
1021 * resume record.
1022 */
1023 pfn = GET_FIELD(&ctxt, user_regs.edx);
1024 if ( (pfn >= p2m_size) ||
1025 (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
1027 ERROR("Suspend record frame number is bad");
1028 goto out;
1030 mfn = p2m[pfn];
1031 SET_FIELD(&ctxt, user_regs.edx, mfn);
1032 start_info = xc_map_foreign_range(
1033 xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
1034 SET_FIELD(start_info, nr_pages, p2m_size);
1035 SET_FIELD(start_info, shared_info, shared_info_frame<<PAGE_SHIFT);
1036 SET_FIELD(start_info, flags, 0);
1037 *store_mfn = p2m[GET_FIELD(start_info, store_mfn)];
1038 SET_FIELD(start_info, store_mfn, *store_mfn);
1039 SET_FIELD(start_info, store_evtchn, store_evtchn);
1040 *console_mfn = p2m[GET_FIELD(start_info, console.domU.mfn)];
1041 SET_FIELD(start_info, console.domU.mfn, *console_mfn);
1042 SET_FIELD(start_info, console.domU.evtchn, console_evtchn);
1043 munmap(start_info, PAGE_SIZE);
1045 /* Uncanonicalise each GDT frame number. */
1046 if ( GET_FIELD(&ctxt, gdt_ents) > 8192 )
1048 ERROR("GDT entry count out of range");
1049 goto out;
1052 for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents); j++ )
1054 pfn = GET_FIELD(&ctxt, gdt_frames[j]);
1055 if ( (pfn >= p2m_size) ||
1056 (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
1058 ERROR("GDT frame number %i (0x%lx) is bad",
1059 j, (unsigned long)pfn);
1060 goto out;
1062 SET_FIELD(&ctxt, gdt_frames[j], p2m[pfn]);
1064 /* Uncanonicalise the page table base pointer. */
1065 pfn = UNFOLD_CR3(GET_FIELD(&ctxt, ctrlreg[3]));
1067 if ( pfn >= p2m_size )
1069 ERROR("PT base is bad: pfn=%lu p2m_size=%lu type=%08lx",
1070 pfn, p2m_size, pfn_type[pfn]);
1071 goto out;
1074 if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
1075 ((unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
1077 ERROR("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
1078 pfn, p2m_size, pfn_type[pfn],
1079 (unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
1080 goto out;
1082 SET_FIELD(&ctxt, ctrlreg[3], FOLD_CR3(p2m[pfn]));
1084 /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
1085 if ( (pt_levels == 4) && (ctxt.x64.ctrlreg[1] & 1) )
1087 pfn = UNFOLD_CR3(ctxt.x64.ctrlreg[1] & ~1);
1088 if ( pfn >= p2m_size )
1090 ERROR("User PT base is bad: pfn=%lu p2m_size=%lu",
1091 pfn, p2m_size);
1092 goto out;
1094 if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
1095 ((unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
1097 ERROR("User PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
1098 pfn, p2m_size, pfn_type[pfn],
1099 (unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT);
1100 goto out;
1102 ctxt.x64.ctrlreg[1] = FOLD_CR3(p2m[pfn]);
1104 domctl.cmd = XEN_DOMCTL_setvcpucontext;
1105 domctl.domain = (domid_t)dom;
1106 domctl.u.vcpucontext.vcpu = i;
1107 set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt.c);
1108 frc = xc_domctl(xc_handle, &domctl);
1109 if ( frc != 0 )
1111 ERROR("Couldn't build vcpu%d", i);
1112 goto out;
1115 if ( !ext_vcpucontext )
1116 continue;
1117 if ( read_exact(io_fd, &domctl.u.ext_vcpucontext, 128) ||
1118 (domctl.u.ext_vcpucontext.vcpu != i) )
1120 ERROR("Error when reading extended ctxt %d", i);
1121 goto out;
1123 domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
1124 domctl.domain = dom;
1125 frc = xc_domctl(xc_handle, &domctl);
1126 if ( frc != 0 )
1128 ERROR("Couldn't set extended vcpu%d info\n", i);
1129 goto out;
1133 if ( read_exact(io_fd, shared_info_page, PAGE_SIZE) )
1135 ERROR("Error when reading shared info page");
1136 goto out;
1139 /* Restore contents of shared-info page. No checking needed. */
1140 new_shared_info = xc_map_foreign_range(
1141 xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
1143 /* restore saved vcpu_info and arch specific info */
1144 MEMCPY_FIELD(new_shared_info, old_shared_info, vcpu_info);
1145 MEMCPY_FIELD(new_shared_info, old_shared_info, arch);
1147 /* clear any pending events and the selector */
1148 MEMSET_ARRAY_FIELD(new_shared_info, evtchn_pending, 0);
1149 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
1150 SET_FIELD(new_shared_info, vcpu_info[i].evtchn_pending_sel, 0);
1152 /* mask event channels */
1153 MEMSET_ARRAY_FIELD(new_shared_info, evtchn_mask, 0xff);
1155 /* leave wallclock time. set by hypervisor */
1156 munmap(new_shared_info, PAGE_SIZE);
1158 /* Uncanonicalise the pfn-to-mfn table frame-number list. */
1159 for ( i = 0; i < P2M_FL_ENTRIES; i++ )
1161 pfn = p2m_frame_list[i];
1162 if ( (pfn >= p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) )
1164 ERROR("PFN-to-MFN frame number %i (%#lx) is bad", i, pfn);
1165 goto out;
1167 p2m_frame_list[i] = p2m[pfn];
1170 /* Copy the P2M we've constructed to the 'live' P2M */
1171 if ( !(live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_WRITE,
1172 p2m_frame_list, P2M_FL_ENTRIES)) )
1174 ERROR("Couldn't map p2m table");
1175 goto out;
1178 /* If the domain we're restoring has a different word size to ours,
1179 * we need to adjust the live_p2m assignment appropriately */
1180 if ( guest_width > sizeof (xen_pfn_t) )
1181 for ( i = p2m_size - 1; i >= 0; i-- )
1182 ((int64_t *)live_p2m)[i] = (long)p2m[i];
1183 else if ( guest_width < sizeof (xen_pfn_t) )
1184 for ( i = 0; i < p2m_size; i++ )
1185 ((uint32_t *)live_p2m)[i] = p2m[i];
1186 else
1187 memcpy(live_p2m, p2m, p2m_size * sizeof(xen_pfn_t));
1188 munmap(live_p2m, P2M_FL_ENTRIES * PAGE_SIZE);
1190 DPRINTF("Domain ready to be built.\n");
1191 rc = 0;
1193 out:
1194 if ( (rc != 0) && (dom != 0) )
1195 xc_domain_destroy(xc_handle, dom);
1196 free(mmu);
1197 free(p2m);
1198 free(pfn_type);
1199 free(hvm_buf);
1201 /* discard cache for save file */
1202 discard_file_cache(io_fd, 1 /*flush*/);
1204 DPRINTF("Restore exit with rc=%d\n", rc);
1206 return rc;
1208 /*
1209 * Local variables:
1210 * mode: C
1211 * c-set-style: "BSD"
1212 * c-basic-offset: 4
1213 * tab-width: 4
1214 * indent-tabs-mode: nil
1215 * End:
1216 */