direct-io.hg

view tools/libxc/xc_linux_restore.c @ 10173:954f4dea9da6

[PAE] Allow pgdirs above 4GB for paravirt guests.
**NOTE**: This obviates the need for lowmem_emergency_pool.
Unpriv guests no longer need to be able to allocate memory
below 4GB for PAE PDPTs.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Fri May 26 17:22:30 2006 +0100 (2006-05-26)
parents dfdc32a9814f
children fdc26ec44145
line source
1 /******************************************************************************
2 * xc_linux_restore.c
3 *
4 * Restore the state of a Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <stdlib.h>
10 #include <unistd.h>
12 #include "xg_private.h"
13 #include "xg_save_restore.h"
15 /* max mfn of the whole machine */
16 static unsigned long max_mfn;
18 /* virtual starting address of the hypervisor */
19 static unsigned long hvirt_start;
21 /* #levels of page tables used by the currrent guest */
22 static unsigned int pt_levels;
24 /* total number of pages used by the current guest */
25 static unsigned long max_pfn;
27 /* Live mapping of the table mapping each PFN to its current MFN. */
28 static unsigned long *live_p2m = NULL;
30 /* A table mapping each PFN to its new MFN. */
31 static unsigned long *p2m = NULL;
34 static ssize_t
35 read_exact(int fd, void *buf, size_t count)
36 {
37 int r = 0, s;
38 unsigned char *b = buf;
40 while (r < count) {
41 s = read(fd, &b[r], count - r);
42 if ((s == -1) && (errno == EINTR))
43 continue;
44 if (s <= 0) {
45 break;
46 }
47 r += s;
48 }
50 return (r == count) ? 1 : 0;
51 }
53 /*
54 ** In the state file (or during transfer), all page-table pages are
55 ** converted into a 'canonical' form where references to actual mfns
56 ** are replaced with references to the corresponding pfns.
57 ** This function inverts that operation, replacing the pfn values with
58 ** the (now known) appropriate mfn values.
59 */
60 int uncanonicalize_pagetable(unsigned long type, void *page)
61 {
62 int i, pte_last;
63 unsigned long pfn;
64 uint64_t pte;
66 pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
68 /* Now iterate through the page table, uncanonicalizing each PTE */
69 for(i = 0; i < pte_last; i++) {
71 if(pt_levels == 2)
72 pte = ((uint32_t *)page)[i];
73 else
74 pte = ((uint64_t *)page)[i];
76 if(pte & _PAGE_PRESENT) {
78 pfn = (pte >> PAGE_SHIFT) & 0xffffffff;
80 if(pfn >= max_pfn) {
81 /* This "page table page" is probably not one; bail. */
82 ERR("Frame number in type %lu page table is out of range: "
83 "i=%d pfn=0x%lx max_pfn=%lu",
84 type >> 28, i, pfn, max_pfn);
85 return 0;
86 }
89 pte &= 0xffffff0000000fffULL;
90 pte |= (uint64_t)p2m[pfn] << PAGE_SHIFT;
92 if(pt_levels == 2)
93 ((uint32_t *)page)[i] = (uint32_t)pte;
94 else
95 ((uint64_t *)page)[i] = (uint64_t)pte;
99 }
100 }
102 return 1;
103 }
105 int xc_linux_restore(int xc_handle, int io_fd,
106 uint32_t dom, unsigned long nr_pfns,
107 unsigned int store_evtchn, unsigned long *store_mfn,
108 unsigned int console_evtchn, unsigned long *console_mfn)
109 {
110 DECLARE_DOM0_OP;
111 int rc = 1, i, n;
112 unsigned long mfn, pfn;
113 unsigned int prev_pc, this_pc;
114 int verify = 0;
115 int nraces = 0;
117 /* The new domain's shared-info frame number. */
118 unsigned long shared_info_frame;
119 unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
120 shared_info_t *shared_info = (shared_info_t *)shared_info_page;
122 /* A copy of the CPU context of the guest. */
123 vcpu_guest_context_t ctxt;
125 /* A table containing the type of each PFN (/not/ MFN!). */
126 unsigned long *pfn_type = NULL;
128 /* A table of MFNs to map in the current region */
129 unsigned long *region_mfn = NULL;
131 /* Types of the pfns in the current region */
132 unsigned long region_pfn_type[MAX_BATCH_SIZE];
134 /* A temporary mapping, and a copy, of one frame of guest memory. */
135 unsigned long *page = NULL;
137 /* A copy of the pfn-to-mfn table frame list. */
138 unsigned long *p2m_frame_list = NULL;
140 /* A temporary mapping of the guest's start_info page. */
141 start_info_t *start_info;
143 char *region_base;
145 xc_mmu_t *mmu = NULL;
147 /* used by debug verify code */
148 unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
150 struct mmuext_op pin[MAX_PIN_BATCH];
151 unsigned int nr_pins;
154 max_pfn = nr_pfns;
156 DPRINTF("xc_linux_restore start: max_pfn = %lx\n", max_pfn);
159 if(!get_platform_info(xc_handle, dom,
160 &max_mfn, &hvirt_start, &pt_levels)) {
161 ERR("Unable to get platform info.");
162 return 1;
163 }
166 if (mlock(&ctxt, sizeof(ctxt))) {
167 /* needed for build dom0 op, but might as well do early */
168 ERR("Unable to mlock ctxt");
169 return 1;
170 }
173 /* Read the saved P2M frame list */
174 if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
175 ERR("Couldn't allocate p2m_frame_list array");
176 goto out;
177 }
179 if (!read_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
180 ERR("read p2m_frame_list failed");
181 goto out;
182 }
185 /* We want zeroed memory so use calloc rather than malloc. */
186 p2m = calloc(max_pfn, sizeof(unsigned long));
187 pfn_type = calloc(max_pfn, sizeof(unsigned long));
188 region_mfn = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
190 if ((p2m == NULL) || (pfn_type == NULL) || (region_mfn == NULL)) {
191 ERR("memory alloc failed");
192 errno = ENOMEM;
193 goto out;
194 }
196 if (mlock(region_mfn, sizeof(unsigned long) * MAX_BATCH_SIZE)) {
197 ERR("Could not mlock region_mfn");
198 goto out;
199 }
201 /* Get the domain's shared-info frame. */
202 op.cmd = DOM0_GETDOMAININFO;
203 op.u.getdomaininfo.domain = (domid_t)dom;
204 if (xc_dom0_op(xc_handle, &op) < 0) {
205 ERR("Could not get information on new domain");
206 goto out;
207 }
208 shared_info_frame = op.u.getdomaininfo.shared_info_frame;
210 if(xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) {
211 errno = ENOMEM;
212 goto out;
213 }
215 if(xc_domain_memory_increase_reservation(
216 xc_handle, dom, max_pfn, 0, 0, NULL) != 0) {
217 ERR("Failed to increase reservation by %lx KB", PFN_TO_KB(max_pfn));
218 errno = ENOMEM;
219 goto out;
220 }
222 DPRINTF("Increased domain reservation by %lx KB\n", PFN_TO_KB(max_pfn));
224 /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
225 if (xc_get_pfn_list(xc_handle, dom, p2m, max_pfn) != max_pfn) {
226 ERR("Did not read correct number of frame numbers for new dom");
227 goto out;
228 }
230 if(!(mmu = xc_init_mmu_updates(xc_handle, dom))) {
231 ERR("Could not initialise for MMU updates");
232 goto out;
233 }
236 DPRINTF("Reloading memory pages: 0%%\n");
238 /*
239 * Now simply read each saved frame into its new machine frame.
240 * We uncanonicalise page tables as we go.
241 */
242 prev_pc = 0;
244 n = 0;
245 while (1) {
247 int j;
249 this_pc = (n * 100) / max_pfn;
250 if ( (this_pc - prev_pc) >= 5 )
251 {
252 PPRINTF("\b\b\b\b%3d%%", this_pc);
253 prev_pc = this_pc;
254 }
256 if (!read_exact(io_fd, &j, sizeof(int))) {
257 ERR("Error when reading batch size");
258 goto out;
259 }
261 PPRINTF("batch %d\n",j);
263 if (j == -1) {
264 verify = 1;
265 fprintf(stderr, "Entering page verify mode\n");
266 continue;
267 }
269 if (j == 0)
270 break; /* our work here is done */
272 if (j > MAX_BATCH_SIZE) {
273 ERR("Max batch size exceeded. Giving up.");
274 goto out;
275 }
277 if (!read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long))) {
278 ERR("Error when reading region pfn types");
279 goto out;
280 }
282 for (i = 0; i < j; i++) {
284 if ((region_pfn_type[i] & LTAB_MASK) == XTAB)
285 region_mfn[i] = 0; /* we know map will fail, but don't care */
286 else
287 region_mfn[i] = p2m[region_pfn_type[i] & ~LTAB_MASK];
289 }
291 if (!(region_base = xc_map_foreign_batch(
292 xc_handle, dom, PROT_WRITE, region_mfn, j))) {
293 ERR("map batch failed");
294 goto out;
295 }
297 for ( i = 0; i < j; i++ )
298 {
299 void *page;
300 unsigned long pagetype;
302 pfn = region_pfn_type[i] & ~LTAB_MASK;
303 pagetype = region_pfn_type[i] & LTAB_MASK;
305 if (pagetype == XTAB)
306 /* a bogus/unmapped page: skip it */
307 continue;
309 if (pfn > max_pfn) {
310 ERR("pfn out of range");
311 goto out;
312 }
314 pfn_type[pfn] = pagetype;
316 mfn = p2m[pfn];
318 /* In verify mode, we use a copy; otherwise we work in place */
319 page = verify ? (void *)buf : (region_base + i*PAGE_SIZE);
321 if (!read_exact(io_fd, page, PAGE_SIZE)) {
322 ERR("Error when reading page (type was %lx)", pagetype);
323 goto out;
324 }
326 pagetype &= LTABTYPE_MASK;
328 if(pagetype >= L1TAB && pagetype <= L4TAB) {
330 /*
331 ** A page table page - need to 'uncanonicalize' it, i.e.
332 ** replace all the references to pfns with the corresponding
333 ** mfns for the new domain.
334 */
335 if(!uncanonicalize_pagetable(pagetype, page)) {
336 /*
337 ** Failing to uncanonicalize a page table can be ok
338 ** under live migration since the pages type may have
339 ** changed by now (and we'll get an update later).
340 */
341 DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
342 pagetype >> 28, pfn, mfn);
343 nraces++;
344 continue;
345 }
347 } else if(pagetype != NOTAB) {
349 ERR("Bogus page type %lx page table is out of range: "
350 "i=%d max_pfn=%lu", pagetype, i, max_pfn);
351 goto out;
353 }
356 if (verify) {
358 int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
360 if (res) {
362 int v;
364 DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
365 "actualcs=%08lx\n", pfn, pfn_type[pfn],
366 csum_page(region_base + i*PAGE_SIZE),
367 csum_page(buf));
369 for (v = 0; v < 4; v++) {
371 unsigned long *p = (unsigned long *)
372 (region_base + i*PAGE_SIZE);
373 if (buf[v] != p[v])
374 DPRINTF(" %d: %08lx %08lx\n", v, buf[v], p[v]);
375 }
376 }
377 }
379 if (xc_add_mmu_update(xc_handle, mmu,
380 (((unsigned long long)mfn) << PAGE_SHIFT)
381 | MMU_MACHPHYS_UPDATE, pfn)) {
382 ERR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
383 goto out;
384 }
385 } /* end of 'batch' for loop */
387 munmap(region_base, j*PAGE_SIZE);
388 n+= j; /* crude stats */
389 }
391 DPRINTF("Received all pages (%d races)\n", nraces);
394 if (xc_finish_mmu_updates(xc_handle, mmu)) {
395 ERR("Error doing finish_mmu_updates()");
396 goto out;
397 }
400 /*
401 * Pin page tables. Do this after writing to them as otherwise Xen
402 * will barf when doing the type-checking.
403 */
404 nr_pins = 0;
405 for (i = 0; i < max_pfn; i++) {
407 if (i == (max_pfn-1) || nr_pins == MAX_PIN_BATCH) {
408 if (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) {
409 ERR("Failed to pin batch of %d page tables", nr_pins);
410 goto out;
411 }
412 nr_pins = 0;
413 }
415 if ( (pfn_type[i] & LPINTAB) == 0 )
416 continue;
418 switch(pfn_type[i]) {
420 case (L1TAB|LPINTAB):
421 pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
422 break;
424 case (L2TAB|LPINTAB):
425 pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
426 break;
428 case (L3TAB|LPINTAB):
429 pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
430 break;
432 case (L4TAB|LPINTAB):
433 pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
434 break;
436 default:
437 continue;
438 }
440 pin[nr_pins].arg1.mfn = p2m[i];
441 nr_pins++;
443 }
445 DPRINTF("\b\b\b\b100%%\n");
446 DPRINTF("Memory reloaded.\n");
448 /* Get the list of PFNs that are not in the psuedo-phys map */
449 {
450 unsigned int count;
451 unsigned long *pfntab;
452 int rc;
454 if (!read_exact(io_fd, &count, sizeof(count))) {
455 ERR("Error when reading pfn count");
456 goto out;
457 }
459 if(!(pfntab = malloc(sizeof(unsigned long) * count))) {
460 ERR("Out of memory");
461 goto out;
462 }
464 if (!read_exact(io_fd, pfntab, sizeof(unsigned long)*count)) {
465 ERR("Error when reading pfntab");
466 goto out;
467 }
469 for (i = 0; i < count; i++) {
471 unsigned long pfn = pfntab[i];
473 if(pfn > max_pfn)
474 /* shouldn't happen - continue optimistically */
475 continue;
477 pfntab[i] = p2m[pfn];
478 p2m[pfn] = INVALID_P2M_ENTRY; // not in pseudo-physical map
479 }
481 if (count > 0) {
483 struct xen_memory_reservation reservation = {
484 .nr_extents = count,
485 .extent_order = 0,
486 .domid = dom
487 };
488 set_xen_guest_handle(reservation.extent_start, pfntab);
490 if ((rc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
491 &reservation)) != count) {
492 ERR("Could not decrease reservation : %d", rc);
493 goto out;
494 } else
495 DPRINTF("Decreased reservation by %d pages\n", count);
496 }
497 }
499 if (!read_exact(io_fd, &ctxt, sizeof(ctxt)) ||
500 !read_exact(io_fd, shared_info_page, PAGE_SIZE)) {
501 ERR("Error when reading ctxt or shared info page");
502 goto out;
503 }
505 /* Uncanonicalise the suspend-record frame number and poke resume rec. */
506 pfn = ctxt.user_regs.edx;
507 if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
508 ERR("Suspend record frame number is bad");
509 goto out;
510 }
511 ctxt.user_regs.edx = mfn = p2m[pfn];
512 start_info = xc_map_foreign_range(
513 xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
514 start_info->nr_pages = max_pfn;
515 start_info->shared_info = shared_info_frame << PAGE_SHIFT;
516 start_info->flags = 0;
517 *store_mfn = start_info->store_mfn = p2m[start_info->store_mfn];
518 start_info->store_evtchn = store_evtchn;
519 *console_mfn = start_info->console_mfn = p2m[start_info->console_mfn];
520 start_info->console_evtchn = console_evtchn;
521 munmap(start_info, PAGE_SIZE);
523 /* Uncanonicalise each GDT frame number. */
524 if (ctxt.gdt_ents > 8192) {
525 ERR("GDT entry count out of range");
526 goto out;
527 }
529 for (i = 0; i < ctxt.gdt_ents; i += 512) {
530 pfn = ctxt.gdt_frames[i];
531 if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
532 ERR("GDT frame number is bad");
533 goto out;
534 }
535 ctxt.gdt_frames[i] = p2m[pfn];
536 }
538 /* Uncanonicalise the page table base pointer. */
539 pfn = ctxt.ctrlreg[3] >> PAGE_SHIFT;
541 if (pfn >= max_pfn) {
542 ERR("PT base is bad: pfn=%lu max_pfn=%lu type=%08lx",
543 pfn, max_pfn, pfn_type[pfn]);
544 goto out;
545 }
547 if ( (pfn_type[pfn] & LTABTYPE_MASK) !=
548 ((unsigned long)pt_levels<<LTAB_SHIFT) ) {
549 ERR("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
550 pfn, max_pfn, pfn_type[pfn],
551 (unsigned long)pt_levels<<LTAB_SHIFT);
552 goto out;
553 }
555 ctxt.ctrlreg[3] = p2m[pfn] << PAGE_SHIFT;
557 /* clear any pending events and the selector */
558 memset(&(shared_info->evtchn_pending[0]), 0,
559 sizeof (shared_info->evtchn_pending));
560 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
561 shared_info->vcpu_info[i].evtchn_pending_sel = 0;
563 /* Copy saved contents of shared-info page. No checking needed. */
564 page = xc_map_foreign_range(
565 xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
566 memcpy(page, shared_info, sizeof(shared_info_t));
567 munmap(page, PAGE_SIZE);
569 /* Uncanonicalise the pfn-to-mfn table frame-number list. */
570 for (i = 0; i < P2M_FL_ENTRIES; i++) {
571 pfn = p2m_frame_list[i];
572 if ((pfn >= max_pfn) || (pfn_type[pfn] != NOTAB)) {
573 ERR("PFN-to-MFN frame number is bad");
574 goto out;
575 }
577 p2m_frame_list[i] = p2m[pfn];
578 }
580 /* Copy the P2M we've constructed to the 'live' P2M */
581 if (!(live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_WRITE,
582 p2m_frame_list, P2M_FL_ENTRIES))) {
583 ERR("Couldn't map p2m table");
584 goto out;
585 }
587 memcpy(live_p2m, p2m, P2M_SIZE);
588 munmap(live_p2m, P2M_SIZE);
590 /*
591 * Safety checking of saved context:
592 * 1. user_regs is fine, as Xen checks that on context switch.
593 * 2. fpu_ctxt is fine, as it can't hurt Xen.
594 * 3. trap_ctxt needs the code selectors checked.
595 * 4. ldt base must be page-aligned, no more than 8192 ents, ...
596 * 5. gdt already done, and further checking is done by Xen.
597 * 6. check that kernel_ss is safe.
598 * 7. pt_base is already done.
599 * 8. debugregs are checked by Xen.
600 * 9. callback code selectors need checking.
601 */
602 for ( i = 0; i < 256; i++ ) {
603 ctxt.trap_ctxt[i].vector = i;
604 if ((ctxt.trap_ctxt[i].cs & 3) == 0)
605 ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS;
606 }
607 if ((ctxt.kernel_ss & 3) == 0)
608 ctxt.kernel_ss = FLAT_KERNEL_DS;
609 #if defined(__i386__)
610 if ((ctxt.event_callback_cs & 3) == 0)
611 ctxt.event_callback_cs = FLAT_KERNEL_CS;
612 if ((ctxt.failsafe_callback_cs & 3) == 0)
613 ctxt.failsafe_callback_cs = FLAT_KERNEL_CS;
614 #endif
615 if (((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) ||
616 (ctxt.ldt_ents > 8192) ||
617 (ctxt.ldt_base > hvirt_start) ||
618 ((ctxt.ldt_base + ctxt.ldt_ents*8) > hvirt_start)) {
619 ERR("Bad LDT base or size");
620 goto out;
621 }
623 DPRINTF("Domain ready to be built.\n");
625 op.cmd = DOM0_SETVCPUCONTEXT;
626 op.u.setvcpucontext.domain = (domid_t)dom;
627 op.u.setvcpucontext.vcpu = 0;
628 set_xen_guest_handle(op.u.setvcpucontext.ctxt, &ctxt);
629 rc = xc_dom0_op(xc_handle, &op);
631 if (rc != 0) {
632 ERR("Couldn't build the domain");
633 goto out;
634 }
636 out:
637 if ( (rc != 0) && (dom != 0) )
638 xc_domain_destroy(xc_handle, dom);
639 free(mmu);
640 free(p2m);
641 free(pfn_type);
643 DPRINTF("Restore exit with rc=%d\n", rc);
645 return rc;
646 }