ia64/xen-unstable

view tools/libxc/xc_linux_restore.c @ 6766:219d96d545fc

merge?
author cl349@firebug.cl.cam.ac.uk
date Mon Sep 12 20:00:41 2005 +0000 (2005-09-12)
parents cdfa7dd00c44 413c911e5780
children 4d899a738d59 8ca0f98ba8e2
line source
1 /******************************************************************************
2 * xc_linux_restore.c
3 *
4 * Restore the state of a Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <stdlib.h>
10 #include <unistd.h>
11 #include "xg_private.h"
12 #include <xenctrl.h>
13 #include <xen/memory.h>
15 #define MAX_BATCH_SIZE 1024
17 #define DEBUG 0
19 #if 1
20 #define ERR(_f, _a...) do { fprintf ( stderr, _f , ## _a ); fflush(stderr); } while(0)
21 #else
22 #define ERR(_f, _a...) ((void)0)
23 #endif
25 #if DEBUG
26 #define DPRINTF(_f, _a...) do { fprintf ( stdout, _f , ## _a ); fflush(stdout); } while (0)
27 #else
28 #define DPRINTF(_f, _a...) ((void)0)
29 #endif
31 #define PROGRESS 0
32 #if PROGRESS
33 #define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a ); fflush(stderr)
34 #else
35 #define PPRINTF(_f, _a...)
36 #endif
38 static ssize_t
39 read_exact(int fd, void *buf, size_t count)
40 {
41 int r = 0, s;
42 unsigned char *b = buf;
44 while (r < count) {
45 s = read(fd, &b[r], count - r);
46 if (s <= 0)
47 break;
48 r += s;
49 }
51 return r;
52 }
54 int xc_linux_restore(int xc_handle, int io_fd, u32 dom, unsigned long nr_pfns,
55 unsigned int store_evtchn, unsigned long *store_mfn,
56 unsigned int console_evtchn, unsigned long *console_mfn)
57 {
58 dom0_op_t op;
59 int rc = 1, i, n, k;
60 unsigned long mfn, pfn, xpfn;
61 unsigned int prev_pc, this_pc;
62 int verify = 0;
63 int err;
65 /* The new domain's shared-info frame number. */
66 unsigned long shared_info_frame;
67 unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
68 shared_info_t *shared_info = (shared_info_t *)shared_info_page;
70 /* A copy of the CPU context of the guest. */
71 vcpu_guest_context_t ctxt;
73 /* A table containg the type of each PFN (/not/ MFN!). */
74 unsigned long *pfn_type = NULL;
76 /* A table of MFNs to map in the current region */
77 unsigned long *region_mfn = NULL;
79 /* A temporary mapping, and a copy, of one frame of guest memory. */
80 unsigned long *ppage = NULL;
82 /* A copy of the pfn-to-mfn table frame list. */
83 unsigned long pfn_to_mfn_frame_list[1024];
85 /* A table mapping each PFN to its new MFN. */
86 unsigned long *pfn_to_mfn_table = NULL;
88 /* used by mapper for updating the domain's copy of the table */
89 unsigned long *live_pfn_to_mfn_table = NULL;
91 /* A temporary mapping of the guest's start_info page. */
92 start_info_t *start_info;
94 int pt_levels = 2; /* XXX auto-detect this */
96 char *region_base;
98 xc_mmu_t *mmu = NULL;
100 /* used by debug verify code */
101 unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
103 #define MAX_PIN_BATCH 1024
104 struct mmuext_op pin[MAX_PIN_BATCH];
105 unsigned int nr_pins = 0;
107 DPRINTF("xc_linux_restore start: nr_pfns = %lx\n", nr_pfns);
109 if (mlock(&ctxt, sizeof(ctxt))) {
110 /* needed for when we do the build dom0 op,
111 but might as well do early */
112 ERR("Unable to mlock ctxt");
113 return 1;
114 }
116 if (read_exact(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE) {
117 ERR("read pfn_to_mfn_frame_list failed");
118 goto out;
119 }
121 /* We want zeroed memory so use calloc rather than malloc. */
122 pfn_to_mfn_table = calloc(4, nr_pfns);
123 pfn_type = calloc(4, nr_pfns);
124 region_mfn = calloc(4, MAX_BATCH_SIZE);
126 if ((pfn_to_mfn_table == NULL) || (pfn_type == NULL) ||
127 (region_mfn == NULL)) {
128 ERR("memory alloc failed");
129 errno = ENOMEM;
130 goto out;
131 }
133 if (mlock(region_mfn, 4 * MAX_BATCH_SIZE)) {
134 ERR("Could not mlock region_mfn");
135 goto out;
136 }
138 /* Get the domain's shared-info frame. */
139 op.cmd = DOM0_GETDOMAININFO;
140 op.u.getdomaininfo.domain = (domid_t)dom;
141 if (xc_dom0_op(xc_handle, &op) < 0) {
142 ERR("Could not get information on new domain");
143 goto out;
144 }
145 shared_info_frame = op.u.getdomaininfo.shared_info_frame;
147 err = xc_domain_setmaxmem(xc_handle, dom, nr_pfns * PAGE_SIZE / 1024);
148 if (err != 0) {
149 errno = ENOMEM;
150 goto out;
151 }
153 err = xc_domain_memory_increase_reservation(xc_handle, dom,
154 nr_pfns, 0, 0, NULL);
155 if (err != 0) {
156 ERR("Failed to increase reservation by %lx\n",
157 nr_pfns * PAGE_SIZE / 1024);
158 errno = ENOMEM;
159 goto out;
160 }
162 /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
163 if (xc_get_pfn_list(xc_handle, dom, pfn_to_mfn_table, nr_pfns) !=
164 nr_pfns) {
165 ERR("Did not read correct number of frame numbers for new dom");
166 goto out;
167 }
169 mmu = xc_init_mmu_updates(xc_handle, dom);
170 if (mmu == NULL) {
171 ERR("Could not initialise for MMU updates");
172 goto out;
173 }
175 DPRINTF("Reloading memory pages: 0%%");
177 /*
178 * Now simply read each saved frame into its new machine frame.
179 * We uncanonicalise page tables as we go.
180 */
181 prev_pc = 0;
183 n = 0;
184 while ( 1 )
185 {
186 int j;
187 unsigned long region_pfn_type[MAX_BATCH_SIZE];
189 this_pc = (n * 100) / nr_pfns;
190 if ( (this_pc - prev_pc) >= 5 )
191 {
192 PPRINTF("\b\b\b\b%3d%%", this_pc);
193 prev_pc = this_pc;
194 }
196 if ( read_exact(io_fd, &j, sizeof(int)) != sizeof(int) )
197 {
198 ERR("Error when reading batch size");
199 goto out;
200 }
202 PPRINTF("batch %d\n",j);
204 if ( j == -1 )
205 {
206 verify = 1;
207 printf("Entering page verify mode\n");
208 continue;
209 }
211 if ( j == 0 )
212 break; /* our work here is done */
214 if ( j > MAX_BATCH_SIZE )
215 {
216 ERR("Max batch size exceeded. Giving up.");
217 goto out;
218 }
220 if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) !=
221 j*sizeof(unsigned long) ) {
222 ERR("Error when reading region pfn types");
223 goto out;
224 }
226 for ( i = 0; i < j; i++ )
227 {
228 if ( (region_pfn_type[i] & LTAB_MASK) == XTAB)
229 {
230 region_mfn[i] = 0; /* we know map will fail, but don't care */
231 }
232 else
233 {
234 pfn = region_pfn_type[i] & ~LTAB_MASK;
235 region_mfn[i] = pfn_to_mfn_table[pfn];
236 }
237 }
239 if ( (region_base = xc_map_foreign_batch( xc_handle, dom,
240 PROT_WRITE,
241 region_mfn,
242 j )) == 0 )
243 {
244 ERR("map batch failed");
245 goto out;
246 }
248 for ( i = 0; i < j; i++ )
249 {
250 unsigned long *ppage;
252 pfn = region_pfn_type[i] & ~LTAB_MASK;
254 if ( (region_pfn_type[i] & LTAB_MASK) == XTAB) continue;
256 if (pfn>nr_pfns)
257 {
258 ERR("pfn out of range");
259 goto out;
260 }
262 region_pfn_type[i] &= LTAB_MASK;
264 pfn_type[pfn] = region_pfn_type[i];
266 mfn = pfn_to_mfn_table[pfn];
268 if ( verify )
269 ppage = (unsigned long*) buf; /* debug case */
270 else
271 ppage = (unsigned long*) (region_base + i*PAGE_SIZE);
273 if ( read_exact(io_fd, ppage, PAGE_SIZE) != PAGE_SIZE )
274 {
275 ERR("Error when reading pagetable page");
276 goto out;
277 }
279 switch( region_pfn_type[i] & LTABTYPE_MASK )
280 {
281 case 0:
282 break;
284 case L1TAB:
285 {
286 for ( k = 0; k < 1024; k++ )
287 {
288 if ( ppage[k] & _PAGE_PRESENT )
289 {
290 xpfn = ppage[k] >> PAGE_SHIFT;
291 if ( xpfn >= nr_pfns )
292 {
293 ERR("Frame number in type %lu page "
294 "table is out of range. i=%d k=%d "
295 "pfn=0x%lx nr_pfns=%lu",
296 region_pfn_type[i]>>28, i,
297 k, xpfn, nr_pfns);
298 goto out;
299 }
301 ppage[k] &= (PAGE_SIZE - 1) &
302 ~(_PAGE_GLOBAL | _PAGE_PAT);
303 ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
304 }
305 }
306 }
307 break;
309 case L2TAB:
310 {
311 for ( k = 0;
312 k < (HYPERVISOR_VIRT_START>>L2_PAGETABLE_SHIFT);
313 k++ )
314 {
315 if ( ppage[k] & _PAGE_PRESENT )
316 {
317 xpfn = ppage[k] >> PAGE_SHIFT;
319 if ( xpfn >= nr_pfns )
320 {
321 ERR("Frame number in type %lu page"
322 " table is out of range. i=%d k=%d "
323 "pfn=%lu nr_pfns=%lu",
324 region_pfn_type[i]>>28, i, k,
325 xpfn, nr_pfns);
326 goto out;
327 }
329 ppage[k] &= (PAGE_SIZE - 1) &
330 ~(_PAGE_GLOBAL | _PAGE_PSE);
331 ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
332 }
333 }
334 }
335 break;
337 default:
338 ERR("Bogus page type %lx page table is "
339 "out of range. i=%d nr_pfns=%lu",
340 region_pfn_type[i], i, nr_pfns);
341 goto out;
343 } /* end of page type switch statement */
345 if ( verify )
346 {
347 int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE );
348 if ( res )
349 {
350 int v;
351 printf("************** pfn=%lx type=%lx gotcs=%08lx "
352 "actualcs=%08lx\n", pfn, pfn_type[pfn],
353 csum_page(region_base + i*PAGE_SIZE),
354 csum_page(buf));
355 for ( v = 0; v < 4; v++ )
356 {
357 unsigned long *p = (unsigned long *)
358 (region_base + i*PAGE_SIZE);
359 if ( buf[v] != p[v] )
360 printf(" %d: %08lx %08lx\n",
361 v, buf[v], p[v] );
362 }
363 }
364 }
366 if ( xc_add_mmu_update(xc_handle, mmu,
367 (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
368 pfn) )
369 {
370 printf("machpys mfn=%ld pfn=%ld\n",mfn,pfn);
371 goto out;
372 }
374 } /* end of 'batch' for loop */
376 munmap( region_base, j*PAGE_SIZE );
377 n+=j; /* crude stats */
378 }
380 DPRINTF("Received all pages\n");
382 if ( pt_levels == 3 )
383 {
384 /* Get all PGDs below 4GB. */
385 for ( i = 0; i < nr_pfns; i++ )
386 {
387 if ( ((pfn_type[i] & LTABTYPE_MASK) == L3TAB) &&
388 (pfn_to_mfn_table[i] > 0xfffffUL) )
389 {
390 unsigned long new_mfn = xc_make_page_below_4G(
391 xc_handle, dom, pfn_to_mfn_table[i]);
392 if ( new_mfn == 0 )
393 {
394 fprintf(stderr, "Couldn't get a page below 4GB :-(\n");
395 goto out;
396 }
397 pfn_to_mfn_table[i] = new_mfn;
398 if ( xc_add_mmu_update(
399 xc_handle, mmu, (new_mfn << PAGE_SHIFT) |
400 MMU_MACHPHYS_UPDATE, i) )
401 {
402 fprintf(stderr, "Couldn't m2p on PAE root pgdir\n");
403 goto out;
404 }
405 }
406 }
407 }
409 if ( xc_finish_mmu_updates(xc_handle, mmu) )
410 goto out;
412 /*
413 * Pin page tables. Do this after writing to them as otherwise Xen
414 * will barf when doing the type-checking.
415 */
416 for ( i = 0; i < nr_pfns; i++ )
417 {
418 if ( (pfn_type[i] & LPINTAB) == 0 )
419 continue;
420 if ( pfn_type[i] == (L1TAB|LPINTAB) )
421 pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
422 else /* pfn_type[i] == (L2TAB|LPINTAB) */
423 pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
424 pin[nr_pins].mfn = pfn_to_mfn_table[i];
425 if ( ++nr_pins == MAX_PIN_BATCH )
426 {
427 if ( xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0 )
428 goto out;
429 nr_pins = 0;
430 }
431 }
433 if ( (nr_pins != 0) &&
434 (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) )
435 goto out;
437 DPRINTF("\b\b\b\b100%%\n");
438 DPRINTF("Memory reloaded.\n");
440 /* Get the list of PFNs that are not in the psuedo-phys map */
441 {
442 unsigned int count;
443 unsigned long *pfntab;
444 int rc;
446 if ( read_exact(io_fd, &count, sizeof(count)) != sizeof(count) )
447 {
448 ERR("Error when reading pfn count");
449 goto out;
450 }
452 pfntab = malloc( sizeof(unsigned int) * count );
453 if ( pfntab == NULL )
454 {
455 ERR("Out of memory");
456 goto out;
457 }
459 if ( read_exact(io_fd, pfntab, sizeof(unsigned int)*count) !=
460 sizeof(unsigned int)*count )
461 {
462 ERR("Error when reading pfntab");
463 goto out;
464 }
466 for ( i = 0; i < count; i++ )
467 {
468 unsigned long pfn = pfntab[i];
469 pfntab[i]=pfn_to_mfn_table[pfn];
470 pfn_to_mfn_table[pfn] = 0x80000001; // not in pmap
471 }
473 if ( count > 0 )
474 {
475 struct xen_memory_reservation reservation = {
476 .extent_start = pfntab,
477 .nr_extents = count,
478 .extent_order = 0,
479 .domid = dom
480 };
481 if ( (rc = xc_memory_op(xc_handle,
482 XENMEM_decrease_reservation,
483 &reservation)) != count )
484 {
485 ERR("Could not decrease reservation : %d",rc);
486 goto out;
487 }
488 else
489 {
490 printf("Decreased reservation by %d pages\n", count);
491 }
492 }
493 }
495 if ( read_exact(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) ||
496 read_exact(io_fd, shared_info_page, PAGE_SIZE) != PAGE_SIZE )
497 {
498 ERR("Error when reading ctxt or shared info page");
499 goto out;
500 }
502 /* Uncanonicalise the suspend-record frame number and poke resume rec. */
503 pfn = ctxt.user_regs.esi;
504 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
505 {
506 ERR("Suspend record frame number is bad");
507 goto out;
508 }
509 ctxt.user_regs.esi = mfn = pfn_to_mfn_table[pfn];
510 start_info = xc_map_foreign_range(
511 xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
512 start_info->nr_pages = nr_pfns;
513 start_info->shared_info = shared_info_frame << PAGE_SHIFT;
514 start_info->flags = 0;
515 *store_mfn = start_info->store_mfn =
516 pfn_to_mfn_table[start_info->store_mfn];
517 start_info->store_evtchn = store_evtchn;
518 *console_mfn = start_info->console_mfn =
519 pfn_to_mfn_table[start_info->console_mfn];
520 start_info->console_evtchn = console_evtchn;
521 munmap(start_info, PAGE_SIZE);
523 /* Uncanonicalise each GDT frame number. */
524 if ( ctxt.gdt_ents > 8192 )
525 {
526 ERR("GDT entry count out of range");
527 goto out;
528 }
530 for ( i = 0; i < ctxt.gdt_ents; i += 512 )
531 {
532 pfn = ctxt.gdt_frames[i];
533 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
534 {
535 ERR("GDT frame number is bad");
536 goto out;
537 }
538 ctxt.gdt_frames[i] = pfn_to_mfn_table[pfn];
539 }
541 /* Uncanonicalise the page table base pointer. */
542 pfn = ctxt.ctrlreg[3] >> PAGE_SHIFT;
543 if ( (pfn >= nr_pfns) || ((pfn_type[pfn]&LTABTYPE_MASK) != L2TAB) )
544 {
545 printf("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
546 pfn, nr_pfns, pfn_type[pfn], (unsigned long)L2TAB);
547 ERR("PT base is bad.");
548 goto out;
549 }
550 ctxt.ctrlreg[3] = pfn_to_mfn_table[pfn] << PAGE_SHIFT;
552 /* clear any pending events and the selector */
553 memset(&(shared_info->evtchn_pending[0]), 0,
554 sizeof (shared_info->evtchn_pending));
555 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
556 shared_info->vcpu_data[i].evtchn_pending_sel = 0;
558 /* Copy saved contents of shared-info page. No checking needed. */
559 ppage = xc_map_foreign_range(
560 xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
561 memcpy(ppage, shared_info, sizeof(shared_info_t));
562 munmap(ppage, PAGE_SIZE);
564 /* Uncanonicalise the pfn-to-mfn table frame-number list. */
565 for ( i = 0; i < (nr_pfns+1023)/1024; i++ )
566 {
567 unsigned long pfn, mfn;
569 pfn = pfn_to_mfn_frame_list[i];
570 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
571 {
572 ERR("PFN-to-MFN frame number is bad");
573 goto out;
574 }
575 mfn = pfn_to_mfn_table[pfn];
576 pfn_to_mfn_frame_list[i] = mfn;
577 }
579 if ( (live_pfn_to_mfn_table =
580 xc_map_foreign_batch(xc_handle, dom,
581 PROT_WRITE,
582 pfn_to_mfn_frame_list,
583 (nr_pfns+1023)/1024 )) == 0 )
584 {
585 ERR("Couldn't map pfn_to_mfn table");
586 goto out;
587 }
589 memcpy(live_pfn_to_mfn_table, pfn_to_mfn_table,
590 nr_pfns*sizeof(unsigned long) );
592 munmap(live_pfn_to_mfn_table, ((nr_pfns+1023)/1024)*PAGE_SIZE);
594 /*
595 * Safety checking of saved context:
596 * 1. user_regs is fine, as Xen checks that on context switch.
597 * 2. fpu_ctxt is fine, as it can't hurt Xen.
598 * 3. trap_ctxt needs the code selectors checked.
599 * 4. ldt base must be page-aligned, no more than 8192 ents, ...
600 * 5. gdt already done, and further checking is done by Xen.
601 * 6. check that kernel_ss is safe.
602 * 7. pt_base is already done.
603 * 8. debugregs are checked by Xen.
604 * 9. callback code selectors need checking.
605 */
606 for ( i = 0; i < 256; i++ )
607 {
608 ctxt.trap_ctxt[i].vector = i;
609 if ( (ctxt.trap_ctxt[i].cs & 3) == 0 )
610 ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS;
611 }
612 if ( (ctxt.kernel_ss & 3) == 0 )
613 ctxt.kernel_ss = FLAT_KERNEL_DS;
614 #if defined(__i386__)
615 if ( (ctxt.event_callback_cs & 3) == 0 )
616 ctxt.event_callback_cs = FLAT_KERNEL_CS;
617 if ( (ctxt.failsafe_callback_cs & 3) == 0 )
618 ctxt.failsafe_callback_cs = FLAT_KERNEL_CS;
619 #endif
620 if ( ((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) ||
621 (ctxt.ldt_ents > 8192) ||
622 (ctxt.ldt_base > HYPERVISOR_VIRT_START) ||
623 ((ctxt.ldt_base + ctxt.ldt_ents*8) > HYPERVISOR_VIRT_START) )
624 {
625 ERR("Bad LDT base or size");
626 goto out;
627 }
629 DPRINTF("Domain ready to be built.\n");
631 op.cmd = DOM0_SETDOMAININFO;
632 op.u.setdomaininfo.domain = (domid_t)dom;
633 op.u.setdomaininfo.vcpu = 0;
634 op.u.setdomaininfo.ctxt = &ctxt;
635 rc = xc_dom0_op(xc_handle, &op);
637 if ( rc != 0 )
638 {
639 ERR("Couldn't build the domain");
640 goto out;
641 }
643 DPRINTF("Domain ready to be unpaused\n");
644 op.cmd = DOM0_UNPAUSEDOMAIN;
645 op.u.unpausedomain.domain = (domid_t)dom;
646 rc = xc_dom0_op(xc_handle, &op);
647 if (rc == 0) {
648 /* Success: print the domain id. */
649 DPRINTF("DOM=%u\n", dom);
650 return 0;
651 }
653 out:
654 if ( (rc != 0) && (dom != 0) )
655 xc_domain_destroy(xc_handle, dom);
656 free(mmu);
657 free(pfn_to_mfn_table);
658 free(pfn_type);
660 DPRINTF("Restore exit with rc=%d\n", rc);
661 return rc;
662 }