ia64/xen-unstable

view tools/libxc/xc_linux_restore.c @ 6422:e24fd7012ffb

merge?
author cl349@firebug.cl.cam.ac.uk
date Thu Aug 25 10:09:39 2005 +0000 (2005-08-25)
parents 2f20c2fce2c5 fdfd511768a3
children 4abd299ef2f6
line source
1 /******************************************************************************
2 * xc_linux_restore.c
3 *
4 * Restore the state of a Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <stdlib.h>
10 #include <unistd.h>
12 #include "xg_private.h"
13 #include <xenctrl.h>
15 #include <xen/linux/suspend.h>
17 #define MAX_BATCH_SIZE 1024
19 #define DEBUG 0
21 #if 1
22 #define ERR(_f, _a...) fprintf ( stderr, _f , ## _a ); fflush(stderr)
23 #else
24 #define ERR(_f, _a...) ((void)0)
25 #endif
27 #if DEBUG
28 #define DPRINTF(_f, _a...) fprintf ( stdout, _f , ## _a ); fflush(stdout)
29 #else
30 #define DPRINTF(_f, _a...) ((void)0)
31 #endif
33 #define PROGRESS 0
34 #if PROGRESS
35 #define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a ); fflush(stderr)
36 #else
37 #define PPRINTF(_f, _a...)
38 #endif
40 static ssize_t
41 read_exact(int fd, void *buf, size_t count)
42 {
43 int r = 0, s;
44 unsigned char *b = buf;
46 while (r < count) {
47 s = read(fd, &b[r], count - r);
48 if (s <= 0)
49 break;
50 r += s;
51 }
53 return r;
54 }
56 int xc_linux_restore(int xc_handle, int io_fd, u32 dom, unsigned long nr_pfns,
57 unsigned int store_evtchn, unsigned long *store_mfn)
58 {
59 dom0_op_t op;
60 int rc = 1, i, n, k;
61 unsigned long mfn, pfn, xpfn;
62 unsigned int prev_pc, this_pc;
63 int verify = 0;
64 int err;
66 /* The new domain's shared-info frame number. */
67 unsigned long shared_info_frame;
68 unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
69 shared_info_t *shared_info = (shared_info_t *)shared_info_page;
71 /* A copy of the CPU context of the guest. */
72 vcpu_guest_context_t ctxt;
74 /* A table containg the type of each PFN (/not/ MFN!). */
75 unsigned long *pfn_type = NULL;
77 /* A table of MFNs to map in the current region */
78 unsigned long *region_mfn = NULL;
80 /* A temporary mapping, and a copy, of one frame of guest memory. */
81 unsigned long *ppage = NULL;
83 /* A copy of the pfn-to-mfn table frame list. */
84 unsigned long pfn_to_mfn_frame_list[1024];
86 /* A table mapping each PFN to its new MFN. */
87 unsigned long *pfn_to_mfn_table = NULL;
89 /* used by mapper for updating the domain's copy of the table */
90 unsigned long *live_pfn_to_mfn_table = NULL;
92 /* A temporary mapping of the guest's suspend record. */
93 suspend_record_t *p_srec;
95 char *region_base;
97 xc_mmu_t *mmu = NULL;
99 /* used by debug verify code */
100 unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
102 #define MAX_PIN_BATCH 1024
103 struct mmuext_op pin[MAX_PIN_BATCH];
104 unsigned int nr_pins = 0;
106 DPRINTF("xc_linux_restore start\n");
108 if (mlock(&ctxt, sizeof(ctxt))) {
109 /* needed for when we do the build dom0 op,
110 but might as well do early */
111 ERR("Unable to mlock ctxt");
112 return 1;
113 }
115 if (read_exact(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE) {
116 ERR("read pfn_to_mfn_frame_list failed");
117 goto out;
118 }
120 /* We want zeroed memory so use calloc rather than malloc. */
121 pfn_to_mfn_table = calloc(4, nr_pfns);
122 pfn_type = calloc(4, nr_pfns);
123 region_mfn = calloc(4, MAX_BATCH_SIZE);
125 if ((pfn_to_mfn_table == NULL) || (pfn_type == NULL) ||
126 (region_mfn == NULL)) {
127 ERR("memory alloc failed");
128 errno = ENOMEM;
129 goto out;
130 }
132 if (mlock(region_mfn, 4 * MAX_BATCH_SIZE)) {
133 ERR("Could not mlock region_mfn");
134 goto out;
135 }
137 /* Get the domain's shared-info frame. */
138 op.cmd = DOM0_GETDOMAININFO;
139 op.u.getdomaininfo.domain = (domid_t)dom;
140 if (xc_dom0_op(xc_handle, &op) < 0) {
141 ERR("Could not get information on new domain");
142 goto out;
143 }
144 shared_info_frame = op.u.getdomaininfo.shared_info_frame;
146 err = xc_domain_setmaxmem(xc_handle, dom, nr_pfns * PAGE_SIZE / 1024);
147 if (err != 0) {
148 errno = ENOMEM;
149 goto out;
150 }
152 err = xc_domain_memory_increase_reservation(xc_handle, dom,
153 nr_pfns * PAGE_SIZE / 1024);
154 if (err != 0) {
155 errno = ENOMEM;
156 goto out;
157 }
159 /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
160 if (xc_get_pfn_list(xc_handle, dom, pfn_to_mfn_table, nr_pfns) !=
161 nr_pfns) {
162 ERR("Did not read correct number of frame numbers for new dom");
163 goto out;
164 }
166 mmu = xc_init_mmu_updates(xc_handle, dom);
167 if (mmu == NULL) {
168 ERR("Could not initialise for MMU updates");
169 goto out;
170 }
172 DPRINTF("Reloading memory pages: 0%%");
174 /*
175 * Now simply read each saved frame into its new machine frame.
176 * We uncanonicalise page tables as we go.
177 */
178 prev_pc = 0;
180 n = 0;
181 while ( 1 )
182 {
183 int j;
184 unsigned long region_pfn_type[MAX_BATCH_SIZE];
186 this_pc = (n * 100) / nr_pfns;
187 if ( (this_pc - prev_pc) >= 5 )
188 {
189 PPRINTF("\b\b\b\b%3d%%", this_pc);
190 prev_pc = this_pc;
191 }
193 if ( read_exact(io_fd, &j, sizeof(int)) != sizeof(int) )
194 {
195 ERR("Error when reading batch size");
196 goto out;
197 }
199 PPRINTF("batch %d\n",j);
201 if ( j == -1 )
202 {
203 verify = 1;
204 printf("Entering page verify mode\n");
205 continue;
206 }
208 if ( j == 0 )
209 break; /* our work here is done */
211 if ( j > MAX_BATCH_SIZE )
212 {
213 ERR("Max batch size exceeded. Giving up.");
214 goto out;
215 }
217 if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) !=
218 j*sizeof(unsigned long) ) {
219 ERR("Error when reading region pfn types");
220 goto out;
221 }
223 for ( i = 0; i < j; i++ )
224 {
225 if ( (region_pfn_type[i] & LTAB_MASK) == XTAB)
226 {
227 region_mfn[i] = 0; /* we know map will fail, but don't care */
228 }
229 else
230 {
231 pfn = region_pfn_type[i] & ~LTAB_MASK;
232 region_mfn[i] = pfn_to_mfn_table[pfn];
233 }
234 }
236 if ( (region_base = xc_map_foreign_batch( xc_handle, dom,
237 PROT_WRITE,
238 region_mfn,
239 j )) == 0 )
240 {
241 ERR("map batch failed");
242 goto out;
243 }
245 for ( i = 0; i < j; i++ )
246 {
247 unsigned long *ppage;
249 pfn = region_pfn_type[i] & ~LTAB_MASK;
251 if ( (region_pfn_type[i] & LTAB_MASK) == XTAB) continue;
253 if (pfn>nr_pfns)
254 {
255 ERR("pfn out of range");
256 goto out;
257 }
259 region_pfn_type[i] &= LTAB_MASK;
261 pfn_type[pfn] = region_pfn_type[i];
263 mfn = pfn_to_mfn_table[pfn];
265 if ( verify )
266 ppage = (unsigned long*) buf; /* debug case */
267 else
268 ppage = (unsigned long*) (region_base + i*PAGE_SIZE);
270 if ( read_exact(io_fd, ppage, PAGE_SIZE) != PAGE_SIZE )
271 {
272 ERR("Error when reading pagetable page");
273 goto out;
274 }
276 switch( region_pfn_type[i] & LTABTYPE_MASK )
277 {
278 case 0:
279 break;
281 case L1TAB:
282 {
283 for ( k = 0; k < 1024; k++ )
284 {
285 if ( ppage[k] & _PAGE_PRESENT )
286 {
287 xpfn = ppage[k] >> PAGE_SHIFT;
288 if ( xpfn >= nr_pfns )
289 {
290 ERR("Frame number in type %lu page "
291 "table is out of range. i=%d k=%d "
292 "pfn=0x%lx nr_pfns=%lu",
293 region_pfn_type[i]>>28, i,
294 k, xpfn, nr_pfns);
295 goto out;
296 }
298 ppage[k] &= (PAGE_SIZE - 1) &
299 ~(_PAGE_GLOBAL | _PAGE_PAT);
300 ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
301 }
302 }
303 }
304 break;
306 case L2TAB:
307 {
308 for ( k = 0;
309 k < (HYPERVISOR_VIRT_START>>L2_PAGETABLE_SHIFT);
310 k++ )
311 {
312 if ( ppage[k] & _PAGE_PRESENT )
313 {
314 xpfn = ppage[k] >> PAGE_SHIFT;
316 if ( xpfn >= nr_pfns )
317 {
318 ERR("Frame number in type %lu page"
319 " table is out of range. i=%d k=%d "
320 "pfn=%lu nr_pfns=%lu",
321 region_pfn_type[i]>>28, i, k,
322 xpfn, nr_pfns);
323 goto out;
324 }
326 ppage[k] &= (PAGE_SIZE - 1) &
327 ~(_PAGE_GLOBAL | _PAGE_PSE);
328 ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
329 }
330 }
331 }
332 break;
334 default:
335 ERR("Bogus page type %lx page table is "
336 "out of range. i=%d nr_pfns=%lu",
337 region_pfn_type[i], i, nr_pfns);
338 goto out;
340 } /* end of page type switch statement */
342 if ( verify )
343 {
344 int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE );
345 if ( res )
346 {
347 int v;
348 printf("************** pfn=%lx type=%lx gotcs=%08lx "
349 "actualcs=%08lx\n", pfn, pfn_type[pfn],
350 csum_page(region_base + i*PAGE_SIZE),
351 csum_page(buf));
352 for ( v = 0; v < 4; v++ )
353 {
354 unsigned long *p = (unsigned long *)
355 (region_base + i*PAGE_SIZE);
356 if ( buf[v] != p[v] )
357 printf(" %d: %08lx %08lx\n",
358 v, buf[v], p[v] );
359 }
360 }
361 }
363 if ( xc_add_mmu_update(xc_handle, mmu,
364 (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
365 pfn) )
366 {
367 printf("machpys mfn=%ld pfn=%ld\n",mfn,pfn);
368 goto out;
369 }
371 } /* end of 'batch' for loop */
373 munmap( region_base, j*PAGE_SIZE );
374 n+=j; /* crude stats */
375 }
377 DPRINTF("Received all pages\n");
379 if ( xc_finish_mmu_updates(xc_handle, mmu) )
380 goto out;
382 /*
383 * Pin page tables. Do this after writing to them as otherwise Xen
384 * will barf when doing the type-checking.
385 */
386 for ( i = 0; i < nr_pfns; i++ )
387 {
388 if ( (pfn_type[i] & LPINTAB) == 0 )
389 continue;
390 if ( pfn_type[i] == (L1TAB|LPINTAB) )
391 pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
392 else /* pfn_type[i] == (L2TAB|LPINTAB) */
393 pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
394 pin[nr_pins].mfn = pfn_to_mfn_table[i];
395 if ( ++nr_pins == MAX_PIN_BATCH )
396 {
397 if ( xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0 )
398 goto out;
399 nr_pins = 0;
400 }
401 }
403 if ( (nr_pins != 0) &&
404 (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) )
405 goto out;
407 DPRINTF("\b\b\b\b100%%\n");
408 DPRINTF("Memory reloaded.\n");
410 /* Get the list of PFNs that are not in the psuedo-phys map */
411 {
412 unsigned int count, *pfntab;
413 int rc;
415 if ( read_exact(io_fd, &count, sizeof(count)) != sizeof(count) )
416 {
417 ERR("Error when reading pfn count");
418 goto out;
419 }
421 pfntab = malloc( sizeof(unsigned int) * count );
422 if ( pfntab == NULL )
423 {
424 ERR("Out of memory");
425 goto out;
426 }
428 if ( read_exact(io_fd, pfntab, sizeof(unsigned int)*count) !=
429 sizeof(unsigned int)*count )
430 {
431 ERR("Error when reading pfntab");
432 goto out;
433 }
435 for ( i = 0; i < count; i++ )
436 {
437 unsigned long pfn = pfntab[i];
438 pfntab[i]=pfn_to_mfn_table[pfn];
439 pfn_to_mfn_table[pfn] = 0x80000001; // not in pmap
440 }
442 if ( count > 0 )
443 {
444 if ( (rc = xc_dom_mem_op( xc_handle,
445 MEMOP_decrease_reservation,
446 pfntab, count, 0, dom )) <0 )
447 {
448 ERR("Could not decrease reservation : %d",rc);
449 goto out;
450 }
451 else
452 {
453 printf("Decreased reservation by %d pages\n", count);
454 }
455 }
456 }
458 if ( read_exact(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) ||
459 read_exact(io_fd, shared_info_page, PAGE_SIZE) != PAGE_SIZE )
460 {
461 ERR("Error when reading ctxt or shared info page");
462 goto out;
463 }
465 /* Uncanonicalise the suspend-record frame number and poke resume rec. */
466 pfn = ctxt.user_regs.esi;
467 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
468 {
469 ERR("Suspend record frame number is bad");
470 goto out;
471 }
472 ctxt.user_regs.esi = mfn = pfn_to_mfn_table[pfn];
473 p_srec = xc_map_foreign_range(
474 xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
475 p_srec->resume_info.nr_pages = nr_pfns;
476 p_srec->resume_info.shared_info = shared_info_frame << PAGE_SHIFT;
477 p_srec->resume_info.flags = 0;
478 *store_mfn = p_srec->resume_info.store_mfn =
479 pfn_to_mfn_table[p_srec->resume_info.store_mfn];
480 p_srec->resume_info.store_evtchn = store_evtchn;
481 munmap(p_srec, PAGE_SIZE);
483 /* Uncanonicalise each GDT frame number. */
484 if ( ctxt.gdt_ents > 8192 )
485 {
486 ERR("GDT entry count out of range");
487 goto out;
488 }
490 for ( i = 0; i < ctxt.gdt_ents; i += 512 )
491 {
492 pfn = ctxt.gdt_frames[i];
493 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
494 {
495 ERR("GDT frame number is bad");
496 goto out;
497 }
498 ctxt.gdt_frames[i] = pfn_to_mfn_table[pfn];
499 }
501 /* Uncanonicalise the page table base pointer. */
502 pfn = ctxt.ctrlreg[3] >> PAGE_SHIFT;
503 if ( (pfn >= nr_pfns) || ((pfn_type[pfn]&LTABTYPE_MASK) != L2TAB) )
504 {
505 printf("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
506 pfn, nr_pfns, pfn_type[pfn], (unsigned long)L2TAB);
507 ERR("PT base is bad.");
508 goto out;
509 }
510 ctxt.ctrlreg[3] = pfn_to_mfn_table[pfn] << PAGE_SHIFT;
512 /* clear any pending events and the selector */
513 memset(&(shared_info->evtchn_pending[0]), 0,
514 sizeof (shared_info->evtchn_pending));
515 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
516 shared_info->vcpu_data[i].evtchn_pending_sel = 0;
518 /* Copy saved contents of shared-info page. No checking needed. */
519 ppage = xc_map_foreign_range(
520 xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
521 memcpy(ppage, shared_info, sizeof(shared_info_t));
522 munmap(ppage, PAGE_SIZE);
524 /* Uncanonicalise the pfn-to-mfn table frame-number list. */
525 for ( i = 0; i < (nr_pfns+1023)/1024; i++ )
526 {
527 unsigned long pfn, mfn;
529 pfn = pfn_to_mfn_frame_list[i];
530 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
531 {
532 ERR("PFN-to-MFN frame number is bad");
533 goto out;
534 }
535 mfn = pfn_to_mfn_table[pfn];
536 pfn_to_mfn_frame_list[i] = mfn;
537 }
539 if ( (live_pfn_to_mfn_table =
540 xc_map_foreign_batch(xc_handle, dom,
541 PROT_WRITE,
542 pfn_to_mfn_frame_list,
543 (nr_pfns+1023)/1024 )) == 0 )
544 {
545 ERR("Couldn't map pfn_to_mfn table");
546 goto out;
547 }
549 memcpy(live_pfn_to_mfn_table, pfn_to_mfn_table,
550 nr_pfns*sizeof(unsigned long) );
552 munmap(live_pfn_to_mfn_table, ((nr_pfns+1023)/1024)*PAGE_SIZE);
554 /*
555 * Safety checking of saved context:
556 * 1. user_regs is fine, as Xen checks that on context switch.
557 * 2. fpu_ctxt is fine, as it can't hurt Xen.
558 * 3. trap_ctxt needs the code selectors checked.
559 * 4. ldt base must be page-aligned, no more than 8192 ents, ...
560 * 5. gdt already done, and further checking is done by Xen.
561 * 6. check that kernel_ss is safe.
562 * 7. pt_base is already done.
563 * 8. debugregs are checked by Xen.
564 * 9. callback code selectors need checking.
565 */
566 for ( i = 0; i < 256; i++ )
567 {
568 ctxt.trap_ctxt[i].vector = i;
569 if ( (ctxt.trap_ctxt[i].cs & 3) == 0 )
570 ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS;
571 }
572 if ( (ctxt.kernel_ss & 3) == 0 )
573 ctxt.kernel_ss = FLAT_KERNEL_DS;
574 #if defined(__i386__)
575 if ( (ctxt.event_callback_cs & 3) == 0 )
576 ctxt.event_callback_cs = FLAT_KERNEL_CS;
577 if ( (ctxt.failsafe_callback_cs & 3) == 0 )
578 ctxt.failsafe_callback_cs = FLAT_KERNEL_CS;
579 #endif
580 if ( ((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) ||
581 (ctxt.ldt_ents > 8192) ||
582 (ctxt.ldt_base > HYPERVISOR_VIRT_START) ||
583 ((ctxt.ldt_base + ctxt.ldt_ents*8) > HYPERVISOR_VIRT_START) )
584 {
585 ERR("Bad LDT base or size");
586 goto out;
587 }
589 DPRINTF("Domain ready to be built.\n");
591 op.cmd = DOM0_SETDOMAININFO;
592 op.u.setdomaininfo.domain = (domid_t)dom;
593 op.u.setdomaininfo.vcpu = 0;
594 op.u.setdomaininfo.ctxt = &ctxt;
595 rc = xc_dom0_op(xc_handle, &op);
597 if ( rc != 0 )
598 {
599 ERR("Couldn't build the domain");
600 goto out;
601 }
603 DPRINTF("Domain ready to be unpaused\n");
604 op.cmd = DOM0_UNPAUSEDOMAIN;
605 op.u.unpausedomain.domain = (domid_t)dom;
606 rc = xc_dom0_op(xc_handle, &op);
607 if (rc == 0) {
608 /* Success: print the domain id. */
609 DPRINTF("DOM=%u\n", dom);
610 return 0;
611 }
613 out:
614 if ( (rc != 0) && (dom != 0) )
615 xc_domain_destroy(xc_handle, dom);
616 free(mmu);
617 free(pfn_to_mfn_table);
618 free(pfn_type);
620 DPRINTF("Restore exit with rc=%d\n", rc);
621 return rc;
622 }