ia64/xen-unstable

view tools/libxc/xc_linux_restore.c @ 6141:7c2fdcb2c933

another merge
author kaf24@firebug.cl.cam.ac.uk
date Fri Aug 12 14:53:26 2005 +0000 (2005-08-12)
parents 38bee85ddeb8 1fb1877ed6d1
children 4995d5f167c9 f51fe43c5d1c 6783e59e1c45
line source
1 /******************************************************************************
2 * xc_linux_restore.c
3 *
4 * Restore the state of a Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include "xc_private.h"
10 #include <xen/linux/suspend.h>
12 #define MAX_BATCH_SIZE 1024
14 #define DEBUG 0
16 #if 1
17 #define ERR(_f, _a...) fprintf ( stderr, _f , ## _a ); fflush(stderr)
18 #else
19 #define ERR(_f, _a...) ((void)0)
20 #endif
22 #if DEBUG
23 #define DPRINTF(_f, _a...) fprintf ( stdout, _f , ## _a ); fflush(stdout)
24 #else
25 #define DPRINTF(_f, _a...) ((void)0)
26 #endif
28 #define PROGRESS 0
29 #if PROGRESS
30 #define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a ); fflush(stderr)
31 #else
32 #define PPRINTF(_f, _a...)
33 #endif
35 static ssize_t
36 read_exact(int fd, void *buf, size_t count)
37 {
38 int r = 0, s;
39 unsigned char *b = buf;
41 while (r < count) {
42 s = read(fd, &b[r], count - r);
43 if (s <= 0)
44 break;
45 r += s;
46 }
48 return r;
49 }
51 int xc_linux_restore(int xc_handle, int io_fd, u32 dom, unsigned long nr_pfns,
52 unsigned int store_evtchn, unsigned long *store_mfn)
53 {
54 dom0_op_t op;
55 int rc = 1, i, n, k;
56 unsigned long mfn, pfn, xpfn;
57 unsigned int prev_pc, this_pc;
58 int verify = 0;
59 int err;
61 /* The new domain's shared-info frame number. */
62 unsigned long shared_info_frame;
63 unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
64 shared_info_t *shared_info = (shared_info_t *)shared_info_page;
66 /* A copy of the CPU context of the guest. */
67 vcpu_guest_context_t ctxt;
69 /* A table containg the type of each PFN (/not/ MFN!). */
70 unsigned long *pfn_type = NULL;
72 /* A table of MFNs to map in the current region */
73 unsigned long *region_mfn = NULL;
75 /* A temporary mapping, and a copy, of one frame of guest memory. */
76 unsigned long *ppage = NULL;
78 /* A copy of the pfn-to-mfn table frame list. */
79 unsigned long pfn_to_mfn_frame_list[1024];
81 /* A table mapping each PFN to its new MFN. */
82 unsigned long *pfn_to_mfn_table = NULL;
84 /* used by mapper for updating the domain's copy of the table */
85 unsigned long *live_pfn_to_mfn_table = NULL;
87 /* A temporary mapping of the guest's suspend record. */
88 suspend_record_t *p_srec;
90 char *region_base;
92 mmu_t *mmu = NULL;
94 /* used by debug verify code */
95 unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
97 #define MAX_PIN_BATCH 1024
98 struct mmuext_op pin[MAX_PIN_BATCH];
99 unsigned int nr_pins = 0;
101 DPRINTF("xc_linux_restore start\n");
103 if (mlock(&ctxt, sizeof(ctxt))) {
104 /* needed for when we do the build dom0 op,
105 but might as well do early */
106 ERR("Unable to mlock ctxt");
107 return 1;
108 }
110 if (read_exact(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE) {
111 ERR("read pfn_to_mfn_frame_list failed");
112 goto out;
113 }
115 /* We want zeroed memory so use calloc rather than malloc. */
116 pfn_to_mfn_table = calloc(4, nr_pfns);
117 pfn_type = calloc(4, nr_pfns);
118 region_mfn = calloc(4, MAX_BATCH_SIZE);
120 if ((pfn_to_mfn_table == NULL) || (pfn_type == NULL) ||
121 (region_mfn == NULL)) {
122 ERR("memory alloc failed");
123 errno = ENOMEM;
124 goto out;
125 }
127 if (mlock(region_mfn, 4 * MAX_BATCH_SIZE)) {
128 ERR("Could not mlock region_mfn");
129 goto out;
130 }
132 /* Get the domain's shared-info frame. */
133 op.cmd = DOM0_GETDOMAININFO;
134 op.u.getdomaininfo.domain = (domid_t)dom;
135 if (do_dom0_op(xc_handle, &op) < 0) {
136 ERR("Could not get information on new domain");
137 goto out;
138 }
139 shared_info_frame = op.u.getdomaininfo.shared_info_frame;
141 err = xc_domain_setmaxmem(xc_handle, dom, nr_pfns * PAGE_SIZE / 1024);
142 if (err != 0) {
143 errno = ENOMEM;
144 goto out;
145 }
147 err = xc_domain_memory_increase_reservation(xc_handle, dom,
148 nr_pfns * PAGE_SIZE / 1024);
149 if (err != 0) {
150 errno = ENOMEM;
151 goto out;
152 }
154 /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
155 if (xc_get_pfn_list(xc_handle, dom, pfn_to_mfn_table, nr_pfns) !=
156 nr_pfns) {
157 ERR("Did not read correct number of frame numbers for new dom");
158 goto out;
159 }
161 mmu = init_mmu_updates(xc_handle, dom);
162 if (mmu == NULL) {
163 ERR("Could not initialise for MMU updates");
164 goto out;
165 }
167 DPRINTF("Reloading memory pages: 0%%");
169 /*
170 * Now simply read each saved frame into its new machine frame.
171 * We uncanonicalise page tables as we go.
172 */
173 prev_pc = 0;
175 n = 0;
176 while ( 1 )
177 {
178 int j;
179 unsigned long region_pfn_type[MAX_BATCH_SIZE];
181 this_pc = (n * 100) / nr_pfns;
182 if ( (this_pc - prev_pc) >= 5 )
183 {
184 PPRINTF("\b\b\b\b%3d%%", this_pc);
185 prev_pc = this_pc;
186 }
188 if ( read_exact(io_fd, &j, sizeof(int)) != sizeof(int) )
189 {
190 ERR("Error when reading batch size");
191 goto out;
192 }
194 PPRINTF("batch %d\n",j);
196 if ( j == -1 )
197 {
198 verify = 1;
199 printf("Entering page verify mode\n");
200 continue;
201 }
203 if ( j == 0 )
204 break; /* our work here is done */
206 if ( j > MAX_BATCH_SIZE )
207 {
208 ERR("Max batch size exceeded. Giving up.");
209 goto out;
210 }
212 if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) !=
213 j*sizeof(unsigned long) ) {
214 ERR("Error when reading region pfn types");
215 goto out;
216 }
218 for ( i = 0; i < j; i++ )
219 {
220 if ( (region_pfn_type[i] & LTAB_MASK) == XTAB)
221 {
222 region_mfn[i] = 0; /* we know map will fail, but don't care */
223 }
224 else
225 {
226 pfn = region_pfn_type[i] & ~LTAB_MASK;
227 region_mfn[i] = pfn_to_mfn_table[pfn];
228 }
229 }
231 if ( (region_base = xc_map_foreign_batch( xc_handle, dom,
232 PROT_WRITE,
233 region_mfn,
234 j )) == 0 )
235 {
236 ERR("map batch failed");
237 goto out;
238 }
240 for ( i = 0; i < j; i++ )
241 {
242 unsigned long *ppage;
244 pfn = region_pfn_type[i] & ~LTAB_MASK;
246 if ( (region_pfn_type[i] & LTAB_MASK) == XTAB) continue;
248 if (pfn>nr_pfns)
249 {
250 ERR("pfn out of range");
251 goto out;
252 }
254 region_pfn_type[i] &= LTAB_MASK;
256 pfn_type[pfn] = region_pfn_type[i];
258 mfn = pfn_to_mfn_table[pfn];
260 if ( verify )
261 ppage = (unsigned long*) buf; /* debug case */
262 else
263 ppage = (unsigned long*) (region_base + i*PAGE_SIZE);
265 if ( read_exact(io_fd, ppage, PAGE_SIZE) != PAGE_SIZE )
266 {
267 ERR("Error when reading pagetable page");
268 goto out;
269 }
271 switch( region_pfn_type[i] & LTABTYPE_MASK )
272 {
273 case 0:
274 break;
276 case L1TAB:
277 {
278 for ( k = 0; k < 1024; k++ )
279 {
280 if ( ppage[k] & _PAGE_PRESENT )
281 {
282 xpfn = ppage[k] >> PAGE_SHIFT;
283 if ( xpfn >= nr_pfns )
284 {
285 ERR("Frame number in type %lu page "
286 "table is out of range. i=%d k=%d "
287 "pfn=0x%lx nr_pfns=%lu",
288 region_pfn_type[i]>>28, i,
289 k, xpfn, nr_pfns);
290 goto out;
291 }
293 ppage[k] &= (PAGE_SIZE - 1) &
294 ~(_PAGE_GLOBAL | _PAGE_PAT);
295 ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
296 }
297 }
298 }
299 break;
301 case L2TAB:
302 {
303 for ( k = 0;
304 k < (HYPERVISOR_VIRT_START>>L2_PAGETABLE_SHIFT);
305 k++ )
306 {
307 if ( ppage[k] & _PAGE_PRESENT )
308 {
309 xpfn = ppage[k] >> PAGE_SHIFT;
311 if ( xpfn >= nr_pfns )
312 {
313 ERR("Frame number in type %lu page"
314 " table is out of range. i=%d k=%d "
315 "pfn=%lu nr_pfns=%lu",
316 region_pfn_type[i]>>28, i, k,
317 xpfn, nr_pfns);
318 goto out;
319 }
321 ppage[k] &= (PAGE_SIZE - 1) &
322 ~(_PAGE_GLOBAL | _PAGE_PSE);
323 ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
324 }
325 }
326 }
327 break;
329 default:
330 ERR("Bogus page type %lx page table is "
331 "out of range. i=%d nr_pfns=%lu",
332 region_pfn_type[i], i, nr_pfns);
333 goto out;
335 } /* end of page type switch statement */
337 if ( verify )
338 {
339 int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE );
340 if ( res )
341 {
342 int v;
343 printf("************** pfn=%lx type=%lx gotcs=%08lx "
344 "actualcs=%08lx\n", pfn, pfn_type[pfn],
345 csum_page(region_base + i*PAGE_SIZE),
346 csum_page(buf));
347 for ( v = 0; v < 4; v++ )
348 {
349 unsigned long *p = (unsigned long *)
350 (region_base + i*PAGE_SIZE);
351 if ( buf[v] != p[v] )
352 printf(" %d: %08lx %08lx\n",
353 v, buf[v], p[v] );
354 }
355 }
356 }
358 if ( add_mmu_update(xc_handle, mmu,
359 (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, pfn) )
360 {
361 printf("machpys mfn=%ld pfn=%ld\n",mfn,pfn);
362 goto out;
363 }
365 } /* end of 'batch' for loop */
367 munmap( region_base, j*PAGE_SIZE );
368 n+=j; /* crude stats */
369 }
371 DPRINTF("Received all pages\n");
373 if ( finish_mmu_updates(xc_handle, mmu) )
374 goto out;
376 /*
377 * Pin page tables. Do this after writing to them as otherwise Xen
378 * will barf when doing the type-checking.
379 */
380 for ( i = 0; i < nr_pfns; i++ )
381 {
382 if ( (pfn_type[i] & LPINTAB) == 0 )
383 continue;
384 if ( pfn_type[i] == (L1TAB|LPINTAB) )
385 pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
386 else /* pfn_type[i] == (L2TAB|LPINTAB) */
387 pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
388 pin[nr_pins].mfn = pfn_to_mfn_table[i];
389 if ( ++nr_pins == MAX_PIN_BATCH )
390 {
391 if ( do_mmuext_op(xc_handle, pin, nr_pins, dom) < 0 )
392 goto out;
393 nr_pins = 0;
394 }
395 }
397 if ( (nr_pins != 0) &&
398 (do_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) )
399 goto out;
401 DPRINTF("\b\b\b\b100%%\n");
402 DPRINTF("Memory reloaded.\n");
404 /* Get the list of PFNs that are not in the psuedo-phys map */
405 {
406 unsigned int count, *pfntab;
407 int rc;
409 if ( read_exact(io_fd, &count, sizeof(count)) != sizeof(count) )
410 {
411 ERR("Error when reading pfn count");
412 goto out;
413 }
415 pfntab = malloc( sizeof(unsigned int) * count );
416 if ( pfntab == NULL )
417 {
418 ERR("Out of memory");
419 goto out;
420 }
422 if ( read_exact(io_fd, pfntab, sizeof(unsigned int)*count) !=
423 sizeof(unsigned int)*count )
424 {
425 ERR("Error when reading pfntab");
426 goto out;
427 }
429 for ( i = 0; i < count; i++ )
430 {
431 unsigned long pfn = pfntab[i];
432 pfntab[i]=pfn_to_mfn_table[pfn];
433 pfn_to_mfn_table[pfn] = 0x80000001; // not in pmap
434 }
436 if ( count > 0 )
437 {
438 if ( (rc = do_dom_mem_op( xc_handle,
439 MEMOP_decrease_reservation,
440 pfntab, count, 0, dom )) <0 )
441 {
442 ERR("Could not decrease reservation : %d",rc);
443 goto out;
444 }
445 else
446 {
447 printf("Decreased reservation by %d pages\n", count);
448 }
449 }
450 }
452 if ( read_exact(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) ||
453 read_exact(io_fd, shared_info_page, PAGE_SIZE) != PAGE_SIZE )
454 {
455 ERR("Error when reading ctxt or shared info page");
456 goto out;
457 }
459 /* Uncanonicalise the suspend-record frame number and poke resume rec. */
460 pfn = ctxt.user_regs.esi;
461 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
462 {
463 ERR("Suspend record frame number is bad");
464 goto out;
465 }
466 ctxt.user_regs.esi = mfn = pfn_to_mfn_table[pfn];
467 p_srec = xc_map_foreign_range(
468 xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
469 p_srec->resume_info.nr_pages = nr_pfns;
470 p_srec->resume_info.shared_info = shared_info_frame << PAGE_SHIFT;
471 p_srec->resume_info.flags = 0;
472 *store_mfn = p_srec->resume_info.store_mfn =
473 pfn_to_mfn_table[p_srec->resume_info.store_mfn];
474 p_srec->resume_info.store_evtchn = store_evtchn;
475 munmap(p_srec, PAGE_SIZE);
477 /* Uncanonicalise each GDT frame number. */
478 if ( ctxt.gdt_ents > 8192 )
479 {
480 ERR("GDT entry count out of range");
481 goto out;
482 }
484 for ( i = 0; i < ctxt.gdt_ents; i += 512 )
485 {
486 pfn = ctxt.gdt_frames[i];
487 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
488 {
489 ERR("GDT frame number is bad");
490 goto out;
491 }
492 ctxt.gdt_frames[i] = pfn_to_mfn_table[pfn];
493 }
495 /* Uncanonicalise the page table base pointer. */
496 pfn = ctxt.ctrlreg[3] >> PAGE_SHIFT;
497 if ( (pfn >= nr_pfns) || ((pfn_type[pfn]&LTABTYPE_MASK) != L2TAB) )
498 {
499 printf("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
500 pfn, nr_pfns, pfn_type[pfn], (unsigned long)L2TAB);
501 ERR("PT base is bad.");
502 goto out;
503 }
504 ctxt.ctrlreg[3] = pfn_to_mfn_table[pfn] << PAGE_SHIFT;
506 /* clear any pending events and the selector */
507 memset(&(shared_info->evtchn_pending[0]), 0,
508 sizeof (shared_info->evtchn_pending));
509 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
510 shared_info->vcpu_data[i].evtchn_pending_sel = 0;
512 /* Copy saved contents of shared-info page. No checking needed. */
513 ppage = xc_map_foreign_range(
514 xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
515 memcpy(ppage, shared_info, sizeof(shared_info_t));
516 munmap(ppage, PAGE_SIZE);
518 /* Uncanonicalise the pfn-to-mfn table frame-number list. */
519 for ( i = 0; i < (nr_pfns+1023)/1024; i++ )
520 {
521 unsigned long pfn, mfn;
523 pfn = pfn_to_mfn_frame_list[i];
524 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
525 {
526 ERR("PFN-to-MFN frame number is bad");
527 goto out;
528 }
529 mfn = pfn_to_mfn_table[pfn];
530 pfn_to_mfn_frame_list[i] = mfn;
531 }
533 if ( (live_pfn_to_mfn_table =
534 xc_map_foreign_batch(xc_handle, dom,
535 PROT_WRITE,
536 pfn_to_mfn_frame_list,
537 (nr_pfns+1023)/1024 )) == 0 )
538 {
539 ERR("Couldn't map pfn_to_mfn table");
540 goto out;
541 }
543 memcpy(live_pfn_to_mfn_table, pfn_to_mfn_table,
544 nr_pfns*sizeof(unsigned long) );
546 munmap(live_pfn_to_mfn_table, ((nr_pfns+1023)/1024)*PAGE_SIZE);
548 /*
549 * Safety checking of saved context:
550 * 1. user_regs is fine, as Xen checks that on context switch.
551 * 2. fpu_ctxt is fine, as it can't hurt Xen.
552 * 3. trap_ctxt needs the code selectors checked.
553 * 4. ldt base must be page-aligned, no more than 8192 ents, ...
554 * 5. gdt already done, and further checking is done by Xen.
555 * 6. check that kernel_ss is safe.
556 * 7. pt_base is already done.
557 * 8. debugregs are checked by Xen.
558 * 9. callback code selectors need checking.
559 */
560 for ( i = 0; i < 256; i++ )
561 {
562 ctxt.trap_ctxt[i].vector = i;
563 if ( (ctxt.trap_ctxt[i].cs & 3) == 0 )
564 ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS;
565 }
566 if ( (ctxt.kernel_ss & 3) == 0 )
567 ctxt.kernel_ss = FLAT_KERNEL_DS;
568 #if defined(__i386__)
569 if ( (ctxt.event_callback_cs & 3) == 0 )
570 ctxt.event_callback_cs = FLAT_KERNEL_CS;
571 if ( (ctxt.failsafe_callback_cs & 3) == 0 )
572 ctxt.failsafe_callback_cs = FLAT_KERNEL_CS;
573 #endif
574 if ( ((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) ||
575 (ctxt.ldt_ents > 8192) ||
576 (ctxt.ldt_base > HYPERVISOR_VIRT_START) ||
577 ((ctxt.ldt_base + ctxt.ldt_ents*8) > HYPERVISOR_VIRT_START) )
578 {
579 ERR("Bad LDT base or size");
580 goto out;
581 }
583 DPRINTF("Domain ready to be built.\n");
585 op.cmd = DOM0_SETDOMAININFO;
586 op.u.setdomaininfo.domain = (domid_t)dom;
587 op.u.setdomaininfo.vcpu = 0;
588 op.u.setdomaininfo.ctxt = &ctxt;
589 rc = do_dom0_op(xc_handle, &op);
591 if ( rc != 0 )
592 {
593 ERR("Couldn't build the domain");
594 goto out;
595 }
597 DPRINTF("Domain ready to be unpaused\n");
598 op.cmd = DOM0_UNPAUSEDOMAIN;
599 op.u.unpausedomain.domain = (domid_t)dom;
600 rc = do_dom0_op(xc_handle, &op);
601 if (rc == 0) {
602 /* Success: print the domain id. */
603 DPRINTF("DOM=%u\n", dom);
604 return 0;
605 }
607 out:
608 if ( (rc != 0) && (dom != 0) )
609 xc_domain_destroy(xc_handle, dom);
610 free(mmu);
611 free(pfn_to_mfn_table);
612 free(pfn_type);
614 DPRINTF("Restore exit with rc=%d\n", rc);
615 return rc;
616 }