ia64/xen-unstable

view tools/libxc/xc_linux_restore.c @ 6708:aa0990ef260f

merge
author iap10@freefall.cl.cam.ac.uk
date Thu Sep 08 17:42:49 2005 +0000 (2005-09-08)
parents 3bde4219c681 12ff9c954ace
children 2704a88c3295 cdfa7dd00c44
line source
1 /******************************************************************************
2 * xc_linux_restore.c
3 *
4 * Restore the state of a Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <stdlib.h>
10 #include <unistd.h>
11 #include "xg_private.h"
12 #include <xenctrl.h>
13 #include <xen/memory.h>
15 #define MAX_BATCH_SIZE 1024
17 #define DEBUG 0
19 #if 1
20 #define ERR(_f, _a...) do { fprintf ( stderr, _f , ## _a ); fflush(stderr); } while(0)
21 #else
22 #define ERR(_f, _a...) ((void)0)
23 #endif
25 #if DEBUG
26 #define DPRINTF(_f, _a...) do { fprintf ( stdout, _f , ## _a ); fflush(stdout); } while (0)
27 #else
28 #define DPRINTF(_f, _a...) ((void)0)
29 #endif
31 #define PROGRESS 0
32 #if PROGRESS
33 #define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a ); fflush(stderr)
34 #else
35 #define PPRINTF(_f, _a...)
36 #endif
38 static ssize_t
39 read_exact(int fd, void *buf, size_t count)
40 {
41 int r = 0, s;
42 unsigned char *b = buf;
44 while (r < count) {
45 s = read(fd, &b[r], count - r);
46 if (s <= 0)
47 break;
48 r += s;
49 }
51 return r;
52 }
54 int xc_linux_restore(int xc_handle, int io_fd, u32 dom, unsigned long nr_pfns,
55 unsigned int store_evtchn, unsigned long *store_mfn,
56 unsigned int console_evtchn, unsigned long *console_mfn)
57 {
58 dom0_op_t op;
59 int rc = 1, i, n, k;
60 unsigned long mfn, pfn, xpfn;
61 unsigned int prev_pc, this_pc;
62 int verify = 0;
63 int err;
65 /* The new domain's shared-info frame number. */
66 unsigned long shared_info_frame;
67 unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
68 shared_info_t *shared_info = (shared_info_t *)shared_info_page;
70 /* A copy of the CPU context of the guest. */
71 vcpu_guest_context_t ctxt;
73 /* A table containg the type of each PFN (/not/ MFN!). */
74 unsigned long *pfn_type = NULL;
76 /* A table of MFNs to map in the current region */
77 unsigned long *region_mfn = NULL;
79 /* A temporary mapping, and a copy, of one frame of guest memory. */
80 unsigned long *ppage = NULL;
82 /* A copy of the pfn-to-mfn table frame list. */
83 unsigned long pfn_to_mfn_frame_list[1024];
85 /* A table mapping each PFN to its new MFN. */
86 unsigned long *pfn_to_mfn_table = NULL;
88 /* used by mapper for updating the domain's copy of the table */
89 unsigned long *live_pfn_to_mfn_table = NULL;
91 /* A temporary mapping of the guest's start_info page. */
92 start_info_t *start_info;
94 char *region_base;
96 xc_mmu_t *mmu = NULL;
98 /* used by debug verify code */
99 unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
101 #define MAX_PIN_BATCH 1024
102 struct mmuext_op pin[MAX_PIN_BATCH];
103 unsigned int nr_pins = 0;
105 DPRINTF("xc_linux_restore start: nr_pfns = %lx\n", nr_pfns);
107 if (mlock(&ctxt, sizeof(ctxt))) {
108 /* needed for when we do the build dom0 op,
109 but might as well do early */
110 ERR("Unable to mlock ctxt");
111 return 1;
112 }
114 if (read_exact(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE) {
115 ERR("read pfn_to_mfn_frame_list failed");
116 goto out;
117 }
119 /* We want zeroed memory so use calloc rather than malloc. */
120 pfn_to_mfn_table = calloc(4, nr_pfns);
121 pfn_type = calloc(4, nr_pfns);
122 region_mfn = calloc(4, MAX_BATCH_SIZE);
124 if ((pfn_to_mfn_table == NULL) || (pfn_type == NULL) ||
125 (region_mfn == NULL)) {
126 ERR("memory alloc failed");
127 errno = ENOMEM;
128 goto out;
129 }
131 if (mlock(region_mfn, 4 * MAX_BATCH_SIZE)) {
132 ERR("Could not mlock region_mfn");
133 goto out;
134 }
136 /* Get the domain's shared-info frame. */
137 op.cmd = DOM0_GETDOMAININFO;
138 op.u.getdomaininfo.domain = (domid_t)dom;
139 if (xc_dom0_op(xc_handle, &op) < 0) {
140 ERR("Could not get information on new domain");
141 goto out;
142 }
143 shared_info_frame = op.u.getdomaininfo.shared_info_frame;
145 err = xc_domain_setmaxmem(xc_handle, dom, nr_pfns * PAGE_SIZE / 1024);
146 if (err != 0) {
147 errno = ENOMEM;
148 goto out;
149 }
151 err = xc_domain_memory_increase_reservation(xc_handle, dom,
152 nr_pfns, 0, 0, NULL);
153 if (err != 0) {
154 ERR("Failed to increase reservation by %lx\n",
155 nr_pfns * PAGE_SIZE / 1024);
156 errno = ENOMEM;
157 goto out;
158 }
160 /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
161 if (xc_get_pfn_list(xc_handle, dom, pfn_to_mfn_table, nr_pfns) !=
162 nr_pfns) {
163 ERR("Did not read correct number of frame numbers for new dom");
164 goto out;
165 }
167 mmu = xc_init_mmu_updates(xc_handle, dom);
168 if (mmu == NULL) {
169 ERR("Could not initialise for MMU updates");
170 goto out;
171 }
173 DPRINTF("Reloading memory pages: 0%%");
175 /*
176 * Now simply read each saved frame into its new machine frame.
177 * We uncanonicalise page tables as we go.
178 */
179 prev_pc = 0;
181 n = 0;
182 while ( 1 )
183 {
184 int j;
185 unsigned long region_pfn_type[MAX_BATCH_SIZE];
187 this_pc = (n * 100) / nr_pfns;
188 if ( (this_pc - prev_pc) >= 5 )
189 {
190 PPRINTF("\b\b\b\b%3d%%", this_pc);
191 prev_pc = this_pc;
192 }
194 if ( read_exact(io_fd, &j, sizeof(int)) != sizeof(int) )
195 {
196 ERR("Error when reading batch size");
197 goto out;
198 }
200 PPRINTF("batch %d\n",j);
202 if ( j == -1 )
203 {
204 verify = 1;
205 printf("Entering page verify mode\n");
206 continue;
207 }
209 if ( j == 0 )
210 break; /* our work here is done */
212 if ( j > MAX_BATCH_SIZE )
213 {
214 ERR("Max batch size exceeded. Giving up.");
215 goto out;
216 }
218 if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) !=
219 j*sizeof(unsigned long) ) {
220 ERR("Error when reading region pfn types");
221 goto out;
222 }
224 for ( i = 0; i < j; i++ )
225 {
226 if ( (region_pfn_type[i] & LTAB_MASK) == XTAB)
227 {
228 region_mfn[i] = 0; /* we know map will fail, but don't care */
229 }
230 else
231 {
232 pfn = region_pfn_type[i] & ~LTAB_MASK;
233 region_mfn[i] = pfn_to_mfn_table[pfn];
234 }
235 }
237 if ( (region_base = xc_map_foreign_batch( xc_handle, dom,
238 PROT_WRITE,
239 region_mfn,
240 j )) == 0 )
241 {
242 ERR("map batch failed");
243 goto out;
244 }
246 for ( i = 0; i < j; i++ )
247 {
248 unsigned long *ppage;
250 pfn = region_pfn_type[i] & ~LTAB_MASK;
252 if ( (region_pfn_type[i] & LTAB_MASK) == XTAB) continue;
254 if (pfn>nr_pfns)
255 {
256 ERR("pfn out of range");
257 goto out;
258 }
260 region_pfn_type[i] &= LTAB_MASK;
262 pfn_type[pfn] = region_pfn_type[i];
264 mfn = pfn_to_mfn_table[pfn];
266 if ( verify )
267 ppage = (unsigned long*) buf; /* debug case */
268 else
269 ppage = (unsigned long*) (region_base + i*PAGE_SIZE);
271 if ( read_exact(io_fd, ppage, PAGE_SIZE) != PAGE_SIZE )
272 {
273 ERR("Error when reading pagetable page");
274 goto out;
275 }
277 switch( region_pfn_type[i] & LTABTYPE_MASK )
278 {
279 case 0:
280 break;
282 case L1TAB:
283 {
284 for ( k = 0; k < 1024; k++ )
285 {
286 if ( ppage[k] & _PAGE_PRESENT )
287 {
288 xpfn = ppage[k] >> PAGE_SHIFT;
289 if ( xpfn >= nr_pfns )
290 {
291 ERR("Frame number in type %lu page "
292 "table is out of range. i=%d k=%d "
293 "pfn=0x%lx nr_pfns=%lu",
294 region_pfn_type[i]>>28, i,
295 k, xpfn, nr_pfns);
296 goto out;
297 }
299 ppage[k] &= (PAGE_SIZE - 1) &
300 ~(_PAGE_GLOBAL | _PAGE_PAT);
301 ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
302 }
303 }
304 }
305 break;
307 case L2TAB:
308 {
309 for ( k = 0;
310 k < (HYPERVISOR_VIRT_START>>L2_PAGETABLE_SHIFT);
311 k++ )
312 {
313 if ( ppage[k] & _PAGE_PRESENT )
314 {
315 xpfn = ppage[k] >> PAGE_SHIFT;
317 if ( xpfn >= nr_pfns )
318 {
319 ERR("Frame number in type %lu page"
320 " table is out of range. i=%d k=%d "
321 "pfn=%lu nr_pfns=%lu",
322 region_pfn_type[i]>>28, i, k,
323 xpfn, nr_pfns);
324 goto out;
325 }
327 ppage[k] &= (PAGE_SIZE - 1) &
328 ~(_PAGE_GLOBAL | _PAGE_PSE);
329 ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
330 }
331 }
332 }
333 break;
335 default:
336 ERR("Bogus page type %lx page table is "
337 "out of range. i=%d nr_pfns=%lu",
338 region_pfn_type[i], i, nr_pfns);
339 goto out;
341 } /* end of page type switch statement */
343 if ( verify )
344 {
345 int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE );
346 if ( res )
347 {
348 int v;
349 printf("************** pfn=%lx type=%lx gotcs=%08lx "
350 "actualcs=%08lx\n", pfn, pfn_type[pfn],
351 csum_page(region_base + i*PAGE_SIZE),
352 csum_page(buf));
353 for ( v = 0; v < 4; v++ )
354 {
355 unsigned long *p = (unsigned long *)
356 (region_base + i*PAGE_SIZE);
357 if ( buf[v] != p[v] )
358 printf(" %d: %08lx %08lx\n",
359 v, buf[v], p[v] );
360 }
361 }
362 }
364 if ( xc_add_mmu_update(xc_handle, mmu,
365 (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
366 pfn) )
367 {
368 printf("machpys mfn=%ld pfn=%ld\n",mfn,pfn);
369 goto out;
370 }
372 } /* end of 'batch' for loop */
374 munmap( region_base, j*PAGE_SIZE );
375 n+=j; /* crude stats */
376 }
378 DPRINTF("Received all pages\n");
380 if ( xc_finish_mmu_updates(xc_handle, mmu) )
381 goto out;
383 /*
384 * Pin page tables. Do this after writing to them as otherwise Xen
385 * will barf when doing the type-checking.
386 */
387 for ( i = 0; i < nr_pfns; i++ )
388 {
389 if ( (pfn_type[i] & LPINTAB) == 0 )
390 continue;
391 if ( pfn_type[i] == (L1TAB|LPINTAB) )
392 pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
393 else /* pfn_type[i] == (L2TAB|LPINTAB) */
394 pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
395 pin[nr_pins].mfn = pfn_to_mfn_table[i];
396 if ( ++nr_pins == MAX_PIN_BATCH )
397 {
398 if ( xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0 )
399 goto out;
400 nr_pins = 0;
401 }
402 }
404 if ( (nr_pins != 0) &&
405 (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) )
406 goto out;
408 DPRINTF("\b\b\b\b100%%\n");
409 DPRINTF("Memory reloaded.\n");
411 /* Get the list of PFNs that are not in the psuedo-phys map */
412 {
413 unsigned int count;
414 unsigned long *pfntab;
415 int rc;
417 if ( read_exact(io_fd, &count, sizeof(count)) != sizeof(count) )
418 {
419 ERR("Error when reading pfn count");
420 goto out;
421 }
423 pfntab = malloc( sizeof(unsigned int) * count );
424 if ( pfntab == NULL )
425 {
426 ERR("Out of memory");
427 goto out;
428 }
430 if ( read_exact(io_fd, pfntab, sizeof(unsigned int)*count) !=
431 sizeof(unsigned int)*count )
432 {
433 ERR("Error when reading pfntab");
434 goto out;
435 }
437 for ( i = 0; i < count; i++ )
438 {
439 unsigned long pfn = pfntab[i];
440 pfntab[i]=pfn_to_mfn_table[pfn];
441 pfn_to_mfn_table[pfn] = 0x80000001; // not in pmap
442 }
444 if ( count > 0 )
445 {
446 struct xen_memory_reservation reservation = {
447 .extent_start = pfntab,
448 .nr_extents = count,
449 .extent_order = 0,
450 .domid = dom
451 };
452 if ( (rc = xc_memory_op(xc_handle,
453 XENMEM_decrease_reservation,
454 &reservation)) != count )
455 {
456 ERR("Could not decrease reservation : %d",rc);
457 goto out;
458 }
459 else
460 {
461 printf("Decreased reservation by %d pages\n", count);
462 }
463 }
464 }
466 if ( read_exact(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) ||
467 read_exact(io_fd, shared_info_page, PAGE_SIZE) != PAGE_SIZE )
468 {
469 ERR("Error when reading ctxt or shared info page");
470 goto out;
471 }
473 /* Uncanonicalise the suspend-record frame number and poke resume rec. */
474 pfn = ctxt.user_regs.esi;
475 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
476 {
477 ERR("Suspend record frame number is bad");
478 goto out;
479 }
480 ctxt.user_regs.esi = mfn = pfn_to_mfn_table[pfn];
481 start_info = xc_map_foreign_range(
482 xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
483 start_info->nr_pages = nr_pfns;
484 start_info->shared_info = shared_info_frame << PAGE_SHIFT;
485 start_info->flags = 0;
486 *store_mfn = start_info->store_mfn =
487 pfn_to_mfn_table[start_info->store_mfn];
488 start_info->store_evtchn = store_evtchn;
489 *console_mfn = start_info->console_mfn =
490 pfn_to_mfn_table[start_info->console_mfn];
491 start_info->console_evtchn = console_evtchn;
492 munmap(start_info, PAGE_SIZE);
494 /* Uncanonicalise each GDT frame number. */
495 if ( ctxt.gdt_ents > 8192 )
496 {
497 ERR("GDT entry count out of range");
498 goto out;
499 }
501 for ( i = 0; i < ctxt.gdt_ents; i += 512 )
502 {
503 pfn = ctxt.gdt_frames[i];
504 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
505 {
506 ERR("GDT frame number is bad");
507 goto out;
508 }
509 ctxt.gdt_frames[i] = pfn_to_mfn_table[pfn];
510 }
512 /* Uncanonicalise the page table base pointer. */
513 pfn = ctxt.ctrlreg[3] >> PAGE_SHIFT;
514 if ( (pfn >= nr_pfns) || ((pfn_type[pfn]&LTABTYPE_MASK) != L2TAB) )
515 {
516 printf("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
517 pfn, nr_pfns, pfn_type[pfn], (unsigned long)L2TAB);
518 ERR("PT base is bad.");
519 goto out;
520 }
521 ctxt.ctrlreg[3] = pfn_to_mfn_table[pfn] << PAGE_SHIFT;
523 /* clear any pending events and the selector */
524 memset(&(shared_info->evtchn_pending[0]), 0,
525 sizeof (shared_info->evtchn_pending));
526 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
527 shared_info->vcpu_data[i].evtchn_pending_sel = 0;
529 /* Copy saved contents of shared-info page. No checking needed. */
530 ppage = xc_map_foreign_range(
531 xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
532 memcpy(ppage, shared_info, sizeof(shared_info_t));
533 munmap(ppage, PAGE_SIZE);
535 /* Uncanonicalise the pfn-to-mfn table frame-number list. */
536 for ( i = 0; i < (nr_pfns+1023)/1024; i++ )
537 {
538 unsigned long pfn, mfn;
540 pfn = pfn_to_mfn_frame_list[i];
541 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
542 {
543 ERR("PFN-to-MFN frame number is bad");
544 goto out;
545 }
546 mfn = pfn_to_mfn_table[pfn];
547 pfn_to_mfn_frame_list[i] = mfn;
548 }
550 if ( (live_pfn_to_mfn_table =
551 xc_map_foreign_batch(xc_handle, dom,
552 PROT_WRITE,
553 pfn_to_mfn_frame_list,
554 (nr_pfns+1023)/1024 )) == 0 )
555 {
556 ERR("Couldn't map pfn_to_mfn table");
557 goto out;
558 }
560 memcpy(live_pfn_to_mfn_table, pfn_to_mfn_table,
561 nr_pfns*sizeof(unsigned long) );
563 munmap(live_pfn_to_mfn_table, ((nr_pfns+1023)/1024)*PAGE_SIZE);
565 /*
566 * Safety checking of saved context:
567 * 1. user_regs is fine, as Xen checks that on context switch.
568 * 2. fpu_ctxt is fine, as it can't hurt Xen.
569 * 3. trap_ctxt needs the code selectors checked.
570 * 4. ldt base must be page-aligned, no more than 8192 ents, ...
571 * 5. gdt already done, and further checking is done by Xen.
572 * 6. check that kernel_ss is safe.
573 * 7. pt_base is already done.
574 * 8. debugregs are checked by Xen.
575 * 9. callback code selectors need checking.
576 */
577 for ( i = 0; i < 256; i++ )
578 {
579 ctxt.trap_ctxt[i].vector = i;
580 if ( (ctxt.trap_ctxt[i].cs & 3) == 0 )
581 ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS;
582 }
583 if ( (ctxt.kernel_ss & 3) == 0 )
584 ctxt.kernel_ss = FLAT_KERNEL_DS;
585 #if defined(__i386__)
586 if ( (ctxt.event_callback_cs & 3) == 0 )
587 ctxt.event_callback_cs = FLAT_KERNEL_CS;
588 if ( (ctxt.failsafe_callback_cs & 3) == 0 )
589 ctxt.failsafe_callback_cs = FLAT_KERNEL_CS;
590 #endif
591 if ( ((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) ||
592 (ctxt.ldt_ents > 8192) ||
593 (ctxt.ldt_base > HYPERVISOR_VIRT_START) ||
594 ((ctxt.ldt_base + ctxt.ldt_ents*8) > HYPERVISOR_VIRT_START) )
595 {
596 ERR("Bad LDT base or size");
597 goto out;
598 }
600 DPRINTF("Domain ready to be built.\n");
602 op.cmd = DOM0_SETDOMAININFO;
603 op.u.setdomaininfo.domain = (domid_t)dom;
604 op.u.setdomaininfo.vcpu = 0;
605 op.u.setdomaininfo.ctxt = &ctxt;
606 rc = xc_dom0_op(xc_handle, &op);
608 if ( rc != 0 )
609 {
610 ERR("Couldn't build the domain");
611 goto out;
612 }
614 DPRINTF("Domain ready to be unpaused\n");
615 op.cmd = DOM0_UNPAUSEDOMAIN;
616 op.u.unpausedomain.domain = (domid_t)dom;
617 rc = xc_dom0_op(xc_handle, &op);
618 if (rc == 0) {
619 /* Success: print the domain id. */
620 DPRINTF("DOM=%u\n", dom);
621 return 0;
622 }
624 out:
625 if ( (rc != 0) && (dom != 0) )
626 xc_domain_destroy(xc_handle, dom);
627 free(mmu);
628 free(pfn_to_mfn_table);
629 free(pfn_type);
631 DPRINTF("Restore exit with rc=%d\n", rc);
632 return rc;
633 }