ia64/xen-unstable

view tools/libxc/xc_linux_restore.c @ 6700:12ff9c954ace

Give each domain some memory below 4GB. This solves the "PGD's must be below 4GB" for the initial page tables. I'm not sure we'll stick with this approach, but this is good enough for the time being.

PAE should be a *lot* more robust on systems that actually have more than 4GB thanks to all the various patches that went in today. I find it astounding that it ever appeared to work at all!

Signed-off-by: ian@xensource.com
author iap10@freefall.cl.cam.ac.uk
date Thu Sep 08 01:07:15 2005 +0000 (2005-09-08)
parents 1f460d0fd6c6
children 5db85ba1c4e0 3bde4219c681 aa0990ef260f
line source
1 /******************************************************************************
2 * xc_linux_restore.c
3 *
4 * Restore the state of a Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <stdlib.h>
10 #include <unistd.h>
11 #include "xg_private.h"
12 #include <xenctrl.h>
13 #include <xen/memory.h>
15 #define MAX_BATCH_SIZE 1024
17 #define DEBUG 0
19 #if 1
20 #define ERR(_f, _a...) do { fprintf ( stderr, _f , ## _a ); fflush(stderr); } while(0)
21 #else
22 #define ERR(_f, _a...) ((void)0)
23 #endif
25 #if DEBUG
26 #define DPRINTF(_f, _a...) do { fprintf ( stdout, _f , ## _a ); fflush(stdout); } while (0)
27 #else
28 #define DPRINTF(_f, _a...) ((void)0)
29 #endif
31 #define PROGRESS 0
32 #if PROGRESS
33 #define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a ); fflush(stderr)
34 #else
35 #define PPRINTF(_f, _a...)
36 #endif
38 static ssize_t
39 read_exact(int fd, void *buf, size_t count)
40 {
41 int r = 0, s;
42 unsigned char *b = buf;
44 while (r < count) {
45 s = read(fd, &b[r], count - r);
46 if (s <= 0)
47 break;
48 r += s;
49 }
51 return r;
52 }
54 int xc_linux_restore(int xc_handle, int io_fd, u32 dom, unsigned long nr_pfns,
55 unsigned int store_evtchn, unsigned long *store_mfn,
56 unsigned int console_evtchn, unsigned long *console_mfn)
57 {
58 dom0_op_t op;
59 int rc = 1, i, n, k;
60 unsigned long mfn, pfn, xpfn;
61 unsigned int prev_pc, this_pc;
62 int verify = 0;
63 int err;
65 /* The new domain's shared-info frame number. */
66 unsigned long shared_info_frame;
67 unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
68 shared_info_t *shared_info = (shared_info_t *)shared_info_page;
70 /* A copy of the CPU context of the guest. */
71 vcpu_guest_context_t ctxt;
73 /* A table containg the type of each PFN (/not/ MFN!). */
74 unsigned long *pfn_type = NULL;
76 /* A table of MFNs to map in the current region */
77 unsigned long *region_mfn = NULL;
79 /* A temporary mapping, and a copy, of one frame of guest memory. */
80 unsigned long *ppage = NULL;
82 /* A copy of the pfn-to-mfn table frame list. */
83 unsigned long pfn_to_mfn_frame_list[1024];
85 /* A table mapping each PFN to its new MFN. */
86 unsigned long *pfn_to_mfn_table = NULL;
88 /* used by mapper for updating the domain's copy of the table */
89 unsigned long *live_pfn_to_mfn_table = NULL;
91 /* A temporary mapping of the guest's start_info page. */
92 start_info_t *start_info;
94 char *region_base;
96 xc_mmu_t *mmu = NULL;
98 /* used by debug verify code */
99 unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
101 #define MAX_PIN_BATCH 1024
102 struct mmuext_op pin[MAX_PIN_BATCH];
103 unsigned int nr_pins = 0;
105 DPRINTF("xc_linux_restore start: nr_pfns = %lx\n", nr_pfns);
107 if (mlock(&ctxt, sizeof(ctxt))) {
108 /* needed for when we do the build dom0 op,
109 but might as well do early */
110 ERR("Unable to mlock ctxt");
111 return 1;
112 }
114 if (read_exact(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE) {
115 ERR("read pfn_to_mfn_frame_list failed");
116 goto out;
117 }
119 /* We want zeroed memory so use calloc rather than malloc. */
120 pfn_to_mfn_table = calloc(4, nr_pfns);
121 pfn_type = calloc(4, nr_pfns);
122 region_mfn = calloc(4, MAX_BATCH_SIZE);
124 if ((pfn_to_mfn_table == NULL) || (pfn_type == NULL) ||
125 (region_mfn == NULL)) {
126 ERR("memory alloc failed");
127 errno = ENOMEM;
128 goto out;
129 }
131 if (mlock(region_mfn, 4 * MAX_BATCH_SIZE)) {
132 ERR("Could not mlock region_mfn");
133 goto out;
134 }
136 /* Get the domain's shared-info frame. */
137 op.cmd = DOM0_GETDOMAININFO;
138 op.u.getdomaininfo.domain = (domid_t)dom;
139 if (xc_dom0_op(xc_handle, &op) < 0) {
140 ERR("Could not get information on new domain");
141 goto out;
142 }
143 shared_info_frame = op.u.getdomaininfo.shared_info_frame;
145 err = xc_domain_setmaxmem(xc_handle, dom, nr_pfns * PAGE_SIZE / 1024);
146 if (err != 0) {
147 errno = ENOMEM;
148 goto out;
149 }
151 err = xc_domain_memory_increase_reservation(xc_handle, dom,
152 nr_pfns * PAGE_SIZE / 1024, 0, 0); //FIX ME
153 if (err != 0) {
154 ERR("Failed to increase reservation by %lx\n",
155 nr_pfns * PAGE_SIZE / 1024);
156 errno = ENOMEM;
157 goto out;
158 }
160 /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
161 if (xc_get_pfn_list(xc_handle, dom, pfn_to_mfn_table, nr_pfns) !=
162 nr_pfns) {
163 ERR("Did not read correct number of frame numbers for new dom");
164 goto out;
165 }
167 mmu = xc_init_mmu_updates(xc_handle, dom);
168 if (mmu == NULL) {
169 ERR("Could not initialise for MMU updates");
170 goto out;
171 }
173 DPRINTF("Reloading memory pages: 0%%");
175 /*
176 * Now simply read each saved frame into its new machine frame.
177 * We uncanonicalise page tables as we go.
178 */
179 prev_pc = 0;
181 n = 0;
182 while ( 1 )
183 {
184 int j;
185 unsigned long region_pfn_type[MAX_BATCH_SIZE];
187 this_pc = (n * 100) / nr_pfns;
188 if ( (this_pc - prev_pc) >= 5 )
189 {
190 PPRINTF("\b\b\b\b%3d%%", this_pc);
191 prev_pc = this_pc;
192 }
194 if ( read_exact(io_fd, &j, sizeof(int)) != sizeof(int) )
195 {
196 ERR("Error when reading batch size");
197 goto out;
198 }
200 PPRINTF("batch %d\n",j);
202 if ( j == -1 )
203 {
204 verify = 1;
205 printf("Entering page verify mode\n");
206 continue;
207 }
209 if ( j == 0 )
210 break; /* our work here is done */
212 if ( j > MAX_BATCH_SIZE )
213 {
214 ERR("Max batch size exceeded. Giving up.");
215 goto out;
216 }
218 if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) !=
219 j*sizeof(unsigned long) ) {
220 ERR("Error when reading region pfn types");
221 goto out;
222 }
224 for ( i = 0; i < j; i++ )
225 {
226 if ( (region_pfn_type[i] & LTAB_MASK) == XTAB)
227 {
228 region_mfn[i] = 0; /* we know map will fail, but don't care */
229 }
230 else
231 {
232 pfn = region_pfn_type[i] & ~LTAB_MASK;
233 region_mfn[i] = pfn_to_mfn_table[pfn];
234 }
235 }
237 if ( (region_base = xc_map_foreign_batch( xc_handle, dom,
238 PROT_WRITE,
239 region_mfn,
240 j )) == 0 )
241 {
242 ERR("map batch failed");
243 goto out;
244 }
246 for ( i = 0; i < j; i++ )
247 {
248 unsigned long *ppage;
250 pfn = region_pfn_type[i] & ~LTAB_MASK;
252 if ( (region_pfn_type[i] & LTAB_MASK) == XTAB) continue;
254 if (pfn>nr_pfns)
255 {
256 ERR("pfn out of range");
257 goto out;
258 }
260 region_pfn_type[i] &= LTAB_MASK;
262 pfn_type[pfn] = region_pfn_type[i];
264 mfn = pfn_to_mfn_table[pfn];
266 if ( verify )
267 ppage = (unsigned long*) buf; /* debug case */
268 else
269 ppage = (unsigned long*) (region_base + i*PAGE_SIZE);
271 if ( read_exact(io_fd, ppage, PAGE_SIZE) != PAGE_SIZE )
272 {
273 ERR("Error when reading pagetable page");
274 goto out;
275 }
277 switch( region_pfn_type[i] & LTABTYPE_MASK )
278 {
279 case 0:
280 break;
282 case L1TAB:
283 {
284 for ( k = 0; k < 1024; k++ )
285 {
286 if ( ppage[k] & _PAGE_PRESENT )
287 {
288 xpfn = ppage[k] >> PAGE_SHIFT;
289 if ( xpfn >= nr_pfns )
290 {
291 ERR("Frame number in type %lu page "
292 "table is out of range. i=%d k=%d "
293 "pfn=0x%lx nr_pfns=%lu",
294 region_pfn_type[i]>>28, i,
295 k, xpfn, nr_pfns);
296 goto out;
297 }
299 ppage[k] &= (PAGE_SIZE - 1) &
300 ~(_PAGE_GLOBAL | _PAGE_PAT);
301 ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
302 }
303 }
304 }
305 break;
307 case L2TAB:
308 {
309 for ( k = 0;
310 k < (HYPERVISOR_VIRT_START>>L2_PAGETABLE_SHIFT);
311 k++ )
312 {
313 if ( ppage[k] & _PAGE_PRESENT )
314 {
315 xpfn = ppage[k] >> PAGE_SHIFT;
317 if ( xpfn >= nr_pfns )
318 {
319 ERR("Frame number in type %lu page"
320 " table is out of range. i=%d k=%d "
321 "pfn=%lu nr_pfns=%lu",
322 region_pfn_type[i]>>28, i, k,
323 xpfn, nr_pfns);
324 goto out;
325 }
327 ppage[k] &= (PAGE_SIZE - 1) &
328 ~(_PAGE_GLOBAL | _PAGE_PSE);
329 ppage[k] |= pfn_to_mfn_table[xpfn] << PAGE_SHIFT;
330 }
331 }
332 }
333 break;
335 default:
336 ERR("Bogus page type %lx page table is "
337 "out of range. i=%d nr_pfns=%lu",
338 region_pfn_type[i], i, nr_pfns);
339 goto out;
341 } /* end of page type switch statement */
343 if ( verify )
344 {
345 int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE );
346 if ( res )
347 {
348 int v;
349 printf("************** pfn=%lx type=%lx gotcs=%08lx "
350 "actualcs=%08lx\n", pfn, pfn_type[pfn],
351 csum_page(region_base + i*PAGE_SIZE),
352 csum_page(buf));
353 for ( v = 0; v < 4; v++ )
354 {
355 unsigned long *p = (unsigned long *)
356 (region_base + i*PAGE_SIZE);
357 if ( buf[v] != p[v] )
358 printf(" %d: %08lx %08lx\n",
359 v, buf[v], p[v] );
360 }
361 }
362 }
364 if ( xc_add_mmu_update(xc_handle, mmu,
365 (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
366 pfn) )
367 {
368 printf("machpys mfn=%ld pfn=%ld\n",mfn,pfn);
369 goto out;
370 }
372 } /* end of 'batch' for loop */
374 munmap( region_base, j*PAGE_SIZE );
375 n+=j; /* crude stats */
376 }
378 DPRINTF("Received all pages\n");
380 if ( xc_finish_mmu_updates(xc_handle, mmu) )
381 goto out;
383 /*
384 * Pin page tables. Do this after writing to them as otherwise Xen
385 * will barf when doing the type-checking.
386 */
387 for ( i = 0; i < nr_pfns; i++ )
388 {
389 if ( (pfn_type[i] & LPINTAB) == 0 )
390 continue;
391 if ( pfn_type[i] == (L1TAB|LPINTAB) )
392 pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
393 else /* pfn_type[i] == (L2TAB|LPINTAB) */
394 pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
395 pin[nr_pins].mfn = pfn_to_mfn_table[i];
396 if ( ++nr_pins == MAX_PIN_BATCH )
397 {
398 if ( xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0 )
399 goto out;
400 nr_pins = 0;
401 }
402 }
404 if ( (nr_pins != 0) &&
405 (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) )
406 goto out;
408 DPRINTF("\b\b\b\b100%%\n");
409 DPRINTF("Memory reloaded.\n");
411 /* Get the list of PFNs that are not in the psuedo-phys map */
412 {
413 unsigned int count;
414 unsigned long *pfntab;
415 int rc;
417 if ( read_exact(io_fd, &count, sizeof(count)) != sizeof(count) )
418 {
419 ERR("Error when reading pfn count");
420 goto out;
421 }
423 pfntab = malloc( sizeof(unsigned int) * count );
424 if ( pfntab == NULL )
425 {
426 ERR("Out of memory");
427 goto out;
428 }
430 if ( read_exact(io_fd, pfntab, sizeof(unsigned int)*count) !=
431 sizeof(unsigned int)*count )
432 {
433 ERR("Error when reading pfntab");
434 goto out;
435 }
437 for ( i = 0; i < count; i++ )
438 {
439 unsigned long pfn = pfntab[i];
440 pfntab[i]=pfn_to_mfn_table[pfn];
441 pfn_to_mfn_table[pfn] = 0x80000001; // not in pmap
442 }
444 if ( count > 0 )
445 {
446 struct xen_memory_reservation reservation = {
447 .extent_start = pfntab,
448 .nr_extents = count,
449 .extent_order = 0,
450 .domid = dom
451 };
452 if ( (rc = xc_memory_op(xc_handle,
453 XENMEM_decrease_reservation,
454 &reservation)) != count )
455 {
456 ERR("Could not decrease reservation : %d",rc);
457 goto out;
458 }
459 else
460 {
461 printf("Decreased reservation by %d pages\n", count);
462 }
463 }
464 }
466 if ( read_exact(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) ||
467 read_exact(io_fd, shared_info_page, PAGE_SIZE) != PAGE_SIZE )
468 {
469 ERR("Error when reading ctxt or shared info page");
470 goto out;
471 }
473 /* Uncanonicalise the suspend-record frame number and poke resume rec. */
474 pfn = ctxt.user_regs.esi;
475 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
476 {
477 ERR("Suspend record frame number is bad");
478 goto out;
479 }
480 ctxt.user_regs.esi = mfn = pfn_to_mfn_table[pfn];
481 start_info = xc_map_foreign_range(
482 xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
483 start_info->nr_pages = nr_pfns;
484 start_info->shared_info = shared_info_frame << PAGE_SHIFT;
485 start_info->flags = 0;
486 *store_mfn = start_info->store_mfn =
487 pfn_to_mfn_table[start_info->store_mfn];
488 start_info->store_evtchn = store_evtchn;
489 *console_mfn = start_info->console_mfn =
490 pfn_to_mfn_table[start_info->console_mfn];
491 start_info->console_evtchn = console_evtchn;
492 munmap(start_info, PAGE_SIZE);
494 /* Uncanonicalise each GDT frame number. */
495 if ( ctxt.gdt_ents > 8192 )
496 {
497 ERR("GDT entry count out of range");
498 goto out;
499 }
501 for ( i = 0; i < ctxt.gdt_ents; i += 512 )
502 {
503 pfn = ctxt.gdt_frames[i];
504 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
505 {
506 ERR("GDT frame number is bad");
507 goto out;
508 }
509 ctxt.gdt_frames[i] = pfn_to_mfn_table[pfn];
510 }
512 /* Uncanonicalise the page table base pointer. */
513 pfn = ctxt.ctrlreg[3] >> PAGE_SHIFT;
514 if ( (pfn >= nr_pfns) || ((pfn_type[pfn]&LTABTYPE_MASK) != L2TAB) )
515 {
516 printf("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
517 pfn, nr_pfns, pfn_type[pfn], (unsigned long)L2TAB);
518 ERR("PT base is bad.");
519 goto out;
520 }
521 ctxt.ctrlreg[3] = pfn_to_mfn_table[pfn] << PAGE_SHIFT;
523 /* clear any pending events and the selector */
524 memset(&(shared_info->evtchn_pending[0]), 0,
525 sizeof (shared_info->evtchn_pending));
526 for ( i = 0; i < MAX_VIRT_CPUS; i++ )
527 shared_info->vcpu_data[i].evtchn_pending_sel = 0;
529 /* Copy saved contents of shared-info page. No checking needed. */
530 ppage = xc_map_foreign_range(
531 xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame);
532 memcpy(ppage, shared_info, sizeof(shared_info_t));
533 munmap(ppage, PAGE_SIZE);
535 /* Uncanonicalise the pfn-to-mfn table frame-number list. */
536 for ( i = 0; i < (nr_pfns+1023)/1024; i++ )
537 {
538 unsigned long pfn, mfn;
540 pfn = pfn_to_mfn_frame_list[i];
541 if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NOTAB) )
542 {
543 ERR("PFN-to-MFN frame number is bad");
544 goto out;
545 }
546 mfn = pfn_to_mfn_table[pfn];
547 pfn_to_mfn_frame_list[i] = mfn;
548 }
550 if ( (live_pfn_to_mfn_table =
551 xc_map_foreign_batch(xc_handle, dom,
552 PROT_WRITE,
553 pfn_to_mfn_frame_list,
554 (nr_pfns+1023)/1024 )) == 0 )
555 {
556 ERR("Couldn't map pfn_to_mfn table");
557 goto out;
558 }
560 memcpy(live_pfn_to_mfn_table, pfn_to_mfn_table,
561 nr_pfns*sizeof(unsigned long) );
563 munmap(live_pfn_to_mfn_table, ((nr_pfns+1023)/1024)*PAGE_SIZE);
565 /*
566 * Safety checking of saved context:
567 * 1. user_regs is fine, as Xen checks that on context switch.
568 * 2. fpu_ctxt is fine, as it can't hurt Xen.
569 * 3. trap_ctxt needs the code selectors checked.
570 * 4. ldt base must be page-aligned, no more than 8192 ents, ...
571 * 5. gdt already done, and further checking is done by Xen.
572 * 6. check that kernel_ss is safe.
573 * 7. pt_base is already done.
574 * 8. debugregs are checked by Xen.
575 * 9. callback code selectors need checking.
576 */
577 for ( i = 0; i < 256; i++ )
578 {
579 ctxt.trap_ctxt[i].vector = i;
580 if ( (ctxt.trap_ctxt[i].cs & 3) == 0 )
581 ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS;
582 }
583 if ( (ctxt.kernel_ss & 3) == 0 )
584 ctxt.kernel_ss = FLAT_KERNEL_DS;
585 #if defined(__i386__)
586 if ( (ctxt.event_callback_cs & 3) == 0 )
587 ctxt.event_callback_cs = FLAT_KERNEL_CS;
588 if ( (ctxt.failsafe_callback_cs & 3) == 0 )
589 ctxt.failsafe_callback_cs = FLAT_KERNEL_CS;
590 #endif
591 if ( ((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) ||
592 (ctxt.ldt_ents > 8192) ||
593 (ctxt.ldt_base > HYPERVISOR_VIRT_START) ||
594 ((ctxt.ldt_base + ctxt.ldt_ents*8) > HYPERVISOR_VIRT_START) )
595 {
596 ERR("Bad LDT base or size");
597 goto out;
598 }
600 DPRINTF("Domain ready to be built.\n");
602 op.cmd = DOM0_SETDOMAININFO;
603 op.u.setdomaininfo.domain = (domid_t)dom;
604 op.u.setdomaininfo.vcpu = 0;
605 op.u.setdomaininfo.ctxt = &ctxt;
606 rc = xc_dom0_op(xc_handle, &op);
608 if ( rc != 0 )
609 {
610 ERR("Couldn't build the domain");
611 goto out;
612 }
614 DPRINTF("Domain ready to be unpaused\n");
615 op.cmd = DOM0_UNPAUSEDOMAIN;
616 op.u.unpausedomain.domain = (domid_t)dom;
617 rc = xc_dom0_op(xc_handle, &op);
618 if (rc == 0) {
619 /* Success: print the domain id. */
620 DPRINTF("DOM=%u\n", dom);
621 return 0;
622 }
624 out:
625 if ( (rc != 0) && (dom != 0) )
626 xc_domain_destroy(xc_handle, dom);
627 free(mmu);
628 free(pfn_to_mfn_table);
629 free(pfn_type);
631 DPRINTF("Restore exit with rc=%d\n", rc);
632 return rc;
633 }