direct-io.hg

view tools/libxc/xc_linux_save.c @ 7967:13b2e5c94595

Ignore live flag on PAE/64-bit migrations, rather than
failing the migration entirely.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Tue Nov 22 12:04:03 2005 +0100 (2005-11-22)
parents bdab22f56efe
children 72a1421dcf1b
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xg_private.h"
16 #include "xg_save_restore.h"
18 /*
19 ** Default values for important tuning parameters. Can override by passing
20 ** non-zero replacement values to xc_linux_save().
21 **
22 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
23 **
24 */
25 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
26 #define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
29 /* max mfn of the whole machine */
30 static unsigned long max_mfn;
32 /* virtual starting address of the hypervisor */
33 static unsigned long hvirt_start;
35 /* #levels of page tables used by the currrent guest */
36 static unsigned int pt_levels;
38 /* total number of pages used by the current guest */
39 static unsigned long max_pfn;
41 /* Live mapping of the table mapping each PFN to its current MFN. */
42 static unsigned long *live_p2m = NULL;
44 /* Live mapping of system MFN to PFN table. */
45 static unsigned long *live_m2p = NULL;
48 /*
49 * Returns TRUE if the given machine frame number has a unique mapping
50 * in the guest's pseudophysical map.
51 */
52 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
53 (((_mfn) < (max_mfn)) && \
54 ((live_m2p[_mfn] < (max_pfn)) && \
55 (live_p2m[live_m2p[_mfn]] == (_mfn))))
58 /* Returns TRUE if MFN is successfully converted to a PFN. */
59 #define translate_mfn_to_pfn(_pmfn) \
60 ({ \
61 unsigned long mfn = *(_pmfn); \
62 int _res = 1; \
63 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
64 _res = 0; \
65 else \
66 *(_pmfn) = live_m2p[mfn]; \
67 _res; \
68 })
70 /*
71 ** During (live) save/migrate, we maintain a number of bitmaps to track
72 ** which pages we have to send, to fixup, and to skip.
73 */
75 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
76 #define BITMAP_SIZE ((max_pfn + BITS_PER_LONG - 1) / 8)
78 #define BITMAP_ENTRY(_nr,_bmap) \
79 ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
81 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
83 static inline int test_bit (int nr, volatile void * addr)
84 {
85 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
86 }
88 static inline void clear_bit (int nr, volatile void * addr)
89 {
90 BITMAP_ENTRY(nr, addr) &= ~(1 << BITMAP_SHIFT(nr));
91 }
93 static inline void set_bit ( int nr, volatile void * addr)
94 {
95 BITMAP_ENTRY(nr, addr) |= (1 << BITMAP_SHIFT(nr));
96 }
98 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
99 static inline unsigned int hweight32(unsigned int w)
100 {
101 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
102 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
103 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
104 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
105 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
106 }
108 static inline int count_bits ( int nr, volatile void *addr)
109 {
110 int i, count = 0;
111 unsigned long *p = (unsigned long *)addr;
112 /* We know that the array is padded to unsigned long. */
113 for( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
114 count += hweight32(*p);
115 return count;
116 }
118 static inline int permute( int i, int nr, int order_nr )
119 {
120 /* Need a simple permutation function so that we scan pages in a
121 pseudo random order, enabling us to get a better estimate of
122 the domain's page dirtying rate as we go (there are often
123 contiguous ranges of pfns that have similar behaviour, and we
124 want to mix them up. */
126 /* e.g. nr->oder 15->4 16->4 17->5 */
127 /* 512MB domain, 128k pages, order 17 */
129 /*
130 QPONMLKJIHGFEDCBA
131 QPONMLKJIH
132 GFEDCBA
133 */
135 /*
136 QPONMLKJIHGFEDCBA
137 EDCBA
138 QPONM
139 LKJIHGF
140 */
142 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
143 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
145 return i;
146 }
151 static uint64_t tv_to_us(struct timeval *new)
152 {
153 return (new->tv_sec * 1000000) + new->tv_usec;
154 }
156 static uint64_t llgettimeofday(void)
157 {
158 struct timeval now;
159 gettimeofday(&now, NULL);
160 return tv_to_us(&now);
161 }
163 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
164 {
165 return ((new->tv_sec - old->tv_sec)*1000000 ) +
166 (new->tv_usec - old->tv_usec);
167 }
170 #ifdef ADAPTIVE_SAVE
173 /*
174 ** We control the rate at which we transmit (or save) to minimize impact
175 ** on running domains (including the target if we're doing live migrate).
176 */
178 #define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
179 #define START_MBIT_RATE 100 /* initial transmit rate for migrate */
182 /* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
183 #define RATE_TO_BTU 781250
185 /* Amount in bytes we allow ourselves to send in a burst */
186 #define BURST_BUDGET (100*1024)
189 /* We keep track of the current and previous transmission rate */
190 static int mbit_rate, ombit_rate = 0;
192 /* Have we reached the maximum transmission rate? */
193 #define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
196 static inline void initialize_mbit_rate()
197 {
198 mbit_rate = START_MBIT_RATE;
199 }
202 static int ratewrite(int io_fd, void *buf, int n)
203 {
204 static int budget = 0;
205 static int burst_time_us = -1;
206 static struct timeval last_put = { 0 };
207 struct timeval now;
208 struct timespec delay;
209 long long delta;
211 if (START_MBIT_RATE == 0)
212 return write(io_fd, buf, n);
214 budget -= n;
215 if (budget < 0) {
216 if (mbit_rate != ombit_rate) {
217 burst_time_us = RATE_TO_BTU / mbit_rate;
218 ombit_rate = mbit_rate;
219 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
220 mbit_rate, BURST_BUDGET, burst_time_us);
221 }
222 if (last_put.tv_sec == 0) {
223 budget += BURST_BUDGET;
224 gettimeofday(&last_put, NULL);
225 } else {
226 while (budget < 0) {
227 gettimeofday(&now, NULL);
228 delta = tv_delta(&now, &last_put);
229 while (delta > burst_time_us) {
230 budget += BURST_BUDGET;
231 last_put.tv_usec += burst_time_us;
232 if (last_put.tv_usec > 1000000) {
233 last_put.tv_usec -= 1000000;
234 last_put.tv_sec++;
235 }
236 delta -= burst_time_us;
237 }
238 if (budget > 0)
239 break;
240 delay.tv_sec = 0;
241 delay.tv_nsec = 1000 * (burst_time_us - delta);
242 while (delay.tv_nsec > 0)
243 if (nanosleep(&delay, &delay) == 0)
244 break;
245 }
246 }
247 }
248 return write(io_fd, buf, n);
249 }
251 #else /* ! ADAPTIVE SAVE */
253 #define RATE_IS_MAX() (0)
254 #define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
255 #define initialize_mbit_rate()
257 #endif
260 static inline ssize_t write_exact(int fd, void *buf, size_t count)
261 {
262 if(write(fd, buf, count) != count)
263 return 0;
264 return 1;
265 }
269 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
270 xc_shadow_control_stats_t *stats, int print)
271 {
272 static struct timeval wall_last;
273 static long long d0_cpu_last;
274 static long long d1_cpu_last;
276 struct timeval wall_now;
277 long long wall_delta;
278 long long d0_cpu_now, d0_cpu_delta;
279 long long d1_cpu_now, d1_cpu_delta;
281 gettimeofday(&wall_now, NULL);
283 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
284 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
286 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
287 fprintf(stderr, "ARRHHH!!\n");
289 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
291 if (wall_delta == 0) wall_delta = 1;
293 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
294 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
296 if (print)
297 fprintf(stderr,
298 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
299 "dirtied %dMb/s %" PRId32 " pages\n",
300 wall_delta,
301 (int)((d0_cpu_delta*100)/wall_delta),
302 (int)((d1_cpu_delta*100)/wall_delta),
303 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
304 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
305 stats->dirty_count);
307 #ifdef ADAPTIVE_SAVE
308 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
309 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
310 + 50;
311 if (mbit_rate > MAX_MBIT_RATE)
312 mbit_rate = MAX_MBIT_RATE;
313 }
314 #endif
316 d0_cpu_last = d0_cpu_now;
317 d1_cpu_last = d1_cpu_now;
318 wall_last = wall_now;
320 return 0;
321 }
324 static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn,
325 unsigned long *arr, int runs)
326 {
327 long long start, now;
328 xc_shadow_control_stats_t stats;
329 int j;
331 start = llgettimeofday();
333 for (j = 0; j < runs; j++) {
334 int i;
336 xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_CLEAN,
337 arr, max_pfn, NULL);
338 fprintf(stderr, "#Flush\n");
339 for ( i = 0; i < 40; i++ ) {
340 usleep(50000);
341 now = llgettimeofday();
342 xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_PEEK,
343 NULL, 0, &stats);
345 fprintf(stderr, "now= %lld faults= %" PRId32 " dirty= %" PRId32
346 " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
347 ((now-start)+500)/1000,
348 stats.fault_count, stats.dirty_count,
349 stats.dirty_net_count, stats.dirty_block_count);
350 }
351 }
353 return -1;
354 }
357 static int suspend_and_state(int xc_handle, int io_fd, int dom,
358 xc_dominfo_t *info,
359 vcpu_guest_context_t *ctxt)
360 {
361 int i = 0;
362 char ans[30];
364 printf("suspend\n");
365 fflush(stdout);
366 if (fgets(ans, sizeof(ans), stdin) == NULL) {
367 ERR("failed reading suspend reply");
368 return -1;
369 }
370 if (strncmp(ans, "done\n", 5)) {
371 ERR("suspend reply incorrect: %s", ans);
372 return -1;
373 }
375 retry:
377 if (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) {
378 ERR("Could not get domain info");
379 return -1;
380 }
382 if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */, ctxt))
383 ERR("Could not get vcpu context");
386 if (info->shutdown && info->shutdown_reason == SHUTDOWN_suspend)
387 return 0; // success
389 if (info->paused) {
390 // try unpausing domain, wait, and retest
391 xc_domain_unpause( xc_handle, dom );
393 ERR("Domain was paused. Wait and re-test.");
394 usleep(10000); // 10ms
396 goto retry;
397 }
400 if( ++i < 100 ) {
401 ERR("Retry suspend domain.");
402 usleep(10000); // 10ms
403 goto retry;
404 }
406 ERR("Unable to suspend domain.");
408 return -1;
409 }
412 /*
413 ** During transfer (or in the state file), all page-table pages must be
414 ** converted into a 'canonical' form where references to actual mfns
415 ** are replaced with references to the corresponding pfns.
416 **
417 ** This function performs the appropriate conversion, taking into account
418 ** which entries do not require canonicalization (in particular, those
419 ** entries which map the virtual address reserved for the hypervisor).
420 */
421 void canonicalize_pagetable(unsigned long type, unsigned long pfn,
422 const void *spage, void *dpage)
423 {
425 int i, pte_last, xen_start, xen_end;
426 uint64_t pte;
428 /*
429 ** We need to determine which entries in this page table hold
430 ** reserved hypervisor mappings. This depends on the current
431 ** page table type as well as the number of paging levels.
432 */
433 xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
435 if (pt_levels == 2 && type == L2TAB)
436 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
438 if (pt_levels == 3 && type == L3TAB)
439 xen_start = L3_PAGETABLE_ENTRIES_PAE;
441 /*
442 ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
443 ** We can spot this by looking for the guest linear mapping which
444 ** Xen always ensures is present in that L2. Guests must ensure
445 ** that this check will fail for other L2s.
446 */
447 if (pt_levels == 3 && type == L2TAB) {
449 /* XXX index of the L2 entry in PAE mode which holds the guest LPT */
450 #define PAE_GLPT_L2ENTRY (495)
451 pte = ((uint64_t*)spage)[PAE_GLPT_L2ENTRY];
453 if(((pte >> PAGE_SHIFT) & 0x0fffffff) == live_p2m[pfn])
454 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
455 }
457 /* Now iterate through the page table, canonicalizing each PTE */
458 for (i = 0; i < pte_last; i++ ) {
460 unsigned long pfn, mfn;
462 if (pt_levels == 2)
463 pte = ((uint32_t*)spage)[i];
464 else
465 pte = ((uint64_t*)spage)[i];
467 if (i >= xen_start && i < xen_end)
468 pte = 0;
470 if (pte & _PAGE_PRESENT) {
472 mfn = (pte >> PAGE_SHIFT) & 0xfffffff;
473 pfn = live_m2p[mfn];
475 if (!MFN_IS_IN_PSEUDOPHYS_MAP(mfn)) {
476 /* I don't think this should ever happen */
477 DPRINTF("FNI: [%08lx,%d] pte=%llx,"
478 " mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
479 type, i, (unsigned long long)pte, mfn,
480 live_m2p[mfn],
481 (live_m2p[mfn] < max_pfn) ?
482 live_p2m[live_m2p[mfn]] : 0xdeadbeaf);
484 pfn = 0; /* be suspicious */
485 }
487 pte &= 0xffffff0000000fffULL;
488 pte |= (uint64_t)pfn << PAGE_SHIFT;
489 }
491 if (pt_levels == 2)
492 ((uint32_t*)dpage)[i] = pte;
493 else
494 ((uint64_t*)dpage)[i] = pte;
496 }
498 return;
499 }
503 static unsigned long *xc_map_m2p(int xc_handle,
504 unsigned long max_mfn,
505 int prot)
506 {
507 privcmd_m2pmfns_t m2p_mfns;
508 privcmd_mmap_t ioctlx;
509 privcmd_mmap_entry_t *entries;
510 unsigned long m2p_chunks, m2p_size;
511 unsigned long *m2p;
512 int i, rc;
514 m2p_size = M2P_SIZE(max_mfn);
515 m2p_chunks = M2P_CHUNKS(max_mfn);
518 m2p_mfns.num = m2p_chunks;
520 if(!(m2p_mfns.arr = malloc(m2p_chunks * sizeof(unsigned long)))) {
521 ERR("failed to allocate space for m2p mfns!\n");
522 return NULL;
523 }
525 if (ioctl(xc_handle, IOCTL_PRIVCMD_GET_MACH2PHYS_MFNS, &m2p_mfns) < 0) {
526 ERR("xc_get_m2p_mfns:");
527 return NULL;
528 }
530 if((m2p = mmap(NULL, m2p_size, prot,
531 MAP_SHARED, xc_handle, 0)) == MAP_FAILED) {
532 ERR("failed to mmap m2p");
533 return NULL;
534 }
537 if(!(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t)))) {
538 ERR("failed to allocate space for mmap entries!\n");
539 return NULL;
540 }
543 ioctlx.num = m2p_chunks;
544 ioctlx.dom = DOMID_XEN;
545 ioctlx.entry = entries;
547 for(i=0; i < m2p_chunks; i++) {
549 entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
550 entries[i].mfn = m2p_mfns.arr[i];
551 entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
553 }
555 if((rc = ioctl(xc_handle, IOCTL_PRIVCMD_MMAP, &ioctlx)) < 0) {
556 ERR("ioctl_mmap failed (rc = %d)", rc);
557 return NULL;
558 }
560 free(m2p_mfns.arr);
561 free(entries);
563 return m2p;
564 }
568 int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
569 uint32_t max_factor, uint32_t flags)
570 {
571 xc_dominfo_t info;
573 int rc = 1, i, j, last_iter, iter = 0;
574 int live = (flags & XCFLAGS_LIVE);
575 int debug = (flags & XCFLAGS_DEBUG);
576 int sent_last_iter, skip_this_iter;
578 /* The new domain's shared-info frame number. */
579 unsigned long shared_info_frame;
581 /* A copy of the CPU context of the guest. */
582 vcpu_guest_context_t ctxt;
584 /* A table containg the type of each PFN (/not/ MFN!). */
585 unsigned long *pfn_type = NULL;
586 unsigned long *pfn_batch = NULL;
588 /* A temporary mapping, and a copy, of one frame of guest memory. */
589 char page[PAGE_SIZE];
591 /* Double and single indirect references to the live P2M table */
592 unsigned long *live_p2m_frame_list_list = NULL;
593 unsigned long *live_p2m_frame_list = NULL;
595 /* A copy of the pfn-to-mfn table frame list. */
596 unsigned long *p2m_frame_list = NULL;
598 /* Live mapping of shared info structure */
599 shared_info_t *live_shinfo = NULL;
601 /* base of the region in which domain memory is mapped */
602 unsigned char *region_base = NULL;
604 /* power of 2 order of max_pfn */
605 int order_nr;
607 /* bitmap of pages:
608 - that should be sent this iteration (unless later marked as skip);
609 - to skip this iteration because already dirty;
610 - to fixup by sending at the end if not already resent; */
611 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
613 xc_shadow_control_stats_t stats;
615 unsigned long needed_to_fix = 0;
616 unsigned long total_sent = 0;
619 /* If no explicit control parameters given, use defaults */
620 if(!max_iters)
621 max_iters = DEF_MAX_ITERS;
622 if(!max_factor)
623 max_factor = DEF_MAX_FACTOR;
625 initialize_mbit_rate();
627 if(!get_platform_info(xc_handle, dom,
628 &max_mfn, &hvirt_start, &pt_levels)) {
629 ERR("Unable to get platform info.");
630 return 1;
631 }
633 if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
634 ERR("Could not get domain info");
635 return 1;
636 }
638 if (mlock(&ctxt, sizeof(ctxt))) {
639 ERR("Unable to mlock ctxt");
640 return 1;
641 }
643 /* Only have to worry about vcpu 0 even for SMP */
644 if (xc_domain_get_vcpu_context(xc_handle, dom, 0, &ctxt)) {
645 ERR("Could not get vcpu context");
646 goto out;
647 }
648 shared_info_frame = info.shared_info_frame;
650 /* A cheesy test to see whether the domain contains valid state. */
651 if (ctxt.ctrlreg[3] == 0)
652 {
653 ERR("Domain is not in a valid Linux guest OS state");
654 goto out;
655 }
657 /* cheesy sanity check */
658 if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
659 ERR("Invalid state record -- pfn count out of range: %lu",
660 (info.max_memkb >> (PAGE_SHIFT - 10)));
661 goto out;
662 }
664 /* Map the shared info frame */
665 if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
666 PROT_READ, shared_info_frame))) {
667 ERR("Couldn't map live_shinfo");
668 goto out;
669 }
671 max_pfn = live_shinfo->arch.max_pfn;
673 live_p2m_frame_list_list =
674 xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
675 live_shinfo->arch.pfn_to_mfn_frame_list_list);
677 if (!live_p2m_frame_list_list) {
678 ERR("Couldn't map p2m_frame_list_list");
679 goto out;
680 }
682 live_p2m_frame_list =
683 xc_map_foreign_batch(xc_handle, dom, PROT_READ,
684 live_p2m_frame_list_list,
685 P2M_FLL_ENTRIES);
687 if (!live_p2m_frame_list) {
688 ERR("Couldn't map p2m_frame_list");
689 goto out;
690 }
692 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
693 the guest must not change which frames are used for this purpose.
694 (its not clear why it would want to change them, and we'll be OK
695 from a safety POV anyhow. */
697 live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
698 live_p2m_frame_list,
699 P2M_FL_ENTRIES);
701 if (!live_p2m) {
702 ERR("Couldn't map p2m table");
703 goto out;
704 }
706 /* Setup the mfn_to_pfn table mapping */
707 if(!(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ))) {
708 ERR("Failed to map live M2P table");
709 goto out;
710 }
713 /* Get a local copy of the live_P2M_frame_list */
714 if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
715 ERR("Couldn't allocate p2m_frame_list array");
716 goto out;
717 }
718 memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
720 /* Canonicalise the pfn-to-mfn table frame-number list. */
721 for (i = 0; i < max_pfn; i += ulpp) {
722 if (!translate_mfn_to_pfn(&p2m_frame_list[i/ulpp])) {
723 ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
724 ERR("entry %d: p2m_frame_list[%ld] is 0x%lx", i, i/ulpp,
725 p2m_frame_list[i/ulpp]);
726 goto out;
727 }
728 }
730 /* Domain is still running at this point */
732 if (live && (pt_levels != 2)) {
733 ERR("Live migration supported only for 32-bit non-pae");
734 live = 0;
735 }
737 if (live) {
739 if (xc_shadow_control(xc_handle, dom,
740 DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
741 NULL, 0, NULL ) < 0) {
742 ERR("Couldn't enable shadow mode");
743 goto out;
744 }
746 last_iter = 0;
748 } else {
750 /* This is a non-live suspend. Issue the call back to get the
751 domain suspended */
753 last_iter = 1;
755 if (suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt)) {
756 ERR("Domain appears not to have suspended");
757 goto out;
758 }
760 }
762 /* pretend we sent all the pages last iteration */
763 sent_last_iter = max_pfn;
766 /* calculate the power of 2 order of max_pfn, e.g.
767 15->4 16->4 17->5 */
768 for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
769 continue;
771 /* Setup to_send / to_fix and to_skip bitmaps */
772 to_send = malloc(BITMAP_SIZE);
773 to_fix = calloc(1, BITMAP_SIZE);
774 to_skip = malloc(BITMAP_SIZE);
776 if (!to_send || !to_fix || !to_skip) {
777 ERR("Couldn't allocate to_send array");
778 goto out;
779 }
781 memset(to_send, 0xff, BITMAP_SIZE);
783 if (mlock(to_send, BITMAP_SIZE)) {
784 ERR("Unable to mlock to_send");
785 return 1;
786 }
788 /* (to fix is local only) */
789 if (mlock(to_skip, BITMAP_SIZE)) {
790 ERR("Unable to mlock to_skip");
791 return 1;
792 }
794 analysis_phase(xc_handle, dom, max_pfn, to_skip, 0);
796 /* We want zeroed memory so use calloc rather than malloc. */
797 pfn_type = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
798 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
800 if ((pfn_type == NULL) || (pfn_batch == NULL)) {
801 ERR("failed to alloc memory for pfn_type and/or pfn_batch arays.");
802 errno = ENOMEM;
803 goto out;
804 }
806 if (mlock(pfn_type, MAX_BATCH_SIZE * sizeof(unsigned long))) {
807 ERR("Unable to mlock");
808 goto out;
809 }
812 /*
813 * Quick belt and braces sanity check.
814 */
815 {
816 int err=0;
817 unsigned long mfn;
818 for (i = 0; i < max_pfn; i++) {
820 mfn = live_p2m[i];
821 if((mfn != 0xffffffffUL) && (live_m2p[mfn] != i)) {
822 DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
823 mfn, live_m2p[mfn]);
824 err++;
825 }
826 }
827 DPRINTF("Had %d unexplained entries in p2m table\n", err);
828 }
831 /* Start writing out the saved-domain record. */
833 if(!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
834 ERR("write: max_pfn");
835 goto out;
836 }
838 if(!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
839 ERR("write: p2m_frame_list");
840 goto out;
841 }
843 print_stats(xc_handle, dom, 0, &stats, 0);
845 /* Now write out each data page, canonicalising page tables as we go... */
847 while(1) {
849 unsigned int prev_pc, sent_this_iter, N, batch;
851 iter++;
852 sent_this_iter = 0;
853 skip_this_iter = 0;
854 prev_pc = 0;
855 N=0;
857 DPRINTF("Saving memory pages: iter %d 0%%", iter);
859 while( N < max_pfn ){
861 unsigned int this_pc = (N * 100) / max_pfn;
863 if ((this_pc - prev_pc) >= 5) {
864 DPRINTF("\b\b\b\b%3d%%", this_pc);
865 prev_pc = this_pc;
866 }
868 /* slightly wasteful to peek the whole array evey time,
869 but this is fast enough for the moment. */
870 if (!last_iter && xc_shadow_control(
871 xc_handle, dom, DOM0_SHADOW_CONTROL_OP_PEEK,
872 to_skip, max_pfn, NULL) != max_pfn) {
873 ERR("Error peeking shadow bitmap");
874 goto out;
875 }
878 /* load pfn_type[] with the mfn of all the pages we're doing in
879 this batch. */
880 for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) {
882 int n = permute(N, max_pfn, order_nr);
884 if (debug) {
885 DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n",
886 iter, (unsigned long)n, live_p2m[n],
887 test_bit(n, to_send),
888 live_m2p[live_p2m[n]&0xFFFFF]);
889 }
891 if (!last_iter && test_bit(n, to_send)&& test_bit(n, to_skip))
892 skip_this_iter++; /* stats keeping */
894 if (!((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
895 (test_bit(n, to_send) && last_iter) ||
896 (test_bit(n, to_fix) && last_iter)))
897 continue;
899 /*
900 ** we get here if:
901 ** 1. page is marked to_send & hasn't already been re-dirtied
902 ** 2. (ignore to_skip in last iteration)
903 ** 3. add in pages that still need fixup (net bufs)
904 */
906 pfn_batch[batch] = n;
907 pfn_type[batch] = live_p2m[n];
909 if(!is_mapped(pfn_type[batch])) {
911 /* not currently in pusedo-physical map -- set bit
912 in to_fix that we must send this page in last_iter
913 unless its sent sooner anyhow */
915 set_bit(n, to_fix);
916 if( (iter > 1) && IS_REAL_PFN(n) )
917 DPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
918 iter, n, pfn_type[batch]);
919 continue;
920 }
922 if(last_iter && test_bit(n, to_fix) && !test_bit(n, to_send)) {
923 needed_to_fix++;
924 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
925 iter,n,pfn_type[batch]);
926 }
928 clear_bit(n, to_fix);
930 batch++;
931 }
933 if (batch == 0)
934 goto skip; /* vanishingly unlikely... */
936 if ((region_base = xc_map_foreign_batch(
937 xc_handle, dom, PROT_READ, pfn_type, batch)) == 0) {
938 ERR("map batch failed");
939 goto out;
940 }
942 if (xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type)) {
943 ERR("get_pfn_type_batch failed");
944 goto out;
945 }
947 for (j = 0; j < batch; j++) {
949 if ((pfn_type[j] & LTAB_MASK) == XTAB) {
950 DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
951 continue;
952 }
954 if (debug)
955 fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
956 " sum= %08lx\n",
957 iter,
958 (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
959 pfn_type[j],
960 live_m2p[pfn_type[j]&(~LTAB_MASK)],
961 csum_page(region_base + (PAGE_SIZE*j)));
963 /* canonicalise mfn->pfn */
964 pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
965 }
967 if(!write_exact(io_fd, &batch, sizeof(unsigned int))) {
968 ERR("Error when writing to state file (2)");
969 goto out;
970 }
972 if(!write_exact(io_fd, pfn_type, sizeof(unsigned long)*j)) {
973 ERR("Error when writing to state file (3)");
974 goto out;
975 }
977 /* entering this loop, pfn_type is now in pfns (Not mfns) */
978 for (j = 0; j < batch; j++) {
980 unsigned long pfn = pfn_type[j] & ~LTAB_MASK;
981 unsigned long pagetype = pfn_type[j] & LTAB_MASK;
982 void *spage = (void *) region_base + (PAGE_SIZE*j);
985 /* write out pages in batch */
986 if (pagetype == XTAB)
987 continue;
989 pagetype &= LTABTYPE_MASK;
991 if (pagetype >= L1TAB && pagetype <= L4TAB) {
993 /* We have a pagetable page: need to rewrite it. */
994 canonicalize_pagetable(pagetype, pfn, spage, page);
996 if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
997 ERR("Error when writing to state file (4)");
998 goto out;
999 }
1001 } else {
1003 /* We have a normal page: just write it directly. */
1004 if (ratewrite(io_fd, spage, PAGE_SIZE) != PAGE_SIZE) {
1005 ERR("Error when writing to state file (5)");
1006 goto out;
1009 } /* end of the write out for this batch */
1011 sent_this_iter += batch;
1013 munmap(region_base, batch*PAGE_SIZE);
1015 } /* end of this while loop for this iteration */
1017 skip:
1019 total_sent += sent_this_iter;
1021 DPRINTF("\r %d: sent %d, skipped %d, ",
1022 iter, sent_this_iter, skip_this_iter );
1024 if (last_iter) {
1025 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
1027 DPRINTF("Total pages sent= %ld (%.2fx)\n",
1028 total_sent, ((float)total_sent)/max_pfn );
1029 DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
1032 if (last_iter && debug){
1033 int minusone = -1;
1034 memset( to_send, 0xff, (max_pfn+8)/8 );
1035 debug = 0;
1036 fprintf(stderr, "Entering debug resend-all mode\n");
1038 /* send "-1" to put receiver into debug mode */
1039 if(!write_exact(io_fd, &minusone, sizeof(int))) {
1040 ERR("Error when writing to state file (6)");
1041 goto out;
1044 continue;
1047 if (last_iter) break;
1049 if (live) {
1052 if(
1053 ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
1054 (iter >= max_iters) ||
1055 (sent_this_iter+skip_this_iter < 50) ||
1056 (total_sent > max_pfn*max_factor) ) {
1058 DPRINTF("Start last iteration\n");
1059 last_iter = 1;
1061 if (suspend_and_state(xc_handle, io_fd, dom, &info, &ctxt)) {
1062 ERR("Domain appears not to have suspended");
1063 goto out;
1066 DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
1067 info.shared_info_frame,
1068 (unsigned long)ctxt.user_regs.eip,
1069 (unsigned long)ctxt.user_regs.edx);
1072 if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_CLEAN,
1073 to_send, max_pfn, &stats ) != max_pfn) {
1074 ERR("Error flushing shadow PT");
1075 goto out;
1078 sent_last_iter = sent_this_iter;
1080 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
1085 } /* end of while 1 */
1087 DPRINTF("All memory is saved\n");
1089 /* Zero terminate */
1090 i = 0;
1091 if (!write_exact(io_fd, &i, sizeof(int))) {
1092 ERR("Error when writing to state file (6)");
1093 goto out;
1096 /* Send through a list of all the PFNs that were not in map at the close */
1098 unsigned int i,j;
1099 unsigned long pfntab[1024];
1101 for (i = 0, j = 0; i < max_pfn; i++) {
1102 if (!is_mapped(live_p2m[i]))
1103 j++;
1106 if(!write_exact(io_fd, &j, sizeof(unsigned int))) {
1107 ERR("Error when writing to state file (6a)");
1108 goto out;
1111 for (i = 0, j = 0; i < max_pfn; ) {
1113 if (!is_mapped(live_p2m[i]))
1114 pfntab[j++] = i;
1116 i++;
1117 if (j == 1024 || i == max_pfn) {
1118 if(!write_exact(io_fd, &pfntab, sizeof(unsigned long)*j)) {
1119 ERR("Error when writing to state file (6b)");
1120 goto out;
1122 j = 0;
1128 /* Canonicalise the suspend-record frame number. */
1129 if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) ){
1130 ERR("Suspend record is not in range of pseudophys map");
1131 goto out;
1134 /* Canonicalise each GDT frame number. */
1135 for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
1136 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1137 ERR("GDT frame is not in range of pseudophys map");
1138 goto out;
1142 /* Canonicalise the page table base pointer. */
1143 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) ) {
1144 ERR("PT base is not in range of pseudophys map");
1145 goto out;
1147 ctxt.ctrlreg[3] = live_m2p[ctxt.ctrlreg[3] >> PAGE_SHIFT] <<
1148 PAGE_SHIFT;
1150 if (!write_exact(io_fd, &ctxt, sizeof(ctxt)) ||
1151 !write_exact(io_fd, live_shinfo, PAGE_SIZE)) {
1152 ERR("Error when writing to state file (1)");
1153 goto out;
1156 /* Success! */
1157 rc = 0;
1159 out:
1161 if (live) {
1162 if(xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_OFF,
1163 NULL, 0, NULL ) < 0) {
1164 DPRINTF("Warning - couldn't disable shadow mode");
1168 if (live_shinfo)
1169 munmap(live_shinfo, PAGE_SIZE);
1171 if (live_p2m_frame_list)
1172 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
1174 if(live_p2m)
1175 munmap(live_p2m, P2M_SIZE);
1177 if(live_m2p)
1178 munmap(live_m2p, M2P_SIZE(max_mfn));
1180 free(pfn_type);
1181 free(pfn_batch);
1182 free(to_send);
1183 free(to_fix);
1184 free(to_skip);
1186 DPRINTF("Save exit rc=%d\n",rc);
1188 return !!rc;
1191 /*
1192 * Local variables:
1193 * mode: C
1194 * c-set-style: "BSD"
1195 * c-basic-offset: 4
1196 * tab-width: 4
1197 * indent-tabs-mode: nil
1198 * End:
1199 */