ia64/xen-unstable

view tools/libxc/xc_linux_save.c @ 13221:62ef527eb19f

Remove 'netbuf race' debug output - we get tons of these for ballooned
down domains, and there's no way for us to distinguish that from the
(transient) case of network buffers.

Also tidy up comment.

Signed-off-by: Steven Hand <steven@xensource.com>
author Steven Hand <steven@xensource.com>
date Fri Dec 29 13:00:08 2006 +0000 (2006-12-29)
parents 3e2d3d737624
children 973e4d233461
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xc_private.h"
16 #include "xg_private.h"
17 #include "xg_save_restore.h"
19 /*
20 ** Default values for important tuning parameters. Can override by passing
21 ** non-zero replacement values to xc_linux_save().
22 **
23 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
24 **
25 */
26 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
27 #define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
30 /* max mfn of the whole machine */
31 static unsigned long max_mfn;
33 /* virtual starting address of the hypervisor */
34 static unsigned long hvirt_start;
36 /* #levels of page tables used by the currrent guest */
37 static unsigned int pt_levels;
39 /* total number of pages used by the current guest */
40 static unsigned long max_pfn;
42 /* Live mapping of the table mapping each PFN to its current MFN. */
43 static xen_pfn_t *live_p2m = NULL;
45 /* Live mapping of system MFN to PFN table. */
46 static xen_pfn_t *live_m2p = NULL;
48 /* grep fodder: machine_to_phys */
50 #define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
52 /*
53 * Returns TRUE if the given machine frame number has a unique mapping
54 * in the guest's pseudophysical map.
55 */
56 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
57 (((_mfn) < (max_mfn)) && \
58 ((mfn_to_pfn(_mfn) < (max_pfn)) && \
59 (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
62 /* Returns TRUE if MFN is successfully converted to a PFN. */
63 #define translate_mfn_to_pfn(_pmfn) \
64 ({ \
65 unsigned long mfn = *(_pmfn); \
66 int _res = 1; \
67 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
68 _res = 0; \
69 else \
70 *(_pmfn) = mfn_to_pfn(mfn); \
71 _res; \
72 })
74 /*
75 ** During (live) save/migrate, we maintain a number of bitmaps to track
76 ** which pages we have to send, to fixup, and to skip.
77 */
79 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
80 #define BITMAP_SIZE ((max_pfn + BITS_PER_LONG - 1) / 8)
82 #define BITMAP_ENTRY(_nr,_bmap) \
83 ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
85 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
87 static inline int test_bit (int nr, volatile void * addr)
88 {
89 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
90 }
92 static inline void clear_bit (int nr, volatile void * addr)
93 {
94 BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
95 }
97 static inline void set_bit ( int nr, volatile void * addr)
98 {
99 BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
100 }
102 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
103 static inline unsigned int hweight32(unsigned int w)
104 {
105 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
106 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
107 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
108 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
109 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
110 }
112 static inline int count_bits ( int nr, volatile void *addr)
113 {
114 int i, count = 0;
115 volatile unsigned long *p = (volatile unsigned long *)addr;
116 /* We know that the array is padded to unsigned long. */
117 for( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
118 count += hweight32(*p);
119 return count;
120 }
122 static inline int permute( int i, int nr, int order_nr )
123 {
124 /* Need a simple permutation function so that we scan pages in a
125 pseudo random order, enabling us to get a better estimate of
126 the domain's page dirtying rate as we go (there are often
127 contiguous ranges of pfns that have similar behaviour, and we
128 want to mix them up. */
130 /* e.g. nr->oder 15->4 16->4 17->5 */
131 /* 512MB domain, 128k pages, order 17 */
133 /*
134 QPONMLKJIHGFEDCBA
135 QPONMLKJIH
136 GFEDCBA
137 */
139 /*
140 QPONMLKJIHGFEDCBA
141 EDCBA
142 QPONM
143 LKJIHGF
144 */
146 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
147 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
149 return i;
150 }
155 static uint64_t tv_to_us(struct timeval *new)
156 {
157 return (new->tv_sec * 1000000) + new->tv_usec;
158 }
160 static uint64_t llgettimeofday(void)
161 {
162 struct timeval now;
163 gettimeofday(&now, NULL);
164 return tv_to_us(&now);
165 }
167 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
168 {
169 return ((new->tv_sec - old->tv_sec)*1000000 ) +
170 (new->tv_usec - old->tv_usec);
171 }
174 #ifdef ADAPTIVE_SAVE
177 /*
178 ** We control the rate at which we transmit (or save) to minimize impact
179 ** on running domains (including the target if we're doing live migrate).
180 */
182 #define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
183 #define START_MBIT_RATE 100 /* initial transmit rate for migrate */
186 /* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
187 #define RATE_TO_BTU 781250
189 /* Amount in bytes we allow ourselves to send in a burst */
190 #define BURST_BUDGET (100*1024)
193 /* We keep track of the current and previous transmission rate */
194 static int mbit_rate, ombit_rate = 0;
196 /* Have we reached the maximum transmission rate? */
197 #define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
200 static inline void initialize_mbit_rate()
201 {
202 mbit_rate = START_MBIT_RATE;
203 }
206 static int ratewrite(int io_fd, void *buf, int n)
207 {
208 static int budget = 0;
209 static int burst_time_us = -1;
210 static struct timeval last_put = { 0 };
211 struct timeval now;
212 struct timespec delay;
213 long long delta;
215 if (START_MBIT_RATE == 0)
216 return write(io_fd, buf, n);
218 budget -= n;
219 if (budget < 0) {
220 if (mbit_rate != ombit_rate) {
221 burst_time_us = RATE_TO_BTU / mbit_rate;
222 ombit_rate = mbit_rate;
223 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
224 mbit_rate, BURST_BUDGET, burst_time_us);
225 }
226 if (last_put.tv_sec == 0) {
227 budget += BURST_BUDGET;
228 gettimeofday(&last_put, NULL);
229 } else {
230 while (budget < 0) {
231 gettimeofday(&now, NULL);
232 delta = tv_delta(&now, &last_put);
233 while (delta > burst_time_us) {
234 budget += BURST_BUDGET;
235 last_put.tv_usec += burst_time_us;
236 if (last_put.tv_usec > 1000000) {
237 last_put.tv_usec -= 1000000;
238 last_put.tv_sec++;
239 }
240 delta -= burst_time_us;
241 }
242 if (budget > 0)
243 break;
244 delay.tv_sec = 0;
245 delay.tv_nsec = 1000 * (burst_time_us - delta);
246 while (delay.tv_nsec > 0)
247 if (nanosleep(&delay, &delay) == 0)
248 break;
249 }
250 }
251 }
252 return write(io_fd, buf, n);
253 }
255 #else /* ! ADAPTIVE SAVE */
257 #define RATE_IS_MAX() (0)
258 #define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
259 #define initialize_mbit_rate()
261 #endif
264 static inline ssize_t write_exact(int fd, void *buf, size_t count)
265 {
266 if(write(fd, buf, count) != count)
267 return 0;
268 return 1;
269 }
273 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
274 xc_shadow_op_stats_t *stats, int print)
275 {
276 static struct timeval wall_last;
277 static long long d0_cpu_last;
278 static long long d1_cpu_last;
280 struct timeval wall_now;
281 long long wall_delta;
282 long long d0_cpu_now, d0_cpu_delta;
283 long long d1_cpu_now, d1_cpu_delta;
285 gettimeofday(&wall_now, NULL);
287 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
288 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
290 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
291 DPRINTF("ARRHHH!!\n");
293 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
295 if (wall_delta == 0) wall_delta = 1;
297 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
298 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
300 if (print)
301 DPRINTF(
302 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
303 "dirtied %dMb/s %" PRId32 " pages\n",
304 wall_delta,
305 (int)((d0_cpu_delta*100)/wall_delta),
306 (int)((d1_cpu_delta*100)/wall_delta),
307 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
308 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
309 stats->dirty_count);
311 #ifdef ADAPTIVE_SAVE
312 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
313 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
314 + 50;
315 if (mbit_rate > MAX_MBIT_RATE)
316 mbit_rate = MAX_MBIT_RATE;
317 }
318 #endif
320 d0_cpu_last = d0_cpu_now;
321 d1_cpu_last = d1_cpu_now;
322 wall_last = wall_now;
324 return 0;
325 }
328 static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn,
329 unsigned long *arr, int runs)
330 {
331 long long start, now;
332 xc_shadow_op_stats_t stats;
333 int j;
335 start = llgettimeofday();
337 for (j = 0; j < runs; j++) {
338 int i;
340 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
341 arr, max_pfn, NULL, 0, NULL);
342 DPRINTF("#Flush\n");
343 for ( i = 0; i < 40; i++ ) {
344 usleep(50000);
345 now = llgettimeofday();
346 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
347 NULL, 0, NULL, 0, &stats);
349 DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
350 ((now-start)+500)/1000,
351 stats.fault_count, stats.dirty_count);
352 }
353 }
355 return -1;
356 }
359 static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
360 int dom, xc_dominfo_t *info,
361 vcpu_guest_context_t *ctxt)
362 {
363 int i = 0;
365 if (!(*suspend)(dom)) {
366 ERROR("Suspend request failed");
367 return -1;
368 }
370 retry:
372 if (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) {
373 ERROR("Could not get domain info");
374 return -1;
375 }
377 if ( xc_vcpu_getcontext(xc_handle, dom, 0 /* XXX */, ctxt))
378 ERROR("Could not get vcpu context");
381 if (info->shutdown && info->shutdown_reason == SHUTDOWN_suspend)
382 return 0; // success
384 if (info->paused) {
385 // try unpausing domain, wait, and retest
386 xc_domain_unpause( xc_handle, dom );
388 ERROR("Domain was paused. Wait and re-test.");
389 usleep(10000); // 10ms
391 goto retry;
392 }
395 if( ++i < 100 ) {
396 ERROR("Retry suspend domain.");
397 usleep(10000); // 10ms
398 goto retry;
399 }
401 ERROR("Unable to suspend domain.");
403 return -1;
404 }
407 /*
408 ** During transfer (or in the state file), all page-table pages must be
409 ** converted into a 'canonical' form where references to actual mfns
410 ** are replaced with references to the corresponding pfns.
411 **
412 ** This function performs the appropriate conversion, taking into account
413 ** which entries do not require canonicalization (in particular, those
414 ** entries which map the virtual address reserved for the hypervisor).
415 */
416 static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
417 const void *spage, void *dpage)
418 {
420 int i, pte_last, xen_start, xen_end, race = 0;
421 uint64_t pte;
423 /*
424 ** We need to determine which entries in this page table hold
425 ** reserved hypervisor mappings. This depends on the current
426 ** page table type as well as the number of paging levels.
427 */
428 xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
430 if (pt_levels == 2 && type == XEN_DOMCTL_PFINFO_L2TAB)
431 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
433 if (pt_levels == 3 && type == XEN_DOMCTL_PFINFO_L3TAB)
434 xen_start = L3_PAGETABLE_ENTRIES_PAE;
436 /*
437 ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
438 ** We can spot this by looking for the guest linear mapping which
439 ** Xen always ensures is present in that L2. Guests must ensure
440 ** that this check will fail for other L2s.
441 */
442 if (pt_levels == 3 && type == XEN_DOMCTL_PFINFO_L2TAB) {
444 /* XXX index of the L2 entry in PAE mode which holds the guest LPT */
445 #define PAE_GLPT_L2ENTRY (495)
446 pte = ((const uint64_t*)spage)[PAE_GLPT_L2ENTRY];
448 if(((pte >> PAGE_SHIFT) & 0x0fffffff) == live_p2m[pfn])
449 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
450 }
452 if (pt_levels == 4 && type == XEN_DOMCTL_PFINFO_L4TAB) {
453 /*
454 ** XXX SMH: should compute these from hvirt_start (which we have)
455 ** and hvirt_end (which we don't)
456 */
457 xen_start = 256;
458 xen_end = 272;
459 }
461 /* Now iterate through the page table, canonicalizing each PTE */
462 for (i = 0; i < pte_last; i++ ) {
464 unsigned long pfn, mfn;
466 if (pt_levels == 2)
467 pte = ((const uint32_t*)spage)[i];
468 else
469 pte = ((const uint64_t*)spage)[i];
471 if (i >= xen_start && i < xen_end)
472 pte = 0;
474 if (pte & _PAGE_PRESENT) {
476 mfn = (pte >> PAGE_SHIFT) & 0xfffffff;
477 if (!MFN_IS_IN_PSEUDOPHYS_MAP(mfn)) {
478 /* This will happen if the type info is stale which
479 is quite feasible under live migration */
480 DPRINTF("PT Race: [%08lx,%d] pte=%llx, mfn=%08lx\n",
481 type, i, (unsigned long long)pte, mfn);
482 pfn = 0; /* zap it - we'll retransmit this page later */
483 race = 1; /* inform the caller of race; fatal if !live */
484 } else
485 pfn = mfn_to_pfn(mfn);
487 pte &= 0xffffff0000000fffULL;
488 pte |= (uint64_t)pfn << PAGE_SHIFT;
489 }
491 if (pt_levels == 2)
492 ((uint32_t*)dpage)[i] = pte;
493 else
494 ((uint64_t*)dpage)[i] = pte;
496 }
498 return race;
499 }
503 static xen_pfn_t *xc_map_m2p(int xc_handle,
504 unsigned long max_mfn,
505 int prot)
506 {
507 struct xen_machphys_mfn_list xmml;
508 privcmd_mmap_entry_t *entries;
509 unsigned long m2p_chunks, m2p_size;
510 xen_pfn_t *m2p;
511 xen_pfn_t *extent_start;
512 int i, rc;
514 m2p_size = M2P_SIZE(max_mfn);
515 m2p_chunks = M2P_CHUNKS(max_mfn);
517 xmml.max_extents = m2p_chunks;
518 if (!(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t)))) {
519 ERROR("failed to allocate space for m2p mfns");
520 return NULL;
521 }
522 set_xen_guest_handle(xmml.extent_start, extent_start);
524 if (xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
525 (xmml.nr_extents != m2p_chunks)) {
526 ERROR("xc_get_m2p_mfns");
527 return NULL;
528 }
530 if ((m2p = mmap(NULL, m2p_size, prot,
531 MAP_SHARED, xc_handle, 0)) == MAP_FAILED) {
532 ERROR("failed to mmap m2p");
533 return NULL;
534 }
536 if (!(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t)))) {
537 ERROR("failed to allocate space for mmap entries");
538 return NULL;
539 }
541 for (i=0; i < m2p_chunks; i++) {
542 entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
543 entries[i].mfn = extent_start[i];
544 entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
545 }
547 if ((rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
548 entries, m2p_chunks)) < 0) {
549 ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
550 return NULL;
551 }
553 free(extent_start);
554 free(entries);
556 return m2p;
557 }
561 int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
562 uint32_t max_factor, uint32_t flags, int (*suspend)(int))
563 {
564 xc_dominfo_t info;
566 int rc = 1, i, j, last_iter, iter = 0;
567 int live = (flags & XCFLAGS_LIVE);
568 int debug = (flags & XCFLAGS_DEBUG);
569 int race = 0, sent_last_iter, skip_this_iter;
571 /* The new domain's shared-info frame number. */
572 unsigned long shared_info_frame;
574 /* A copy of the CPU context of the guest. */
575 vcpu_guest_context_t ctxt;
577 /* A table containg the type of each PFN (/not/ MFN!). */
578 unsigned long *pfn_type = NULL;
579 unsigned long *pfn_batch = NULL;
581 /* A temporary mapping, and a copy, of one frame of guest memory. */
582 char page[PAGE_SIZE];
584 /* Double and single indirect references to the live P2M table */
585 xen_pfn_t *live_p2m_frame_list_list = NULL;
586 xen_pfn_t *live_p2m_frame_list = NULL;
588 /* A copy of the pfn-to-mfn table frame list. */
589 xen_pfn_t *p2m_frame_list = NULL;
591 /* Live mapping of shared info structure */
592 shared_info_t *live_shinfo = NULL;
594 /* base of the region in which domain memory is mapped */
595 unsigned char *region_base = NULL;
597 /* power of 2 order of max_pfn */
598 int order_nr;
600 /* bitmap of pages:
601 - that should be sent this iteration (unless later marked as skip);
602 - to skip this iteration because already dirty;
603 - to fixup by sending at the end if not already resent; */
604 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
606 xc_shadow_op_stats_t stats;
608 unsigned long needed_to_fix = 0;
609 unsigned long total_sent = 0;
612 /* If no explicit control parameters given, use defaults */
613 if(!max_iters)
614 max_iters = DEF_MAX_ITERS;
615 if(!max_factor)
616 max_factor = DEF_MAX_FACTOR;
618 initialize_mbit_rate();
620 if(!get_platform_info(xc_handle, dom,
621 &max_mfn, &hvirt_start, &pt_levels)) {
622 ERROR("Unable to get platform info.");
623 return 1;
624 }
626 if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
627 ERROR("Could not get domain info");
628 return 1;
629 }
631 if (lock_pages(&ctxt, sizeof(ctxt))) {
632 ERROR("Unable to lock ctxt");
633 return 1;
634 }
636 /* Only have to worry about vcpu 0 even for SMP */
637 if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) {
638 ERROR("Could not get vcpu context");
639 goto out;
640 }
641 shared_info_frame = info.shared_info_frame;
643 /* A cheesy test to see whether the domain contains valid state. */
644 if (ctxt.ctrlreg[3] == 0)
645 {
646 ERROR("Domain is not in a valid Linux guest OS state");
647 goto out;
648 }
650 /* cheesy sanity check */
651 if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
652 ERROR("Invalid state record -- pfn count out of range: %lu",
653 (info.max_memkb >> (PAGE_SHIFT - 10)));
654 goto out;
655 }
657 /* Map the shared info frame */
658 if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
659 PROT_READ, shared_info_frame))) {
660 ERROR("Couldn't map live_shinfo");
661 goto out;
662 }
664 max_pfn = live_shinfo->arch.max_pfn;
666 live_p2m_frame_list_list =
667 xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
668 live_shinfo->arch.pfn_to_mfn_frame_list_list);
670 if (!live_p2m_frame_list_list) {
671 ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
672 goto out;
673 }
675 live_p2m_frame_list =
676 xc_map_foreign_batch(xc_handle, dom, PROT_READ,
677 live_p2m_frame_list_list,
678 P2M_FLL_ENTRIES);
680 if (!live_p2m_frame_list) {
681 ERROR("Couldn't map p2m_frame_list");
682 goto out;
683 }
685 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
686 the guest must not change which frames are used for this purpose.
687 (its not clear why it would want to change them, and we'll be OK
688 from a safety POV anyhow. */
690 live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
691 live_p2m_frame_list,
692 P2M_FL_ENTRIES);
694 if (!live_p2m) {
695 ERROR("Couldn't map p2m table");
696 goto out;
697 }
699 /* Setup the mfn_to_pfn table mapping */
700 if(!(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ))) {
701 ERROR("Failed to map live M2P table");
702 goto out;
703 }
706 /* Get a local copy of the live_P2M_frame_list */
707 if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
708 ERROR("Couldn't allocate p2m_frame_list array");
709 goto out;
710 }
711 memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
713 /* Canonicalise the pfn-to-mfn table frame-number list. */
714 for (i = 0; i < max_pfn; i += fpp) {
715 if (!translate_mfn_to_pfn(&p2m_frame_list[i/fpp])) {
716 ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
717 ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
718 (uint64_t)p2m_frame_list[i/fpp]);
719 goto out;
720 }
721 }
723 /* Domain is still running at this point */
724 if (live) {
726 if (xc_shadow_control(xc_handle, dom,
727 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
728 NULL, 0, NULL, 0, NULL) < 0) {
729 ERROR("Couldn't enable shadow mode");
730 goto out;
731 }
733 last_iter = 0;
735 } else {
737 /* This is a non-live suspend. Issue the call back to get the
738 domain suspended */
740 last_iter = 1;
742 if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt)) {
743 ERROR("Domain appears not to have suspended");
744 goto out;
745 }
747 }
749 /* pretend we sent all the pages last iteration */
750 sent_last_iter = max_pfn;
753 /* calculate the power of 2 order of max_pfn, e.g.
754 15->4 16->4 17->5 */
755 for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
756 continue;
758 /* Setup to_send / to_fix and to_skip bitmaps */
759 to_send = malloc(BITMAP_SIZE);
760 to_fix = calloc(1, BITMAP_SIZE);
761 to_skip = malloc(BITMAP_SIZE);
763 if (!to_send || !to_fix || !to_skip) {
764 ERROR("Couldn't allocate to_send array");
765 goto out;
766 }
768 memset(to_send, 0xff, BITMAP_SIZE);
770 if (lock_pages(to_send, BITMAP_SIZE)) {
771 ERROR("Unable to lock to_send");
772 return 1;
773 }
775 /* (to fix is local only) */
776 if (lock_pages(to_skip, BITMAP_SIZE)) {
777 ERROR("Unable to lock to_skip");
778 return 1;
779 }
781 analysis_phase(xc_handle, dom, max_pfn, to_skip, 0);
783 /* We want zeroed memory so use calloc rather than malloc. */
784 pfn_type = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
785 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
787 if ((pfn_type == NULL) || (pfn_batch == NULL)) {
788 ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
789 errno = ENOMEM;
790 goto out;
791 }
793 if (lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type))) {
794 ERROR("Unable to lock");
795 goto out;
796 }
798 /*
799 * Quick belt and braces sanity check.
800 */
801 {
802 int err=0;
803 unsigned long mfn;
804 for (i = 0; i < max_pfn; i++) {
806 mfn = live_p2m[i];
807 if((mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i)) {
808 DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
809 mfn, mfn_to_pfn(mfn));
810 err++;
811 }
812 }
813 DPRINTF("Had %d unexplained entries in p2m table\n", err);
814 }
817 /* Start writing out the saved-domain record. */
819 if (!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
820 ERROR("write: max_pfn");
821 goto out;
822 }
824 /*
825 * Write an extended-info structure to inform the restore code that
826 * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
827 * slow paths in the restore code.
828 */
829 if ((pt_levels == 3) &&
830 (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3))) {
831 unsigned long signature = ~0UL;
832 uint32_t tot_sz = sizeof(struct vcpu_guest_context) + 8;
833 uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
834 char chunk_sig[] = "vcpu";
835 if (!write_exact(io_fd, &signature, sizeof(signature)) ||
836 !write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
837 !write_exact(io_fd, &chunk_sig, 4) ||
838 !write_exact(io_fd, &chunk_sz, sizeof(chunk_sz)) ||
839 !write_exact(io_fd, &ctxt, sizeof(ctxt))) {
840 ERROR("write: extended info");
841 goto out;
842 }
843 }
845 if (!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
846 ERROR("write: p2m_frame_list");
847 goto out;
848 }
850 print_stats(xc_handle, dom, 0, &stats, 0);
852 /* Now write out each data page, canonicalising page tables as we go... */
854 while(1) {
856 unsigned int prev_pc, sent_this_iter, N, batch;
858 iter++;
859 sent_this_iter = 0;
860 skip_this_iter = 0;
861 prev_pc = 0;
862 N=0;
864 DPRINTF("Saving memory pages: iter %d 0%%", iter);
866 while( N < max_pfn ){
868 unsigned int this_pc = (N * 100) / max_pfn;
870 if ((this_pc - prev_pc) >= 5) {
871 DPRINTF("\b\b\b\b%3d%%", this_pc);
872 prev_pc = this_pc;
873 }
875 /* slightly wasteful to peek the whole array evey time,
876 but this is fast enough for the moment. */
877 if (!last_iter && xc_shadow_control(
878 xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK,
879 to_skip, max_pfn, NULL, 0, NULL) != max_pfn) {
880 ERROR("Error peeking shadow bitmap");
881 goto out;
882 }
885 /* load pfn_type[] with the mfn of all the pages we're doing in
886 this batch. */
887 for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) {
889 int n = permute(N, max_pfn, order_nr);
891 if (debug) {
892 DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n",
893 iter, (unsigned long)n, live_p2m[n],
894 test_bit(n, to_send),
895 mfn_to_pfn(live_p2m[n]&0xFFFFF));
896 }
898 if (!last_iter && test_bit(n, to_send)&& test_bit(n, to_skip))
899 skip_this_iter++; /* stats keeping */
901 if (!((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
902 (test_bit(n, to_send) && last_iter) ||
903 (test_bit(n, to_fix) && last_iter)))
904 continue;
906 /*
907 ** we get here if:
908 ** 1. page is marked to_send & hasn't already been re-dirtied
909 ** 2. (ignore to_skip in last iteration)
910 ** 3. add in pages that still need fixup (net bufs)
911 */
913 pfn_batch[batch] = n;
914 pfn_type[batch] = live_p2m[n];
916 if(!is_mapped(pfn_type[batch])) {
918 /*
919 ** not currently in psuedo-physical map -- set bit
920 ** in to_fix since we must send this page in last_iter
921 ** unless its sent sooner anyhow, or it never enters
922 ** pseudo-physical map (e.g. for ballooned down domains)
923 */
925 set_bit(n, to_fix);
926 continue;
927 }
929 if(last_iter && test_bit(n, to_fix) && !test_bit(n, to_send)) {
930 needed_to_fix++;
931 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
932 iter, n, pfn_type[batch]);
933 }
935 clear_bit(n, to_fix);
937 batch++;
938 }
940 if (batch == 0)
941 goto skip; /* vanishingly unlikely... */
943 if ((region_base = xc_map_foreign_batch(
944 xc_handle, dom, PROT_READ, pfn_type, batch)) == 0) {
945 ERROR("map batch failed");
946 goto out;
947 }
949 if (xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type)) {
950 ERROR("get_pfn_type_batch failed");
951 goto out;
952 }
954 for ( j = 0; j < batch; j++ )
955 {
957 if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
958 XEN_DOMCTL_PFINFO_XTAB )
959 {
960 DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
961 continue;
962 }
964 if (debug)
965 DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
966 " sum= %08lx\n",
967 iter,
968 (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
969 pfn_batch[j],
970 pfn_type[j],
971 mfn_to_pfn(pfn_type[j] &
972 ~XEN_DOMCTL_PFINFO_LTAB_MASK),
973 csum_page(region_base + (PAGE_SIZE*j)));
975 /* canonicalise mfn->pfn */
976 pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
977 pfn_batch[j];
978 }
980 if(!write_exact(io_fd, &batch, sizeof(unsigned int))) {
981 ERROR("Error when writing to state file (2) (errno %d)",
982 errno);
983 goto out;
984 }
986 if(!write_exact(io_fd, pfn_type, sizeof(unsigned long)*j)) {
987 ERROR("Error when writing to state file (3) (errno %d)",
988 errno);
989 goto out;
990 }
992 /* entering this loop, pfn_type is now in pfns (Not mfns) */
993 for ( j = 0; j < batch; j++ )
994 {
995 unsigned long pfn, pagetype;
996 void *spage = (char *)region_base + (PAGE_SIZE*j);
998 pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
999 pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
1001 /* write out pages in batch */
1002 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
1003 continue;
1005 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
1007 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
1008 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
1010 /* We have a pagetable page: need to rewrite it. */
1011 race =
1012 canonicalize_pagetable(pagetype, pfn, spage, page);
1014 if(race && !live)
1015 goto out;
1017 if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
1018 ERROR("Error when writing to state file (4)"
1019 " (errno %d)", errno);
1020 goto out;
1023 } else {
1025 /* We have a normal page: just write it directly. */
1026 if (ratewrite(io_fd, spage, PAGE_SIZE) != PAGE_SIZE) {
1027 ERROR("Error when writing to state file (5)"
1028 " (errno %d)", errno);
1029 goto out;
1032 } /* end of the write out for this batch */
1034 sent_this_iter += batch;
1036 munmap(region_base, batch*PAGE_SIZE);
1038 } /* end of this while loop for this iteration */
1040 skip:
1042 total_sent += sent_this_iter;
1044 DPRINTF("\r %d: sent %d, skipped %d, ",
1045 iter, sent_this_iter, skip_this_iter );
1047 if (last_iter) {
1048 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
1050 DPRINTF("Total pages sent= %ld (%.2fx)\n",
1051 total_sent, ((float)total_sent)/max_pfn );
1052 DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
1055 if (last_iter && debug) {
1056 int minusone = -1;
1057 memset(to_send, 0xff, BITMAP_SIZE);
1058 debug = 0;
1059 DPRINTF("Entering debug resend-all mode\n");
1061 /* send "-1" to put receiver into debug mode */
1062 if(!write_exact(io_fd, &minusone, sizeof(int))) {
1063 ERROR("Error when writing to state file (6) (errno %d)",
1064 errno);
1065 goto out;
1068 continue;
1071 if (last_iter)
1072 break;
1074 if (live) {
1075 if (((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
1076 (iter >= max_iters) ||
1077 (sent_this_iter+skip_this_iter < 50) ||
1078 (total_sent > max_pfn*max_factor)) {
1079 DPRINTF("Start last iteration\n");
1080 last_iter = 1;
1082 if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
1083 &ctxt)) {
1084 ERROR("Domain appears not to have suspended");
1085 goto out;
1088 DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
1089 info.shared_info_frame,
1090 (unsigned long)ctxt.user_regs.eip,
1091 (unsigned long)ctxt.user_regs.edx);
1094 if (xc_shadow_control(xc_handle, dom,
1095 XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
1096 max_pfn, NULL, 0, &stats) != max_pfn) {
1097 ERROR("Error flushing shadow PT");
1098 goto out;
1101 sent_last_iter = sent_this_iter;
1103 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
1106 } /* end of while 1 */
1108 DPRINTF("All memory is saved\n");
1110 /* Zero terminate */
1111 i = 0;
1112 if (!write_exact(io_fd, &i, sizeof(int))) {
1113 ERROR("Error when writing to state file (6') (errno %d)", errno);
1114 goto out;
1117 /* Send through a list of all the PFNs that were not in map at the close */
1119 unsigned int i,j;
1120 unsigned long pfntab[1024];
1122 for (i = 0, j = 0; i < max_pfn; i++) {
1123 if (!is_mapped(live_p2m[i]))
1124 j++;
1127 if(!write_exact(io_fd, &j, sizeof(unsigned int))) {
1128 ERROR("Error when writing to state file (6a) (errno %d)", errno);
1129 goto out;
1132 for (i = 0, j = 0; i < max_pfn; ) {
1134 if (!is_mapped(live_p2m[i]))
1135 pfntab[j++] = i;
1137 i++;
1138 if (j == 1024 || i == max_pfn) {
1139 if(!write_exact(io_fd, &pfntab, sizeof(unsigned long)*j)) {
1140 ERROR("Error when writing to state file (6b) (errno %d)",
1141 errno);
1142 goto out;
1144 j = 0;
1150 /* Canonicalise the suspend-record frame number. */
1151 if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) ){
1152 ERROR("Suspend record is not in range of pseudophys map");
1153 goto out;
1156 /* Canonicalise each GDT frame number. */
1157 for ( i = 0; (512*i) < ctxt.gdt_ents; i++ ) {
1158 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1159 ERROR("GDT frame is not in range of pseudophys map");
1160 goto out;
1164 /* Canonicalise the page table base pointer. */
1165 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) ) {
1166 ERROR("PT base is not in range of pseudophys map");
1167 goto out;
1169 ctxt.ctrlreg[3] =
1170 xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3])));
1172 if (!write_exact(io_fd, &ctxt, sizeof(ctxt)) ||
1173 !write_exact(io_fd, live_shinfo, PAGE_SIZE)) {
1174 ERROR("Error when writing to state file (1) (errno %d)", errno);
1175 goto out;
1178 /* Success! */
1179 rc = 0;
1181 out:
1183 if (live) {
1184 if(xc_shadow_control(xc_handle, dom,
1185 XEN_DOMCTL_SHADOW_OP_OFF,
1186 NULL, 0, NULL, 0, NULL) < 0) {
1187 DPRINTF("Warning - couldn't disable shadow mode");
1191 if (live_shinfo)
1192 munmap(live_shinfo, PAGE_SIZE);
1194 if (live_p2m_frame_list_list)
1195 munmap(live_p2m_frame_list_list, PAGE_SIZE);
1197 if (live_p2m_frame_list)
1198 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
1200 if(live_p2m)
1201 munmap(live_p2m, P2M_SIZE);
1203 if(live_m2p)
1204 munmap(live_m2p, M2P_SIZE(max_mfn));
1206 free(pfn_type);
1207 free(pfn_batch);
1208 free(to_send);
1209 free(to_fix);
1210 free(to_skip);
1212 DPRINTF("Save exit rc=%d\n",rc);
1214 return !!rc;
1217 /*
1218 * Local variables:
1219 * mode: C
1220 * c-set-style: "BSD"
1221 * c-basic-offset: 4
1222 * tab-width: 4
1223 * indent-tabs-mode: nil
1224 * End:
1225 */