ia64/xen-unstable

view tools/libxc/xc_linux_save.c @ 14114:59b8d5168cc1

Reduce impact of saving/restoring/dumping large domains on Dom0 memory
usage by means of fadvise64() to tell the OS to discard the cache
pages used for the save/dump file.

Signed-off-by: Simon Graham <Simon.Graham@stratus.com>
author Keir Fraser <keir@xensource.com>
date Sat Feb 24 14:48:17 2007 +0000 (2007-02-24)
parents f61992cb82fe
children dd8c88744433
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xc_private.h"
16 #include "xc_dom.h"
17 #include "xg_private.h"
18 #include "xg_save_restore.h"
20 /*
21 ** Default values for important tuning parameters. Can override by passing
22 ** non-zero replacement values to xc_linux_save().
23 **
24 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
25 **
26 */
27 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
28 #define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
31 /* max mfn of the whole machine */
32 static unsigned long max_mfn;
34 /* virtual starting address of the hypervisor */
35 static unsigned long hvirt_start;
37 /* #levels of page tables used by the currrent guest */
38 static unsigned int pt_levels;
40 /* total number of pages used by the current guest */
41 static unsigned long max_pfn;
43 /* Live mapping of the table mapping each PFN to its current MFN. */
44 static xen_pfn_t *live_p2m = NULL;
46 /* Live mapping of system MFN to PFN table. */
47 static xen_pfn_t *live_m2p = NULL;
48 static unsigned long m2p_mfn0;
50 /* grep fodder: machine_to_phys */
52 #define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
54 /*
55 * Returns TRUE if the given machine frame number has a unique mapping
56 * in the guest's pseudophysical map.
57 */
58 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
59 (((_mfn) < (max_mfn)) && \
60 ((mfn_to_pfn(_mfn) < (max_pfn)) && \
61 (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
64 /* Returns TRUE if MFN is successfully converted to a PFN. */
65 #define translate_mfn_to_pfn(_pmfn) \
66 ({ \
67 unsigned long mfn = *(_pmfn); \
68 int _res = 1; \
69 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
70 _res = 0; \
71 else \
72 *(_pmfn) = mfn_to_pfn(mfn); \
73 _res; \
74 })
76 /*
77 ** During (live) save/migrate, we maintain a number of bitmaps to track
78 ** which pages we have to send, to fixup, and to skip.
79 */
81 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
82 #define BITMAP_SIZE ((max_pfn + BITS_PER_LONG - 1) / 8)
84 #define BITMAP_ENTRY(_nr,_bmap) \
85 ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
87 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
89 static inline int test_bit (int nr, volatile void * addr)
90 {
91 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
92 }
94 static inline void clear_bit (int nr, volatile void * addr)
95 {
96 BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
97 }
99 static inline void set_bit ( int nr, volatile void * addr)
100 {
101 BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
102 }
104 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
105 static inline unsigned int hweight32(unsigned int w)
106 {
107 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
108 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
109 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
110 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
111 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
112 }
114 static inline int count_bits ( int nr, volatile void *addr)
115 {
116 int i, count = 0;
117 volatile unsigned long *p = (volatile unsigned long *)addr;
118 /* We know that the array is padded to unsigned long. */
119 for( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
120 count += hweight32(*p);
121 return count;
122 }
124 static inline int permute( int i, int nr, int order_nr )
125 {
126 /* Need a simple permutation function so that we scan pages in a
127 pseudo random order, enabling us to get a better estimate of
128 the domain's page dirtying rate as we go (there are often
129 contiguous ranges of pfns that have similar behaviour, and we
130 want to mix them up. */
132 /* e.g. nr->oder 15->4 16->4 17->5 */
133 /* 512MB domain, 128k pages, order 17 */
135 /*
136 QPONMLKJIHGFEDCBA
137 QPONMLKJIH
138 GFEDCBA
139 */
141 /*
142 QPONMLKJIHGFEDCBA
143 EDCBA
144 QPONM
145 LKJIHGF
146 */
148 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
149 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
151 return i;
152 }
157 static uint64_t tv_to_us(struct timeval *new)
158 {
159 return (new->tv_sec * 1000000) + new->tv_usec;
160 }
162 static uint64_t llgettimeofday(void)
163 {
164 struct timeval now;
165 gettimeofday(&now, NULL);
166 return tv_to_us(&now);
167 }
169 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
170 {
171 return ((new->tv_sec - old->tv_sec)*1000000 ) +
172 (new->tv_usec - old->tv_usec);
173 }
175 static int noncached_write(int fd, int live, void *buffer, int len)
176 {
177 static int write_count = 0;
179 int rc = write(fd,buffer,len);
181 if (!live) {
182 write_count += len;
184 if (write_count >= MAX_PAGECACHE_USAGE*PAGE_SIZE) {
185 int serrno = errno;
187 /* Time to discard cache - dont care if this fails */
188 discard_file_cache(fd, 0 /* no flush */);
190 write_count = 0;
192 errno = serrno;
193 }
194 }
195 return rc;
196 }
198 #ifdef ADAPTIVE_SAVE
201 /*
202 ** We control the rate at which we transmit (or save) to minimize impact
203 ** on running domains (including the target if we're doing live migrate).
204 */
206 #define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
207 #define START_MBIT_RATE 100 /* initial transmit rate for migrate */
210 /* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
211 #define RATE_TO_BTU 781250
213 /* Amount in bytes we allow ourselves to send in a burst */
214 #define BURST_BUDGET (100*1024)
217 /* We keep track of the current and previous transmission rate */
218 static int mbit_rate, ombit_rate = 0;
220 /* Have we reached the maximum transmission rate? */
221 #define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
224 static inline void initialize_mbit_rate()
225 {
226 mbit_rate = START_MBIT_RATE;
227 }
230 static int ratewrite(int io_fd, int live, void *buf, int n)
231 {
232 static int budget = 0;
233 static int burst_time_us = -1;
234 static struct timeval last_put = { 0 };
235 struct timeval now;
236 struct timespec delay;
237 long long delta;
239 if (START_MBIT_RATE == 0)
240 return noncached_write(io_fd, live, buf, n);
242 budget -= n;
243 if (budget < 0) {
244 if (mbit_rate != ombit_rate) {
245 burst_time_us = RATE_TO_BTU / mbit_rate;
246 ombit_rate = mbit_rate;
247 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
248 mbit_rate, BURST_BUDGET, burst_time_us);
249 }
250 if (last_put.tv_sec == 0) {
251 budget += BURST_BUDGET;
252 gettimeofday(&last_put, NULL);
253 } else {
254 while (budget < 0) {
255 gettimeofday(&now, NULL);
256 delta = tv_delta(&now, &last_put);
257 while (delta > burst_time_us) {
258 budget += BURST_BUDGET;
259 last_put.tv_usec += burst_time_us;
260 if (last_put.tv_usec > 1000000) {
261 last_put.tv_usec -= 1000000;
262 last_put.tv_sec++;
263 }
264 delta -= burst_time_us;
265 }
266 if (budget > 0)
267 break;
268 delay.tv_sec = 0;
269 delay.tv_nsec = 1000 * (burst_time_us - delta);
270 while (delay.tv_nsec > 0)
271 if (nanosleep(&delay, &delay) == 0)
272 break;
273 }
274 }
275 }
276 return noncached_write(io_fd, live, buf, n);
277 }
279 #else /* ! ADAPTIVE SAVE */
281 #define RATE_IS_MAX() (0)
282 #define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), (_buf), (_n))
283 #define initialize_mbit_rate()
285 #endif
288 static inline ssize_t write_exact(int fd, void *buf, size_t count)
289 {
290 if(write(fd, buf, count) != count)
291 return 0;
292 return 1;
293 }
297 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
298 xc_shadow_op_stats_t *stats, int print)
299 {
300 static struct timeval wall_last;
301 static long long d0_cpu_last;
302 static long long d1_cpu_last;
304 struct timeval wall_now;
305 long long wall_delta;
306 long long d0_cpu_now, d0_cpu_delta;
307 long long d1_cpu_now, d1_cpu_delta;
309 gettimeofday(&wall_now, NULL);
311 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
312 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
314 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
315 DPRINTF("ARRHHH!!\n");
317 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
319 if (wall_delta == 0) wall_delta = 1;
321 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
322 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
324 if (print)
325 DPRINTF(
326 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
327 "dirtied %dMb/s %" PRId32 " pages\n",
328 wall_delta,
329 (int)((d0_cpu_delta*100)/wall_delta),
330 (int)((d1_cpu_delta*100)/wall_delta),
331 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
332 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
333 stats->dirty_count);
335 #ifdef ADAPTIVE_SAVE
336 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
337 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
338 + 50;
339 if (mbit_rate > MAX_MBIT_RATE)
340 mbit_rate = MAX_MBIT_RATE;
341 }
342 #endif
344 d0_cpu_last = d0_cpu_now;
345 d1_cpu_last = d1_cpu_now;
346 wall_last = wall_now;
348 return 0;
349 }
352 static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn,
353 unsigned long *arr, int runs)
354 {
355 long long start, now;
356 xc_shadow_op_stats_t stats;
357 int j;
359 start = llgettimeofday();
361 for (j = 0; j < runs; j++) {
362 int i;
364 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
365 arr, max_pfn, NULL, 0, NULL);
366 DPRINTF("#Flush\n");
367 for ( i = 0; i < 40; i++ ) {
368 usleep(50000);
369 now = llgettimeofday();
370 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
371 NULL, 0, NULL, 0, &stats);
373 DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
374 ((now-start)+500)/1000,
375 stats.fault_count, stats.dirty_count);
376 }
377 }
379 return -1;
380 }
383 static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
384 int dom, xc_dominfo_t *info,
385 vcpu_guest_context_t *ctxt)
386 {
387 int i = 0;
389 if (!(*suspend)(dom)) {
390 ERROR("Suspend request failed");
391 return -1;
392 }
394 retry:
396 if (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) {
397 ERROR("Could not get domain info");
398 return -1;
399 }
401 if ( xc_vcpu_getcontext(xc_handle, dom, 0 /* XXX */, ctxt))
402 ERROR("Could not get vcpu context");
405 if (info->dying) {
406 ERROR("domain is dying");
407 return -1;
408 }
410 if (info->crashed) {
411 ERROR("domain has crashed");
412 return -1;
413 }
415 if (info->shutdown) {
416 switch (info->shutdown_reason) {
417 case SHUTDOWN_poweroff:
418 case SHUTDOWN_reboot:
419 ERROR("domain has shut down");
420 return -1;
421 case SHUTDOWN_suspend:
422 return 0;
423 case SHUTDOWN_crash:
424 ERROR("domain has crashed");
425 return -1;
426 }
427 }
429 if (info->paused) {
430 // try unpausing domain, wait, and retest
431 xc_domain_unpause( xc_handle, dom );
433 ERROR("Domain was paused. Wait and re-test.");
434 usleep(10000); // 10ms
436 goto retry;
437 }
440 if( ++i < 100 ) {
441 ERROR("Retry suspend domain");
442 usleep(10000); // 10ms
443 goto retry;
444 }
446 ERROR("Unable to suspend domain.");
448 return -1;
449 }
451 /*
452 ** Map the top-level page of MFNs from the guest. The guest might not have
453 ** finished resuming from a previous restore operation, so we wait a while for
454 ** it to update the MFN to a reasonable value.
455 */
456 static void *map_frame_list_list(int xc_handle, uint32_t dom,
457 shared_info_t *shinfo)
458 {
459 int count = 100;
460 void *p;
462 while (count-- && shinfo->arch.pfn_to_mfn_frame_list_list == 0)
463 usleep(10000);
465 if (shinfo->arch.pfn_to_mfn_frame_list_list == 0) {
466 ERROR("Timed out waiting for frame list updated.");
467 return NULL;
468 }
470 p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
471 shinfo->arch.pfn_to_mfn_frame_list_list);
473 if (p == NULL)
474 ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
476 return p;
477 }
479 /*
480 ** During transfer (or in the state file), all page-table pages must be
481 ** converted into a 'canonical' form where references to actual mfns
482 ** are replaced with references to the corresponding pfns.
483 **
484 ** This function performs the appropriate conversion, taking into account
485 ** which entries do not require canonicalization (in particular, those
486 ** entries which map the virtual address reserved for the hypervisor).
487 */
488 static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
489 const void *spage, void *dpage)
490 {
492 int i, pte_last, xen_start, xen_end, race = 0;
493 uint64_t pte;
495 /*
496 ** We need to determine which entries in this page table hold
497 ** reserved hypervisor mappings. This depends on the current
498 ** page table type as well as the number of paging levels.
499 */
500 xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
502 if (pt_levels == 2 && type == XEN_DOMCTL_PFINFO_L2TAB)
503 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
505 if (pt_levels == 3 && type == XEN_DOMCTL_PFINFO_L3TAB)
506 xen_start = L3_PAGETABLE_ENTRIES_PAE;
508 /*
509 ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
510 ** We can spot this by looking for the guest linear mapping which
511 ** Xen always ensures is present in that L2. Guests must ensure
512 ** that this check will fail for other L2s.
513 */
514 if (pt_levels == 3 && type == XEN_DOMCTL_PFINFO_L2TAB) {
515 int hstart;
516 unsigned long he;
518 hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
519 he = ((const uint64_t *) spage)[hstart];
521 if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 ) {
522 /* hvirt starts with xen stuff... */
523 xen_start = hstart;
524 } else if ( hvirt_start != 0xf5800000 ) {
525 /* old L2s from before hole was shrunk... */
526 hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
527 he = ((const uint64_t *) spage)[hstart];
529 if( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
530 xen_start = hstart;
531 }
532 }
534 if (pt_levels == 4 && type == XEN_DOMCTL_PFINFO_L4TAB) {
535 /*
536 ** XXX SMH: should compute these from hvirt_start (which we have)
537 ** and hvirt_end (which we don't)
538 */
539 xen_start = 256;
540 xen_end = 272;
541 }
543 /* Now iterate through the page table, canonicalizing each PTE */
544 for (i = 0; i < pte_last; i++ ) {
546 unsigned long pfn, mfn;
548 if (pt_levels == 2)
549 pte = ((const uint32_t*)spage)[i];
550 else
551 pte = ((const uint64_t*)spage)[i];
553 if (i >= xen_start && i < xen_end)
554 pte = 0;
556 if (pte & _PAGE_PRESENT) {
558 mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
559 if (!MFN_IS_IN_PSEUDOPHYS_MAP(mfn)) {
560 /* This will happen if the type info is stale which
561 is quite feasible under live migration */
562 pfn = 0; /* zap it - we'll retransmit this page later */
563 race = 1; /* inform the caller of race; fatal if !live */
564 } else
565 pfn = mfn_to_pfn(mfn);
567 pte &= ~MADDR_MASK_X86;
568 pte |= (uint64_t)pfn << PAGE_SHIFT;
570 /*
571 * PAE guest L3Es can contain these flags when running on
572 * a 64bit hypervisor. We zap these here to avoid any
573 * surprise at restore time...
574 */
575 if ( pt_levels == 3 &&
576 type == XEN_DOMCTL_PFINFO_L3TAB &&
577 pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED) )
578 {
579 pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
580 }
581 }
583 if (pt_levels == 2)
584 ((uint32_t*)dpage)[i] = pte;
585 else
586 ((uint64_t*)dpage)[i] = pte;
588 }
590 return race;
591 }
595 static xen_pfn_t *xc_map_m2p(int xc_handle,
596 unsigned long max_mfn,
597 int prot)
598 {
599 struct xen_machphys_mfn_list xmml;
600 privcmd_mmap_entry_t *entries;
601 unsigned long m2p_chunks, m2p_size;
602 xen_pfn_t *m2p;
603 xen_pfn_t *extent_start;
604 int i, rc;
606 m2p_size = M2P_SIZE(max_mfn);
607 m2p_chunks = M2P_CHUNKS(max_mfn);
609 xmml.max_extents = m2p_chunks;
610 if (!(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t)))) {
611 ERROR("failed to allocate space for m2p mfns");
612 return NULL;
613 }
614 set_xen_guest_handle(xmml.extent_start, extent_start);
616 if (xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
617 (xmml.nr_extents != m2p_chunks)) {
618 ERROR("xc_get_m2p_mfns");
619 return NULL;
620 }
622 if ((m2p = mmap(NULL, m2p_size, prot,
623 MAP_SHARED, xc_handle, 0)) == MAP_FAILED) {
624 ERROR("failed to mmap m2p");
625 return NULL;
626 }
628 if (!(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t)))) {
629 ERROR("failed to allocate space for mmap entries");
630 return NULL;
631 }
633 for (i=0; i < m2p_chunks; i++) {
634 entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
635 entries[i].mfn = extent_start[i];
636 entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
637 }
639 if ((rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
640 entries, m2p_chunks)) < 0) {
641 ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
642 return NULL;
643 }
645 m2p_mfn0 = entries[0].mfn;
647 free(extent_start);
648 free(entries);
650 return m2p;
651 }
655 int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
656 uint32_t max_factor, uint32_t flags, int (*suspend)(int))
657 {
658 xc_dominfo_t info;
660 int rc = 1, i, j, last_iter, iter = 0;
661 int live = (flags & XCFLAGS_LIVE);
662 int debug = (flags & XCFLAGS_DEBUG);
663 int race = 0, sent_last_iter, skip_this_iter;
665 /* The new domain's shared-info frame number. */
666 unsigned long shared_info_frame;
668 /* A copy of the CPU context of the guest. */
669 vcpu_guest_context_t ctxt;
671 /* A table containg the type of each PFN (/not/ MFN!). */
672 unsigned long *pfn_type = NULL;
673 unsigned long *pfn_batch = NULL;
675 /* A temporary mapping, and a copy, of one frame of guest memory. */
676 char page[PAGE_SIZE];
678 /* Double and single indirect references to the live P2M table */
679 xen_pfn_t *live_p2m_frame_list_list = NULL;
680 xen_pfn_t *live_p2m_frame_list = NULL;
682 /* A copy of the pfn-to-mfn table frame list. */
683 xen_pfn_t *p2m_frame_list = NULL;
685 /* Live mapping of shared info structure */
686 shared_info_t *live_shinfo = NULL;
688 /* base of the region in which domain memory is mapped */
689 unsigned char *region_base = NULL;
691 /* power of 2 order of max_pfn */
692 int order_nr;
694 /* bitmap of pages:
695 - that should be sent this iteration (unless later marked as skip);
696 - to skip this iteration because already dirty;
697 - to fixup by sending at the end if not already resent; */
698 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
700 xc_shadow_op_stats_t stats;
702 unsigned long needed_to_fix = 0;
703 unsigned long total_sent = 0;
706 /* If no explicit control parameters given, use defaults */
707 if(!max_iters)
708 max_iters = DEF_MAX_ITERS;
709 if(!max_factor)
710 max_factor = DEF_MAX_FACTOR;
712 initialize_mbit_rate();
714 if(!get_platform_info(xc_handle, dom,
715 &max_mfn, &hvirt_start, &pt_levels)) {
716 ERROR("Unable to get platform info.");
717 return 1;
718 }
720 if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
721 ERROR("Could not get domain info");
722 return 1;
723 }
725 if (lock_pages(&ctxt, sizeof(ctxt))) {
726 ERROR("Unable to lock ctxt");
727 return 1;
728 }
730 /* Only have to worry about vcpu 0 even for SMP */
731 if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) {
732 ERROR("Could not get vcpu context");
733 goto out;
734 }
735 shared_info_frame = info.shared_info_frame;
737 /* A cheesy test to see whether the domain contains valid state. */
738 if (ctxt.ctrlreg[3] == 0)
739 {
740 ERROR("Domain is not in a valid Linux guest OS state");
741 goto out;
742 }
744 /* Map the shared info frame */
745 if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
746 PROT_READ, shared_info_frame))) {
747 ERROR("Couldn't map live_shinfo");
748 goto out;
749 }
751 max_pfn = live_shinfo->arch.max_pfn;
753 live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
754 live_shinfo);
756 if (!live_p2m_frame_list_list)
757 goto out;
759 live_p2m_frame_list =
760 xc_map_foreign_batch(xc_handle, dom, PROT_READ,
761 live_p2m_frame_list_list,
762 P2M_FLL_ENTRIES);
764 if (!live_p2m_frame_list) {
765 ERROR("Couldn't map p2m_frame_list");
766 goto out;
767 }
769 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
770 the guest must not change which frames are used for this purpose.
771 (its not clear why it would want to change them, and we'll be OK
772 from a safety POV anyhow. */
774 live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
775 live_p2m_frame_list,
776 P2M_FL_ENTRIES);
778 if (!live_p2m) {
779 ERROR("Couldn't map p2m table");
780 goto out;
781 }
783 /* Setup the mfn_to_pfn table mapping */
784 if(!(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ))) {
785 ERROR("Failed to map live M2P table");
786 goto out;
787 }
790 /* Get a local copy of the live_P2M_frame_list */
791 if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
792 ERROR("Couldn't allocate p2m_frame_list array");
793 goto out;
794 }
795 memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
797 /* Canonicalise the pfn-to-mfn table frame-number list. */
798 for (i = 0; i < max_pfn; i += fpp) {
799 if (!translate_mfn_to_pfn(&p2m_frame_list[i/fpp])) {
800 ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
801 ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
802 (uint64_t)p2m_frame_list[i/fpp]);
803 goto out;
804 }
805 }
807 /* Domain is still running at this point */
808 if (live) {
810 if (xc_shadow_control(xc_handle, dom,
811 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
812 NULL, 0, NULL, 0, NULL) < 0) {
813 ERROR("Couldn't enable shadow mode");
814 goto out;
815 }
817 last_iter = 0;
819 } else {
821 /* This is a non-live suspend. Issue the call back to get the
822 domain suspended */
824 last_iter = 1;
826 if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt)) {
827 ERROR("Domain appears not to have suspended");
828 goto out;
829 }
831 }
833 /* pretend we sent all the pages last iteration */
834 sent_last_iter = max_pfn;
837 /* calculate the power of 2 order of max_pfn, e.g.
838 15->4 16->4 17->5 */
839 for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
840 continue;
842 /* Setup to_send / to_fix and to_skip bitmaps */
843 to_send = malloc(BITMAP_SIZE);
844 to_fix = calloc(1, BITMAP_SIZE);
845 to_skip = malloc(BITMAP_SIZE);
847 if (!to_send || !to_fix || !to_skip) {
848 ERROR("Couldn't allocate to_send array");
849 goto out;
850 }
852 memset(to_send, 0xff, BITMAP_SIZE);
854 if (lock_pages(to_send, BITMAP_SIZE)) {
855 ERROR("Unable to lock to_send");
856 return 1;
857 }
859 /* (to fix is local only) */
860 if (lock_pages(to_skip, BITMAP_SIZE)) {
861 ERROR("Unable to lock to_skip");
862 return 1;
863 }
865 analysis_phase(xc_handle, dom, max_pfn, to_skip, 0);
867 /* We want zeroed memory so use calloc rather than malloc. */
868 pfn_type = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
869 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
871 if ((pfn_type == NULL) || (pfn_batch == NULL)) {
872 ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
873 errno = ENOMEM;
874 goto out;
875 }
877 if (lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type))) {
878 ERROR("Unable to lock");
879 goto out;
880 }
882 /*
883 * Quick belt and braces sanity check.
884 */
885 {
886 int err=0;
887 unsigned long mfn;
888 for (i = 0; i < max_pfn; i++) {
890 mfn = live_p2m[i];
891 if((mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i)) {
892 DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
893 mfn, mfn_to_pfn(mfn));
894 err++;
895 }
896 }
897 DPRINTF("Had %d unexplained entries in p2m table\n", err);
898 }
901 /* Start writing out the saved-domain record. */
903 if (!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
904 ERROR("write: max_pfn");
905 goto out;
906 }
908 /*
909 * Write an extended-info structure to inform the restore code that
910 * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
911 * slow paths in the restore code.
912 */
913 if ((pt_levels == 3) &&
914 (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3))) {
915 unsigned long signature = ~0UL;
916 uint32_t tot_sz = sizeof(struct vcpu_guest_context) + 8;
917 uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
918 char chunk_sig[] = "vcpu";
919 if (!write_exact(io_fd, &signature, sizeof(signature)) ||
920 !write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
921 !write_exact(io_fd, &chunk_sig, 4) ||
922 !write_exact(io_fd, &chunk_sz, sizeof(chunk_sz)) ||
923 !write_exact(io_fd, &ctxt, sizeof(ctxt))) {
924 ERROR("write: extended info");
925 goto out;
926 }
927 }
929 if (!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
930 ERROR("write: p2m_frame_list");
931 goto out;
932 }
934 print_stats(xc_handle, dom, 0, &stats, 0);
936 /* Now write out each data page, canonicalising page tables as we go... */
938 while(1) {
940 unsigned int prev_pc, sent_this_iter, N, batch;
942 iter++;
943 sent_this_iter = 0;
944 skip_this_iter = 0;
945 prev_pc = 0;
946 N=0;
948 DPRINTF("Saving memory pages: iter %d 0%%", iter);
950 while( N < max_pfn ){
952 unsigned int this_pc = (N * 100) / max_pfn;
954 if ((this_pc - prev_pc) >= 5) {
955 DPRINTF("\b\b\b\b%3d%%", this_pc);
956 prev_pc = this_pc;
957 }
959 /* slightly wasteful to peek the whole array evey time,
960 but this is fast enough for the moment. */
961 if (!last_iter && xc_shadow_control(
962 xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK,
963 to_skip, max_pfn, NULL, 0, NULL) != max_pfn) {
964 ERROR("Error peeking shadow bitmap");
965 goto out;
966 }
969 /* load pfn_type[] with the mfn of all the pages we're doing in
970 this batch. */
971 for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) {
973 int n = permute(N, max_pfn, order_nr);
975 if (debug) {
976 DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n",
977 iter, (unsigned long)n, live_p2m[n],
978 test_bit(n, to_send),
979 mfn_to_pfn(live_p2m[n]&0xFFFFF));
980 }
982 if (!last_iter && test_bit(n, to_send)&& test_bit(n, to_skip))
983 skip_this_iter++; /* stats keeping */
985 if (!((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
986 (test_bit(n, to_send) && last_iter) ||
987 (test_bit(n, to_fix) && last_iter)))
988 continue;
990 /*
991 ** we get here if:
992 ** 1. page is marked to_send & hasn't already been re-dirtied
993 ** 2. (ignore to_skip in last iteration)
994 ** 3. add in pages that still need fixup (net bufs)
995 */
997 pfn_batch[batch] = n;
998 pfn_type[batch] = live_p2m[n];
1000 if(!is_mapped(pfn_type[batch])) {
1002 /*
1003 ** not currently in psuedo-physical map -- set bit
1004 ** in to_fix since we must send this page in last_iter
1005 ** unless its sent sooner anyhow, or it never enters
1006 ** pseudo-physical map (e.g. for ballooned down domains)
1007 */
1009 set_bit(n, to_fix);
1010 continue;
1013 if(last_iter && test_bit(n, to_fix) && !test_bit(n, to_send)) {
1014 needed_to_fix++;
1015 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
1016 iter, n, pfn_type[batch]);
1019 clear_bit(n, to_fix);
1021 batch++;
1024 if (batch == 0)
1025 goto skip; /* vanishingly unlikely... */
1027 if ((region_base = xc_map_foreign_batch(
1028 xc_handle, dom, PROT_READ, pfn_type, batch)) == 0) {
1029 ERROR("map batch failed");
1030 goto out;
1033 for ( j = 0; j < batch; j++ )
1034 ((uint32_t *)pfn_type)[j] = pfn_type[j];
1035 if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
1036 (uint32_t *)pfn_type) )
1038 ERROR("get_pfn_type_batch failed");
1039 goto out;
1041 for ( j = batch-1; j >= 0; j-- )
1042 pfn_type[j] = ((uint32_t *)pfn_type)[j];
1044 for ( j = 0; j < batch; j++ )
1047 if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
1048 XEN_DOMCTL_PFINFO_XTAB )
1050 DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
1051 continue;
1054 if (debug)
1055 DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
1056 " sum= %08lx\n",
1057 iter,
1058 (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
1059 pfn_batch[j],
1060 pfn_type[j],
1061 mfn_to_pfn(pfn_type[j] &
1062 ~XEN_DOMCTL_PFINFO_LTAB_MASK),
1063 csum_page(region_base + (PAGE_SIZE*j)));
1065 /* canonicalise mfn->pfn */
1066 pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
1067 pfn_batch[j];
1070 if(!write_exact(io_fd, &batch, sizeof(unsigned int))) {
1071 ERROR("Error when writing to state file (2) (errno %d)",
1072 errno);
1073 goto out;
1076 if(!write_exact(io_fd, pfn_type, sizeof(unsigned long)*j)) {
1077 ERROR("Error when writing to state file (3) (errno %d)",
1078 errno);
1079 goto out;
1082 /* entering this loop, pfn_type is now in pfns (Not mfns) */
1083 for ( j = 0; j < batch; j++ )
1085 unsigned long pfn, pagetype;
1086 void *spage = (char *)region_base + (PAGE_SIZE*j);
1088 pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
1089 pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
1091 /* write out pages in batch */
1092 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
1093 continue;
1095 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
1097 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
1098 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
1100 /* We have a pagetable page: need to rewrite it. */
1101 race =
1102 canonicalize_pagetable(pagetype, pfn, spage, page);
1104 if(race && !live)
1105 goto out;
1107 if (ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE) {
1108 ERROR("Error when writing to state file (4)"
1109 " (errno %d)", errno);
1110 goto out;
1113 } else {
1115 /* We have a normal page: just write it directly. */
1116 if (ratewrite(io_fd, live, spage, PAGE_SIZE) != PAGE_SIZE) {
1117 ERROR("Error when writing to state file (5)"
1118 " (errno %d)", errno);
1119 goto out;
1122 } /* end of the write out for this batch */
1124 sent_this_iter += batch;
1126 munmap(region_base, batch*PAGE_SIZE);
1128 } /* end of this while loop for this iteration */
1130 skip:
1132 total_sent += sent_this_iter;
1134 DPRINTF("\r %d: sent %d, skipped %d, ",
1135 iter, sent_this_iter, skip_this_iter );
1137 if (last_iter) {
1138 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
1140 DPRINTF("Total pages sent= %ld (%.2fx)\n",
1141 total_sent, ((float)total_sent)/max_pfn );
1142 DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
1145 if (last_iter && debug) {
1146 int minusone = -1;
1147 memset(to_send, 0xff, BITMAP_SIZE);
1148 debug = 0;
1149 DPRINTF("Entering debug resend-all mode\n");
1151 /* send "-1" to put receiver into debug mode */
1152 if(!write_exact(io_fd, &minusone, sizeof(int))) {
1153 ERROR("Error when writing to state file (6) (errno %d)",
1154 errno);
1155 goto out;
1158 continue;
1161 if (last_iter)
1162 break;
1164 if (live) {
1165 if (((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
1166 (iter >= max_iters) ||
1167 (sent_this_iter+skip_this_iter < 50) ||
1168 (total_sent > max_pfn*max_factor)) {
1169 DPRINTF("Start last iteration\n");
1170 last_iter = 1;
1172 if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
1173 &ctxt)) {
1174 ERROR("Domain appears not to have suspended");
1175 goto out;
1178 DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
1179 info.shared_info_frame,
1180 (unsigned long)ctxt.user_regs.eip,
1181 (unsigned long)ctxt.user_regs.edx);
1184 if (xc_shadow_control(xc_handle, dom,
1185 XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
1186 max_pfn, NULL, 0, &stats) != max_pfn) {
1187 ERROR("Error flushing shadow PT");
1188 goto out;
1191 sent_last_iter = sent_this_iter;
1193 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
1196 } /* end of while 1 */
1198 DPRINTF("All memory is saved\n");
1200 /* Zero terminate */
1201 i = 0;
1202 if (!write_exact(io_fd, &i, sizeof(int))) {
1203 ERROR("Error when writing to state file (6') (errno %d)", errno);
1204 goto out;
1207 /* Send through a list of all the PFNs that were not in map at the close */
1209 unsigned int i,j;
1210 unsigned long pfntab[1024];
1212 for (i = 0, j = 0; i < max_pfn; i++) {
1213 if (!is_mapped(live_p2m[i]))
1214 j++;
1217 if(!write_exact(io_fd, &j, sizeof(unsigned int))) {
1218 ERROR("Error when writing to state file (6a) (errno %d)", errno);
1219 goto out;
1222 for (i = 0, j = 0; i < max_pfn; ) {
1224 if (!is_mapped(live_p2m[i]))
1225 pfntab[j++] = i;
1227 i++;
1228 if (j == 1024 || i == max_pfn) {
1229 if(!write_exact(io_fd, &pfntab, sizeof(unsigned long)*j)) {
1230 ERROR("Error when writing to state file (6b) (errno %d)",
1231 errno);
1232 goto out;
1234 j = 0;
1240 /* Canonicalise the suspend-record frame number. */
1241 if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) ){
1242 ERROR("Suspend record is not in range of pseudophys map");
1243 goto out;
1246 /* Canonicalise each GDT frame number. */
1247 for ( i = 0; (512*i) < ctxt.gdt_ents; i++ ) {
1248 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1249 ERROR("GDT frame is not in range of pseudophys map");
1250 goto out;
1254 /* Canonicalise the page table base pointer. */
1255 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) ) {
1256 ERROR("PT base is not in range of pseudophys map");
1257 goto out;
1259 ctxt.ctrlreg[3] =
1260 xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3])));
1262 /*
1263 * Reset the MFN to be a known-invalid value. See map_frame_list_list().
1264 */
1265 memcpy(page, live_shinfo, PAGE_SIZE);
1266 ((shared_info_t *)page)->arch.pfn_to_mfn_frame_list_list = 0;
1268 if (!write_exact(io_fd, &ctxt, sizeof(ctxt)) ||
1269 !write_exact(io_fd, page, PAGE_SIZE)) {
1270 ERROR("Error when writing to state file (1) (errno %d)", errno);
1271 goto out;
1274 /* Success! */
1275 rc = 0;
1277 out:
1279 if (live) {
1280 if(xc_shadow_control(xc_handle, dom,
1281 XEN_DOMCTL_SHADOW_OP_OFF,
1282 NULL, 0, NULL, 0, NULL) < 0) {
1283 DPRINTF("Warning - couldn't disable shadow mode");
1286 else {
1287 // flush last write and discard cache for file
1288 discard_file_cache(io_fd, 1 /* flush */);
1291 if (live_shinfo)
1292 munmap(live_shinfo, PAGE_SIZE);
1294 if (live_p2m_frame_list_list)
1295 munmap(live_p2m_frame_list_list, PAGE_SIZE);
1297 if (live_p2m_frame_list)
1298 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
1300 if(live_p2m)
1301 munmap(live_p2m, P2M_SIZE);
1303 if(live_m2p)
1304 munmap(live_m2p, M2P_SIZE(max_mfn));
1306 free(pfn_type);
1307 free(pfn_batch);
1308 free(to_send);
1309 free(to_fix);
1310 free(to_skip);
1312 DPRINTF("Save exit rc=%d\n",rc);
1314 return !!rc;
1317 /*
1318 * Local variables:
1319 * mode: C
1320 * c-set-style: "BSD"
1321 * c-basic-offset: 4
1322 * tab-width: 4
1323 * indent-tabs-mode: nil
1324 * End:
1325 */