ia64/xen-unstable

view tools/libxc/xc_linux_save.c @ 13608:30af6cfdb05c

Make domctl/sysctl interfaces 32-/64-bit invariant.
This kills off a fair amount of unpleasant CONFIG_COMPAT shimming and
avoids needing to keep the compat paths in sync as these interfaces
continue to develop.
Signed-off-by: Keir Fraser <keir@xensource.com>
author kfraser@localhost.localdomain
date Wed Jan 24 16:33:19 2007 +0000 (2007-01-24)
parents 73b88d158ec9
children a9165141e52d
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xc_private.h"
16 #include "xg_private.h"
17 #include "xg_save_restore.h"
19 /*
20 ** Default values for important tuning parameters. Can override by passing
21 ** non-zero replacement values to xc_linux_save().
22 **
23 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
24 **
25 */
26 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
27 #define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
30 /* max mfn of the whole machine */
31 static unsigned long max_mfn;
33 /* virtual starting address of the hypervisor */
34 static unsigned long hvirt_start;
36 /* #levels of page tables used by the currrent guest */
37 static unsigned int pt_levels;
39 /* total number of pages used by the current guest */
40 static unsigned long max_pfn;
42 /* Live mapping of the table mapping each PFN to its current MFN. */
43 static xen_pfn_t *live_p2m = NULL;
45 /* Live mapping of system MFN to PFN table. */
46 static xen_pfn_t *live_m2p = NULL;
47 static unsigned long m2p_mfn0;
49 /* grep fodder: machine_to_phys */
51 #define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
53 /*
54 * Returns TRUE if the given machine frame number has a unique mapping
55 * in the guest's pseudophysical map.
56 */
57 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
58 (((_mfn) < (max_mfn)) && \
59 ((mfn_to_pfn(_mfn) < (max_pfn)) && \
60 (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
63 /* Returns TRUE if MFN is successfully converted to a PFN. */
64 #define translate_mfn_to_pfn(_pmfn) \
65 ({ \
66 unsigned long mfn = *(_pmfn); \
67 int _res = 1; \
68 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
69 _res = 0; \
70 else \
71 *(_pmfn) = mfn_to_pfn(mfn); \
72 _res; \
73 })
75 /*
76 ** During (live) save/migrate, we maintain a number of bitmaps to track
77 ** which pages we have to send, to fixup, and to skip.
78 */
80 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
81 #define BITMAP_SIZE ((max_pfn + BITS_PER_LONG - 1) / 8)
83 #define BITMAP_ENTRY(_nr,_bmap) \
84 ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
86 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
88 static inline int test_bit (int nr, volatile void * addr)
89 {
90 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
91 }
93 static inline void clear_bit (int nr, volatile void * addr)
94 {
95 BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
96 }
98 static inline void set_bit ( int nr, volatile void * addr)
99 {
100 BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
101 }
103 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
104 static inline unsigned int hweight32(unsigned int w)
105 {
106 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
107 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
108 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
109 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
110 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
111 }
113 static inline int count_bits ( int nr, volatile void *addr)
114 {
115 int i, count = 0;
116 volatile unsigned long *p = (volatile unsigned long *)addr;
117 /* We know that the array is padded to unsigned long. */
118 for( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
119 count += hweight32(*p);
120 return count;
121 }
123 static inline int permute( int i, int nr, int order_nr )
124 {
125 /* Need a simple permutation function so that we scan pages in a
126 pseudo random order, enabling us to get a better estimate of
127 the domain's page dirtying rate as we go (there are often
128 contiguous ranges of pfns that have similar behaviour, and we
129 want to mix them up. */
131 /* e.g. nr->oder 15->4 16->4 17->5 */
132 /* 512MB domain, 128k pages, order 17 */
134 /*
135 QPONMLKJIHGFEDCBA
136 QPONMLKJIH
137 GFEDCBA
138 */
140 /*
141 QPONMLKJIHGFEDCBA
142 EDCBA
143 QPONM
144 LKJIHGF
145 */
147 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
148 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
150 return i;
151 }
156 static uint64_t tv_to_us(struct timeval *new)
157 {
158 return (new->tv_sec * 1000000) + new->tv_usec;
159 }
161 static uint64_t llgettimeofday(void)
162 {
163 struct timeval now;
164 gettimeofday(&now, NULL);
165 return tv_to_us(&now);
166 }
168 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
169 {
170 return ((new->tv_sec - old->tv_sec)*1000000 ) +
171 (new->tv_usec - old->tv_usec);
172 }
175 #ifdef ADAPTIVE_SAVE
178 /*
179 ** We control the rate at which we transmit (or save) to minimize impact
180 ** on running domains (including the target if we're doing live migrate).
181 */
183 #define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
184 #define START_MBIT_RATE 100 /* initial transmit rate for migrate */
187 /* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
188 #define RATE_TO_BTU 781250
190 /* Amount in bytes we allow ourselves to send in a burst */
191 #define BURST_BUDGET (100*1024)
194 /* We keep track of the current and previous transmission rate */
195 static int mbit_rate, ombit_rate = 0;
197 /* Have we reached the maximum transmission rate? */
198 #define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
201 static inline void initialize_mbit_rate()
202 {
203 mbit_rate = START_MBIT_RATE;
204 }
207 static int ratewrite(int io_fd, void *buf, int n)
208 {
209 static int budget = 0;
210 static int burst_time_us = -1;
211 static struct timeval last_put = { 0 };
212 struct timeval now;
213 struct timespec delay;
214 long long delta;
216 if (START_MBIT_RATE == 0)
217 return write(io_fd, buf, n);
219 budget -= n;
220 if (budget < 0) {
221 if (mbit_rate != ombit_rate) {
222 burst_time_us = RATE_TO_BTU / mbit_rate;
223 ombit_rate = mbit_rate;
224 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
225 mbit_rate, BURST_BUDGET, burst_time_us);
226 }
227 if (last_put.tv_sec == 0) {
228 budget += BURST_BUDGET;
229 gettimeofday(&last_put, NULL);
230 } else {
231 while (budget < 0) {
232 gettimeofday(&now, NULL);
233 delta = tv_delta(&now, &last_put);
234 while (delta > burst_time_us) {
235 budget += BURST_BUDGET;
236 last_put.tv_usec += burst_time_us;
237 if (last_put.tv_usec > 1000000) {
238 last_put.tv_usec -= 1000000;
239 last_put.tv_sec++;
240 }
241 delta -= burst_time_us;
242 }
243 if (budget > 0)
244 break;
245 delay.tv_sec = 0;
246 delay.tv_nsec = 1000 * (burst_time_us - delta);
247 while (delay.tv_nsec > 0)
248 if (nanosleep(&delay, &delay) == 0)
249 break;
250 }
251 }
252 }
253 return write(io_fd, buf, n);
254 }
256 #else /* ! ADAPTIVE SAVE */
258 #define RATE_IS_MAX() (0)
259 #define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
260 #define initialize_mbit_rate()
262 #endif
265 static inline ssize_t write_exact(int fd, void *buf, size_t count)
266 {
267 if(write(fd, buf, count) != count)
268 return 0;
269 return 1;
270 }
274 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
275 xc_shadow_op_stats_t *stats, int print)
276 {
277 static struct timeval wall_last;
278 static long long d0_cpu_last;
279 static long long d1_cpu_last;
281 struct timeval wall_now;
282 long long wall_delta;
283 long long d0_cpu_now, d0_cpu_delta;
284 long long d1_cpu_now, d1_cpu_delta;
286 gettimeofday(&wall_now, NULL);
288 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
289 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
291 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
292 DPRINTF("ARRHHH!!\n");
294 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
296 if (wall_delta == 0) wall_delta = 1;
298 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
299 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
301 if (print)
302 DPRINTF(
303 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
304 "dirtied %dMb/s %" PRId32 " pages\n",
305 wall_delta,
306 (int)((d0_cpu_delta*100)/wall_delta),
307 (int)((d1_cpu_delta*100)/wall_delta),
308 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
309 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
310 stats->dirty_count);
312 #ifdef ADAPTIVE_SAVE
313 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
314 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
315 + 50;
316 if (mbit_rate > MAX_MBIT_RATE)
317 mbit_rate = MAX_MBIT_RATE;
318 }
319 #endif
321 d0_cpu_last = d0_cpu_now;
322 d1_cpu_last = d1_cpu_now;
323 wall_last = wall_now;
325 return 0;
326 }
329 static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn,
330 unsigned long *arr, int runs)
331 {
332 long long start, now;
333 xc_shadow_op_stats_t stats;
334 int j;
336 start = llgettimeofday();
338 for (j = 0; j < runs; j++) {
339 int i;
341 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
342 arr, max_pfn, NULL, 0, NULL);
343 DPRINTF("#Flush\n");
344 for ( i = 0; i < 40; i++ ) {
345 usleep(50000);
346 now = llgettimeofday();
347 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
348 NULL, 0, NULL, 0, &stats);
350 DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
351 ((now-start)+500)/1000,
352 stats.fault_count, stats.dirty_count);
353 }
354 }
356 return -1;
357 }
360 static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
361 int dom, xc_dominfo_t *info,
362 vcpu_guest_context_t *ctxt)
363 {
364 int i = 0;
366 if (!(*suspend)(dom)) {
367 ERROR("Suspend request failed");
368 return -1;
369 }
371 retry:
373 if (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) {
374 ERROR("Could not get domain info");
375 return -1;
376 }
378 if ( xc_vcpu_getcontext(xc_handle, dom, 0 /* XXX */, ctxt))
379 ERROR("Could not get vcpu context");
382 if (info->shutdown && info->shutdown_reason == SHUTDOWN_suspend)
383 return 0; // success
385 if (info->paused) {
386 // try unpausing domain, wait, and retest
387 xc_domain_unpause( xc_handle, dom );
389 ERROR("Domain was paused. Wait and re-test.");
390 usleep(10000); // 10ms
392 goto retry;
393 }
396 if( ++i < 100 ) {
397 ERROR("Retry suspend domain.");
398 usleep(10000); // 10ms
399 goto retry;
400 }
402 ERROR("Unable to suspend domain.");
404 return -1;
405 }
407 /*
408 ** Map the top-level page of MFNs from the guest. The guest might not have
409 ** finished resuming from a previous restore operation, so we wait a while for
410 ** it to update the MFN to a reasonable value.
411 */
412 static void *map_frame_list_list(int xc_handle, uint32_t dom,
413 shared_info_t *shinfo)
414 {
415 int count = 100;
416 void *p;
418 while (count-- && shinfo->arch.pfn_to_mfn_frame_list_list == 0)
419 usleep(10000);
421 if (shinfo->arch.pfn_to_mfn_frame_list_list == 0) {
422 ERROR("Timed out waiting for frame list updated.");
423 return NULL;
424 }
426 p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
427 shinfo->arch.pfn_to_mfn_frame_list_list);
429 if (p == NULL)
430 ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
432 return p;
433 }
435 /*
436 ** During transfer (or in the state file), all page-table pages must be
437 ** converted into a 'canonical' form where references to actual mfns
438 ** are replaced with references to the corresponding pfns.
439 **
440 ** This function performs the appropriate conversion, taking into account
441 ** which entries do not require canonicalization (in particular, those
442 ** entries which map the virtual address reserved for the hypervisor).
443 */
444 static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
445 const void *spage, void *dpage)
446 {
448 int i, pte_last, xen_start, xen_end, race = 0;
449 uint64_t pte;
451 /*
452 ** We need to determine which entries in this page table hold
453 ** reserved hypervisor mappings. This depends on the current
454 ** page table type as well as the number of paging levels.
455 */
456 xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
458 if (pt_levels == 2 && type == XEN_DOMCTL_PFINFO_L2TAB)
459 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
461 if (pt_levels == 3 && type == XEN_DOMCTL_PFINFO_L3TAB)
462 xen_start = L3_PAGETABLE_ENTRIES_PAE;
464 /*
465 ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
466 ** We can spot this by looking for the guest linear mapping which
467 ** Xen always ensures is present in that L2. Guests must ensure
468 ** that this check will fail for other L2s.
469 */
470 if (pt_levels == 3 && type == XEN_DOMCTL_PFINFO_L2TAB) {
471 int hstart;
472 unsigned long he;
474 hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
475 he = ((const uint64_t *) spage)[hstart];
477 if ( ((he >> PAGE_SHIFT) & 0x0fffffff) == m2p_mfn0 ) {
478 /* hvirt starts with xen stuff... */
479 xen_start = hstart;
480 } else if ( hvirt_start != 0xf5800000 ) {
481 /* old L2s from before hole was shrunk... */
482 hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
483 he = ((const uint64_t *) spage)[hstart];
485 if( ((he >> PAGE_SHIFT) & 0x0fffffff) == m2p_mfn0 )
486 xen_start = hstart;
487 }
488 }
490 if (pt_levels == 4 && type == XEN_DOMCTL_PFINFO_L4TAB) {
491 /*
492 ** XXX SMH: should compute these from hvirt_start (which we have)
493 ** and hvirt_end (which we don't)
494 */
495 xen_start = 256;
496 xen_end = 272;
497 }
499 /* Now iterate through the page table, canonicalizing each PTE */
500 for (i = 0; i < pte_last; i++ ) {
502 unsigned long pfn, mfn;
504 if (pt_levels == 2)
505 pte = ((const uint32_t*)spage)[i];
506 else
507 pte = ((const uint64_t*)spage)[i];
509 if (i >= xen_start && i < xen_end)
510 pte = 0;
512 if (pte & _PAGE_PRESENT) {
514 mfn = (pte >> PAGE_SHIFT) & 0xfffffff;
515 if (!MFN_IS_IN_PSEUDOPHYS_MAP(mfn)) {
516 /* This will happen if the type info is stale which
517 is quite feasible under live migration */
518 DPRINTF("PT Race: [%08lx,%d] pte=%llx, mfn=%08lx\n",
519 type, i, (unsigned long long)pte, mfn);
520 pfn = 0; /* zap it - we'll retransmit this page later */
521 race = 1; /* inform the caller of race; fatal if !live */
522 } else
523 pfn = mfn_to_pfn(mfn);
525 pte &= 0xffffff0000000fffULL;
526 pte |= (uint64_t)pfn << PAGE_SHIFT;
527 }
529 if (pt_levels == 2)
530 ((uint32_t*)dpage)[i] = pte;
531 else
532 ((uint64_t*)dpage)[i] = pte;
534 }
536 return race;
537 }
541 static xen_pfn_t *xc_map_m2p(int xc_handle,
542 unsigned long max_mfn,
543 int prot)
544 {
545 struct xen_machphys_mfn_list xmml;
546 privcmd_mmap_entry_t *entries;
547 unsigned long m2p_chunks, m2p_size;
548 xen_pfn_t *m2p;
549 xen_pfn_t *extent_start;
550 int i, rc;
552 m2p_size = M2P_SIZE(max_mfn);
553 m2p_chunks = M2P_CHUNKS(max_mfn);
555 xmml.max_extents = m2p_chunks;
556 if (!(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t)))) {
557 ERROR("failed to allocate space for m2p mfns");
558 return NULL;
559 }
560 set_xen_guest_handle(xmml.extent_start, extent_start);
562 if (xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
563 (xmml.nr_extents != m2p_chunks)) {
564 ERROR("xc_get_m2p_mfns");
565 return NULL;
566 }
568 if ((m2p = mmap(NULL, m2p_size, prot,
569 MAP_SHARED, xc_handle, 0)) == MAP_FAILED) {
570 ERROR("failed to mmap m2p");
571 return NULL;
572 }
574 if (!(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t)))) {
575 ERROR("failed to allocate space for mmap entries");
576 return NULL;
577 }
579 for (i=0; i < m2p_chunks; i++) {
580 entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
581 entries[i].mfn = extent_start[i];
582 entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
583 }
585 if ((rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
586 entries, m2p_chunks)) < 0) {
587 ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
588 return NULL;
589 }
591 m2p_mfn0 = entries[0].mfn;
593 free(extent_start);
594 free(entries);
596 return m2p;
597 }
601 int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
602 uint32_t max_factor, uint32_t flags, int (*suspend)(int))
603 {
604 xc_dominfo_t info;
606 int rc = 1, i, j, last_iter, iter = 0;
607 int live = (flags & XCFLAGS_LIVE);
608 int debug = (flags & XCFLAGS_DEBUG);
609 int race = 0, sent_last_iter, skip_this_iter;
611 /* The new domain's shared-info frame number. */
612 unsigned long shared_info_frame;
614 /* A copy of the CPU context of the guest. */
615 vcpu_guest_context_t ctxt;
617 /* A table containg the type of each PFN (/not/ MFN!). */
618 unsigned long *pfn_type = NULL;
619 unsigned long *pfn_batch = NULL;
621 /* A temporary mapping, and a copy, of one frame of guest memory. */
622 char page[PAGE_SIZE];
624 /* Double and single indirect references to the live P2M table */
625 xen_pfn_t *live_p2m_frame_list_list = NULL;
626 xen_pfn_t *live_p2m_frame_list = NULL;
628 /* A copy of the pfn-to-mfn table frame list. */
629 xen_pfn_t *p2m_frame_list = NULL;
631 /* Live mapping of shared info structure */
632 shared_info_t *live_shinfo = NULL;
634 /* base of the region in which domain memory is mapped */
635 unsigned char *region_base = NULL;
637 /* power of 2 order of max_pfn */
638 int order_nr;
640 /* bitmap of pages:
641 - that should be sent this iteration (unless later marked as skip);
642 - to skip this iteration because already dirty;
643 - to fixup by sending at the end if not already resent; */
644 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
646 xc_shadow_op_stats_t stats;
648 unsigned long needed_to_fix = 0;
649 unsigned long total_sent = 0;
652 /* If no explicit control parameters given, use defaults */
653 if(!max_iters)
654 max_iters = DEF_MAX_ITERS;
655 if(!max_factor)
656 max_factor = DEF_MAX_FACTOR;
658 initialize_mbit_rate();
660 if(!get_platform_info(xc_handle, dom,
661 &max_mfn, &hvirt_start, &pt_levels)) {
662 ERROR("Unable to get platform info.");
663 return 1;
664 }
666 if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
667 ERROR("Could not get domain info");
668 return 1;
669 }
671 if (lock_pages(&ctxt, sizeof(ctxt))) {
672 ERROR("Unable to lock ctxt");
673 return 1;
674 }
676 /* Only have to worry about vcpu 0 even for SMP */
677 if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) {
678 ERROR("Could not get vcpu context");
679 goto out;
680 }
681 shared_info_frame = info.shared_info_frame;
683 /* A cheesy test to see whether the domain contains valid state. */
684 if (ctxt.ctrlreg[3] == 0)
685 {
686 ERROR("Domain is not in a valid Linux guest OS state");
687 goto out;
688 }
690 /* Map the shared info frame */
691 if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
692 PROT_READ, shared_info_frame))) {
693 ERROR("Couldn't map live_shinfo");
694 goto out;
695 }
697 max_pfn = live_shinfo->arch.max_pfn;
699 live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
700 live_shinfo);
702 if (!live_p2m_frame_list_list)
703 goto out;
705 live_p2m_frame_list =
706 xc_map_foreign_batch(xc_handle, dom, PROT_READ,
707 live_p2m_frame_list_list,
708 P2M_FLL_ENTRIES);
710 if (!live_p2m_frame_list) {
711 ERROR("Couldn't map p2m_frame_list");
712 goto out;
713 }
715 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
716 the guest must not change which frames are used for this purpose.
717 (its not clear why it would want to change them, and we'll be OK
718 from a safety POV anyhow. */
720 live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
721 live_p2m_frame_list,
722 P2M_FL_ENTRIES);
724 if (!live_p2m) {
725 ERROR("Couldn't map p2m table");
726 goto out;
727 }
729 /* Setup the mfn_to_pfn table mapping */
730 if(!(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ))) {
731 ERROR("Failed to map live M2P table");
732 goto out;
733 }
736 /* Get a local copy of the live_P2M_frame_list */
737 if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
738 ERROR("Couldn't allocate p2m_frame_list array");
739 goto out;
740 }
741 memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
743 /* Canonicalise the pfn-to-mfn table frame-number list. */
744 for (i = 0; i < max_pfn; i += fpp) {
745 if (!translate_mfn_to_pfn(&p2m_frame_list[i/fpp])) {
746 ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
747 ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
748 (uint64_t)p2m_frame_list[i/fpp]);
749 goto out;
750 }
751 }
753 /* Domain is still running at this point */
754 if (live) {
756 if (xc_shadow_control(xc_handle, dom,
757 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
758 NULL, 0, NULL, 0, NULL) < 0) {
759 ERROR("Couldn't enable shadow mode");
760 goto out;
761 }
763 last_iter = 0;
765 } else {
767 /* This is a non-live suspend. Issue the call back to get the
768 domain suspended */
770 last_iter = 1;
772 if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt)) {
773 ERROR("Domain appears not to have suspended");
774 goto out;
775 }
777 }
779 /* pretend we sent all the pages last iteration */
780 sent_last_iter = max_pfn;
783 /* calculate the power of 2 order of max_pfn, e.g.
784 15->4 16->4 17->5 */
785 for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
786 continue;
788 /* Setup to_send / to_fix and to_skip bitmaps */
789 to_send = malloc(BITMAP_SIZE);
790 to_fix = calloc(1, BITMAP_SIZE);
791 to_skip = malloc(BITMAP_SIZE);
793 if (!to_send || !to_fix || !to_skip) {
794 ERROR("Couldn't allocate to_send array");
795 goto out;
796 }
798 memset(to_send, 0xff, BITMAP_SIZE);
800 if (lock_pages(to_send, BITMAP_SIZE)) {
801 ERROR("Unable to lock to_send");
802 return 1;
803 }
805 /* (to fix is local only) */
806 if (lock_pages(to_skip, BITMAP_SIZE)) {
807 ERROR("Unable to lock to_skip");
808 return 1;
809 }
811 analysis_phase(xc_handle, dom, max_pfn, to_skip, 0);
813 /* We want zeroed memory so use calloc rather than malloc. */
814 pfn_type = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
815 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
817 if ((pfn_type == NULL) || (pfn_batch == NULL)) {
818 ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
819 errno = ENOMEM;
820 goto out;
821 }
823 if (lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type))) {
824 ERROR("Unable to lock");
825 goto out;
826 }
828 /*
829 * Quick belt and braces sanity check.
830 */
831 {
832 int err=0;
833 unsigned long mfn;
834 for (i = 0; i < max_pfn; i++) {
836 mfn = live_p2m[i];
837 if((mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i)) {
838 DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
839 mfn, mfn_to_pfn(mfn));
840 err++;
841 }
842 }
843 DPRINTF("Had %d unexplained entries in p2m table\n", err);
844 }
847 /* Start writing out the saved-domain record. */
849 if (!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
850 ERROR("write: max_pfn");
851 goto out;
852 }
854 /*
855 * Write an extended-info structure to inform the restore code that
856 * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
857 * slow paths in the restore code.
858 */
859 if ((pt_levels == 3) &&
860 (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3))) {
861 unsigned long signature = ~0UL;
862 uint32_t tot_sz = sizeof(struct vcpu_guest_context) + 8;
863 uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
864 char chunk_sig[] = "vcpu";
865 if (!write_exact(io_fd, &signature, sizeof(signature)) ||
866 !write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
867 !write_exact(io_fd, &chunk_sig, 4) ||
868 !write_exact(io_fd, &chunk_sz, sizeof(chunk_sz)) ||
869 !write_exact(io_fd, &ctxt, sizeof(ctxt))) {
870 ERROR("write: extended info");
871 goto out;
872 }
873 }
875 if (!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
876 ERROR("write: p2m_frame_list");
877 goto out;
878 }
880 print_stats(xc_handle, dom, 0, &stats, 0);
882 /* Now write out each data page, canonicalising page tables as we go... */
884 while(1) {
886 unsigned int prev_pc, sent_this_iter, N, batch;
888 iter++;
889 sent_this_iter = 0;
890 skip_this_iter = 0;
891 prev_pc = 0;
892 N=0;
894 DPRINTF("Saving memory pages: iter %d 0%%", iter);
896 while( N < max_pfn ){
898 unsigned int this_pc = (N * 100) / max_pfn;
900 if ((this_pc - prev_pc) >= 5) {
901 DPRINTF("\b\b\b\b%3d%%", this_pc);
902 prev_pc = this_pc;
903 }
905 /* slightly wasteful to peek the whole array evey time,
906 but this is fast enough for the moment. */
907 if (!last_iter && xc_shadow_control(
908 xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK,
909 to_skip, max_pfn, NULL, 0, NULL) != max_pfn) {
910 ERROR("Error peeking shadow bitmap");
911 goto out;
912 }
915 /* load pfn_type[] with the mfn of all the pages we're doing in
916 this batch. */
917 for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) {
919 int n = permute(N, max_pfn, order_nr);
921 if (debug) {
922 DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n",
923 iter, (unsigned long)n, live_p2m[n],
924 test_bit(n, to_send),
925 mfn_to_pfn(live_p2m[n]&0xFFFFF));
926 }
928 if (!last_iter && test_bit(n, to_send)&& test_bit(n, to_skip))
929 skip_this_iter++; /* stats keeping */
931 if (!((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
932 (test_bit(n, to_send) && last_iter) ||
933 (test_bit(n, to_fix) && last_iter)))
934 continue;
936 /*
937 ** we get here if:
938 ** 1. page is marked to_send & hasn't already been re-dirtied
939 ** 2. (ignore to_skip in last iteration)
940 ** 3. add in pages that still need fixup (net bufs)
941 */
943 pfn_batch[batch] = n;
944 pfn_type[batch] = live_p2m[n];
946 if(!is_mapped(pfn_type[batch])) {
948 /*
949 ** not currently in psuedo-physical map -- set bit
950 ** in to_fix since we must send this page in last_iter
951 ** unless its sent sooner anyhow, or it never enters
952 ** pseudo-physical map (e.g. for ballooned down domains)
953 */
955 set_bit(n, to_fix);
956 continue;
957 }
959 if(last_iter && test_bit(n, to_fix) && !test_bit(n, to_send)) {
960 needed_to_fix++;
961 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
962 iter, n, pfn_type[batch]);
963 }
965 clear_bit(n, to_fix);
967 batch++;
968 }
970 if (batch == 0)
971 goto skip; /* vanishingly unlikely... */
973 if ((region_base = xc_map_foreign_batch(
974 xc_handle, dom, PROT_READ, pfn_type, batch)) == 0) {
975 ERROR("map batch failed");
976 goto out;
977 }
979 for ( j = 0; j < batch; j++ )
980 ((uint32_t *)pfn_type)[i] = pfn_type[i];
981 if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
982 (uint32_t *)pfn_type) )
983 {
984 ERROR("get_pfn_type_batch failed");
985 goto out;
986 }
987 for ( j = batch-1; j >= 0; j-- )
988 pfn_type[i] = ((uint32_t *)pfn_type)[i];
990 for ( j = 0; j < batch; j++ )
991 {
993 if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
994 XEN_DOMCTL_PFINFO_XTAB )
995 {
996 DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
997 continue;
998 }
1000 if (debug)
1001 DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
1002 " sum= %08lx\n",
1003 iter,
1004 (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
1005 pfn_batch[j],
1006 pfn_type[j],
1007 mfn_to_pfn(pfn_type[j] &
1008 ~XEN_DOMCTL_PFINFO_LTAB_MASK),
1009 csum_page(region_base + (PAGE_SIZE*j)));
1011 /* canonicalise mfn->pfn */
1012 pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
1013 pfn_batch[j];
1016 if(!write_exact(io_fd, &batch, sizeof(unsigned int))) {
1017 ERROR("Error when writing to state file (2) (errno %d)",
1018 errno);
1019 goto out;
1022 if(!write_exact(io_fd, pfn_type, sizeof(unsigned long)*j)) {
1023 ERROR("Error when writing to state file (3) (errno %d)",
1024 errno);
1025 goto out;
1028 /* entering this loop, pfn_type is now in pfns (Not mfns) */
1029 for ( j = 0; j < batch; j++ )
1031 unsigned long pfn, pagetype;
1032 void *spage = (char *)region_base + (PAGE_SIZE*j);
1034 pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
1035 pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
1037 /* write out pages in batch */
1038 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
1039 continue;
1041 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
1043 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
1044 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
1046 /* We have a pagetable page: need to rewrite it. */
1047 race =
1048 canonicalize_pagetable(pagetype, pfn, spage, page);
1050 if(race && !live)
1051 goto out;
1053 if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
1054 ERROR("Error when writing to state file (4)"
1055 " (errno %d)", errno);
1056 goto out;
1059 } else {
1061 /* We have a normal page: just write it directly. */
1062 if (ratewrite(io_fd, spage, PAGE_SIZE) != PAGE_SIZE) {
1063 ERROR("Error when writing to state file (5)"
1064 " (errno %d)", errno);
1065 goto out;
1068 } /* end of the write out for this batch */
1070 sent_this_iter += batch;
1072 munmap(region_base, batch*PAGE_SIZE);
1074 } /* end of this while loop for this iteration */
1076 skip:
1078 total_sent += sent_this_iter;
1080 DPRINTF("\r %d: sent %d, skipped %d, ",
1081 iter, sent_this_iter, skip_this_iter );
1083 if (last_iter) {
1084 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
1086 DPRINTF("Total pages sent= %ld (%.2fx)\n",
1087 total_sent, ((float)total_sent)/max_pfn );
1088 DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
1091 if (last_iter && debug) {
1092 int minusone = -1;
1093 memset(to_send, 0xff, BITMAP_SIZE);
1094 debug = 0;
1095 DPRINTF("Entering debug resend-all mode\n");
1097 /* send "-1" to put receiver into debug mode */
1098 if(!write_exact(io_fd, &minusone, sizeof(int))) {
1099 ERROR("Error when writing to state file (6) (errno %d)",
1100 errno);
1101 goto out;
1104 continue;
1107 if (last_iter)
1108 break;
1110 if (live) {
1111 if (((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
1112 (iter >= max_iters) ||
1113 (sent_this_iter+skip_this_iter < 50) ||
1114 (total_sent > max_pfn*max_factor)) {
1115 DPRINTF("Start last iteration\n");
1116 last_iter = 1;
1118 if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
1119 &ctxt)) {
1120 ERROR("Domain appears not to have suspended");
1121 goto out;
1124 DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
1125 info.shared_info_frame,
1126 (unsigned long)ctxt.user_regs.eip,
1127 (unsigned long)ctxt.user_regs.edx);
1130 if (xc_shadow_control(xc_handle, dom,
1131 XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
1132 max_pfn, NULL, 0, &stats) != max_pfn) {
1133 ERROR("Error flushing shadow PT");
1134 goto out;
1137 sent_last_iter = sent_this_iter;
1139 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
1142 } /* end of while 1 */
1144 DPRINTF("All memory is saved\n");
1146 /* Zero terminate */
1147 i = 0;
1148 if (!write_exact(io_fd, &i, sizeof(int))) {
1149 ERROR("Error when writing to state file (6') (errno %d)", errno);
1150 goto out;
1153 /* Send through a list of all the PFNs that were not in map at the close */
1155 unsigned int i,j;
1156 unsigned long pfntab[1024];
1158 for (i = 0, j = 0; i < max_pfn; i++) {
1159 if (!is_mapped(live_p2m[i]))
1160 j++;
1163 if(!write_exact(io_fd, &j, sizeof(unsigned int))) {
1164 ERROR("Error when writing to state file (6a) (errno %d)", errno);
1165 goto out;
1168 for (i = 0, j = 0; i < max_pfn; ) {
1170 if (!is_mapped(live_p2m[i]))
1171 pfntab[j++] = i;
1173 i++;
1174 if (j == 1024 || i == max_pfn) {
1175 if(!write_exact(io_fd, &pfntab, sizeof(unsigned long)*j)) {
1176 ERROR("Error when writing to state file (6b) (errno %d)",
1177 errno);
1178 goto out;
1180 j = 0;
1186 /* Canonicalise the suspend-record frame number. */
1187 if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) ){
1188 ERROR("Suspend record is not in range of pseudophys map");
1189 goto out;
1192 /* Canonicalise each GDT frame number. */
1193 for ( i = 0; (512*i) < ctxt.gdt_ents; i++ ) {
1194 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1195 ERROR("GDT frame is not in range of pseudophys map");
1196 goto out;
1200 /* Canonicalise the page table base pointer. */
1201 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) ) {
1202 ERROR("PT base is not in range of pseudophys map");
1203 goto out;
1205 ctxt.ctrlreg[3] =
1206 xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3])));
1208 /*
1209 * Reset the MFN to be a known-invalid value. See map_frame_list_list().
1210 */
1211 memcpy(page, live_shinfo, PAGE_SIZE);
1212 ((shared_info_t *)page)->arch.pfn_to_mfn_frame_list_list = 0;
1214 if (!write_exact(io_fd, &ctxt, sizeof(ctxt)) ||
1215 !write_exact(io_fd, page, PAGE_SIZE)) {
1216 ERROR("Error when writing to state file (1) (errno %d)", errno);
1217 goto out;
1220 /* Success! */
1221 rc = 0;
1223 out:
1225 if (live) {
1226 if(xc_shadow_control(xc_handle, dom,
1227 XEN_DOMCTL_SHADOW_OP_OFF,
1228 NULL, 0, NULL, 0, NULL) < 0) {
1229 DPRINTF("Warning - couldn't disable shadow mode");
1233 if (live_shinfo)
1234 munmap(live_shinfo, PAGE_SIZE);
1236 if (live_p2m_frame_list_list)
1237 munmap(live_p2m_frame_list_list, PAGE_SIZE);
1239 if (live_p2m_frame_list)
1240 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
1242 if(live_p2m)
1243 munmap(live_p2m, P2M_SIZE);
1245 if(live_m2p)
1246 munmap(live_m2p, M2P_SIZE(max_mfn));
1248 free(pfn_type);
1249 free(pfn_batch);
1250 free(to_send);
1251 free(to_fix);
1252 free(to_skip);
1254 DPRINTF("Save exit rc=%d\n",rc);
1256 return !!rc;
1259 /*
1260 * Local variables:
1261 * mode: C
1262 * c-set-style: "BSD"
1263 * c-basic-offset: 4
1264 * tab-width: 4
1265 * indent-tabs-mode: nil
1266 * End:
1267 */