ia64/xen-unstable

view tools/libxc/xc_linux_save.c @ 8075:bf09a8db5bb4

Fix formatting.

Signed-off-by: Steven Hand <steven@xensource.com>
author smh22@firebug.cl.cam.ac.uk
date Sat Nov 26 12:17:35 2005 +0100 (2005-11-26)
parents 486f4c9e1c22
children 7acd50d945d7
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xg_private.h"
16 #include "xg_save_restore.h"
18 /*
19 ** Default values for important tuning parameters. Can override by passing
20 ** non-zero replacement values to xc_linux_save().
21 **
22 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
23 **
24 */
25 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
26 #define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
29 /* max mfn of the whole machine */
30 static unsigned long max_mfn;
32 /* virtual starting address of the hypervisor */
33 static unsigned long hvirt_start;
35 /* #levels of page tables used by the currrent guest */
36 static unsigned int pt_levels;
38 /* total number of pages used by the current guest */
39 static unsigned long max_pfn;
41 /* Live mapping of the table mapping each PFN to its current MFN. */
42 static unsigned long *live_p2m = NULL;
44 /* Live mapping of system MFN to PFN table. */
45 static unsigned long *live_m2p = NULL;
47 /* grep fodder: machine_to_phys */
49 #define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
51 /*
52 * Returns TRUE if the given machine frame number has a unique mapping
53 * in the guest's pseudophysical map.
54 */
55 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
56 (((_mfn) < (max_mfn)) && \
57 ((mfn_to_pfn(_mfn) < (max_pfn)) && \
58 (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
61 /* Returns TRUE if MFN is successfully converted to a PFN. */
62 #define translate_mfn_to_pfn(_pmfn) \
63 ({ \
64 unsigned long mfn = *(_pmfn); \
65 int _res = 1; \
66 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
67 _res = 0; \
68 else \
69 *(_pmfn) = mfn_to_pfn(mfn); \
70 _res; \
71 })
73 /*
74 ** During (live) save/migrate, we maintain a number of bitmaps to track
75 ** which pages we have to send, to fixup, and to skip.
76 */
78 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
79 #define BITMAP_SIZE ((max_pfn + BITS_PER_LONG - 1) / 8)
81 #define BITMAP_ENTRY(_nr,_bmap) \
82 ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
84 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
86 static inline int test_bit (int nr, volatile void * addr)
87 {
88 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
89 }
91 static inline void clear_bit (int nr, volatile void * addr)
92 {
93 BITMAP_ENTRY(nr, addr) &= ~(1 << BITMAP_SHIFT(nr));
94 }
96 static inline void set_bit ( int nr, volatile void * addr)
97 {
98 BITMAP_ENTRY(nr, addr) |= (1 << BITMAP_SHIFT(nr));
99 }
101 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
102 static inline unsigned int hweight32(unsigned int w)
103 {
104 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
105 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
106 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
107 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
108 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
109 }
111 static inline int count_bits ( int nr, volatile void *addr)
112 {
113 int i, count = 0;
114 unsigned long *p = (unsigned long *)addr;
115 /* We know that the array is padded to unsigned long. */
116 for( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
117 count += hweight32(*p);
118 return count;
119 }
121 static inline int permute( int i, int nr, int order_nr )
122 {
123 /* Need a simple permutation function so that we scan pages in a
124 pseudo random order, enabling us to get a better estimate of
125 the domain's page dirtying rate as we go (there are often
126 contiguous ranges of pfns that have similar behaviour, and we
127 want to mix them up. */
129 /* e.g. nr->oder 15->4 16->4 17->5 */
130 /* 512MB domain, 128k pages, order 17 */
132 /*
133 QPONMLKJIHGFEDCBA
134 QPONMLKJIH
135 GFEDCBA
136 */
138 /*
139 QPONMLKJIHGFEDCBA
140 EDCBA
141 QPONM
142 LKJIHGF
143 */
145 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
146 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
148 return i;
149 }
154 static uint64_t tv_to_us(struct timeval *new)
155 {
156 return (new->tv_sec * 1000000) + new->tv_usec;
157 }
159 static uint64_t llgettimeofday(void)
160 {
161 struct timeval now;
162 gettimeofday(&now, NULL);
163 return tv_to_us(&now);
164 }
166 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
167 {
168 return ((new->tv_sec - old->tv_sec)*1000000 ) +
169 (new->tv_usec - old->tv_usec);
170 }
173 #ifdef ADAPTIVE_SAVE
176 /*
177 ** We control the rate at which we transmit (or save) to minimize impact
178 ** on running domains (including the target if we're doing live migrate).
179 */
181 #define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
182 #define START_MBIT_RATE 100 /* initial transmit rate for migrate */
185 /* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
186 #define RATE_TO_BTU 781250
188 /* Amount in bytes we allow ourselves to send in a burst */
189 #define BURST_BUDGET (100*1024)
192 /* We keep track of the current and previous transmission rate */
193 static int mbit_rate, ombit_rate = 0;
195 /* Have we reached the maximum transmission rate? */
196 #define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
199 static inline void initialize_mbit_rate()
200 {
201 mbit_rate = START_MBIT_RATE;
202 }
205 static int ratewrite(int io_fd, void *buf, int n)
206 {
207 static int budget = 0;
208 static int burst_time_us = -1;
209 static struct timeval last_put = { 0 };
210 struct timeval now;
211 struct timespec delay;
212 long long delta;
214 if (START_MBIT_RATE == 0)
215 return write(io_fd, buf, n);
217 budget -= n;
218 if (budget < 0) {
219 if (mbit_rate != ombit_rate) {
220 burst_time_us = RATE_TO_BTU / mbit_rate;
221 ombit_rate = mbit_rate;
222 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
223 mbit_rate, BURST_BUDGET, burst_time_us);
224 }
225 if (last_put.tv_sec == 0) {
226 budget += BURST_BUDGET;
227 gettimeofday(&last_put, NULL);
228 } else {
229 while (budget < 0) {
230 gettimeofday(&now, NULL);
231 delta = tv_delta(&now, &last_put);
232 while (delta > burst_time_us) {
233 budget += BURST_BUDGET;
234 last_put.tv_usec += burst_time_us;
235 if (last_put.tv_usec > 1000000) {
236 last_put.tv_usec -= 1000000;
237 last_put.tv_sec++;
238 }
239 delta -= burst_time_us;
240 }
241 if (budget > 0)
242 break;
243 delay.tv_sec = 0;
244 delay.tv_nsec = 1000 * (burst_time_us - delta);
245 while (delay.tv_nsec > 0)
246 if (nanosleep(&delay, &delay) == 0)
247 break;
248 }
249 }
250 }
251 return write(io_fd, buf, n);
252 }
254 #else /* ! ADAPTIVE SAVE */
256 #define RATE_IS_MAX() (0)
257 #define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
258 #define initialize_mbit_rate()
260 #endif
263 static inline ssize_t write_exact(int fd, void *buf, size_t count)
264 {
265 if(write(fd, buf, count) != count)
266 return 0;
267 return 1;
268 }
272 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
273 xc_shadow_control_stats_t *stats, int print)
274 {
275 static struct timeval wall_last;
276 static long long d0_cpu_last;
277 static long long d1_cpu_last;
279 struct timeval wall_now;
280 long long wall_delta;
281 long long d0_cpu_now, d0_cpu_delta;
282 long long d1_cpu_now, d1_cpu_delta;
284 gettimeofday(&wall_now, NULL);
286 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
287 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
289 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
290 fprintf(stderr, "ARRHHH!!\n");
292 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
294 if (wall_delta == 0) wall_delta = 1;
296 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
297 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
299 if (print)
300 fprintf(stderr,
301 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
302 "dirtied %dMb/s %" PRId32 " pages\n",
303 wall_delta,
304 (int)((d0_cpu_delta*100)/wall_delta),
305 (int)((d1_cpu_delta*100)/wall_delta),
306 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
307 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
308 stats->dirty_count);
310 #ifdef ADAPTIVE_SAVE
311 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
312 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
313 + 50;
314 if (mbit_rate > MAX_MBIT_RATE)
315 mbit_rate = MAX_MBIT_RATE;
316 }
317 #endif
319 d0_cpu_last = d0_cpu_now;
320 d1_cpu_last = d1_cpu_now;
321 wall_last = wall_now;
323 return 0;
324 }
327 static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn,
328 unsigned long *arr, int runs)
329 {
330 long long start, now;
331 xc_shadow_control_stats_t stats;
332 int j;
334 start = llgettimeofday();
336 for (j = 0; j < runs; j++) {
337 int i;
339 xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_CLEAN,
340 arr, max_pfn, NULL);
341 fprintf(stderr, "#Flush\n");
342 for ( i = 0; i < 40; i++ ) {
343 usleep(50000);
344 now = llgettimeofday();
345 xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_PEEK,
346 NULL, 0, &stats);
348 fprintf(stderr, "now= %lld faults= %" PRId32 " dirty= %" PRId32
349 " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
350 ((now-start)+500)/1000,
351 stats.fault_count, stats.dirty_count,
352 stats.dirty_net_count, stats.dirty_block_count);
353 }
354 }
356 return -1;
357 }
360 static int suspend_and_state(int xc_handle, int io_fd, int dom,
361 xc_dominfo_t *info,
362 vcpu_guest_context_t *ctxt)
363 {
364 int i = 0;
365 char ans[30];
367 printf("suspend\n");
368 fflush(stdout);
369 if (fgets(ans, sizeof(ans), stdin) == NULL) {
370 ERR("failed reading suspend reply");
371 return -1;
372 }
373 if (strncmp(ans, "done\n", 5)) {
374 ERR("suspend reply incorrect: %s", ans);
375 return -1;
376 }
378 retry:
380 if (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) {
381 ERR("Could not get domain info");
382 return -1;
383 }
385 if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */, ctxt))
386 ERR("Could not get vcpu context");
389 if (info->shutdown && info->shutdown_reason == SHUTDOWN_suspend)
390 return 0; // success
392 if (info->paused) {
393 // try unpausing domain, wait, and retest
394 xc_domain_unpause( xc_handle, dom );
396 ERR("Domain was paused. Wait and re-test.");
397 usleep(10000); // 10ms
399 goto retry;
400 }
403 if( ++i < 100 ) {
404 ERR("Retry suspend domain.");
405 usleep(10000); // 10ms
406 goto retry;
407 }
409 ERR("Unable to suspend domain.");
411 return -1;
412 }
415 /*
416 ** During transfer (or in the state file), all page-table pages must be
417 ** converted into a 'canonical' form where references to actual mfns
418 ** are replaced with references to the corresponding pfns.
419 **
420 ** This function performs the appropriate conversion, taking into account
421 ** which entries do not require canonicalization (in particular, those
422 ** entries which map the virtual address reserved for the hypervisor).
423 */
424 void canonicalize_pagetable(unsigned long type, unsigned long pfn,
425 const void *spage, void *dpage)
426 {
428 int i, pte_last, xen_start, xen_end;
429 uint64_t pte;
431 /*
432 ** We need to determine which entries in this page table hold
433 ** reserved hypervisor mappings. This depends on the current
434 ** page table type as well as the number of paging levels.
435 */
436 xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
438 if (pt_levels == 2 && type == L2TAB)
439 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
441 if (pt_levels == 3 && type == L3TAB)
442 xen_start = L3_PAGETABLE_ENTRIES_PAE;
444 /*
445 ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
446 ** We can spot this by looking for the guest linear mapping which
447 ** Xen always ensures is present in that L2. Guests must ensure
448 ** that this check will fail for other L2s.
449 */
450 if (pt_levels == 3 && type == L2TAB) {
452 /* XXX index of the L2 entry in PAE mode which holds the guest LPT */
453 #define PAE_GLPT_L2ENTRY (495)
454 pte = ((uint64_t*)spage)[PAE_GLPT_L2ENTRY];
456 if(((pte >> PAGE_SHIFT) & 0x0fffffff) == live_p2m[pfn])
457 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
458 }
460 if (pt_levels == 4 && type == L4TAB) {
461 /*
462 ** XXX SMH: should compute these from hvirt_start (which we have)
463 ** and hvirt_end (which we don't)
464 */
465 xen_start = 256;
466 xen_end = 272;
467 }
469 /* Now iterate through the page table, canonicalizing each PTE */
470 for (i = 0; i < pte_last; i++ ) {
472 unsigned long pfn, mfn;
474 if (pt_levels == 2)
475 pte = ((uint32_t*)spage)[i];
476 else
477 pte = ((uint64_t*)spage)[i];
479 if (i >= xen_start && i < xen_end)
480 pte = 0;
482 if (pte & _PAGE_PRESENT) {
484 mfn = (pte >> PAGE_SHIFT) & 0xfffffff;
485 if (!MFN_IS_IN_PSEUDOPHYS_MAP(mfn)) {
486 /* This will happen if the type info is stale which
487 is quite feasible under live migration */
488 DPRINTF("PT Race: [%08lx,%d] pte=%llx, mfn=%08lx\n",
489 type, i, (unsigned long long)pte, mfn);
490 pfn = 0; /* zap it - we'll retransmit this page later */
491 } else
492 pfn = mfn_to_pfn(mfn);
494 pte &= 0xffffff0000000fffULL;
495 pte |= (uint64_t)pfn << PAGE_SHIFT;
496 }
498 if (pt_levels == 2)
499 ((uint32_t*)dpage)[i] = pte;
500 else
501 ((uint64_t*)dpage)[i] = pte;
503 }
505 return;
506 }
510 static unsigned long *xc_map_m2p(int xc_handle,
511 unsigned long max_mfn,
512 int prot)
513 {
514 struct xen_machphys_mfn_list xmml;
515 privcmd_mmap_t ioctlx;
516 privcmd_mmap_entry_t *entries;
517 unsigned long m2p_chunks, m2p_size;
518 unsigned long *m2p;
519 int i, rc;
521 m2p_size = M2P_SIZE(max_mfn);
522 m2p_chunks = M2P_CHUNKS(max_mfn);
524 xmml.max_extents = m2p_chunks;
525 if (!(xmml.extent_start = malloc(m2p_chunks * sizeof(unsigned long)))) {
526 ERR("failed to allocate space for m2p mfns!\n");
527 return NULL;
528 }
530 if (xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
531 (xmml.nr_extents != m2p_chunks)) {
532 ERR("xc_get_m2p_mfns:");
533 return NULL;
534 }
536 if ((m2p = mmap(NULL, m2p_size, prot,
537 MAP_SHARED, xc_handle, 0)) == MAP_FAILED) {
538 ERR("failed to mmap m2p");
539 return NULL;
540 }
542 if (!(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t)))) {
543 ERR("failed to allocate space for mmap entries!\n");
544 return NULL;
545 }
547 ioctlx.num = m2p_chunks;
548 ioctlx.dom = DOMID_XEN;
549 ioctlx.entry = entries;
551 for (i=0; i < m2p_chunks; i++) {
552 entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
553 entries[i].mfn = xmml.extent_start[i];
554 entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
555 }
557 if ((rc = ioctl(xc_handle, IOCTL_PRIVCMD_MMAP, &ioctlx)) < 0) {
558 ERR("ioctl_mmap failed (rc = %d)", rc);
559 return NULL;
560 }
562 free(xmml.extent_start);
563 free(entries);
565 return m2p;
566 }
570 int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
571 uint32_t max_factor, uint32_t flags)
572 {
573 xc_dominfo_t info;
575 int rc = 1, i, j, last_iter, iter = 0;
576 int live = (flags & XCFLAGS_LIVE);
577 int debug = (flags & XCFLAGS_DEBUG);
578 int sent_last_iter, skip_this_iter;
580 /* The new domain's shared-info frame number. */
581 unsigned long shared_info_frame;
583 /* A copy of the CPU context of the guest. */
584 vcpu_guest_context_t ctxt;
586 /* A table containg the type of each PFN (/not/ MFN!). */
587 unsigned long *pfn_type = NULL;
588 unsigned long *pfn_batch = NULL;
590 /* A temporary mapping, and a copy, of one frame of guest memory. */
591 char page[PAGE_SIZE];
593 /* Double and single indirect references to the live P2M table */
594 unsigned long *live_p2m_frame_list_list = NULL;
595 unsigned long *live_p2m_frame_list = NULL;
597 /* A copy of the pfn-to-mfn table frame list. */
598 unsigned long *p2m_frame_list = NULL;
600 /* Live mapping of shared info structure */
601 shared_info_t *live_shinfo = NULL;
603 /* base of the region in which domain memory is mapped */
604 unsigned char *region_base = NULL;
606 /* power of 2 order of max_pfn */
607 int order_nr;
609 /* bitmap of pages:
610 - that should be sent this iteration (unless later marked as skip);
611 - to skip this iteration because already dirty;
612 - to fixup by sending at the end if not already resent; */
613 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
615 xc_shadow_control_stats_t stats;
617 unsigned long needed_to_fix = 0;
618 unsigned long total_sent = 0;
621 /* If no explicit control parameters given, use defaults */
622 if(!max_iters)
623 max_iters = DEF_MAX_ITERS;
624 if(!max_factor)
625 max_factor = DEF_MAX_FACTOR;
627 initialize_mbit_rate();
629 if(!get_platform_info(xc_handle, dom,
630 &max_mfn, &hvirt_start, &pt_levels)) {
631 ERR("Unable to get platform info.");
632 return 1;
633 }
635 if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
636 ERR("Could not get domain info");
637 return 1;
638 }
640 if (mlock(&ctxt, sizeof(ctxt))) {
641 ERR("Unable to mlock ctxt");
642 return 1;
643 }
645 /* Only have to worry about vcpu 0 even for SMP */
646 if (xc_domain_get_vcpu_context(xc_handle, dom, 0, &ctxt)) {
647 ERR("Could not get vcpu context");
648 goto out;
649 }
650 shared_info_frame = info.shared_info_frame;
652 /* A cheesy test to see whether the domain contains valid state. */
653 if (ctxt.ctrlreg[3] == 0)
654 {
655 ERR("Domain is not in a valid Linux guest OS state");
656 goto out;
657 }
659 /* cheesy sanity check */
660 if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
661 ERR("Invalid state record -- pfn count out of range: %lu",
662 (info.max_memkb >> (PAGE_SHIFT - 10)));
663 goto out;
664 }
666 /* Map the shared info frame */
667 if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
668 PROT_READ, shared_info_frame))) {
669 ERR("Couldn't map live_shinfo");
670 goto out;
671 }
673 max_pfn = live_shinfo->arch.max_pfn;
675 live_p2m_frame_list_list =
676 xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
677 live_shinfo->arch.pfn_to_mfn_frame_list_list);
679 if (!live_p2m_frame_list_list) {
680 ERR("Couldn't map p2m_frame_list_list");
681 goto out;
682 }
684 live_p2m_frame_list =
685 xc_map_foreign_batch(xc_handle, dom, PROT_READ,
686 live_p2m_frame_list_list,
687 P2M_FLL_ENTRIES);
689 if (!live_p2m_frame_list) {
690 ERR("Couldn't map p2m_frame_list");
691 goto out;
692 }
694 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
695 the guest must not change which frames are used for this purpose.
696 (its not clear why it would want to change them, and we'll be OK
697 from a safety POV anyhow. */
699 live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
700 live_p2m_frame_list,
701 P2M_FL_ENTRIES);
703 if (!live_p2m) {
704 ERR("Couldn't map p2m table");
705 goto out;
706 }
708 /* Setup the mfn_to_pfn table mapping */
709 if(!(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ))) {
710 ERR("Failed to map live M2P table");
711 goto out;
712 }
715 /* Get a local copy of the live_P2M_frame_list */
716 if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
717 ERR("Couldn't allocate p2m_frame_list array");
718 goto out;
719 }
720 memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
722 /* Canonicalise the pfn-to-mfn table frame-number list. */
723 for (i = 0; i < max_pfn; i += ulpp) {
724 if (!translate_mfn_to_pfn(&p2m_frame_list[i/ulpp])) {
725 ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
726 ERR("entry %d: p2m_frame_list[%ld] is 0x%lx", i, i/ulpp,
727 p2m_frame_list[i/ulpp]);
728 goto out;
729 }
730 }
732 /* Domain is still running at this point */
733 if (live) {
735 if (xc_shadow_control(xc_handle, dom,
736 DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
737 NULL, 0, NULL ) < 0) {
738 ERR("Couldn't enable shadow mode");
739 goto out;
740 }
742 last_iter = 0;
744 } else {
746 /* This is a non-live suspend. Issue the call back to get the
747 domain suspended */
749 last_iter = 1;
751 if (suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt)) {
752 ERR("Domain appears not to have suspended");
753 goto out;
754 }
756 }
758 /* pretend we sent all the pages last iteration */
759 sent_last_iter = max_pfn;
762 /* calculate the power of 2 order of max_pfn, e.g.
763 15->4 16->4 17->5 */
764 for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
765 continue;
767 /* Setup to_send / to_fix and to_skip bitmaps */
768 to_send = malloc(BITMAP_SIZE);
769 to_fix = calloc(1, BITMAP_SIZE);
770 to_skip = malloc(BITMAP_SIZE);
772 if (!to_send || !to_fix || !to_skip) {
773 ERR("Couldn't allocate to_send array");
774 goto out;
775 }
777 memset(to_send, 0xff, BITMAP_SIZE);
779 if (mlock(to_send, BITMAP_SIZE)) {
780 ERR("Unable to mlock to_send");
781 return 1;
782 }
784 /* (to fix is local only) */
785 if (mlock(to_skip, BITMAP_SIZE)) {
786 ERR("Unable to mlock to_skip");
787 return 1;
788 }
790 analysis_phase(xc_handle, dom, max_pfn, to_skip, 0);
792 /* We want zeroed memory so use calloc rather than malloc. */
793 pfn_type = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
794 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
796 if ((pfn_type == NULL) || (pfn_batch == NULL)) {
797 ERR("failed to alloc memory for pfn_type and/or pfn_batch arays.");
798 errno = ENOMEM;
799 goto out;
800 }
802 if (mlock(pfn_type, MAX_BATCH_SIZE * sizeof(unsigned long))) {
803 ERR("Unable to mlock");
804 goto out;
805 }
808 /*
809 * Quick belt and braces sanity check.
810 */
811 {
812 int err=0;
813 unsigned long mfn;
814 for (i = 0; i < max_pfn; i++) {
816 mfn = live_p2m[i];
817 if((mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i)) {
818 DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
819 mfn, mfn_to_pfn(mfn));
820 err++;
821 }
822 }
823 DPRINTF("Had %d unexplained entries in p2m table\n", err);
824 }
827 /* Start writing out the saved-domain record. */
829 if(!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
830 ERR("write: max_pfn");
831 goto out;
832 }
834 if(!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
835 ERR("write: p2m_frame_list");
836 goto out;
837 }
839 print_stats(xc_handle, dom, 0, &stats, 0);
841 /* Now write out each data page, canonicalising page tables as we go... */
843 while(1) {
845 unsigned int prev_pc, sent_this_iter, N, batch;
847 iter++;
848 sent_this_iter = 0;
849 skip_this_iter = 0;
850 prev_pc = 0;
851 N=0;
853 DPRINTF("Saving memory pages: iter %d 0%%", iter);
855 while( N < max_pfn ){
857 unsigned int this_pc = (N * 100) / max_pfn;
859 if ((this_pc - prev_pc) >= 5) {
860 DPRINTF("\b\b\b\b%3d%%", this_pc);
861 prev_pc = this_pc;
862 }
864 /* slightly wasteful to peek the whole array evey time,
865 but this is fast enough for the moment. */
866 if (!last_iter && xc_shadow_control(
867 xc_handle, dom, DOM0_SHADOW_CONTROL_OP_PEEK,
868 to_skip, max_pfn, NULL) != max_pfn) {
869 ERR("Error peeking shadow bitmap");
870 goto out;
871 }
874 /* load pfn_type[] with the mfn of all the pages we're doing in
875 this batch. */
876 for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) {
878 int n = permute(N, max_pfn, order_nr);
880 if (debug) {
881 DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n",
882 iter, (unsigned long)n, live_p2m[n],
883 test_bit(n, to_send),
884 mfn_to_pfn(live_p2m[n]&0xFFFFF));
885 }
887 if (!last_iter && test_bit(n, to_send)&& test_bit(n, to_skip))
888 skip_this_iter++; /* stats keeping */
890 if (!((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
891 (test_bit(n, to_send) && last_iter) ||
892 (test_bit(n, to_fix) && last_iter)))
893 continue;
895 /*
896 ** we get here if:
897 ** 1. page is marked to_send & hasn't already been re-dirtied
898 ** 2. (ignore to_skip in last iteration)
899 ** 3. add in pages that still need fixup (net bufs)
900 */
902 pfn_batch[batch] = n;
903 pfn_type[batch] = live_p2m[n];
905 if(!is_mapped(pfn_type[batch])) {
907 /* not currently in pusedo-physical map -- set bit
908 in to_fix that we must send this page in last_iter
909 unless its sent sooner anyhow */
911 set_bit(n, to_fix);
912 if( (iter > 1) && IS_REAL_PFN(n) )
913 DPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
914 iter, n, pfn_type[batch]);
915 continue;
916 }
918 if(last_iter && test_bit(n, to_fix) && !test_bit(n, to_send)) {
919 needed_to_fix++;
920 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
921 iter,n,pfn_type[batch]);
922 }
924 clear_bit(n, to_fix);
926 batch++;
927 }
929 if (batch == 0)
930 goto skip; /* vanishingly unlikely... */
932 if ((region_base = xc_map_foreign_batch(
933 xc_handle, dom, PROT_READ, pfn_type, batch)) == 0) {
934 ERR("map batch failed");
935 goto out;
936 }
938 if (xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type)) {
939 ERR("get_pfn_type_batch failed");
940 goto out;
941 }
943 for (j = 0; j < batch; j++) {
945 if ((pfn_type[j] & LTAB_MASK) == XTAB) {
946 DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
947 continue;
948 }
950 if (debug)
951 fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
952 " sum= %08lx\n",
953 iter,
954 (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
955 pfn_type[j],
956 mfn_to_pfn(pfn_type[j]&(~LTAB_MASK)),
957 csum_page(region_base + (PAGE_SIZE*j)));
959 /* canonicalise mfn->pfn */
960 pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
961 }
963 if(!write_exact(io_fd, &batch, sizeof(unsigned int))) {
964 ERR("Error when writing to state file (2)");
965 goto out;
966 }
968 if(!write_exact(io_fd, pfn_type, sizeof(unsigned long)*j)) {
969 ERR("Error when writing to state file (3)");
970 goto out;
971 }
973 /* entering this loop, pfn_type is now in pfns (Not mfns) */
974 for (j = 0; j < batch; j++) {
976 unsigned long pfn = pfn_type[j] & ~LTAB_MASK;
977 unsigned long pagetype = pfn_type[j] & LTAB_MASK;
978 void *spage = (void *) region_base + (PAGE_SIZE*j);
981 /* write out pages in batch */
982 if (pagetype == XTAB)
983 continue;
985 pagetype &= LTABTYPE_MASK;
987 if (pagetype >= L1TAB && pagetype <= L4TAB) {
989 /* We have a pagetable page: need to rewrite it. */
990 canonicalize_pagetable(pagetype, pfn, spage, page);
992 if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
993 ERR("Error when writing to state file (4)");
994 goto out;
995 }
997 } else {
999 /* We have a normal page: just write it directly. */
1000 if (ratewrite(io_fd, spage, PAGE_SIZE) != PAGE_SIZE) {
1001 ERR("Error when writing to state file (5)");
1002 goto out;
1005 } /* end of the write out for this batch */
1007 sent_this_iter += batch;
1009 munmap(region_base, batch*PAGE_SIZE);
1011 } /* end of this while loop for this iteration */
1013 skip:
1015 total_sent += sent_this_iter;
1017 DPRINTF("\r %d: sent %d, skipped %d, ",
1018 iter, sent_this_iter, skip_this_iter );
1020 if (last_iter) {
1021 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
1023 DPRINTF("Total pages sent= %ld (%.2fx)\n",
1024 total_sent, ((float)total_sent)/max_pfn );
1025 DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
1028 if (last_iter && debug){
1029 int minusone = -1;
1030 memset(to_send, 0xff, BITMAP_SIZE);
1031 debug = 0;
1032 fprintf(stderr, "Entering debug resend-all mode\n");
1034 /* send "-1" to put receiver into debug mode */
1035 if(!write_exact(io_fd, &minusone, sizeof(int))) {
1036 ERR("Error when writing to state file (6)");
1037 goto out;
1040 continue;
1043 if (last_iter) break;
1045 if (live) {
1048 if(
1049 ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
1050 (iter >= max_iters) ||
1051 (sent_this_iter+skip_this_iter < 50) ||
1052 (total_sent > max_pfn*max_factor) ) {
1054 DPRINTF("Start last iteration\n");
1055 last_iter = 1;
1057 if (suspend_and_state(xc_handle, io_fd, dom, &info, &ctxt)) {
1058 ERR("Domain appears not to have suspended");
1059 goto out;
1062 DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
1063 info.shared_info_frame,
1064 (unsigned long)ctxt.user_regs.eip,
1065 (unsigned long)ctxt.user_regs.edx);
1068 if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_CLEAN,
1069 to_send, max_pfn, &stats ) != max_pfn) {
1070 ERR("Error flushing shadow PT");
1071 goto out;
1074 sent_last_iter = sent_this_iter;
1076 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
1081 } /* end of while 1 */
1083 DPRINTF("All memory is saved\n");
1085 /* Zero terminate */
1086 i = 0;
1087 if (!write_exact(io_fd, &i, sizeof(int))) {
1088 ERR("Error when writing to state file (6)");
1089 goto out;
1092 /* Send through a list of all the PFNs that were not in map at the close */
1094 unsigned int i,j;
1095 unsigned long pfntab[1024];
1097 for (i = 0, j = 0; i < max_pfn; i++) {
1098 if (!is_mapped(live_p2m[i]))
1099 j++;
1102 if(!write_exact(io_fd, &j, sizeof(unsigned int))) {
1103 ERR("Error when writing to state file (6a)");
1104 goto out;
1107 for (i = 0, j = 0; i < max_pfn; ) {
1109 if (!is_mapped(live_p2m[i]))
1110 pfntab[j++] = i;
1112 i++;
1113 if (j == 1024 || i == max_pfn) {
1114 if(!write_exact(io_fd, &pfntab, sizeof(unsigned long)*j)) {
1115 ERR("Error when writing to state file (6b)");
1116 goto out;
1118 j = 0;
1124 /* Canonicalise the suspend-record frame number. */
1125 if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) ){
1126 ERR("Suspend record is not in range of pseudophys map");
1127 goto out;
1130 /* Canonicalise each GDT frame number. */
1131 for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
1132 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1133 ERR("GDT frame is not in range of pseudophys map");
1134 goto out;
1138 /* Canonicalise the page table base pointer. */
1139 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) ) {
1140 ERR("PT base is not in range of pseudophys map");
1141 goto out;
1143 ctxt.ctrlreg[3] = mfn_to_pfn(ctxt.ctrlreg[3] >> PAGE_SHIFT) <<
1144 PAGE_SHIFT;
1146 if (!write_exact(io_fd, &ctxt, sizeof(ctxt)) ||
1147 !write_exact(io_fd, live_shinfo, PAGE_SIZE)) {
1148 ERR("Error when writing to state file (1)");
1149 goto out;
1152 /* Success! */
1153 rc = 0;
1155 out:
1157 if (live) {
1158 if(xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_OFF,
1159 NULL, 0, NULL ) < 0) {
1160 DPRINTF("Warning - couldn't disable shadow mode");
1164 if (live_shinfo)
1165 munmap(live_shinfo, PAGE_SIZE);
1167 if (live_p2m_frame_list)
1168 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
1170 if(live_p2m)
1171 munmap(live_p2m, P2M_SIZE);
1173 if(live_m2p)
1174 munmap(live_m2p, M2P_SIZE(max_mfn));
1176 free(pfn_type);
1177 free(pfn_batch);
1178 free(to_send);
1179 free(to_fix);
1180 free(to_skip);
1182 DPRINTF("Save exit rc=%d\n",rc);
1184 return !!rc;
1187 /*
1188 * Local variables:
1189 * mode: C
1190 * c-set-style: "BSD"
1191 * c-basic-offset: 4
1192 * tab-width: 4
1193 * indent-tabs-mode: nil
1194 * End:
1195 */