direct-io.hg

view tools/libxc/xc_linux_save.c @ 7703:539b2757642e

Fix 64-bit build.

Signed-off-by: Steven Hand <steven@xensource.com>
author smh22@firebug.cl.cam.ac.uk
date Wed Nov 09 09:02:59 2005 +0100 (2005-11-09)
parents b3c2bc39d815
children fe3a892b33b4
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xg_private.h"
16 #include "xg_save_restore.h"
18 /*
19 ** Default values for important tuning parameters. Can override by passing
20 ** non-zero replacement values to xc_linux_save().
21 **
22 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
23 **
24 */
25 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
26 #define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
29 /* max mfn of the whole machine */
30 static uint32_t max_mfn;
32 /* virtual starting address of the hypervisor */
33 static uint32_t hvirt_start;
35 /* #levels of page tables used by the currrent guest */
36 static uint32_t pt_levels;
38 /* total number of pages used by the current guest */
39 static unsigned long max_pfn;
41 /* Live mapping of the table mapping each PFN to its current MFN. */
42 static unsigned long *live_p2m = NULL;
44 /* Live mapping of system MFN to PFN table. */
45 static unsigned long *live_m2p = NULL;
48 /*
49 * Returns TRUE if the given machine frame number has a unique mapping
50 * in the guest's pseudophysical map.
51 */
52 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
53 (((_mfn) < (max_mfn)) && \
54 ((live_m2p[_mfn] < (max_pfn)) && \
55 (live_p2m[live_m2p[_mfn]] == (_mfn))))
58 /* Returns TRUE if MFN is successfully converted to a PFN. */
59 #define translate_mfn_to_pfn(_pmfn) \
60 ({ \
61 unsigned long mfn = *(_pmfn); \
62 int _res = 1; \
63 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
64 _res = 0; \
65 else \
66 *(_pmfn) = live_m2p[mfn]; \
67 _res; \
68 })
70 /*
71 ** During (live) save/migrate, we maintain a number of bitmaps to track
72 ** which pages we have to send, to fixup, and to skip.
73 */
75 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
76 #define BITMAP_SIZE ((max_pfn + BITS_PER_LONG - 1) / BITS_PER_LONG)
78 #define BITMAP_ENTRY(_nr,_bmap) \
79 ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
81 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
83 static inline int test_bit (int nr, volatile void * addr)
84 {
85 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
86 }
88 static inline void clear_bit (int nr, volatile void * addr)
89 {
90 BITMAP_ENTRY(nr, addr) &= ~(1 << BITMAP_SHIFT(nr));
91 }
93 static inline void set_bit ( int nr, volatile void * addr)
94 {
95 BITMAP_ENTRY(nr, addr) |= (1 << BITMAP_SHIFT(nr));
96 }
98 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
99 static inline unsigned int hweight32(unsigned int w)
100 {
101 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
102 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
103 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
104 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
105 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
106 }
108 static inline int count_bits ( int nr, volatile void *addr)
109 {
110 int i, count = 0;
111 unsigned long *p = (unsigned long *)addr;
112 /* We know that the array is padded to unsigned long. */
113 for( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
114 count += hweight32(*p);
115 return count;
116 }
118 static inline int permute( int i, int nr, int order_nr )
119 {
120 /* Need a simple permutation function so that we scan pages in a
121 pseudo random order, enabling us to get a better estimate of
122 the domain's page dirtying rate as we go (there are often
123 contiguous ranges of pfns that have similar behaviour, and we
124 want to mix them up. */
126 /* e.g. nr->oder 15->4 16->4 17->5 */
127 /* 512MB domain, 128k pages, order 17 */
129 /*
130 QPONMLKJIHGFEDCBA
131 QPONMLKJIH
132 GFEDCBA
133 */
135 /*
136 QPONMLKJIHGFEDCBA
137 EDCBA
138 QPONM
139 LKJIHGF
140 */
142 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
143 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
145 return i;
146 }
151 static uint64_t tv_to_us(struct timeval *new)
152 {
153 return (new->tv_sec * 1000000) + new->tv_usec;
154 }
156 static uint64_t llgettimeofday(void)
157 {
158 struct timeval now;
159 gettimeofday(&now, NULL);
160 return tv_to_us(&now);
161 }
163 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
164 {
165 return ((new->tv_sec - old->tv_sec)*1000000 ) +
166 (new->tv_usec - old->tv_usec);
167 }
170 #ifdef ADAPTIVE_SAVE
173 /*
174 ** We control the rate at which we transmit (or save) to minimize impact
175 ** on running domains (including the target if we're doing live migrate).
176 */
178 #define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
179 #define START_MBIT_RATE 100 /* initial transmit rate for migrate */
182 /* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
183 #define RATE_TO_BTU 781250
185 /* Amount in bytes we allow ourselves to send in a burst */
186 #define BURST_BUDGET (100*1024)
189 /* We keep track of the current and previous transmission rate */
190 static int mbit_rate, ombit_rate = 0;
192 /* Have we reached the maximum transmission rate? */
193 #define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
196 static inline void initialize_mbit_rate()
197 {
198 mbit_rate = START_MBIT_RATE;
199 }
202 static int ratewrite(int io_fd, void *buf, int n)
203 {
204 static int budget = 0;
205 static int burst_time_us = -1;
206 static struct timeval last_put = { 0 };
207 struct timeval now;
208 struct timespec delay;
209 long long delta;
211 if (START_MBIT_RATE == 0)
212 return write(io_fd, buf, n);
214 budget -= n;
215 if (budget < 0) {
216 if (mbit_rate != ombit_rate) {
217 burst_time_us = RATE_TO_BTU / mbit_rate;
218 ombit_rate = mbit_rate;
219 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
220 mbit_rate, BURST_BUDGET, burst_time_us);
221 }
222 if (last_put.tv_sec == 0) {
223 budget += BURST_BUDGET;
224 gettimeofday(&last_put, NULL);
225 } else {
226 while (budget < 0) {
227 gettimeofday(&now, NULL);
228 delta = tv_delta(&now, &last_put);
229 while (delta > burst_time_us) {
230 budget += BURST_BUDGET;
231 last_put.tv_usec += burst_time_us;
232 if (last_put.tv_usec > 1000000) {
233 last_put.tv_usec -= 1000000;
234 last_put.tv_sec++;
235 }
236 delta -= burst_time_us;
237 }
238 if (budget > 0)
239 break;
240 delay.tv_sec = 0;
241 delay.tv_nsec = 1000 * (burst_time_us - delta);
242 while (delay.tv_nsec > 0)
243 if (nanosleep(&delay, &delay) == 0)
244 break;
245 }
246 }
247 }
248 return write(io_fd, buf, n);
249 }
251 #else /* ! ADAPTIVE SAVE */
253 #define RATE_IS_MAX() (0)
254 #define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
255 #define initialize_mbit_rate()
257 #endif
260 static inline ssize_t write_exact(int fd, void *buf, size_t count)
261 {
262 if(write(fd, buf, count) != count)
263 return 0;
264 return 1;
265 }
269 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
270 xc_shadow_control_stats_t *stats, int print)
271 {
272 static struct timeval wall_last;
273 static long long d0_cpu_last;
274 static long long d1_cpu_last;
276 struct timeval wall_now;
277 long long wall_delta;
278 long long d0_cpu_now, d0_cpu_delta;
279 long long d1_cpu_now, d1_cpu_delta;
281 gettimeofday(&wall_now, NULL);
283 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
284 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
286 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
287 fprintf(stderr, "ARRHHH!!\n");
289 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
291 if (wall_delta == 0) wall_delta = 1;
293 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
294 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
296 if (print)
297 fprintf(stderr,
298 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
299 "dirtied %dMb/s %" PRId32 " pages\n",
300 wall_delta,
301 (int)((d0_cpu_delta*100)/wall_delta),
302 (int)((d1_cpu_delta*100)/wall_delta),
303 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
304 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
305 stats->dirty_count);
307 #ifdef ADAPTIVE_SAVE
308 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
309 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
310 + 50;
311 if (mbit_rate > MAX_MBIT_RATE)
312 mbit_rate = MAX_MBIT_RATE;
313 }
314 #endif
316 d0_cpu_last = d0_cpu_now;
317 d1_cpu_last = d1_cpu_now;
318 wall_last = wall_now;
320 return 0;
321 }
324 static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn,
325 unsigned long *arr, int runs)
326 {
327 long long start, now;
328 xc_shadow_control_stats_t stats;
329 int j;
331 start = llgettimeofday();
333 for (j = 0; j < runs; j++) {
334 int i;
336 xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_CLEAN,
337 arr, max_pfn, NULL);
338 fprintf(stderr, "#Flush\n");
339 for ( i = 0; i < 40; i++ ) {
340 usleep(50000);
341 now = llgettimeofday();
342 xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_PEEK,
343 NULL, 0, &stats);
345 fprintf(stderr, "now= %lld faults= %" PRId32 " dirty= %" PRId32
346 " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
347 ((now-start)+500)/1000,
348 stats.fault_count, stats.dirty_count,
349 stats.dirty_net_count, stats.dirty_block_count);
350 }
351 }
353 return -1;
354 }
357 static int suspend_and_state(int xc_handle, int io_fd, int dom,
358 xc_dominfo_t *info,
359 vcpu_guest_context_t *ctxt)
360 {
361 int i = 0;
362 char ans[30];
364 printf("suspend\n");
365 fflush(stdout);
366 if (fgets(ans, sizeof(ans), stdin) == NULL) {
367 ERR("failed reading suspend reply");
368 return -1;
369 }
370 if (strncmp(ans, "done\n", 5)) {
371 ERR("suspend reply incorrect: %s", ans);
372 return -1;
373 }
375 retry:
377 if (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) {
378 ERR("Could not get domain info");
379 return -1;
380 }
382 if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */, ctxt))
383 ERR("Could not get vcpu context");
386 if (info->shutdown && info->shutdown_reason == SHUTDOWN_suspend)
387 return 0; // success
389 if (info->paused) {
390 // try unpausing domain, wait, and retest
391 xc_domain_unpause( xc_handle, dom );
393 ERR("Domain was paused. Wait and re-test.");
394 usleep(10000); // 10ms
396 goto retry;
397 }
400 if( ++i < 100 ) {
401 ERR("Retry suspend domain.");
402 usleep(10000); // 10ms
403 goto retry;
404 }
406 ERR("Unable to suspend domain.");
408 return -1;
409 }
412 /*
413 ** During transfer (or in the state file), all page-table pages must be
414 ** converted into a 'canonical' form where references to actual mfns
415 ** are replaced with references to the corresponding pfns.
416 **
417 ** This function performs the appropriate conversion, taking into account
418 ** which entries do not require canonicalization (in particular, those
419 ** entries which map the virtual address reserved for the hypervisor).
420 */
421 void canonicalize_pagetable(unsigned long type, unsigned long pfn,
422 const void *spage, void *dpage)
423 {
425 int i, pte_last, xen_start, xen_end;
426 uint64_t pte;
428 /*
429 ** We need to determine which entries in this page table hold
430 ** reserved hypervisor mappings. This depends on the current
431 ** page table type as well as the number of paging levels.
432 */
433 xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
435 if (pt_levels == 2 && type == L2TAB)
436 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
438 if (pt_levels == 3 && type == L3TAB)
439 xen_start = L3_PAGETABLE_ENTRIES_PAE;
441 /*
442 ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
443 ** We can spot this by looking for the guest linear mapping which
444 ** Xen always ensures is present in that L2. Guests must ensure
445 ** that this check will fail for other L2s.
446 */
447 if (pt_levels == 3 && type == L2TAB) {
449 /* XXX index of the L2 entry in PAE mode which holds the guest LPT */
450 #define PAE_GLPT_L2ENTRY (495)
451 pte = ((uint64_t*)spage)[PAE_GLPT_L2ENTRY];
453 if(((pte >> PAGE_SHIFT) & 0x0fffffff) == live_p2m[pfn])
454 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
455 }
457 /* Now iterate through the page table, canonicalizing each PTE */
458 for (i = 0; i < pte_last; i++ ) {
460 unsigned long pfn, mfn;
462 if (pt_levels == 2)
463 pte = ((uint32_t*)spage)[i];
464 else
465 pte = ((uint64_t*)spage)[i];
467 if (i >= xen_start && i < xen_end)
468 pte = 0;
470 if (pte & _PAGE_PRESENT) {
472 mfn = (pte >> PAGE_SHIFT) & 0xfffffff;
473 pfn = live_m2p[mfn];
475 if (!MFN_IS_IN_PSEUDOPHYS_MAP(mfn)) {
476 /* I don't think this should ever happen */
477 DPRINTF("FNI: [%08lx,%d] pte=%llx,"
478 " mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
479 type, i, (unsigned long long)pte, mfn,
480 live_m2p[mfn],
481 (live_m2p[mfn] < max_pfn) ?
482 live_p2m[live_m2p[mfn]] : 0xdeadbeaf);
484 pfn = 0; /* be suspicious */
485 }
487 pte &= 0xffffff0000000fffULL;
488 pte |= (uint64_t)pfn << PAGE_SHIFT;
489 }
491 if (pt_levels == 2)
492 ((uint32_t*)dpage)[i] = pte;
493 else
494 ((uint64_t*)dpage)[i] = pte;
496 }
498 return;
499 }
504 int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
505 uint32_t max_factor, uint32_t flags)
506 {
507 xc_dominfo_t info;
509 int rc = 1, i, j, last_iter, iter = 0;
510 int live = (flags & XCFLAGS_LIVE);
511 int debug = (flags & XCFLAGS_DEBUG);
512 int sent_last_iter, skip_this_iter;
514 /* The new domain's shared-info frame number. */
515 unsigned long shared_info_frame;
517 /* A copy of the CPU context of the guest. */
518 vcpu_guest_context_t ctxt;
520 /* A table containg the type of each PFN (/not/ MFN!). */
521 unsigned long *pfn_type = NULL;
522 unsigned long *pfn_batch = NULL;
524 /* A temporary mapping, and a copy, of one frame of guest memory. */
525 char page[PAGE_SIZE];
527 /* Double and single indirect references to the live P2M table */
528 unsigned long *live_p2m_frame_list_list = NULL;
529 unsigned long *live_p2m_frame_list = NULL;
531 /* A copy of the pfn-to-mfn table frame list. */
532 unsigned long *p2m_frame_list = NULL;
534 unsigned long m2p_start_mfn;
536 /* Live mapping of shared info structure */
537 shared_info_t *live_shinfo = NULL;
539 /* base of the region in which domain memory is mapped */
540 unsigned char *region_base = NULL;
544 /* power of 2 order of max_pfn */
545 int order_nr;
547 /* bitmap of pages:
548 - that should be sent this iteration (unless later marked as skip);
549 - to skip this iteration because already dirty;
550 - to fixup by sending at the end if not already resent; */
551 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
553 xc_shadow_control_stats_t stats;
555 unsigned long needed_to_fix = 0;
556 unsigned long total_sent = 0;
559 /* If no explicit control parameters given, use defaults */
560 if(!max_iters)
561 max_iters = DEF_MAX_ITERS;
562 if(!max_factor)
563 max_factor = DEF_MAX_FACTOR;
565 initialize_mbit_rate();
567 DPRINTF("xc_linux_save start DOM%u live=%s\n", dom, live ?
568 "true" : "false");
570 if(!get_platform_info(xc_handle, dom,
571 &max_mfn, &hvirt_start, &pt_levels)) {
572 ERR("Unable to get platform info.");
573 return 1;
574 }
576 if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
577 ERR("Could not get domain info");
578 return 1;
579 }
581 if (mlock(&ctxt, sizeof(ctxt))) {
582 ERR("Unable to mlock ctxt");
583 return 1;
584 }
586 /* Only have to worry about vcpu 0 even for SMP */
587 if (xc_domain_get_vcpu_context(xc_handle, dom, 0, &ctxt)) {
588 ERR("Could not get vcpu context");
589 goto out;
590 }
591 shared_info_frame = info.shared_info_frame;
593 /* A cheesy test to see whether the domain contains valid state. */
594 if (ctxt.ctrlreg[3] == 0)
595 {
596 ERR("Domain is not in a valid Linux guest OS state");
597 goto out;
598 }
600 /* cheesy sanity check */
601 if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
602 ERR("Invalid state record -- pfn count out of range: %lu",
603 (info.max_memkb >> (PAGE_SHIFT - 10)));
604 goto out;
605 }
607 /* Map the shared info frame */
608 if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
609 PROT_READ, shared_info_frame))) {
610 ERR("Couldn't map live_shinfo");
611 goto out;
612 }
614 max_pfn = live_shinfo->arch.max_pfn;
616 live_p2m_frame_list_list =
617 xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
618 live_shinfo->arch.pfn_to_mfn_frame_list_list);
620 if (!live_p2m_frame_list_list) {
621 ERR("Couldn't map p2m_frame_list_list");
622 goto out;
623 }
625 live_p2m_frame_list =
626 xc_map_foreign_batch(xc_handle, dom, PROT_READ,
627 live_p2m_frame_list_list,
628 P2M_FLL_ENTRIES);
630 if (!live_p2m_frame_list) {
631 ERR("Couldn't map p2m_frame_list");
632 goto out;
633 }
635 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
636 the guest must not change which frames are used for this purpose.
637 (its not clear why it would want to change them, and we'll be OK
638 from a safety POV anyhow. */
640 live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
641 live_p2m_frame_list,
642 P2M_FL_ENTRIES);
644 if (!live_p2m) {
645 ERR("Couldn't map p2m table");
646 goto out;
647 }
649 /* Setup the mfn_to_pfn table mapping */
650 m2p_start_mfn = xc_get_m2p_start_mfn(xc_handle);
651 live_m2p = xc_map_foreign_range(xc_handle, DOMID_XEN, M2P_SIZE,
652 PROT_READ, m2p_start_mfn);
654 /* Get a local copy fo the live_P2M_frame_list */
655 if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
656 ERR("Couldn't allocate p2m_frame_list array");
657 goto out;
658 }
659 memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
661 /* Canonicalise the pfn-to-mfn table frame-number list. */
662 for (i = 0; i < max_pfn; i += ulpp) {
663 if (!translate_mfn_to_pfn(&p2m_frame_list[i/ulpp])) {
664 ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
665 goto out;
666 }
667 }
669 /* Domain is still running at this point */
671 if (live) {
673 if (xc_shadow_control(xc_handle, dom,
674 DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
675 NULL, 0, NULL ) < 0) {
676 ERR("Couldn't enable shadow mode");
677 goto out;
678 }
680 last_iter = 0;
682 } else {
684 /* This is a non-live suspend. Issue the call back to get the
685 domain suspended */
687 last_iter = 1;
689 if (suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt)) {
690 ERR("Domain appears not to have suspended");
691 goto out;
692 }
694 }
696 #if 0
697 sent_last_iter = 0xFFFFFFFF; /* Pretend we sent a /lot/ last time */
698 #else
699 sent_last_iter = 1 << 20;
700 #endif
703 /* calculate the power of 2 order of max_pfn, e.g.
704 15->4 16->4 17->5 */
705 for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
706 continue;
708 #undef BITMAP_SIZE
709 #define BITMAP_SIZE ((1<<20)/8)
711 /* Setup to_send / to_fix and to_skip bitmaps */
712 to_send = malloc(BITMAP_SIZE);
713 to_fix = calloc(1, BITMAP_SIZE);
714 to_skip = malloc(BITMAP_SIZE);
716 if (!to_send || !to_fix || !to_skip) {
717 ERR("Couldn't allocate to_send array");
718 goto out;
719 }
721 memset(to_send, 0xff, BITMAP_SIZE);
723 if (mlock(to_send, BITMAP_SIZE)) {
724 ERR("Unable to mlock to_send");
725 return 1;
726 }
728 /* (to fix is local only) */
729 if (mlock(to_skip, BITMAP_SIZE)) {
730 ERR("Unable to mlock to_skip");
731 return 1;
732 }
734 analysis_phase(xc_handle, dom, max_pfn, to_skip, 0);
736 /* We want zeroed memory so use calloc rather than malloc. */
737 pfn_type = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
738 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
740 if ((pfn_type == NULL) || (pfn_batch == NULL)) {
741 errno = ENOMEM;
742 goto out;
743 }
745 if (mlock(pfn_type, MAX_BATCH_SIZE * sizeof(unsigned long))) {
746 ERR("Unable to mlock");
747 goto out;
748 }
751 /*
752 * Quick belt and braces sanity check.
753 */
754 {
755 int err=0;
756 unsigned long mfn;
757 for (i = 0; i < max_pfn; i++) {
759 mfn = live_p2m[i];
760 if((live_m2p[mfn] != i) && (mfn != 0xffffffffUL)) {
761 DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
762 mfn, live_m2p[mfn]);
763 err++;
764 }
765 }
766 DPRINTF("Had %d unexplained entries in p2m table\n", err);
767 }
770 /* Start writing out the saved-domain record. */
772 if(!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
773 ERR("write: max_pfn");
774 goto out;
775 }
777 if(!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
778 ERR("write: p2m_frame_list");
779 goto out;
780 }
782 print_stats(xc_handle, dom, 0, &stats, 0);
784 /* Now write out each data page, canonicalising page tables as we go... */
786 while(1) {
788 unsigned int prev_pc, sent_this_iter, N, batch;
790 iter++;
791 sent_this_iter = 0;
792 skip_this_iter = 0;
793 prev_pc = 0;
794 N=0;
796 DPRINTF("Saving memory pages: iter %d 0%%", iter);
798 while( N < max_pfn ){
800 unsigned int this_pc = (N * 100) / max_pfn;
802 if ((this_pc - prev_pc) >= 5) {
803 DPRINTF("\b\b\b\b%3d%%", this_pc);
804 prev_pc = this_pc;
805 }
807 /* slightly wasteful to peek the whole array evey time,
808 but this is fast enough for the moment. */
809 if (!last_iter && xc_shadow_control(
810 xc_handle, dom, DOM0_SHADOW_CONTROL_OP_PEEK,
811 to_skip, max_pfn, NULL) != max_pfn) {
812 ERR("Error peeking shadow bitmap");
813 goto out;
814 }
817 /* load pfn_type[] with the mfn of all the pages we're doing in
818 this batch. */
819 for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) {
821 int n = permute(N, max_pfn, order_nr);
823 if (debug) {
824 DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n",
825 iter, (unsigned long)n, live_p2m[n],
826 test_bit(n, to_send),
827 live_m2p[live_p2m[n]&0xFFFFF]);
828 }
830 if (!last_iter && test_bit(n, to_send)&& test_bit(n, to_skip))
831 skip_this_iter++; /* stats keeping */
833 if (!((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
834 (test_bit(n, to_send) && last_iter) ||
835 (test_bit(n, to_fix) && last_iter)))
836 continue;
838 /*
839 ** we get here if:
840 ** 1. page is marked to_send & hasn't already been re-dirtied
841 ** 2. (ignore to_skip in last iteration)
842 ** 3. add in pages that still need fixup (net bufs)
843 */
845 pfn_batch[batch] = n;
846 pfn_type[batch] = live_p2m[n];
848 if(!is_mapped(pfn_type[batch])) {
850 /* not currently in pusedo-physical map -- set bit
851 in to_fix that we must send this page in last_iter
852 unless its sent sooner anyhow */
854 set_bit(n, to_fix);
855 if(iter > 1)
856 DPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
857 iter, n, pfn_type[batch]);
858 continue;
859 }
861 if(last_iter && test_bit(n, to_fix) && !test_bit(n, to_send)) {
862 needed_to_fix++;
863 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
864 iter,n,pfn_type[batch]);
865 }
867 clear_bit(n, to_fix);
869 batch++;
870 }
872 if (batch == 0)
873 goto skip; /* vanishingly unlikely... */
875 if ((region_base = xc_map_foreign_batch(
876 xc_handle, dom, PROT_READ, pfn_type, batch)) == 0) {
877 ERR("map batch failed");
878 goto out;
879 }
881 if (xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type)) {
882 ERR("get_pfn_type_batch failed");
883 goto out;
884 }
886 for (j = 0; j < batch; j++) {
888 if ((pfn_type[j] & LTAB_MASK) == XTAB) {
889 DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
890 continue;
891 }
893 if (debug)
894 fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
895 " sum= %08lx\n",
896 iter,
897 (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
898 pfn_type[j],
899 live_m2p[pfn_type[j]&(~LTAB_MASK)],
900 csum_page(region_base + (PAGE_SIZE*j)));
902 /* canonicalise mfn->pfn */
903 pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
904 }
906 if(!write_exact(io_fd, &batch, sizeof(unsigned int))) {
907 ERR("Error when writing to state file (2)");
908 goto out;
909 }
911 if(!write_exact(io_fd, pfn_type, sizeof(unsigned long)*j)) {
912 ERR("Error when writing to state file (3)");
913 goto out;
914 }
916 /* entering this loop, pfn_type is now in pfns (Not mfns) */
917 for (j = 0; j < batch; j++) {
919 unsigned long pfn = pfn_type[j] & ~LTAB_MASK;
920 unsigned long pagetype = pfn_type[j] & LTAB_MASK;
921 void *spage = (void *) region_base + (PAGE_SIZE*j);
924 /* write out pages in batch */
925 if (pagetype == XTAB) {
926 DPRINTF("SKIP BOGUS page %i mfn %08lx\n", j, pfn_type[j]);
927 continue;
928 }
930 pagetype &= LTABTYPE_MASK;
932 if (pagetype >= L1TAB && pagetype <= L4TAB) {
934 /* We have a pagetable page: need to rewrite it. */
935 canonicalize_pagetable(pagetype, pfn, spage, page);
937 if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
938 ERR("Error when writing to state file (4)");
939 goto out;
940 }
942 } else {
944 /* We have a normal page: just write it directly. */
945 if (ratewrite(io_fd, spage, PAGE_SIZE) != PAGE_SIZE) {
946 ERR("Error when writing to state file (5)");
947 goto out;
948 }
949 }
950 } /* end of the write out for this batch */
952 sent_this_iter += batch;
954 } /* end of this while loop for this iteration */
956 munmap(region_base, batch*PAGE_SIZE);
958 skip:
960 total_sent += sent_this_iter;
962 DPRINTF("\r %d: sent %d, skipped %d, ",
963 iter, sent_this_iter, skip_this_iter );
965 if (last_iter) {
966 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
968 DPRINTF("Total pages sent= %ld (%.2fx)\n",
969 total_sent, ((float)total_sent)/max_pfn );
970 DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
971 }
973 if (last_iter && debug){
974 int minusone = -1;
975 memset( to_send, 0xff, (max_pfn+8)/8 );
976 debug = 0;
977 fprintf(stderr, "Entering debug resend-all mode\n");
979 /* send "-1" to put receiver into debug mode */
980 if(!write_exact(io_fd, &minusone, sizeof(int))) {
981 ERR("Error when writing to state file (6)");
982 goto out;
983 }
985 continue;
986 }
988 if (last_iter) break;
990 if (live) {
993 if(
994 ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
995 (iter >= max_iters) ||
996 (sent_this_iter+skip_this_iter < 50) ||
997 (total_sent > max_pfn*max_factor) ) {
999 DPRINTF("Start last iteration\n");
1000 last_iter = 1;
1002 if (suspend_and_state(xc_handle, io_fd, dom, &info, &ctxt)) {
1003 ERR("Domain appears not to have suspended");
1004 goto out;
1007 DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
1008 info.shared_info_frame,
1009 (unsigned long)ctxt.user_regs.eip,
1010 (unsigned long)ctxt.user_regs.edx);
1013 if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_CLEAN,
1014 to_send, max_pfn, &stats ) != max_pfn) {
1015 ERR("Error flushing shadow PT");
1016 goto out;
1019 sent_last_iter = sent_this_iter;
1021 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
1026 } /* end of while 1 */
1028 DPRINTF("All memory is saved\n");
1030 /* Success! */
1031 rc = 0;
1033 /* ^^^^^^ XXX SMH: hmm.. not sure that's really success! */
1035 /* Zero terminate */
1036 if (!write_exact(io_fd, &rc, sizeof(int))) {
1037 ERR("Error when writing to state file (6)");
1038 goto out;
1041 /* Send through a list of all the PFNs that were not in map at the close */
1043 unsigned int i,j;
1044 unsigned long pfntab[1024];
1046 for ( i = 0, j = 0; i < max_pfn; i++ ) {
1047 if ( ! is_mapped(live_p2m[i]) )
1048 j++;
1051 if(!write_exact(io_fd, &j, sizeof(unsigned int))) {
1052 ERR("Error when writing to state file (6a)");
1053 goto out;
1056 for ( i = 0, j = 0; i < max_pfn; ) {
1058 if (!is_mapped(live_p2m[i]))
1059 pfntab[j++] = i;
1061 i++;
1062 if (j == 1024 || i == max_pfn) {
1063 if(!write_exact(io_fd, &pfntab, sizeof(unsigned long)*j)) {
1064 ERR("Error when writing to state file (6b)");
1065 goto out;
1067 j = 0;
1073 /* Canonicalise the suspend-record frame number. */
1074 if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) ){
1075 ERR("Suspend record is not in range of pseudophys map");
1076 goto out;
1079 /* Canonicalise each GDT frame number. */
1080 for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
1081 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1082 ERR("GDT frame is not in range of pseudophys map");
1083 goto out;
1087 /* Canonicalise the page table base pointer. */
1088 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) ) {
1089 ERR("PT base is not in range of pseudophys map");
1090 goto out;
1092 ctxt.ctrlreg[3] = live_m2p[ctxt.ctrlreg[3] >> PAGE_SHIFT] <<
1093 PAGE_SHIFT;
1095 if (!write_exact(io_fd, &ctxt, sizeof(ctxt)) ||
1096 !write_exact(io_fd, live_shinfo, PAGE_SIZE)) {
1097 ERR("Error when writing to state file (1)");
1098 goto out;
1101 out:
1103 if (live_shinfo)
1104 munmap(live_shinfo, PAGE_SIZE);
1106 if (live_p2m_frame_list)
1107 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
1109 if(live_p2m)
1110 munmap(live_p2m, P2M_SIZE);
1112 if(live_m2p)
1113 munmap(live_m2p, M2P_SIZE);
1115 free(pfn_type);
1116 free(pfn_batch);
1117 free(to_send);
1118 free(to_fix);
1119 free(to_skip);
1121 DPRINTF("Save exit rc=%d\n",rc);
1123 return !!rc;
1126 /*
1127 * Local variables:
1128 * mode: C
1129 * c-set-style: "BSD"
1130 * c-basic-offset: 4
1131 * tab-width: 4
1132 * indent-tabs-mode: nil
1133 * End:
1134 */