ia64/xen-unstable

view tools/libxc/xc_linux_save.c @ 7238:971e7c7411b3

Raise an exception if an error appears on the pipes to our children, and make
sure that the child's pipes are closed even under that exception. Move the
handling of POLLHUP to the end of the loop, so that we guarantee to read any
remaining data from the child if POLLHUP and POLLIN appear at the same time.

Signed-off-by: Ewan Mellor <ewan@xensource.com>
author emellor@ewan
date Thu Oct 06 10:13:11 2005 +0100 (2005-10-06)
parents 540d17fe32ce
children f15892b95965
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xg_private.h"
17 #define BATCH_SIZE 1024 /* 1024 pages (4MB) at a time */
19 #define MAX_MBIT_RATE 500
21 /*
22 ** Default values for important tuning parameters. Can override by passing
23 ** non-zero replacement values to xc_linux_save().
24 **
25 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
26 **
27 */
28 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
29 #define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
31 /* Flags to control behaviour of xc_linux_save */
32 #define XCFLAGS_LIVE 1
33 #define XCFLAGS_DEBUG 2
35 #define DEBUG 0
37 #if 1
38 #define ERR(_f, _a...) do { fprintf(stderr, _f , ## _a); fflush(stderr); } while (0)
39 #else
40 #define ERR(_f, _a...) ((void)0)
41 #endif
43 #if DEBUG
44 #define DPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
45 #else
46 #define DPRINTF(_f, _a...) ((void)0)
47 #endif
49 #define PROGRESS 0
50 #if PROGRESS
51 #define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
52 #else
53 #define PPRINTF(_f, _a...)
54 #endif
56 /*
57 * Returns TRUE if the given machine frame number has a unique mapping
58 * in the guest's pseudophysical map.
59 */
61 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
62 (((_mfn) < (1024*1024)) && \
63 ((live_mfn_to_pfn_table[_mfn] < nr_pfns) && \
64 (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn))))
67 /* Returns TRUE if MFN is successfully converted to a PFN. */
68 #define translate_mfn_to_pfn(_pmfn) \
69 ({ \
70 unsigned long mfn = *(_pmfn); \
71 int _res = 1; \
72 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
73 _res = 0; \
74 else \
75 *(_pmfn) = live_mfn_to_pfn_table[mfn]; \
76 _res; \
77 })
79 #define is_mapped(pfn) (!((pfn) & 0x80000000UL))
81 static inline int test_bit ( int nr, volatile void * addr)
82 {
83 return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
84 (nr % (sizeof(unsigned long)*8))) & 1;
85 }
87 static inline void clear_bit ( int nr, volatile void * addr)
88 {
89 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &=
90 ~(1 << (nr % (sizeof(unsigned long)*8) ) );
91 }
93 static inline void set_bit ( int nr, volatile void * addr)
94 {
95 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |=
96 (1 << (nr % (sizeof(unsigned long)*8) ) );
97 }
99 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
100 static inline unsigned int hweight32(unsigned int w)
101 {
102 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
103 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
104 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
105 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
106 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
107 }
109 static inline int count_bits ( int nr, volatile void *addr)
110 {
111 int i, count = 0;
112 unsigned long *p = (unsigned long *)addr;
113 /* We know that the array is padded to unsigned long. */
114 for( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
115 count += hweight32(*p);
116 return count;
117 }
119 static inline int permute( int i, int nr, int order_nr )
120 {
121 /* Need a simple permutation function so that we scan pages in a
122 pseudo random order, enabling us to get a better estimate of
123 the domain's page dirtying rate as we go (there are often
124 contiguous ranges of pfns that have similar behaviour, and we
125 want to mix them up. */
127 /* e.g. nr->oder 15->4 16->4 17->5 */
128 /* 512MB domain, 128k pages, order 17 */
130 /*
131 QPONMLKJIHGFEDCBA
132 QPONMLKJIH
133 GFEDCBA
134 */
136 /*
137 QPONMLKJIHGFEDCBA
138 EDCBA
139 QPONM
140 LKJIHGF
141 */
143 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
144 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
146 return i;
147 }
149 static long long tv_to_us( struct timeval *new )
150 {
151 return (new->tv_sec * 1000000) + new->tv_usec;
152 }
154 static long long llgettimeofday( void )
155 {
156 struct timeval now;
157 gettimeofday(&now, NULL);
158 return tv_to_us(&now);
159 }
161 static long long tv_delta( struct timeval *new, struct timeval *old )
162 {
163 return ((new->tv_sec - old->tv_sec)*1000000 ) +
164 (new->tv_usec - old->tv_usec);
165 }
168 #define START_MBIT_RATE 0 //ioctxt->resource
170 static int mbit_rate, ombit_rate = 0;
171 static int burst_time_us = -1;
173 #define MBIT_RATE mbit_rate
174 #define BURST_BUDGET (100*1024)
176 /*
177 1000000/((100)*1024*1024/8/(100*1024))
178 7812
179 1000000/((100)*1024/8/(100))
180 7812
181 1000000/((100)*128/(100))
182 7812
183 100000000/((100)*128)
184 7812
185 100000000/128
186 781250
187 */
188 #define RATE_TO_BTU 781250
189 #define BURST_TIME_US burst_time_us
191 static int
192 ratewrite(int io_fd, void *buf, int n)
193 {
194 static int budget = 0;
195 static struct timeval last_put = { 0 };
196 struct timeval now;
197 struct timespec delay;
198 long long delta;
200 if ( START_MBIT_RATE == 0 )
201 return write(io_fd, buf, n);
203 budget -= n;
204 if ( budget < 0 )
205 {
206 if ( MBIT_RATE != ombit_rate )
207 {
208 BURST_TIME_US = RATE_TO_BTU / MBIT_RATE;
209 ombit_rate = MBIT_RATE;
210 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
211 MBIT_RATE, BURST_BUDGET, BURST_TIME_US);
212 }
213 if ( last_put.tv_sec == 0 )
214 {
215 budget += BURST_BUDGET;
216 gettimeofday(&last_put, NULL);
217 }
218 else
219 {
220 while ( budget < 0 )
221 {
222 gettimeofday(&now, NULL);
223 delta = tv_delta(&now, &last_put);
224 while ( delta > BURST_TIME_US )
225 {
226 budget += BURST_BUDGET;
227 last_put.tv_usec += BURST_TIME_US;
228 if ( last_put.tv_usec > 1000000 )
229 {
230 last_put.tv_usec -= 1000000;
231 last_put.tv_sec++;
232 }
233 delta -= BURST_TIME_US;
234 }
235 if ( budget > 0 )
236 break;
237 delay.tv_sec = 0;
238 delay.tv_nsec = 1000 * (BURST_TIME_US - delta);
239 while ( delay.tv_nsec > 0 )
240 if ( nanosleep(&delay, &delay) == 0 )
241 break;
242 }
243 }
244 }
245 return write(io_fd, buf, n);
246 }
248 static int print_stats( int xc_handle, u32 domid,
249 int pages_sent, xc_shadow_control_stats_t *stats,
250 int print )
251 {
252 static struct timeval wall_last;
253 static long long d0_cpu_last;
254 static long long d1_cpu_last;
256 struct timeval wall_now;
257 long long wall_delta;
258 long long d0_cpu_now, d0_cpu_delta;
259 long long d1_cpu_now, d1_cpu_delta;
261 gettimeofday(&wall_now, NULL);
263 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
264 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
266 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
267 fprintf(stderr, "ARRHHH!!\n");
269 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
271 if ( wall_delta == 0 ) wall_delta = 1;
273 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
274 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
276 if ( print )
277 fprintf(stderr,
278 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
279 "dirtied %dMb/s %" PRId32 " pages\n",
280 wall_delta,
281 (int)((d0_cpu_delta*100)/wall_delta),
282 (int)((d1_cpu_delta*100)/wall_delta),
283 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
284 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
285 stats->dirty_count);
287 if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
288 {
289 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
290 + 50;
291 if (mbit_rate > MAX_MBIT_RATE)
292 mbit_rate = MAX_MBIT_RATE;
293 }
295 d0_cpu_last = d0_cpu_now;
296 d1_cpu_last = d1_cpu_now;
297 wall_last = wall_now;
299 return 0;
300 }
302 static int analysis_phase( int xc_handle, u32 domid,
303 int nr_pfns, unsigned long *arr, int runs )
304 {
305 long long start, now;
306 xc_shadow_control_stats_t stats;
307 int j;
309 start = llgettimeofday();
311 for ( j = 0; j < runs; j++ )
312 {
313 int i;
315 xc_shadow_control( xc_handle, domid,
316 DOM0_SHADOW_CONTROL_OP_CLEAN,
317 arr, nr_pfns, NULL);
318 fprintf(stderr, "#Flush\n");
319 for ( i = 0; i < 40; i++ )
320 {
321 usleep(50000);
322 now = llgettimeofday();
323 xc_shadow_control( xc_handle, domid,
324 DOM0_SHADOW_CONTROL_OP_PEEK,
325 NULL, 0, &stats);
327 fprintf(stderr, "now= %lld faults= %" PRId32 " dirty= %" PRId32
328 " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
329 ((now-start)+500)/1000,
330 stats.fault_count, stats.dirty_count,
331 stats.dirty_net_count, stats.dirty_block_count);
332 }
333 }
335 return -1;
336 }
339 static int suspend_and_state(int xc_handle, int io_fd, int dom,
340 xc_dominfo_t *info,
341 vcpu_guest_context_t *ctxt)
342 {
343 int i = 0;
344 char ans[30];
346 printf("suspend\n");
347 fflush(stdout);
348 if ( fgets(ans, sizeof(ans), stdin) == NULL )
349 {
350 ERR("failed reading suspend reply");
351 return -1;
352 }
353 if ( strncmp(ans, "done\n", 5) )
354 {
355 ERR("suspend reply incorrect: %s", ans);
356 return -1;
357 }
359 retry:
361 if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1)
362 {
363 ERR("Could not get domain info");
364 return -1;
365 }
367 if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */,
368 ctxt) )
369 {
370 ERR("Could not get vcpu context");
371 }
373 if ( info->shutdown && info->shutdown_reason == SHUTDOWN_suspend )
374 {
375 return 0; // success
376 }
378 if ( info->paused )
379 {
380 // try unpausing domain, wait, and retest
381 xc_domain_unpause( xc_handle, dom );
383 ERR("Domain was paused. Wait and re-test.");
384 usleep(10000); // 10ms
386 goto retry;
387 }
390 if( ++i < 100 )
391 {
392 ERR("Retry suspend domain.");
393 usleep(10000); // 10ms
394 goto retry;
395 }
397 ERR("Unable to suspend domain.");
399 return -1;
400 }
402 int xc_linux_save(int xc_handle, int io_fd, u32 dom, u32 max_iters,
403 u32 max_factor, u32 flags)
404 {
405 xc_dominfo_t info;
407 int rc = 1, i, j, k, last_iter, iter = 0;
408 unsigned long mfn;
409 int live = (flags & XCFLAGS_LIVE);
410 int debug = (flags & XCFLAGS_DEBUG);
411 int sent_last_iter, skip_this_iter;
413 /* The new domain's shared-info frame number. */
414 unsigned long shared_info_frame;
416 /* A copy of the CPU context of the guest. */
417 vcpu_guest_context_t ctxt;
419 /* A table containg the type of each PFN (/not/ MFN!). */
420 unsigned long *pfn_type = NULL;
421 unsigned long *pfn_batch = NULL;
423 /* A temporary mapping, and a copy, of one frame of guest memory. */
424 unsigned long page[1024];
426 /* A copy of the pfn-to-mfn table frame list. */
427 unsigned long *live_pfn_to_mfn_frame_list_list = NULL;
428 unsigned long *live_pfn_to_mfn_frame_list = NULL;
429 unsigned long pfn_to_mfn_frame_list[1024];
431 /* Live mapping of the table mapping each PFN to its current MFN. */
432 unsigned long *live_pfn_to_mfn_table = NULL;
433 /* Live mapping of system MFN to PFN table. */
434 unsigned long *live_mfn_to_pfn_table = NULL;
435 unsigned long mfn_to_pfn_table_start_mfn;
437 /* Live mapping of shared info structure */
438 shared_info_t *live_shinfo = NULL;
440 /* base of the region in which domain memory is mapped */
441 unsigned char *region_base = NULL;
443 /* number of pages we're dealing with */
444 unsigned long nr_pfns;
446 /* power of 2 order of nr_pfns */
447 int order_nr;
449 /* bitmap of pages:
450 - that should be sent this iteration (unless later marked as skip);
451 - to skip this iteration because already dirty;
452 - to fixup by sending at the end if not already resent; */
453 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
455 xc_shadow_control_stats_t stats;
457 int needed_to_fix = 0;
458 int total_sent = 0;
460 MBIT_RATE = START_MBIT_RATE;
463 /* If no explicit control parameters given, use defaults */
464 if( !max_iters )
465 max_iters = DEF_MAX_ITERS;
466 if( !max_factor )
467 max_factor = DEF_MAX_FACTOR;
470 DPRINTF("xc_linux_save start DOM%u live=%s\n", dom, live?"true":"false");
472 if ( mlock(&ctxt, sizeof(ctxt)) )
473 {
474 ERR("Unable to mlock ctxt");
475 return 1;
476 }
478 if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
479 {
480 ERR("Could not get domain info");
481 goto out;
482 }
483 if ( xc_domain_get_vcpu_context(xc_handle, dom, /* FIXME */ 0, &ctxt) )
484 {
485 ERR("Could not get vcpu context");
486 goto out;
487 }
488 shared_info_frame = info.shared_info_frame;
490 /* A cheesy test to see whether the domain contains valid state. */
491 if ( ctxt.ctrlreg[3] == 0 )
492 {
493 ERR("Domain is not in a valid Linux guest OS state");
494 goto out;
495 }
497 nr_pfns = info.max_memkb >> (PAGE_SHIFT - 10);
499 /* cheesy sanity check */
500 if ( nr_pfns > 1024*1024 )
501 {
502 ERR("Invalid state record -- pfn count out of range: %lu", nr_pfns);
503 goto out;
504 }
506 /* Map the shared info frame */
507 live_shinfo = xc_map_foreign_range(
508 xc_handle, dom, PAGE_SIZE, PROT_READ, shared_info_frame);
509 if ( !live_shinfo )
510 {
511 ERR("Couldn't map live_shinfo");
512 goto out;
513 }
515 live_pfn_to_mfn_frame_list_list = xc_map_foreign_range(
516 xc_handle, dom,
517 PAGE_SIZE, PROT_READ, live_shinfo->arch.pfn_to_mfn_frame_list_list);
519 if (!live_pfn_to_mfn_frame_list_list){
520 ERR("Couldn't map pfn_to_mfn_frame_list_list");
521 goto out;
522 }
524 live_pfn_to_mfn_frame_list =
525 xc_map_foreign_batch(xc_handle, dom,
526 PROT_READ,
527 live_pfn_to_mfn_frame_list_list,
528 (nr_pfns+(1024*1024)-1)/(1024*1024) );
530 if ( !live_pfn_to_mfn_frame_list)
531 {
532 ERR("Couldn't map pfn_to_mfn_frame_list");
533 goto out;
534 }
537 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
538 the guest must not change which frames are used for this purpose.
539 (its not clear why it would want to change them, and we'll be OK
540 from a safety POV anyhow. */
542 live_pfn_to_mfn_table = xc_map_foreign_batch(xc_handle, dom,
543 PROT_READ,
544 live_pfn_to_mfn_frame_list,
545 (nr_pfns+1023)/1024 );
546 if ( !live_pfn_to_mfn_table )
547 {
548 ERR("Couldn't map pfn_to_mfn table");
549 goto out;
550 }
552 /* Setup the mfn_to_pfn table mapping */
553 mfn_to_pfn_table_start_mfn = xc_get_m2p_start_mfn( xc_handle );
555 live_mfn_to_pfn_table =
556 xc_map_foreign_range(xc_handle, DOMID_XEN,
557 PAGE_SIZE*1024, PROT_READ,
558 mfn_to_pfn_table_start_mfn );
560 /* Canonicalise the pfn-to-mfn table frame-number list. */
561 memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE );
563 for ( i = 0; i < nr_pfns; i += 1024 )
564 {
565 if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) )
566 {
567 ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
568 goto out;
569 }
570 }
573 /* Domain is still running at this point */
575 if ( live )
576 {
577 if ( xc_shadow_control( xc_handle, dom,
578 DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
579 NULL, 0, NULL ) < 0 )
580 {
581 ERR("Couldn't enable shadow mode");
582 goto out;
583 }
585 last_iter = 0;
586 }
587 else
588 {
589 /* This is a non-live suspend. Issue the call back to get the
590 domain suspended */
592 last_iter = 1;
594 if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
595 {
596 ERR("Domain appears not to have suspended");
597 goto out;
598 }
600 }
601 sent_last_iter = 1<<20; /* 4GB of pages */
603 /* calculate the power of 2 order of nr_pfns, e.g.
604 15->4 16->4 17->5 */
605 for ( i = nr_pfns-1, order_nr = 0; i ; i >>= 1, order_nr++ )
606 continue;
608 /* Setup to_send bitmap */
609 {
610 /* size these for a maximal 4GB domain, to make interaction
611 with balloon driver easier. It's only user space memory,
612 ater all... (3x 128KB) */
614 int sz = ( 1<<20 ) / 8;
616 to_send = malloc( sz );
617 to_fix = calloc( 1, sz );
618 to_skip = malloc( sz );
620 if ( !to_send || !to_fix || !to_skip )
621 {
622 ERR("Couldn't allocate to_send array");
623 goto out;
624 }
626 memset(to_send, 0xff, sz);
628 if ( mlock(to_send, sz) )
629 {
630 ERR("Unable to mlock to_send");
631 return 1;
632 }
634 /* (to fix is local only) */
636 if ( mlock(to_skip, sz) )
637 {
638 ERR("Unable to mlock to_skip");
639 return 1;
640 }
642 }
644 analysis_phase( xc_handle, dom, nr_pfns, to_skip, 0 );
646 /* We want zeroed memory so use calloc rather than malloc. */
647 pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
648 pfn_batch = calloc(BATCH_SIZE, sizeof(unsigned long));
650 if ( (pfn_type == NULL) || (pfn_batch == NULL) )
651 {
652 errno = ENOMEM;
653 goto out;
654 }
656 if ( mlock(pfn_type, BATCH_SIZE * sizeof(unsigned long)) )
657 {
658 ERR("Unable to mlock");
659 goto out;
660 }
663 /*
664 * Quick belt and braces sanity check.
665 */
666 #if DEBUG
667 {
668 int err=0;
669 for ( i = 0; i < nr_pfns; i++ )
670 {
671 mfn = live_pfn_to_mfn_table[i];
673 if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0xffffffffUL) )
674 {
675 fprintf(stderr, "i=0x%x mfn=%lx live_mfn_to_pfn_table=%lx\n",
676 i,mfn,live_mfn_to_pfn_table[mfn]);
677 err++;
678 }
679 }
680 fprintf(stderr, "Had %d unexplained entries in p2m table\n",err);
681 }
682 #endif
685 /* Start writing out the saved-domain record. */
687 if ( write(io_fd, &nr_pfns, sizeof(unsigned long)) !=
688 sizeof(unsigned long) )
689 {
690 ERR("write: nr_pfns");
691 goto out;
692 }
694 if ( write(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE )
695 {
696 ERR("write: pfn_to_mfn_frame_list");
697 goto out;
698 }
700 print_stats( xc_handle, dom, 0, &stats, 0 );
702 /* Now write out each data page, canonicalising page tables as we go... */
704 for ( ; ; )
705 {
706 unsigned int prev_pc, sent_this_iter, N, batch;
708 iter++;
709 sent_this_iter = 0;
710 skip_this_iter = 0;
711 prev_pc = 0;
712 N=0;
714 DPRINTF("Saving memory pages: iter %d 0%%", iter);
716 while ( N < nr_pfns )
717 {
718 unsigned int this_pc = (N * 100) / nr_pfns;
720 if ( (this_pc - prev_pc) >= 5 )
721 {
722 DPRINTF("\b\b\b\b%3d%%", this_pc);
723 prev_pc = this_pc;
724 }
726 /* slightly wasteful to peek the whole array evey time,
727 but this is fast enough for the moment. */
729 if ( !last_iter &&
730 xc_shadow_control(xc_handle, dom,
731 DOM0_SHADOW_CONTROL_OP_PEEK,
732 to_skip, nr_pfns, NULL) != nr_pfns )
733 {
734 ERR("Error peeking shadow bitmap");
735 goto out;
736 }
739 /* load pfn_type[] with the mfn of all the pages we're doing in
740 this batch. */
742 for ( batch = 0; batch < BATCH_SIZE && N < nr_pfns ; N++ )
743 {
744 int n = permute(N, nr_pfns, order_nr );
746 if ( 0 && debug ) {
747 fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d "
748 " [mfn]= %08lx\n",
749 iter, (unsigned long)n, live_pfn_to_mfn_table[n],
750 test_bit(n,to_send),
751 live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&
752 0xFFFFF]);
753 }
755 if ( !last_iter &&
756 test_bit(n, to_send) &&
757 test_bit(n, to_skip) ) {
758 skip_this_iter++; /* stats keeping */
759 }
761 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
762 (test_bit(n, to_send) && last_iter) ||
763 (test_bit(n, to_fix) && last_iter)) ) {
764 continue;
765 }
767 /* we get here if:
768 1. page is marked to_send & hasn't already been re-dirtied
769 2. (ignore to_skip in last iteration)
770 3. add in pages that still need fixup (net bufs)
771 */
773 pfn_batch[batch] = n;
774 pfn_type[batch] = live_pfn_to_mfn_table[n];
776 if( ! is_mapped(pfn_type[batch]) )
777 {
778 /* not currently in pusedo-physical map -- set bit
779 in to_fix that we must send this page in last_iter
780 unless its sent sooner anyhow */
782 set_bit( n, to_fix );
783 if( iter>1 )
784 DPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
785 iter,n,pfn_type[batch]);
786 continue;
787 }
789 if ( last_iter &&
790 test_bit(n, to_fix) &&
791 !test_bit(n, to_send) )
792 {
793 needed_to_fix++;
794 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
795 iter,n,pfn_type[batch]);
796 }
798 clear_bit(n, to_fix);
800 batch++;
801 }
803 if ( batch == 0 )
804 goto skip; /* vanishingly unlikely... */
806 if ( (region_base = xc_map_foreign_batch(xc_handle, dom,
807 PROT_READ,
808 pfn_type,
809 batch)) == 0 ){
810 ERR("map batch failed");
811 goto out;
812 }
814 if ( xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type) ){
815 ERR("get_pfn_type_batch failed");
816 goto out;
817 }
819 for ( j = 0; j < batch; j++ )
820 {
821 if ( (pfn_type[j] & LTAB_MASK) == XTAB )
822 {
823 DPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
824 continue;
825 }
827 if ( 0 && debug )
828 fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
829 " sum= %08lx\n",
830 iter,
831 (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
832 pfn_type[j],
833 live_mfn_to_pfn_table[pfn_type[j]&(~LTAB_MASK)],
834 csum_page(region_base + (PAGE_SIZE*j)));
836 /* canonicalise mfn->pfn */
837 pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
838 }
840 if ( write(io_fd, &batch, sizeof(int)) != sizeof(int) )
841 {
842 ERR("Error when writing to state file (2)");
843 goto out;
844 }
846 if ( write(io_fd, pfn_type, sizeof(unsigned long)*j) !=
847 (sizeof(unsigned long) * j) )
848 {
849 ERR("Error when writing to state file (3)");
850 goto out;
851 }
853 /* entering this loop, pfn_type is now in pfns (Not mfns) */
854 for ( j = 0; j < batch; j++ )
855 {
856 /* write out pages in batch */
857 if ( (pfn_type[j] & LTAB_MASK) == XTAB )
858 {
859 DPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
860 continue;
861 }
863 if ( ((pfn_type[j] & LTABTYPE_MASK) == L1TAB) ||
864 ((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ){
865 memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
867 for ( k = 0;
868 k < (((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ?
869 (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) :
870 1024);
871 k++ )
872 {
873 unsigned long pfn;
875 if ( !(page[k] & _PAGE_PRESENT) )
876 continue;
878 mfn = page[k] >> PAGE_SHIFT;
879 pfn = live_mfn_to_pfn_table[mfn];
881 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
882 {
883 /* I don't think this should ever happen */
884 fprintf(stderr, "FNI %d : [%08lx,%d] pte=%08lx, "
885 "mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
886 j, pfn_type[j], k,
887 page[k], mfn, live_mfn_to_pfn_table[mfn],
888 (live_mfn_to_pfn_table[mfn]<nr_pfns)?
889 live_pfn_to_mfn_table[
890 live_mfn_to_pfn_table[mfn]] :
891 0xdeadbeef);
893 pfn = 0; /* be suspicious */
894 }
896 page[k] &= PAGE_SIZE - 1;
897 page[k] |= pfn << PAGE_SHIFT;
899 #if 0
900 fprintf(stderr,
901 "L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx "
902 "xpfn=%d\n",
903 pfn_type[j]>>28,
904 j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
905 #endif
907 } /* end of page table rewrite for loop */
909 if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
910 ERR("Error when writing to state file (4)");
911 goto out;
912 }
914 } /* end of it's a PT page */ else { /* normal page */
916 if ( ratewrite(io_fd, region_base + (PAGE_SIZE*j),
917 PAGE_SIZE) != PAGE_SIZE )
918 {
919 ERR("Error when writing to state file (5)");
920 goto out;
921 }
922 }
923 } /* end of the write out for this batch */
925 sent_this_iter += batch;
927 } /* end of this while loop for this iteration */
929 munmap(region_base, batch*PAGE_SIZE);
931 skip:
933 total_sent += sent_this_iter;
935 DPRINTF("\r %d: sent %d, skipped %d, ",
936 iter, sent_this_iter, skip_this_iter );
938 if ( last_iter ) {
939 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
941 DPRINTF("Total pages sent= %d (%.2fx)\n",
942 total_sent, ((float)total_sent)/nr_pfns );
943 DPRINTF("(of which %d were fixups)\n", needed_to_fix );
944 }
946 if (last_iter && debug){
947 int minusone = -1;
948 memset( to_send, 0xff, (nr_pfns+8)/8 );
949 debug = 0;
950 fprintf(stderr, "Entering debug resend-all mode\n");
952 /* send "-1" to put receiver into debug mode */
953 if (write(io_fd, &minusone, sizeof(int)) != sizeof(int)) {
954 ERR("Error when writing to state file (6)");
955 goto out;
956 }
958 continue;
959 }
961 if ( last_iter ) break;
963 if ( live )
964 {
965 if (
966 ( ( sent_this_iter > sent_last_iter ) &&
967 (mbit_rate == MAX_MBIT_RATE ) ) ||
968 (iter >= max_iters) ||
969 (sent_this_iter+skip_this_iter < 50) ||
970 (total_sent > nr_pfns*max_factor) )
971 {
972 DPRINTF("Start last iteration\n");
973 last_iter = 1;
975 if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
976 {
977 ERR("Domain appears not to have suspended");
978 goto out;
979 }
981 DPRINTF("SUSPEND shinfo %08lx eip %08u edx %08u\n",
982 info.shared_info_frame,
983 ctxt.user_regs.eip, ctxt.user_regs.edx);
984 }
986 if ( xc_shadow_control( xc_handle, dom,
987 DOM0_SHADOW_CONTROL_OP_CLEAN,
988 to_send, nr_pfns, &stats ) != nr_pfns )
989 {
990 ERR("Error flushing shadow PT");
991 goto out;
992 }
994 sent_last_iter = sent_this_iter;
996 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
998 }
1001 } /* end of while 1 */
1003 DPRINTF("All memory is saved\n");
1005 /* Success! */
1006 rc = 0;
1008 /* Zero terminate */
1009 if ( write(io_fd, &rc, sizeof(int)) != sizeof(int) )
1011 ERR("Error when writing to state file (6)");
1012 goto out;
1015 /* Send through a list of all the PFNs that were not in map at the close */
1017 unsigned int i,j;
1018 unsigned int pfntab[1024];
1020 for ( i = 0, j = 0; i < nr_pfns; i++ )
1021 if ( !is_mapped(live_pfn_to_mfn_table[i]) )
1022 j++;
1024 if ( write(io_fd, &j, sizeof(unsigned int)) != sizeof(unsigned int) )
1026 ERR("Error when writing to state file (6a)");
1027 goto out;
1030 for ( i = 0, j = 0; i < nr_pfns; )
1032 if ( !is_mapped(live_pfn_to_mfn_table[i]) )
1034 pfntab[j++] = i;
1036 i++;
1037 if ( j == 1024 || i == nr_pfns )
1039 if ( write(io_fd, &pfntab, sizeof(unsigned long)*j) !=
1040 (sizeof(unsigned long) * j) )
1042 ERR("Error when writing to state file (6b)");
1043 goto out;
1045 j = 0;
1050 /* Canonicalise the suspend-record frame number. */
1051 if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) )
1053 ERR("Suspend record is not in range of pseudophys map");
1054 goto out;
1057 /* Canonicalise each GDT frame number. */
1058 for ( i = 0; i < ctxt.gdt_ents; i += 512 )
1060 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) )
1062 ERR("GDT frame is not in range of pseudophys map");
1063 goto out;
1067 /* Canonicalise the page table base pointer. */
1068 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) )
1070 ERR("PT base is not in range of pseudophys map");
1071 goto out;
1073 ctxt.ctrlreg[3] = live_mfn_to_pfn_table[ctxt.ctrlreg[3] >> PAGE_SHIFT] <<
1074 PAGE_SHIFT;
1076 if ( write(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) ||
1077 write(io_fd, live_shinfo, PAGE_SIZE) != PAGE_SIZE)
1079 ERR("Error when writing to state file (1)");
1080 goto out;
1083 out:
1085 if ( live_shinfo )
1086 munmap(live_shinfo, PAGE_SIZE);
1088 if ( live_pfn_to_mfn_frame_list )
1089 munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
1091 if ( live_pfn_to_mfn_table )
1092 munmap(live_pfn_to_mfn_table, nr_pfns*4);
1094 if ( live_mfn_to_pfn_table )
1095 munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024);
1097 free(pfn_type);
1098 free(pfn_batch);
1099 free(to_send);
1100 free(to_fix);
1101 free(to_skip);
1103 DPRINTF("Save exit rc=%d\n",rc);
1104 return !!rc;
1107 /*
1108 * Local variables:
1109 * mode: C
1110 * c-set-style: "BSD"
1111 * c-basic-offset: 4
1112 * tab-width: 4
1113 * indent-tabs-mode: nil
1114 * End:
1115 */