ia64/xen-unstable

view tools/libxc/xc_linux_save.c @ 6385:f34e732ed4bf

Xenstore testsuite robustness: save output rather than rerun on failure.
"make check" reruns a test which fails with more verbosity. If the test
fails intermittently, that doesn't work well: save the output and simply
dump it if the test fails.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Christian Limpach <Christian.Limpach@cl.cam.ac.uk>
author cl349@firebug.cl.cam.ac.uk
date Tue Aug 23 19:58:59 2005 +0000 (2005-08-23)
parents 522bc50588ed
children fdfd511768a3 2f20c2fce2c5 cc5f88b719d0 fa0754a9f64f
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <sys/time.h>
11 #include "xc_private.h"
12 #include <xen/linux/suspend.h>
13 #include <xen/io/domain_controller.h>
14 #include <time.h>
16 #define BATCH_SIZE 1024 /* 1024 pages (4MB) at a time */
18 #define MAX_MBIT_RATE 500
20 #define DEBUG 0
22 #if 1
23 #define ERR(_f, _a...) do { fprintf(stderr, _f , ## _a); fflush(stderr); } while (0)
24 #else
25 #define ERR(_f, _a...) ((void)0)
26 #endif
28 #if DEBUG
29 #define DPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
30 #else
31 #define DPRINTF(_f, _a...) ((void)0)
32 #endif
34 #define PROGRESS 0
35 #if PROGRESS
36 #define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
37 #else
38 #define PPRINTF(_f, _a...)
39 #endif
41 /*
42 * Returns TRUE if the given machine frame number has a unique mapping
43 * in the guest's pseudophysical map.
44 */
46 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
47 (((_mfn) < (1024*1024)) && \
48 ((live_mfn_to_pfn_table[_mfn] < nr_pfns) && \
49 (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn))))
52 /* Returns TRUE if MFN is successfully converted to a PFN. */
53 #define translate_mfn_to_pfn(_pmfn) \
54 ({ \
55 unsigned long mfn = *(_pmfn); \
56 int _res = 1; \
57 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
58 _res = 0; \
59 else \
60 *(_pmfn) = live_mfn_to_pfn_table[mfn]; \
61 _res; \
62 })
64 #define is_mapped(pfn) (!((pfn) & 0x80000000UL))
66 static inline int test_bit ( int nr, volatile void * addr)
67 {
68 return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
69 (nr % (sizeof(unsigned long)*8))) & 1;
70 }
72 static inline void clear_bit ( int nr, volatile void * addr)
73 {
74 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &=
75 ~(1 << (nr % (sizeof(unsigned long)*8) ) );
76 }
78 static inline void set_bit ( int nr, volatile void * addr)
79 {
80 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |=
81 (1 << (nr % (sizeof(unsigned long)*8) ) );
82 }
84 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
85 static inline unsigned int hweight32(unsigned int w)
86 {
87 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
88 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
89 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
90 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
91 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
92 }
94 static inline int count_bits ( int nr, volatile void *addr)
95 {
96 int i, count = 0;
97 unsigned long *p = (unsigned long *)addr;
98 /* We know that the array is padded to unsigned long. */
99 for(i=0;i<nr/(sizeof(unsigned long)*8);i++,p++)
100 count += hweight32( *p );
101 return count;
102 }
104 static inline int permute( int i, int nr, int order_nr )
105 {
106 /* Need a simple permutation function so that we scan pages in a
107 pseudo random order, enabling us to get a better estimate of
108 the domain's page dirtying rate as we go (there are often
109 contiguous ranges of pfns that have similar behaviour, and we
110 want to mix them up. */
112 /* e.g. nr->oder 15->4 16->4 17->5 */
113 /* 512MB domain, 128k pages, order 17 */
115 /*
116 QPONMLKJIHGFEDCBA
117 QPONMLKJIH
118 GFEDCBA
119 */
121 /*
122 QPONMLKJIHGFEDCBA
123 EDCBA
124 QPONM
125 LKJIHGF
126 */
128 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
129 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
131 return i;
132 }
134 static long long tv_to_us( struct timeval *new )
135 {
136 return (new->tv_sec * 1000000) + new->tv_usec;
137 }
139 static long long llgettimeofday( void )
140 {
141 struct timeval now;
142 gettimeofday(&now, NULL);
143 return tv_to_us(&now);
144 }
146 static long long tv_delta( struct timeval *new, struct timeval *old )
147 {
148 return ((new->tv_sec - old->tv_sec)*1000000 ) +
149 (new->tv_usec - old->tv_usec);
150 }
153 #define START_MBIT_RATE 0 //ioctxt->resource
155 static int mbit_rate, ombit_rate = 0;
156 static int burst_time_us = -1;
158 #define MBIT_RATE mbit_rate
159 #define BURST_BUDGET (100*1024)
161 /*
162 1000000/((100)*1024*1024/8/(100*1024))
163 7812
164 1000000/((100)*1024/8/(100))
165 7812
166 1000000/((100)*128/(100))
167 7812
168 100000000/((100)*128)
169 7812
170 100000000/128
171 781250
172 */
173 #define RATE_TO_BTU 781250
174 #define BURST_TIME_US burst_time_us
176 static int
177 ratewrite(int io_fd, void *buf, int n)
178 {
179 static int budget = 0;
180 static struct timeval last_put = { 0 };
181 struct timeval now;
182 struct timespec delay;
183 long long delta;
185 if (START_MBIT_RATE == 0)
186 return write(io_fd, buf, n);
188 budget -= n;
189 if (budget < 0) {
190 if (MBIT_RATE != ombit_rate) {
191 BURST_TIME_US = RATE_TO_BTU / MBIT_RATE;
192 ombit_rate = MBIT_RATE;
193 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
194 MBIT_RATE, BURST_BUDGET, BURST_TIME_US);
195 }
196 if (last_put.tv_sec == 0) {
197 budget += BURST_BUDGET;
198 gettimeofday(&last_put, NULL);
199 } else {
200 while (budget < 0) {
201 gettimeofday(&now, NULL);
202 delta = tv_delta(&now, &last_put);
203 while (delta > BURST_TIME_US) {
204 budget += BURST_BUDGET;
205 last_put.tv_usec += BURST_TIME_US;
206 if (last_put.tv_usec > 1000000) {
207 last_put.tv_usec -= 1000000;
208 last_put.tv_sec++;
209 }
210 delta -= BURST_TIME_US;
211 }
212 if (budget > 0)
213 break;
214 delay.tv_sec = 0;
215 delay.tv_nsec = 1000 * (BURST_TIME_US - delta);
216 while (delay.tv_nsec > 0)
217 if (nanosleep(&delay, &delay) == 0)
218 break;
219 }
220 }
221 }
222 return write(io_fd, buf, n);
223 }
225 static int print_stats( int xc_handle, u32 domid,
226 int pages_sent, xc_shadow_control_stats_t *stats,
227 int print )
228 {
229 static struct timeval wall_last;
230 static long long d0_cpu_last;
231 static long long d1_cpu_last;
233 struct timeval wall_now;
234 long long wall_delta;
235 long long d0_cpu_now, d0_cpu_delta;
236 long long d1_cpu_now, d1_cpu_delta;
238 gettimeofday(&wall_now, NULL);
240 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
241 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
243 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
244 fprintf(stderr, "ARRHHH!!\n");
246 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
248 if ( wall_delta == 0 ) wall_delta = 1;
250 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
251 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
253 if ( print )
254 fprintf(stderr,
255 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
256 "dirtied %dMb/s %" PRId32 " pages\n",
257 wall_delta,
258 (int)((d0_cpu_delta*100)/wall_delta),
259 (int)((d1_cpu_delta*100)/wall_delta),
260 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
261 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
262 stats->dirty_count);
264 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
265 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
266 + 50;
267 if (mbit_rate > MAX_MBIT_RATE)
268 mbit_rate = MAX_MBIT_RATE;
269 }
271 d0_cpu_last = d0_cpu_now;
272 d1_cpu_last = d1_cpu_now;
273 wall_last = wall_now;
275 return 0;
276 }
278 static int analysis_phase( int xc_handle, u32 domid,
279 int nr_pfns, unsigned long *arr, int runs )
280 {
281 long long start, now;
282 xc_shadow_control_stats_t stats;
283 int j;
285 start = llgettimeofday();
287 for (j = 0; j < runs; j++)
288 {
289 int i;
291 xc_shadow_control( xc_handle, domid,
292 DOM0_SHADOW_CONTROL_OP_CLEAN,
293 arr, nr_pfns, NULL);
294 fprintf(stderr, "#Flush\n");
295 for ( i = 0; i < 40; i++ )
296 {
297 usleep(50000);
298 now = llgettimeofday();
299 xc_shadow_control( xc_handle, domid,
300 DOM0_SHADOW_CONTROL_OP_PEEK,
301 NULL, 0, &stats);
303 fprintf(stderr, "now= %lld faults= %" PRId32 " dirty= %" PRId32
304 " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
305 ((now-start)+500)/1000,
306 stats.fault_count, stats.dirty_count,
307 stats.dirty_net_count, stats.dirty_block_count);
308 }
309 }
311 return -1;
312 }
315 static int suspend_and_state(int xc_handle, int io_fd, int dom,
316 xc_dominfo_t *info,
317 vcpu_guest_context_t *ctxt)
318 {
319 int i=0;
320 char ans[30];
322 printf("suspend\n");
323 fflush(stdout);
324 if (fgets(ans, sizeof(ans), stdin) == NULL) {
325 ERR("failed reading suspend reply");
326 return -1;
327 }
328 if (strncmp(ans, "done\n", 5)) {
329 ERR("suspend reply incorrect: %s", ans);
330 return -1;
331 }
333 retry:
335 if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1)
336 {
337 ERR("Could not get domain info");
338 return -1;
339 }
341 if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */,
342 ctxt) )
343 {
344 ERR("Could not get vcpu context");
345 }
347 if ( info->shutdown && info->shutdown_reason == SHUTDOWN_suspend )
348 {
349 return 0; // success
350 }
352 if ( info->paused )
353 {
354 // try unpausing domain, wait, and retest
355 xc_domain_unpause( xc_handle, dom );
357 ERR("Domain was paused. Wait and re-test.");
358 usleep(10000); // 10ms
360 goto retry;
361 }
364 if( ++i < 100 )
365 {
366 ERR("Retry suspend domain.");
367 usleep(10000); // 10ms
368 goto retry;
369 }
371 ERR("Unable to suspend domain.");
373 return -1;
374 }
376 int xc_linux_save(int xc_handle, int io_fd, u32 dom)
377 {
378 xc_dominfo_t info;
380 int rc = 1, i, j, k, last_iter, iter = 0;
381 unsigned long mfn;
382 int live = 0; // (ioctxt->flags & XCFLAGS_LIVE);
383 int debug = 0; // (ioctxt->flags & XCFLAGS_DEBUG);
384 int sent_last_iter, skip_this_iter;
386 /* Important tuning parameters */
387 int max_iters = 29; /* limit us to 30 times round loop */
388 int max_factor = 3; /* never send more than 3x nr_pfns */
390 /* The new domain's shared-info frame number. */
391 unsigned long shared_info_frame;
393 /* A copy of the CPU context of the guest. */
394 vcpu_guest_context_t ctxt;
396 /* A table containg the type of each PFN (/not/ MFN!). */
397 unsigned long *pfn_type = NULL;
398 unsigned long *pfn_batch = NULL;
400 /* A temporary mapping, and a copy, of one frame of guest memory. */
401 unsigned long page[1024];
403 /* A copy of the pfn-to-mfn table frame list. */
404 unsigned long *live_pfn_to_mfn_frame_list = NULL;
405 unsigned long pfn_to_mfn_frame_list[1024];
407 /* Live mapping of the table mapping each PFN to its current MFN. */
408 unsigned long *live_pfn_to_mfn_table = NULL;
409 /* Live mapping of system MFN to PFN table. */
410 unsigned long *live_mfn_to_pfn_table = NULL;
411 unsigned long mfn_to_pfn_table_start_mfn;
413 /* Live mapping of shared info structure */
414 shared_info_t *live_shinfo = NULL;
416 /* base of the region in which domain memory is mapped */
417 unsigned char *region_base = NULL;
419 /* A temporary mapping, and a copy, of the guest's suspend record. */
420 suspend_record_t *p_srec = NULL;
422 /* number of pages we're dealing with */
423 unsigned long nr_pfns;
425 /* power of 2 order of nr_pfns */
426 int order_nr;
428 /* bitmap of pages:
429 - that should be sent this iteration (unless later marked as skip);
430 - to skip this iteration because already dirty;
431 - to fixup by sending at the end if not already resent; */
432 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
434 xc_shadow_control_stats_t stats;
436 int needed_to_fix = 0;
437 int total_sent = 0;
439 MBIT_RATE = START_MBIT_RATE;
441 DPRINTF("xc_linux_save start %d\n", dom);
443 if (mlock(&ctxt, sizeof(ctxt))) {
444 ERR("Unable to mlock ctxt");
445 return 1;
446 }
448 if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1)
449 {
450 ERR("Could not get domain info");
451 goto out;
452 }
453 if ( xc_domain_get_vcpu_context( xc_handle, dom, /* FIXME */ 0,
454 &ctxt) )
455 {
456 ERR("Could not get vcpu context");
457 goto out;
458 }
459 shared_info_frame = info.shared_info_frame;
461 /* A cheesy test to see whether the domain contains valid state. */
462 if ( ctxt.ctrlreg[3] == 0 ){
463 ERR("Domain is not in a valid Linux guest OS state");
464 goto out;
465 }
467 nr_pfns = info.max_memkb >> (PAGE_SHIFT - 10);
469 /* cheesy sanity check */
470 if ( nr_pfns > 1024*1024 )
471 {
472 ERR("Invalid state record -- pfn count out of range: %lu", nr_pfns);
473 goto out;
474 }
476 /* Map the shared info frame */
477 live_shinfo = xc_map_foreign_range(xc_handle, dom,
478 PAGE_SIZE, PROT_READ,
479 shared_info_frame);
481 if (!live_shinfo){
482 ERR("Couldn't map live_shinfo");
483 goto out;
484 }
486 /* the pfn_to_mfn_frame_list fits in a single page */
487 live_pfn_to_mfn_frame_list =
488 xc_map_foreign_range(xc_handle, dom,
489 PAGE_SIZE, PROT_READ,
490 live_shinfo->arch.pfn_to_mfn_frame_list );
492 if (!live_pfn_to_mfn_frame_list){
493 ERR("Couldn't map pfn_to_mfn_frame_list");
494 goto out;
495 }
498 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
499 the guest must not change which frames are used for this purpose.
500 (its not clear why it would want to change them, and we'll be OK
501 from a safety POV anyhow. */
503 live_pfn_to_mfn_table = xc_map_foreign_batch(xc_handle, dom,
504 PROT_READ,
505 live_pfn_to_mfn_frame_list,
506 (nr_pfns+1023)/1024 );
507 if( !live_pfn_to_mfn_table ){
508 ERR("Couldn't map pfn_to_mfn table");
509 goto out;
510 }
512 /* Setup the mfn_to_pfn table mapping */
513 mfn_to_pfn_table_start_mfn = xc_get_m2p_start_mfn( xc_handle );
515 live_mfn_to_pfn_table =
516 xc_map_foreign_range(xc_handle, DOMID_XEN,
517 PAGE_SIZE*1024, PROT_READ,
518 mfn_to_pfn_table_start_mfn );
520 /* Canonicalise the pfn-to-mfn table frame-number list. */
521 memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE );
523 for ( i = 0; i < nr_pfns; i += 1024 ){
524 if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ){
525 ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
526 goto out;
527 }
528 }
531 /* Domain is still running at this point */
533 if( live )
534 {
535 if ( xc_shadow_control( xc_handle, dom,
536 DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
537 NULL, 0, NULL ) < 0 ) {
538 ERR("Couldn't enable shadow mode");
539 goto out;
540 }
542 last_iter = 0;
543 } else{
544 /* This is a non-live suspend. Issue the call back to get the
545 domain suspended */
547 last_iter = 1;
549 if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
550 {
551 ERR("Domain appears not to have suspended");
552 goto out;
553 }
555 }
556 sent_last_iter = 1<<20; /* 4GB of pages */
558 /* calculate the power of 2 order of nr_pfns, e.g.
559 15->4 16->4 17->5 */
560 for( i=nr_pfns-1, order_nr=0; i ; i>>=1, order_nr++ );
562 /* Setup to_send bitmap */
563 {
564 /* size these for a maximal 4GB domain, to make interaction
565 with balloon driver easier. It's only user space memory,
566 ater all... (3x 128KB) */
568 int sz = ( 1<<20 ) / 8;
570 to_send = malloc( sz );
571 to_fix = calloc( 1, sz );
572 to_skip = malloc( sz );
574 if (!to_send || !to_fix || !to_skip){
575 ERR("Couldn't allocate to_send array");
576 goto out;
577 }
579 memset( to_send, 0xff, sz );
581 if ( mlock( to_send, sz ) ){
582 ERR("Unable to mlock to_send");
583 return 1;
584 }
586 /* (to fix is local only) */
588 if ( mlock( to_skip, sz ) ){
589 ERR("Unable to mlock to_skip");
590 return 1;
591 }
593 }
595 analysis_phase( xc_handle, dom, nr_pfns, to_skip, 0 );
597 /* We want zeroed memory so use calloc rather than malloc. */
598 pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
599 pfn_batch = calloc(BATCH_SIZE, sizeof(unsigned long));
601 if ( (pfn_type == NULL) || (pfn_batch == NULL) ){
602 errno = ENOMEM;
603 goto out;
604 }
606 if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) ){
607 ERR("Unable to mlock");
608 goto out;
609 }
612 /*
613 * Quick belt and braces sanity check.
614 */
615 #if DEBUG
616 {
617 int err=0;
618 for ( i = 0; i < nr_pfns; i++ )
619 {
620 mfn = live_pfn_to_mfn_table[i];
622 if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0xffffffffUL) )
623 {
624 fprintf(stderr, "i=0x%x mfn=%lx live_mfn_to_pfn_table=%lx\n",
625 i,mfn,live_mfn_to_pfn_table[mfn]);
626 err++;
627 }
628 }
629 fprintf(stderr, "Had %d unexplained entries in p2m table\n",err);
630 }
631 #endif
634 /* Start writing out the saved-domain record. */
636 if (write(io_fd, &nr_pfns, sizeof(unsigned long)) !=
637 sizeof(unsigned long)) {
638 ERR("write: nr_pfns");
639 goto out;
640 }
641 if (write(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE) {
642 ERR("write: pfn_to_mfn_frame_list");
643 goto out;
644 }
646 /* Map the suspend-record MFN to pin it. The page must be owned by
647 dom for this to succeed. */
648 p_srec = xc_map_foreign_range(xc_handle, dom,
649 sizeof(*p_srec), PROT_READ | PROT_WRITE,
650 ctxt.user_regs.esi);
651 if (!p_srec){
652 ERR("Couldn't map suspend record");
653 goto out;
654 }
656 /* Canonicalize store mfn. */
657 if ( !translate_mfn_to_pfn(&p_srec->resume_info.store_mfn) ) {
658 ERR("Store frame is not in range of pseudophys map");
659 goto out;
660 }
662 print_stats( xc_handle, dom, 0, &stats, 0 );
664 /* Now write out each data page, canonicalising page tables as we go... */
666 while(1){
667 unsigned int prev_pc, sent_this_iter, N, batch;
669 iter++;
670 sent_this_iter = 0;
671 skip_this_iter = 0;
672 prev_pc = 0;
673 N=0;
675 DPRINTF("Saving memory pages: iter %d 0%%", iter);
677 while( N < nr_pfns ){
678 unsigned int this_pc = (N * 100) / nr_pfns;
680 if ( (this_pc - prev_pc) >= 5 ){
681 DPRINTF("\b\b\b\b%3d%%", this_pc);
682 prev_pc = this_pc;
683 }
685 /* slightly wasteful to peek the whole array evey time,
686 but this is fast enough for the moment. */
688 if ( !last_iter &&
689 xc_shadow_control(xc_handle, dom,
690 DOM0_SHADOW_CONTROL_OP_PEEK,
691 to_skip, nr_pfns, NULL) != nr_pfns )
692 {
693 ERR("Error peeking shadow bitmap");
694 goto out;
695 }
698 /* load pfn_type[] with the mfn of all the pages we're doing in
699 this batch. */
701 for ( batch = 0; batch < BATCH_SIZE && N < nr_pfns ; N++ )
702 {
703 int n = permute(N, nr_pfns, order_nr );
705 if ( 0 && debug ) {
706 fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d "
707 " [mfn]= %08lx\n",
708 iter, (unsigned long)n, live_pfn_to_mfn_table[n],
709 test_bit(n,to_send),
710 live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&
711 0xFFFFF]);
712 }
714 if ( !last_iter &&
715 test_bit(n, to_send) &&
716 test_bit(n, to_skip) ) {
717 skip_this_iter++; /* stats keeping */
718 }
720 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
721 (test_bit(n, to_send) && last_iter) ||
722 (test_bit(n, to_fix) && last_iter)) ) {
723 continue;
724 }
726 /* we get here if:
727 1. page is marked to_send & hasn't already been re-dirtied
728 2. (ignore to_skip in last iteration)
729 3. add in pages that still need fixup (net bufs)
730 */
732 pfn_batch[batch] = n;
733 pfn_type[batch] = live_pfn_to_mfn_table[n];
735 if( ! is_mapped(pfn_type[batch]) )
736 {
737 /* not currently in pusedo-physical map -- set bit
738 in to_fix that we must send this page in last_iter
739 unless its sent sooner anyhow */
741 set_bit( n, to_fix );
742 if( iter>1 )
743 DPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
744 iter,n,pfn_type[batch]);
745 continue;
746 }
748 if ( last_iter &&
749 test_bit(n, to_fix) &&
750 !test_bit(n, to_send) )
751 {
752 needed_to_fix++;
753 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
754 iter,n,pfn_type[batch]);
755 }
757 clear_bit(n, to_fix);
759 batch++;
760 }
762 // DPRINTF("batch %d:%d (n=%d)\n", iter, batch, n);
764 if ( batch == 0 )
765 goto skip; /* vanishingly unlikely... */
767 if ( (region_base = xc_map_foreign_batch(xc_handle, dom,
768 PROT_READ,
769 pfn_type,
770 batch)) == 0 ){
771 ERR("map batch failed");
772 goto out;
773 }
775 if ( get_pfn_type_batch(xc_handle, dom, batch, pfn_type) ){
776 ERR("get_pfn_type_batch failed");
777 goto out;
778 }
780 for ( j = 0; j < batch; j++ ){
781 if ( (pfn_type[j] & LTAB_MASK) == XTAB ){
782 DPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
783 continue;
784 }
786 if ( 0 && debug )
787 fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
788 " sum= %08lx\n",
789 iter,
790 (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
791 pfn_type[j],
792 live_mfn_to_pfn_table[pfn_type[j]&(~LTAB_MASK)],
793 csum_page(region_base + (PAGE_SIZE*j)));
795 /* canonicalise mfn->pfn */
796 pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
797 }
799 if (write(io_fd, &batch, sizeof(int)) != sizeof(int)) {
800 ERR("Error when writing to state file (2)");
801 goto out;
802 }
804 if (write(io_fd, pfn_type, sizeof(unsigned long)*j) !=
805 sizeof(unsigned long)*j) {
806 ERR("Error when writing to state file (3)");
807 goto out;
808 }
810 /* entering this loop, pfn_type is now in pfns (Not mfns) */
811 for( j = 0; j < batch; j++ ){
812 /* write out pages in batch */
813 if( (pfn_type[j] & LTAB_MASK) == XTAB){
814 DPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
815 continue;
816 }
818 if ( ((pfn_type[j] & LTABTYPE_MASK) == L1TAB) ||
819 ((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ){
820 memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
822 for ( k = 0;
823 k < (((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ?
824 (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) :
825 1024);
826 k++ ){
827 unsigned long pfn;
829 if ( !(page[k] & _PAGE_PRESENT) )
830 continue;
832 mfn = page[k] >> PAGE_SHIFT;
833 pfn = live_mfn_to_pfn_table[mfn];
835 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
836 {
837 /* I don't think this should ever happen */
838 fprintf(stderr, "FNI %d : [%08lx,%d] pte=%08lx, "
839 "mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
840 j, pfn_type[j], k,
841 page[k], mfn, live_mfn_to_pfn_table[mfn],
842 (live_mfn_to_pfn_table[mfn]<nr_pfns)?
843 live_pfn_to_mfn_table[
844 live_mfn_to_pfn_table[mfn]] :
845 0xdeadbeef);
847 pfn = 0; /* be suspicious */
848 }
850 page[k] &= PAGE_SIZE - 1;
851 page[k] |= pfn << PAGE_SHIFT;
853 #if 0
854 fprintf(stderr,
855 "L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx "
856 "xpfn=%d\n",
857 pfn_type[j]>>28,
858 j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
859 #endif
861 } /* end of page table rewrite for loop */
863 if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
864 ERR("Error when writing to state file (4)");
865 goto out;
866 }
868 } /* end of it's a PT page */ else { /* normal page */
870 if (ratewrite(io_fd, region_base + (PAGE_SIZE*j),
871 PAGE_SIZE) != PAGE_SIZE) {
872 ERR("Error when writing to state file (5)");
873 goto out;
874 }
875 }
876 } /* end of the write out for this batch */
878 sent_this_iter += batch;
880 } /* end of this while loop for this iteration */
882 munmap(region_base, batch*PAGE_SIZE);
884 skip:
886 total_sent += sent_this_iter;
888 DPRINTF("\r %d: sent %d, skipped %d, ",
889 iter, sent_this_iter, skip_this_iter );
891 if ( last_iter ) {
892 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
894 DPRINTF("Total pages sent= %d (%.2fx)\n",
895 total_sent, ((float)total_sent)/nr_pfns );
896 DPRINTF("(of which %d were fixups)\n", needed_to_fix );
897 }
899 if (last_iter && debug){
900 int minusone = -1;
901 memset( to_send, 0xff, (nr_pfns+8)/8 );
902 debug = 0;
903 fprintf(stderr, "Entering debug resend-all mode\n");
905 /* send "-1" to put receiver into debug mode */
906 if (write(io_fd, &minusone, sizeof(int)) != sizeof(int)) {
907 ERR("Error when writing to state file (6)");
908 goto out;
909 }
911 continue;
912 }
914 if ( last_iter ) break;
916 if ( live )
917 {
918 if (
919 ( ( sent_this_iter > sent_last_iter ) &&
920 (mbit_rate == MAX_MBIT_RATE ) ) ||
921 (iter >= max_iters) ||
922 (sent_this_iter+skip_this_iter < 50) ||
923 (total_sent > nr_pfns*max_factor) )
924 {
925 DPRINTF("Start last iteration\n");
926 last_iter = 1;
928 if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
929 {
930 ERR("Domain appears not to have suspended");
931 goto out;
932 }
934 DPRINTF("SUSPEND shinfo %08lx eip %08u esi %08u\n",
935 info.shared_info_frame,
936 ctxt.user_regs.eip, ctxt.user_regs.esi);
937 }
939 if ( xc_shadow_control( xc_handle, dom,
940 DOM0_SHADOW_CONTROL_OP_CLEAN,
941 to_send, nr_pfns, &stats ) != nr_pfns )
942 {
943 ERR("Error flushing shadow PT");
944 goto out;
945 }
947 sent_last_iter = sent_this_iter;
949 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
951 }
954 } /* end of while 1 */
956 DPRINTF("All memory is saved\n");
958 /* Success! */
959 rc = 0;
961 /* Zero terminate */
962 if (write(io_fd, &rc, sizeof(int)) != sizeof(int)) {
963 ERR("Error when writing to state file (6)");
964 goto out;
965 }
967 /* Send through a list of all the PFNs that were not in map at the close */
968 {
969 unsigned int i,j;
970 unsigned int pfntab[1024];
972 for ( i = 0, j = 0; i < nr_pfns; i++ )
973 {
974 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
975 j++;
976 }
978 if (write(io_fd, &j, sizeof(unsigned int)) != sizeof(unsigned int)) {
979 ERR("Error when writing to state file (6a)");
980 goto out;
981 }
983 for ( i = 0, j = 0; i < nr_pfns; )
984 {
985 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
986 {
987 pfntab[j++] = i;
988 }
989 i++;
990 if ( j == 1024 || i == nr_pfns )
991 {
992 if (write(io_fd, &pfntab, sizeof(unsigned long)*j) !=
993 sizeof(unsigned long)*j) {
994 ERR("Error when writing to state file (6b)");
995 goto out;
996 }
997 j = 0;
998 }
999 }
1002 if (nr_pfns != p_srec->nr_pfns )
1004 ERR("Suspend record nr_pfns unexpected (%ld != %ld)",
1005 p_srec->nr_pfns, nr_pfns);
1006 goto out;
1009 /* Canonicalise the suspend-record frame number. */
1010 if ( !translate_mfn_to_pfn(&ctxt.user_regs.esi) ){
1011 ERR("Suspend record is not in range of pseudophys map");
1012 goto out;
1015 /* Canonicalise each GDT frame number. */
1016 for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
1017 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1018 ERR("GDT frame is not in range of pseudophys map");
1019 goto out;
1023 /* Canonicalise the page table base pointer. */
1024 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) ) {
1025 ERR("PT base is not in range of pseudophys map");
1026 goto out;
1028 ctxt.ctrlreg[3] = live_mfn_to_pfn_table[ctxt.ctrlreg[3] >> PAGE_SHIFT] <<
1029 PAGE_SHIFT;
1031 if (write(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) ||
1032 write(io_fd, live_shinfo, PAGE_SIZE) != PAGE_SIZE) {
1033 ERR("Error when writing to state file (1)");
1034 goto out;
1037 out:
1039 if(live_shinfo)
1040 munmap(live_shinfo, PAGE_SIZE);
1042 if(p_srec)
1043 munmap(p_srec, sizeof(*p_srec));
1045 if(live_pfn_to_mfn_frame_list)
1046 munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
1048 if(live_pfn_to_mfn_table)
1049 munmap(live_pfn_to_mfn_table, nr_pfns*4);
1051 if(live_mfn_to_pfn_table)
1052 munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024);
1054 free(pfn_type);
1055 free(pfn_batch);
1056 free(to_send);
1057 free(to_fix);
1058 free(to_skip);
1060 DPRINTF("Save exit rc=%d\n",rc);
1061 return !!rc;