ia64/xen-unstable

view tools/libxc/xc_linux_save.c @ 6552:a9873d384da4

Merge.
author adsharma@los-vmm.sc.intel.com
date Thu Aug 25 12:24:48 2005 -0700 (2005-08-25)
parents 112d44270733 fa0754a9f64f
children dfaf788ab18c
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xg_private.h"
17 #include <xen/linux/suspend.h>
18 #include <xen/io/domain_controller.h>
20 #define BATCH_SIZE 1024 /* 1024 pages (4MB) at a time */
22 #define MAX_MBIT_RATE 500
24 #define DEBUG 0
26 #if 1
27 #define ERR(_f, _a...) do { fprintf(stderr, _f , ## _a); fflush(stderr); } while (0)
28 #else
29 #define ERR(_f, _a...) ((void)0)
30 #endif
32 #if DEBUG
33 #define DPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
34 #else
35 #define DPRINTF(_f, _a...) ((void)0)
36 #endif
38 #define PROGRESS 0
39 #if PROGRESS
40 #define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
41 #else
42 #define PPRINTF(_f, _a...)
43 #endif
45 /*
46 * Returns TRUE if the given machine frame number has a unique mapping
47 * in the guest's pseudophysical map.
48 */
50 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
51 (((_mfn) < (1024*1024)) && \
52 ((live_mfn_to_pfn_table[_mfn] < nr_pfns) && \
53 (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn))))
56 /* Returns TRUE if MFN is successfully converted to a PFN. */
57 #define translate_mfn_to_pfn(_pmfn) \
58 ({ \
59 unsigned long mfn = *(_pmfn); \
60 int _res = 1; \
61 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
62 _res = 0; \
63 else \
64 *(_pmfn) = live_mfn_to_pfn_table[mfn]; \
65 _res; \
66 })
68 #define is_mapped(pfn) (!((pfn) & 0x80000000UL))
70 static inline int test_bit ( int nr, volatile void * addr)
71 {
72 return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
73 (nr % (sizeof(unsigned long)*8))) & 1;
74 }
76 static inline void clear_bit ( int nr, volatile void * addr)
77 {
78 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &=
79 ~(1 << (nr % (sizeof(unsigned long)*8) ) );
80 }
82 static inline void set_bit ( int nr, volatile void * addr)
83 {
84 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |=
85 (1 << (nr % (sizeof(unsigned long)*8) ) );
86 }
88 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
89 static inline unsigned int hweight32(unsigned int w)
90 {
91 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
92 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
93 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
94 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
95 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
96 }
98 static inline int count_bits ( int nr, volatile void *addr)
99 {
100 int i, count = 0;
101 unsigned long *p = (unsigned long *)addr;
102 /* We know that the array is padded to unsigned long. */
103 for(i=0;i<nr/(sizeof(unsigned long)*8);i++,p++)
104 count += hweight32( *p );
105 return count;
106 }
108 static inline int permute( int i, int nr, int order_nr )
109 {
110 /* Need a simple permutation function so that we scan pages in a
111 pseudo random order, enabling us to get a better estimate of
112 the domain's page dirtying rate as we go (there are often
113 contiguous ranges of pfns that have similar behaviour, and we
114 want to mix them up. */
116 /* e.g. nr->oder 15->4 16->4 17->5 */
117 /* 512MB domain, 128k pages, order 17 */
119 /*
120 QPONMLKJIHGFEDCBA
121 QPONMLKJIH
122 GFEDCBA
123 */
125 /*
126 QPONMLKJIHGFEDCBA
127 EDCBA
128 QPONM
129 LKJIHGF
130 */
132 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
133 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
135 return i;
136 }
138 static long long tv_to_us( struct timeval *new )
139 {
140 return (new->tv_sec * 1000000) + new->tv_usec;
141 }
143 static long long llgettimeofday( void )
144 {
145 struct timeval now;
146 gettimeofday(&now, NULL);
147 return tv_to_us(&now);
148 }
150 static long long tv_delta( struct timeval *new, struct timeval *old )
151 {
152 return ((new->tv_sec - old->tv_sec)*1000000 ) +
153 (new->tv_usec - old->tv_usec);
154 }
157 #define START_MBIT_RATE 0 //ioctxt->resource
159 static int mbit_rate, ombit_rate = 0;
160 static int burst_time_us = -1;
162 #define MBIT_RATE mbit_rate
163 #define BURST_BUDGET (100*1024)
165 /*
166 1000000/((100)*1024*1024/8/(100*1024))
167 7812
168 1000000/((100)*1024/8/(100))
169 7812
170 1000000/((100)*128/(100))
171 7812
172 100000000/((100)*128)
173 7812
174 100000000/128
175 781250
176 */
177 #define RATE_TO_BTU 781250
178 #define BURST_TIME_US burst_time_us
180 static int
181 ratewrite(int io_fd, void *buf, int n)
182 {
183 static int budget = 0;
184 static struct timeval last_put = { 0 };
185 struct timeval now;
186 struct timespec delay;
187 long long delta;
189 if (START_MBIT_RATE == 0)
190 return write(io_fd, buf, n);
192 budget -= n;
193 if (budget < 0) {
194 if (MBIT_RATE != ombit_rate) {
195 BURST_TIME_US = RATE_TO_BTU / MBIT_RATE;
196 ombit_rate = MBIT_RATE;
197 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
198 MBIT_RATE, BURST_BUDGET, BURST_TIME_US);
199 }
200 if (last_put.tv_sec == 0) {
201 budget += BURST_BUDGET;
202 gettimeofday(&last_put, NULL);
203 } else {
204 while (budget < 0) {
205 gettimeofday(&now, NULL);
206 delta = tv_delta(&now, &last_put);
207 while (delta > BURST_TIME_US) {
208 budget += BURST_BUDGET;
209 last_put.tv_usec += BURST_TIME_US;
210 if (last_put.tv_usec > 1000000) {
211 last_put.tv_usec -= 1000000;
212 last_put.tv_sec++;
213 }
214 delta -= BURST_TIME_US;
215 }
216 if (budget > 0)
217 break;
218 delay.tv_sec = 0;
219 delay.tv_nsec = 1000 * (BURST_TIME_US - delta);
220 while (delay.tv_nsec > 0)
221 if (nanosleep(&delay, &delay) == 0)
222 break;
223 }
224 }
225 }
226 return write(io_fd, buf, n);
227 }
229 static int print_stats( int xc_handle, u32 domid,
230 int pages_sent, xc_shadow_control_stats_t *stats,
231 int print )
232 {
233 static struct timeval wall_last;
234 static long long d0_cpu_last;
235 static long long d1_cpu_last;
237 struct timeval wall_now;
238 long long wall_delta;
239 long long d0_cpu_now, d0_cpu_delta;
240 long long d1_cpu_now, d1_cpu_delta;
242 gettimeofday(&wall_now, NULL);
244 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
245 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
247 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
248 fprintf(stderr, "ARRHHH!!\n");
250 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
252 if ( wall_delta == 0 ) wall_delta = 1;
254 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
255 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
257 if ( print )
258 fprintf(stderr,
259 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
260 "dirtied %dMb/s %" PRId32 " pages\n",
261 wall_delta,
262 (int)((d0_cpu_delta*100)/wall_delta),
263 (int)((d1_cpu_delta*100)/wall_delta),
264 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
265 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
266 stats->dirty_count);
268 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
269 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
270 + 50;
271 if (mbit_rate > MAX_MBIT_RATE)
272 mbit_rate = MAX_MBIT_RATE;
273 }
275 d0_cpu_last = d0_cpu_now;
276 d1_cpu_last = d1_cpu_now;
277 wall_last = wall_now;
279 return 0;
280 }
282 static int analysis_phase( int xc_handle, u32 domid,
283 int nr_pfns, unsigned long *arr, int runs )
284 {
285 long long start, now;
286 xc_shadow_control_stats_t stats;
287 int j;
289 start = llgettimeofday();
291 for (j = 0; j < runs; j++)
292 {
293 int i;
295 xc_shadow_control( xc_handle, domid,
296 DOM0_SHADOW_CONTROL_OP_CLEAN,
297 arr, nr_pfns, NULL);
298 fprintf(stderr, "#Flush\n");
299 for ( i = 0; i < 40; i++ )
300 {
301 usleep(50000);
302 now = llgettimeofday();
303 xc_shadow_control( xc_handle, domid,
304 DOM0_SHADOW_CONTROL_OP_PEEK,
305 NULL, 0, &stats);
307 fprintf(stderr, "now= %lld faults= %" PRId32 " dirty= %" PRId32
308 " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
309 ((now-start)+500)/1000,
310 stats.fault_count, stats.dirty_count,
311 stats.dirty_net_count, stats.dirty_block_count);
312 }
313 }
315 return -1;
316 }
319 static int suspend_and_state(int xc_handle, int io_fd, int dom,
320 xc_dominfo_t *info,
321 vcpu_guest_context_t *ctxt)
322 {
323 int i=0;
324 char ans[30];
326 printf("suspend\n");
327 fflush(stdout);
328 if (fgets(ans, sizeof(ans), stdin) == NULL) {
329 ERR("failed reading suspend reply");
330 return -1;
331 }
332 if (strncmp(ans, "done\n", 5)) {
333 ERR("suspend reply incorrect: %s", ans);
334 return -1;
335 }
337 retry:
339 if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1)
340 {
341 ERR("Could not get domain info");
342 return -1;
343 }
345 if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */,
346 ctxt) )
347 {
348 ERR("Could not get vcpu context");
349 }
351 if ( info->shutdown && info->shutdown_reason == SHUTDOWN_suspend )
352 {
353 return 0; // success
354 }
356 if ( info->paused )
357 {
358 // try unpausing domain, wait, and retest
359 xc_domain_unpause( xc_handle, dom );
361 ERR("Domain was paused. Wait and re-test.");
362 usleep(10000); // 10ms
364 goto retry;
365 }
368 if( ++i < 100 )
369 {
370 ERR("Retry suspend domain.");
371 usleep(10000); // 10ms
372 goto retry;
373 }
375 ERR("Unable to suspend domain.");
377 return -1;
378 }
380 int xc_linux_save(int xc_handle, int io_fd, u32 dom)
381 {
382 xc_dominfo_t info;
384 int rc = 1, i, j, k, last_iter, iter = 0;
385 unsigned long mfn;
386 int live = 0; // (ioctxt->flags & XCFLAGS_LIVE);
387 int debug = 0; // (ioctxt->flags & XCFLAGS_DEBUG);
388 int sent_last_iter, skip_this_iter;
390 /* Important tuning parameters */
391 int max_iters = 29; /* limit us to 30 times round loop */
392 int max_factor = 3; /* never send more than 3x nr_pfns */
394 /* The new domain's shared-info frame number. */
395 unsigned long shared_info_frame;
397 /* A copy of the CPU context of the guest. */
398 vcpu_guest_context_t ctxt;
400 /* A table containg the type of each PFN (/not/ MFN!). */
401 unsigned long *pfn_type = NULL;
402 unsigned long *pfn_batch = NULL;
404 /* A temporary mapping, and a copy, of one frame of guest memory. */
405 unsigned long page[1024];
407 /* A copy of the pfn-to-mfn table frame list. */
408 unsigned long *live_pfn_to_mfn_frame_list = NULL;
409 unsigned long pfn_to_mfn_frame_list[1024];
411 /* Live mapping of the table mapping each PFN to its current MFN. */
412 unsigned long *live_pfn_to_mfn_table = NULL;
413 /* Live mapping of system MFN to PFN table. */
414 unsigned long *live_mfn_to_pfn_table = NULL;
415 unsigned long mfn_to_pfn_table_start_mfn;
417 /* Live mapping of shared info structure */
418 shared_info_t *live_shinfo = NULL;
420 /* base of the region in which domain memory is mapped */
421 unsigned char *region_base = NULL;
423 /* A temporary mapping, and a copy, of the guest's suspend record. */
424 suspend_record_t *p_srec = NULL;
426 /* number of pages we're dealing with */
427 unsigned long nr_pfns;
429 /* power of 2 order of nr_pfns */
430 int order_nr;
432 /* bitmap of pages:
433 - that should be sent this iteration (unless later marked as skip);
434 - to skip this iteration because already dirty;
435 - to fixup by sending at the end if not already resent; */
436 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
438 xc_shadow_control_stats_t stats;
440 int needed_to_fix = 0;
441 int total_sent = 0;
443 MBIT_RATE = START_MBIT_RATE;
445 DPRINTF("xc_linux_save start %d\n", dom);
447 if (mlock(&ctxt, sizeof(ctxt))) {
448 ERR("Unable to mlock ctxt");
449 return 1;
450 }
452 if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1)
453 {
454 ERR("Could not get domain info");
455 goto out;
456 }
457 if ( xc_domain_get_vcpu_context( xc_handle, dom, /* FIXME */ 0,
458 &ctxt) )
459 {
460 ERR("Could not get vcpu context");
461 goto out;
462 }
463 shared_info_frame = info.shared_info_frame;
465 /* A cheesy test to see whether the domain contains valid state. */
466 if ( ctxt.ctrlreg[3] == 0 ){
467 ERR("Domain is not in a valid Linux guest OS state");
468 goto out;
469 }
471 nr_pfns = info.max_memkb >> (PAGE_SHIFT - 10);
473 /* cheesy sanity check */
474 if ( nr_pfns > 1024*1024 )
475 {
476 ERR("Invalid state record -- pfn count out of range: %lu", nr_pfns);
477 goto out;
478 }
480 /* Map the shared info frame */
481 live_shinfo = xc_map_foreign_range(xc_handle, dom,
482 PAGE_SIZE, PROT_READ,
483 shared_info_frame);
485 if (!live_shinfo){
486 ERR("Couldn't map live_shinfo");
487 goto out;
488 }
490 /* the pfn_to_mfn_frame_list fits in a single page */
491 live_pfn_to_mfn_frame_list =
492 xc_map_foreign_range(xc_handle, dom,
493 PAGE_SIZE, PROT_READ,
494 live_shinfo->arch.pfn_to_mfn_frame_list );
496 if (!live_pfn_to_mfn_frame_list){
497 ERR("Couldn't map pfn_to_mfn_frame_list");
498 goto out;
499 }
502 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
503 the guest must not change which frames are used for this purpose.
504 (its not clear why it would want to change them, and we'll be OK
505 from a safety POV anyhow. */
507 live_pfn_to_mfn_table = xc_map_foreign_batch(xc_handle, dom,
508 PROT_READ,
509 live_pfn_to_mfn_frame_list,
510 (nr_pfns+1023)/1024 );
511 if( !live_pfn_to_mfn_table ){
512 ERR("Couldn't map pfn_to_mfn table");
513 goto out;
514 }
516 /* Setup the mfn_to_pfn table mapping */
517 mfn_to_pfn_table_start_mfn = xc_get_m2p_start_mfn( xc_handle );
519 live_mfn_to_pfn_table =
520 xc_map_foreign_range(xc_handle, DOMID_XEN,
521 PAGE_SIZE*1024, PROT_READ,
522 mfn_to_pfn_table_start_mfn );
524 /* Canonicalise the pfn-to-mfn table frame-number list. */
525 memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE );
527 for ( i = 0; i < nr_pfns; i += 1024 ){
528 if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ){
529 ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
530 goto out;
531 }
532 }
535 /* Domain is still running at this point */
537 if( live )
538 {
539 if ( xc_shadow_control( xc_handle, dom,
540 DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
541 NULL, 0, NULL ) < 0 ) {
542 ERR("Couldn't enable shadow mode");
543 goto out;
544 }
546 last_iter = 0;
547 } else{
548 /* This is a non-live suspend. Issue the call back to get the
549 domain suspended */
551 last_iter = 1;
553 if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
554 {
555 ERR("Domain appears not to have suspended");
556 goto out;
557 }
559 }
560 sent_last_iter = 1<<20; /* 4GB of pages */
562 /* calculate the power of 2 order of nr_pfns, e.g.
563 15->4 16->4 17->5 */
564 for( i=nr_pfns-1, order_nr=0; i ; i>>=1, order_nr++ );
566 /* Setup to_send bitmap */
567 {
568 /* size these for a maximal 4GB domain, to make interaction
569 with balloon driver easier. It's only user space memory,
570 ater all... (3x 128KB) */
572 int sz = ( 1<<20 ) / 8;
574 to_send = malloc( sz );
575 to_fix = calloc( 1, sz );
576 to_skip = malloc( sz );
578 if (!to_send || !to_fix || !to_skip){
579 ERR("Couldn't allocate to_send array");
580 goto out;
581 }
583 memset( to_send, 0xff, sz );
585 if ( mlock( to_send, sz ) ){
586 ERR("Unable to mlock to_send");
587 return 1;
588 }
590 /* (to fix is local only) */
592 if ( mlock( to_skip, sz ) ){
593 ERR("Unable to mlock to_skip");
594 return 1;
595 }
597 }
599 analysis_phase( xc_handle, dom, nr_pfns, to_skip, 0 );
601 /* We want zeroed memory so use calloc rather than malloc. */
602 pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
603 pfn_batch = calloc(BATCH_SIZE, sizeof(unsigned long));
605 if ( (pfn_type == NULL) || (pfn_batch == NULL) ){
606 errno = ENOMEM;
607 goto out;
608 }
610 if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) ){
611 ERR("Unable to mlock");
612 goto out;
613 }
616 /*
617 * Quick belt and braces sanity check.
618 */
619 #if DEBUG
620 {
621 int err=0;
622 for ( i = 0; i < nr_pfns; i++ )
623 {
624 mfn = live_pfn_to_mfn_table[i];
626 if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0xffffffffUL) )
627 {
628 fprintf(stderr, "i=0x%x mfn=%lx live_mfn_to_pfn_table=%lx\n",
629 i,mfn,live_mfn_to_pfn_table[mfn]);
630 err++;
631 }
632 }
633 fprintf(stderr, "Had %d unexplained entries in p2m table\n",err);
634 }
635 #endif
638 /* Start writing out the saved-domain record. */
640 if (write(io_fd, &nr_pfns, sizeof(unsigned long)) !=
641 sizeof(unsigned long)) {
642 ERR("write: nr_pfns");
643 goto out;
644 }
645 if (write(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE) {
646 ERR("write: pfn_to_mfn_frame_list");
647 goto out;
648 }
650 /* Map the suspend-record MFN to pin it. The page must be owned by
651 dom for this to succeed. */
652 p_srec = xc_map_foreign_range(xc_handle, dom,
653 sizeof(*p_srec), PROT_READ | PROT_WRITE,
654 ctxt.user_regs.esi);
655 if (!p_srec){
656 ERR("Couldn't map suspend record");
657 goto out;
658 }
660 /* Canonicalize store mfn. */
661 if ( !translate_mfn_to_pfn(&p_srec->resume_info.store_mfn) ) {
662 ERR("Store frame is not in range of pseudophys map");
663 goto out;
664 }
666 print_stats( xc_handle, dom, 0, &stats, 0 );
668 /* Now write out each data page, canonicalising page tables as we go... */
670 while(1){
671 unsigned int prev_pc, sent_this_iter, N, batch;
673 iter++;
674 sent_this_iter = 0;
675 skip_this_iter = 0;
676 prev_pc = 0;
677 N=0;
679 DPRINTF("Saving memory pages: iter %d 0%%", iter);
681 while( N < nr_pfns ){
682 unsigned int this_pc = (N * 100) / nr_pfns;
684 if ( (this_pc - prev_pc) >= 5 ){
685 DPRINTF("\b\b\b\b%3d%%", this_pc);
686 prev_pc = this_pc;
687 }
689 /* slightly wasteful to peek the whole array evey time,
690 but this is fast enough for the moment. */
692 if ( !last_iter &&
693 xc_shadow_control(xc_handle, dom,
694 DOM0_SHADOW_CONTROL_OP_PEEK,
695 to_skip, nr_pfns, NULL) != nr_pfns )
696 {
697 ERR("Error peeking shadow bitmap");
698 goto out;
699 }
702 /* load pfn_type[] with the mfn of all the pages we're doing in
703 this batch. */
705 for ( batch = 0; batch < BATCH_SIZE && N < nr_pfns ; N++ )
706 {
707 int n = permute(N, nr_pfns, order_nr );
709 if ( 0 && debug ) {
710 fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d "
711 " [mfn]= %08lx\n",
712 iter, (unsigned long)n, live_pfn_to_mfn_table[n],
713 test_bit(n,to_send),
714 live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&
715 0xFFFFF]);
716 }
718 if ( !last_iter &&
719 test_bit(n, to_send) &&
720 test_bit(n, to_skip) ) {
721 skip_this_iter++; /* stats keeping */
722 }
724 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
725 (test_bit(n, to_send) && last_iter) ||
726 (test_bit(n, to_fix) && last_iter)) ) {
727 continue;
728 }
730 /* we get here if:
731 1. page is marked to_send & hasn't already been re-dirtied
732 2. (ignore to_skip in last iteration)
733 3. add in pages that still need fixup (net bufs)
734 */
736 pfn_batch[batch] = n;
737 pfn_type[batch] = live_pfn_to_mfn_table[n];
739 if( ! is_mapped(pfn_type[batch]) )
740 {
741 /* not currently in pusedo-physical map -- set bit
742 in to_fix that we must send this page in last_iter
743 unless its sent sooner anyhow */
745 set_bit( n, to_fix );
746 if( iter>1 )
747 DPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
748 iter,n,pfn_type[batch]);
749 continue;
750 }
752 if ( last_iter &&
753 test_bit(n, to_fix) &&
754 !test_bit(n, to_send) )
755 {
756 needed_to_fix++;
757 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
758 iter,n,pfn_type[batch]);
759 }
761 clear_bit(n, to_fix);
763 batch++;
764 }
766 // DPRINTF("batch %d:%d (n=%d)\n", iter, batch, n);
768 if ( batch == 0 )
769 goto skip; /* vanishingly unlikely... */
771 if ( (region_base = xc_map_foreign_batch(xc_handle, dom,
772 PROT_READ,
773 pfn_type,
774 batch)) == 0 ){
775 ERR("map batch failed");
776 goto out;
777 }
779 if ( xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type) ){
780 ERR("get_pfn_type_batch failed");
781 goto out;
782 }
784 for ( j = 0; j < batch; j++ ){
785 if ( (pfn_type[j] & LTAB_MASK) == XTAB ){
786 DPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
787 continue;
788 }
790 if ( 0 && debug )
791 fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
792 " sum= %08lx\n",
793 iter,
794 (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
795 pfn_type[j],
796 live_mfn_to_pfn_table[pfn_type[j]&(~LTAB_MASK)],
797 csum_page(region_base + (PAGE_SIZE*j)));
799 /* canonicalise mfn->pfn */
800 pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
801 }
803 if (write(io_fd, &batch, sizeof(int)) != sizeof(int)) {
804 ERR("Error when writing to state file (2)");
805 goto out;
806 }
808 if (write(io_fd, pfn_type, sizeof(unsigned long)*j) !=
809 sizeof(unsigned long)*j) {
810 ERR("Error when writing to state file (3)");
811 goto out;
812 }
814 /* entering this loop, pfn_type is now in pfns (Not mfns) */
815 for( j = 0; j < batch; j++ ){
816 /* write out pages in batch */
817 if( (pfn_type[j] & LTAB_MASK) == XTAB){
818 DPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
819 continue;
820 }
822 if ( ((pfn_type[j] & LTABTYPE_MASK) == L1TAB) ||
823 ((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ){
824 memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
826 for ( k = 0;
827 k < (((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ?
828 (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) :
829 1024);
830 k++ ){
831 unsigned long pfn;
833 if ( !(page[k] & _PAGE_PRESENT) )
834 continue;
836 mfn = page[k] >> PAGE_SHIFT;
837 pfn = live_mfn_to_pfn_table[mfn];
839 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
840 {
841 /* I don't think this should ever happen */
842 fprintf(stderr, "FNI %d : [%08lx,%d] pte=%08lx, "
843 "mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
844 j, pfn_type[j], k,
845 page[k], mfn, live_mfn_to_pfn_table[mfn],
846 (live_mfn_to_pfn_table[mfn]<nr_pfns)?
847 live_pfn_to_mfn_table[
848 live_mfn_to_pfn_table[mfn]] :
849 0xdeadbeef);
851 pfn = 0; /* be suspicious */
852 }
854 page[k] &= PAGE_SIZE - 1;
855 page[k] |= pfn << PAGE_SHIFT;
857 #if 0
858 fprintf(stderr,
859 "L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx "
860 "xpfn=%d\n",
861 pfn_type[j]>>28,
862 j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
863 #endif
865 } /* end of page table rewrite for loop */
867 if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
868 ERR("Error when writing to state file (4)");
869 goto out;
870 }
872 } /* end of it's a PT page */ else { /* normal page */
874 if (ratewrite(io_fd, region_base + (PAGE_SIZE*j),
875 PAGE_SIZE) != PAGE_SIZE) {
876 ERR("Error when writing to state file (5)");
877 goto out;
878 }
879 }
880 } /* end of the write out for this batch */
882 sent_this_iter += batch;
884 } /* end of this while loop for this iteration */
886 munmap(region_base, batch*PAGE_SIZE);
888 skip:
890 total_sent += sent_this_iter;
892 DPRINTF("\r %d: sent %d, skipped %d, ",
893 iter, sent_this_iter, skip_this_iter );
895 if ( last_iter ) {
896 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
898 DPRINTF("Total pages sent= %d (%.2fx)\n",
899 total_sent, ((float)total_sent)/nr_pfns );
900 DPRINTF("(of which %d were fixups)\n", needed_to_fix );
901 }
903 if (last_iter && debug){
904 int minusone = -1;
905 memset( to_send, 0xff, (nr_pfns+8)/8 );
906 debug = 0;
907 fprintf(stderr, "Entering debug resend-all mode\n");
909 /* send "-1" to put receiver into debug mode */
910 if (write(io_fd, &minusone, sizeof(int)) != sizeof(int)) {
911 ERR("Error when writing to state file (6)");
912 goto out;
913 }
915 continue;
916 }
918 if ( last_iter ) break;
920 if ( live )
921 {
922 if (
923 ( ( sent_this_iter > sent_last_iter ) &&
924 (mbit_rate == MAX_MBIT_RATE ) ) ||
925 (iter >= max_iters) ||
926 (sent_this_iter+skip_this_iter < 50) ||
927 (total_sent > nr_pfns*max_factor) )
928 {
929 DPRINTF("Start last iteration\n");
930 last_iter = 1;
932 if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
933 {
934 ERR("Domain appears not to have suspended");
935 goto out;
936 }
938 DPRINTF("SUSPEND shinfo %08lx eip %08u esi %08u\n",
939 info.shared_info_frame,
940 ctxt.user_regs.eip, ctxt.user_regs.esi);
941 }
943 if ( xc_shadow_control( xc_handle, dom,
944 DOM0_SHADOW_CONTROL_OP_CLEAN,
945 to_send, nr_pfns, &stats ) != nr_pfns )
946 {
947 ERR("Error flushing shadow PT");
948 goto out;
949 }
951 sent_last_iter = sent_this_iter;
953 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
955 }
958 } /* end of while 1 */
960 DPRINTF("All memory is saved\n");
962 /* Success! */
963 rc = 0;
965 /* Zero terminate */
966 if (write(io_fd, &rc, sizeof(int)) != sizeof(int)) {
967 ERR("Error when writing to state file (6)");
968 goto out;
969 }
971 /* Send through a list of all the PFNs that were not in map at the close */
972 {
973 unsigned int i,j;
974 unsigned int pfntab[1024];
976 for ( i = 0, j = 0; i < nr_pfns; i++ )
977 {
978 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
979 j++;
980 }
982 if (write(io_fd, &j, sizeof(unsigned int)) != sizeof(unsigned int)) {
983 ERR("Error when writing to state file (6a)");
984 goto out;
985 }
987 for ( i = 0, j = 0; i < nr_pfns; )
988 {
989 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
990 {
991 pfntab[j++] = i;
992 }
993 i++;
994 if ( j == 1024 || i == nr_pfns )
995 {
996 if (write(io_fd, &pfntab, sizeof(unsigned long)*j) !=
997 sizeof(unsigned long)*j) {
998 ERR("Error when writing to state file (6b)");
999 goto out;
1001 j = 0;
1006 if (nr_pfns != p_srec->nr_pfns )
1008 ERR("Suspend record nr_pfns unexpected (%ld != %ld)",
1009 p_srec->nr_pfns, nr_pfns);
1010 goto out;
1013 /* Canonicalise the suspend-record frame number. */
1014 if ( !translate_mfn_to_pfn(&ctxt.user_regs.esi) ){
1015 ERR("Suspend record is not in range of pseudophys map");
1016 goto out;
1019 /* Canonicalise each GDT frame number. */
1020 for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
1021 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1022 ERR("GDT frame is not in range of pseudophys map");
1023 goto out;
1027 /* Canonicalise the page table base pointer. */
1028 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) ) {
1029 ERR("PT base is not in range of pseudophys map");
1030 goto out;
1032 ctxt.ctrlreg[3] = live_mfn_to_pfn_table[ctxt.ctrlreg[3] >> PAGE_SHIFT] <<
1033 PAGE_SHIFT;
1035 if (write(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) ||
1036 write(io_fd, live_shinfo, PAGE_SIZE) != PAGE_SIZE) {
1037 ERR("Error when writing to state file (1)");
1038 goto out;
1041 out:
1043 if(live_shinfo)
1044 munmap(live_shinfo, PAGE_SIZE);
1046 if(p_srec)
1047 munmap(p_srec, sizeof(*p_srec));
1049 if(live_pfn_to_mfn_frame_list)
1050 munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
1052 if(live_pfn_to_mfn_table)
1053 munmap(live_pfn_to_mfn_table, nr_pfns*4);
1055 if(live_mfn_to_pfn_table)
1056 munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024);
1058 free(pfn_type);
1059 free(pfn_batch);
1060 free(to_send);
1061 free(to_fix);
1062 free(to_skip);
1064 DPRINTF("Save exit rc=%d\n",rc);
1065 return !!rc;