ia64/xen-unstable

view tools/libxc/xc_linux_save.c @ 6671:7a36f58f64ee

merge?
author kaf24@firebug.cl.cam.ac.uk
date Wed Sep 07 09:50:57 2005 +0000 (2005-09-07)
parents fb90dd31c6d7 ff14bb5600c9
children a75b08af8d19
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xg_private.h"
17 #include <xen/io/domain_controller.h>
19 #define BATCH_SIZE 1024 /* 1024 pages (4MB) at a time */
21 #define MAX_MBIT_RATE 500
24 /*
25 ** Default values for important tuning parameters. Can override by passing
26 ** non-zero replacement values to xc_linux_save().
27 **
28 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
29 **
30 */
31 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
32 #define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
36 /* Flags to control behaviour of xc_linux_save */
37 #define XCFLAGS_LIVE 1
38 #define XCFLAGS_DEBUG 2
41 #define DEBUG 0
43 #if 1
44 #define ERR(_f, _a...) do { fprintf(stderr, _f , ## _a); fflush(stderr); } while (0)
45 #else
46 #define ERR(_f, _a...) ((void)0)
47 #endif
49 #if DEBUG
50 #define DPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
51 #else
52 #define DPRINTF(_f, _a...) ((void)0)
53 #endif
55 #define PROGRESS 0
56 #if PROGRESS
57 #define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
58 #else
59 #define PPRINTF(_f, _a...)
60 #endif
62 /*
63 * Returns TRUE if the given machine frame number has a unique mapping
64 * in the guest's pseudophysical map.
65 */
67 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
68 (((_mfn) < (1024*1024)) && \
69 ((live_mfn_to_pfn_table[_mfn] < nr_pfns) && \
70 (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn))))
73 /* Returns TRUE if MFN is successfully converted to a PFN. */
74 #define translate_mfn_to_pfn(_pmfn) \
75 ({ \
76 unsigned long mfn = *(_pmfn); \
77 int _res = 1; \
78 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
79 _res = 0; \
80 else \
81 *(_pmfn) = live_mfn_to_pfn_table[mfn]; \
82 _res; \
83 })
85 #define is_mapped(pfn) (!((pfn) & 0x80000000UL))
87 static inline int test_bit ( int nr, volatile void * addr)
88 {
89 return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
90 (nr % (sizeof(unsigned long)*8))) & 1;
91 }
93 static inline void clear_bit ( int nr, volatile void * addr)
94 {
95 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &=
96 ~(1 << (nr % (sizeof(unsigned long)*8) ) );
97 }
99 static inline void set_bit ( int nr, volatile void * addr)
100 {
101 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |=
102 (1 << (nr % (sizeof(unsigned long)*8) ) );
103 }
105 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
106 static inline unsigned int hweight32(unsigned int w)
107 {
108 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
109 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
110 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
111 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
112 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
113 }
115 static inline int count_bits ( int nr, volatile void *addr)
116 {
117 int i, count = 0;
118 unsigned long *p = (unsigned long *)addr;
119 /* We know that the array is padded to unsigned long. */
120 for(i=0;i<nr/(sizeof(unsigned long)*8);i++,p++)
121 count += hweight32( *p );
122 return count;
123 }
125 static inline int permute( int i, int nr, int order_nr )
126 {
127 /* Need a simple permutation function so that we scan pages in a
128 pseudo random order, enabling us to get a better estimate of
129 the domain's page dirtying rate as we go (there are often
130 contiguous ranges of pfns that have similar behaviour, and we
131 want to mix them up. */
133 /* e.g. nr->oder 15->4 16->4 17->5 */
134 /* 512MB domain, 128k pages, order 17 */
136 /*
137 QPONMLKJIHGFEDCBA
138 QPONMLKJIH
139 GFEDCBA
140 */
142 /*
143 QPONMLKJIHGFEDCBA
144 EDCBA
145 QPONM
146 LKJIHGF
147 */
149 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
150 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
152 return i;
153 }
155 static long long tv_to_us( struct timeval *new )
156 {
157 return (new->tv_sec * 1000000) + new->tv_usec;
158 }
160 static long long llgettimeofday( void )
161 {
162 struct timeval now;
163 gettimeofday(&now, NULL);
164 return tv_to_us(&now);
165 }
167 static long long tv_delta( struct timeval *new, struct timeval *old )
168 {
169 return ((new->tv_sec - old->tv_sec)*1000000 ) +
170 (new->tv_usec - old->tv_usec);
171 }
174 #define START_MBIT_RATE 0 //ioctxt->resource
176 static int mbit_rate, ombit_rate = 0;
177 static int burst_time_us = -1;
179 #define MBIT_RATE mbit_rate
180 #define BURST_BUDGET (100*1024)
182 /*
183 1000000/((100)*1024*1024/8/(100*1024))
184 7812
185 1000000/((100)*1024/8/(100))
186 7812
187 1000000/((100)*128/(100))
188 7812
189 100000000/((100)*128)
190 7812
191 100000000/128
192 781250
193 */
194 #define RATE_TO_BTU 781250
195 #define BURST_TIME_US burst_time_us
197 static int
198 ratewrite(int io_fd, void *buf, int n)
199 {
200 static int budget = 0;
201 static struct timeval last_put = { 0 };
202 struct timeval now;
203 struct timespec delay;
204 long long delta;
206 if (START_MBIT_RATE == 0)
207 return write(io_fd, buf, n);
209 budget -= n;
210 if (budget < 0) {
211 if (MBIT_RATE != ombit_rate) {
212 BURST_TIME_US = RATE_TO_BTU / MBIT_RATE;
213 ombit_rate = MBIT_RATE;
214 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
215 MBIT_RATE, BURST_BUDGET, BURST_TIME_US);
216 }
217 if (last_put.tv_sec == 0) {
218 budget += BURST_BUDGET;
219 gettimeofday(&last_put, NULL);
220 } else {
221 while (budget < 0) {
222 gettimeofday(&now, NULL);
223 delta = tv_delta(&now, &last_put);
224 while (delta > BURST_TIME_US) {
225 budget += BURST_BUDGET;
226 last_put.tv_usec += BURST_TIME_US;
227 if (last_put.tv_usec > 1000000) {
228 last_put.tv_usec -= 1000000;
229 last_put.tv_sec++;
230 }
231 delta -= BURST_TIME_US;
232 }
233 if (budget > 0)
234 break;
235 delay.tv_sec = 0;
236 delay.tv_nsec = 1000 * (BURST_TIME_US - delta);
237 while (delay.tv_nsec > 0)
238 if (nanosleep(&delay, &delay) == 0)
239 break;
240 }
241 }
242 }
243 return write(io_fd, buf, n);
244 }
246 static int print_stats( int xc_handle, u32 domid,
247 int pages_sent, xc_shadow_control_stats_t *stats,
248 int print )
249 {
250 static struct timeval wall_last;
251 static long long d0_cpu_last;
252 static long long d1_cpu_last;
254 struct timeval wall_now;
255 long long wall_delta;
256 long long d0_cpu_now, d0_cpu_delta;
257 long long d1_cpu_now, d1_cpu_delta;
259 gettimeofday(&wall_now, NULL);
261 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
262 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
264 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
265 fprintf(stderr, "ARRHHH!!\n");
267 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
269 if ( wall_delta == 0 ) wall_delta = 1;
271 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
272 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
274 if ( print )
275 fprintf(stderr,
276 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
277 "dirtied %dMb/s %" PRId32 " pages\n",
278 wall_delta,
279 (int)((d0_cpu_delta*100)/wall_delta),
280 (int)((d1_cpu_delta*100)/wall_delta),
281 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
282 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
283 stats->dirty_count);
285 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
286 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
287 + 50;
288 if (mbit_rate > MAX_MBIT_RATE)
289 mbit_rate = MAX_MBIT_RATE;
290 }
292 d0_cpu_last = d0_cpu_now;
293 d1_cpu_last = d1_cpu_now;
294 wall_last = wall_now;
296 return 0;
297 }
299 static int analysis_phase( int xc_handle, u32 domid,
300 int nr_pfns, unsigned long *arr, int runs )
301 {
302 long long start, now;
303 xc_shadow_control_stats_t stats;
304 int j;
306 start = llgettimeofday();
308 for (j = 0; j < runs; j++)
309 {
310 int i;
312 xc_shadow_control( xc_handle, domid,
313 DOM0_SHADOW_CONTROL_OP_CLEAN,
314 arr, nr_pfns, NULL);
315 fprintf(stderr, "#Flush\n");
316 for ( i = 0; i < 40; i++ )
317 {
318 usleep(50000);
319 now = llgettimeofday();
320 xc_shadow_control( xc_handle, domid,
321 DOM0_SHADOW_CONTROL_OP_PEEK,
322 NULL, 0, &stats);
324 fprintf(stderr, "now= %lld faults= %" PRId32 " dirty= %" PRId32
325 " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
326 ((now-start)+500)/1000,
327 stats.fault_count, stats.dirty_count,
328 stats.dirty_net_count, stats.dirty_block_count);
329 }
330 }
332 return -1;
333 }
336 static int suspend_and_state(int xc_handle, int io_fd, int dom,
337 xc_dominfo_t *info,
338 vcpu_guest_context_t *ctxt)
339 {
340 int i = 0;
341 char ans[30];
343 printf("suspend\n");
344 fflush(stdout);
345 if (fgets(ans, sizeof(ans), stdin) == NULL) {
346 ERR("failed reading suspend reply");
347 return -1;
348 }
349 if (strncmp(ans, "done\n", 5)) {
350 ERR("suspend reply incorrect: %s", ans);
351 return -1;
352 }
354 retry:
356 if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1)
357 {
358 ERR("Could not get domain info");
359 return -1;
360 }
362 if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */,
363 ctxt) )
364 {
365 ERR("Could not get vcpu context");
366 }
368 if ( info->shutdown && info->shutdown_reason == SHUTDOWN_suspend )
369 {
370 return 0; // success
371 }
373 if ( info->paused )
374 {
375 // try unpausing domain, wait, and retest
376 xc_domain_unpause( xc_handle, dom );
378 ERR("Domain was paused. Wait and re-test.");
379 usleep(10000); // 10ms
381 goto retry;
382 }
385 if( ++i < 100 )
386 {
387 ERR("Retry suspend domain.");
388 usleep(10000); // 10ms
389 goto retry;
390 }
392 ERR("Unable to suspend domain.");
394 return -1;
395 }
397 int xc_linux_save(int xc_handle, int io_fd, u32 dom, u32 max_iters,
398 u32 max_factor, u32 flags)
399 {
400 xc_dominfo_t info;
402 int rc = 1, i, j, k, last_iter, iter = 0;
403 unsigned long mfn;
404 int live = (flags & XCFLAGS_LIVE);
405 int debug = (flags & XCFLAGS_DEBUG);
406 int sent_last_iter, skip_this_iter;
408 /* The new domain's shared-info frame number. */
409 unsigned long shared_info_frame;
411 /* A copy of the CPU context of the guest. */
412 vcpu_guest_context_t ctxt;
414 /* A table containg the type of each PFN (/not/ MFN!). */
415 unsigned long *pfn_type = NULL;
416 unsigned long *pfn_batch = NULL;
418 /* A temporary mapping, and a copy, of one frame of guest memory. */
419 unsigned long page[1024];
421 /* A copy of the pfn-to-mfn table frame list. */
422 unsigned long *live_pfn_to_mfn_frame_list_list = NULL;
423 unsigned long *live_pfn_to_mfn_frame_list = NULL;
424 unsigned long pfn_to_mfn_frame_list[1024];
426 /* Live mapping of the table mapping each PFN to its current MFN. */
427 unsigned long *live_pfn_to_mfn_table = NULL;
428 /* Live mapping of system MFN to PFN table. */
429 unsigned long *live_mfn_to_pfn_table = NULL;
430 unsigned long mfn_to_pfn_table_start_mfn;
432 /* Live mapping of shared info structure */
433 shared_info_t *live_shinfo = NULL;
435 /* base of the region in which domain memory is mapped */
436 unsigned char *region_base = NULL;
438 /* number of pages we're dealing with */
439 unsigned long nr_pfns;
441 /* power of 2 order of nr_pfns */
442 int order_nr;
444 /* bitmap of pages:
445 - that should be sent this iteration (unless later marked as skip);
446 - to skip this iteration because already dirty;
447 - to fixup by sending at the end if not already resent; */
448 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
450 xc_shadow_control_stats_t stats;
452 int needed_to_fix = 0;
453 int total_sent = 0;
455 MBIT_RATE = START_MBIT_RATE;
458 /* If no explicit control parameters given, use defaults */
459 if(!max_iters)
460 max_iters = DEF_MAX_ITERS;
461 if(!max_factor)
462 max_factor = DEF_MAX_FACTOR;
465 DPRINTF("xc_linux_save start DOM%u live=%s\n", dom, live?"true":"false");
467 if (mlock(&ctxt, sizeof(ctxt))) {
468 ERR("Unable to mlock ctxt");
469 return 1;
470 }
472 if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1)
473 {
474 ERR("Could not get domain info");
475 goto out;
476 }
477 if ( xc_domain_get_vcpu_context( xc_handle, dom, /* FIXME */ 0,
478 &ctxt) )
479 {
480 ERR("Could not get vcpu context");
481 goto out;
482 }
483 shared_info_frame = info.shared_info_frame;
485 /* A cheesy test to see whether the domain contains valid state. */
486 if ( ctxt.ctrlreg[3] == 0 ){
487 ERR("Domain is not in a valid Linux guest OS state");
488 goto out;
489 }
491 nr_pfns = info.max_memkb >> (PAGE_SHIFT - 10);
493 /* cheesy sanity check */
494 if ( nr_pfns > 1024*1024 )
495 {
496 ERR("Invalid state record -- pfn count out of range: %lu", nr_pfns);
497 goto out;
498 }
500 /* Map the shared info frame */
501 live_shinfo = xc_map_foreign_range(xc_handle, dom,
502 PAGE_SIZE, PROT_READ,
503 shared_info_frame);
505 if (!live_shinfo){
506 ERR("Couldn't map live_shinfo");
507 goto out;
508 }
510 live_pfn_to_mfn_frame_list_list = xc_map_foreign_range(xc_handle, dom,
511 PAGE_SIZE, PROT_READ,
512 live_shinfo->arch.pfn_to_mfn_frame_list_list);
514 if (!live_pfn_to_mfn_frame_list_list){
515 ERR("Couldn't map pfn_to_mfn_frame_list_list");
516 goto out;
517 }
519 live_pfn_to_mfn_frame_list =
520 xc_map_foreign_batch(xc_handle, dom,
521 PROT_READ,
522 live_pfn_to_mfn_frame_list_list,
523 (nr_pfns+(1024*1024)-1)/(1024*1024) );
525 if (!live_pfn_to_mfn_frame_list){
526 ERR("Couldn't map pfn_to_mfn_frame_list");
527 goto out;
528 }
531 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
532 the guest must not change which frames are used for this purpose.
533 (its not clear why it would want to change them, and we'll be OK
534 from a safety POV anyhow. */
536 live_pfn_to_mfn_table = xc_map_foreign_batch(xc_handle, dom,
537 PROT_READ,
538 live_pfn_to_mfn_frame_list,
539 (nr_pfns+1023)/1024 );
540 if( !live_pfn_to_mfn_table ){
541 ERR("Couldn't map pfn_to_mfn table");
542 goto out;
543 }
545 /* Setup the mfn_to_pfn table mapping */
546 mfn_to_pfn_table_start_mfn = xc_get_m2p_start_mfn( xc_handle );
548 live_mfn_to_pfn_table =
549 xc_map_foreign_range(xc_handle, DOMID_XEN,
550 PAGE_SIZE*1024, PROT_READ,
551 mfn_to_pfn_table_start_mfn );
553 /* Canonicalise the pfn-to-mfn table frame-number list. */
554 memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE );
556 for ( i = 0; i < nr_pfns; i += 1024 ){
557 if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ){
558 ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
559 goto out;
560 }
561 }
564 /* Domain is still running at this point */
566 if( live )
567 {
568 if ( xc_shadow_control( xc_handle, dom,
569 DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
570 NULL, 0, NULL ) < 0 ) {
571 ERR("Couldn't enable shadow mode");
572 goto out;
573 }
575 last_iter = 0;
576 } else{
577 /* This is a non-live suspend. Issue the call back to get the
578 domain suspended */
580 last_iter = 1;
582 if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
583 {
584 ERR("Domain appears not to have suspended");
585 goto out;
586 }
588 }
589 sent_last_iter = 1<<20; /* 4GB of pages */
591 /* calculate the power of 2 order of nr_pfns, e.g.
592 15->4 16->4 17->5 */
593 for( i=nr_pfns-1, order_nr=0; i ; i>>=1, order_nr++ );
595 /* Setup to_send bitmap */
596 {
597 /* size these for a maximal 4GB domain, to make interaction
598 with balloon driver easier. It's only user space memory,
599 ater all... (3x 128KB) */
601 int sz = ( 1<<20 ) / 8;
603 to_send = malloc( sz );
604 to_fix = calloc( 1, sz );
605 to_skip = malloc( sz );
607 if (!to_send || !to_fix || !to_skip){
608 ERR("Couldn't allocate to_send array");
609 goto out;
610 }
612 memset( to_send, 0xff, sz );
614 if ( mlock( to_send, sz ) ){
615 ERR("Unable to mlock to_send");
616 return 1;
617 }
619 /* (to fix is local only) */
621 if ( mlock( to_skip, sz ) ){
622 ERR("Unable to mlock to_skip");
623 return 1;
624 }
626 }
628 analysis_phase( xc_handle, dom, nr_pfns, to_skip, 0 );
630 /* We want zeroed memory so use calloc rather than malloc. */
631 pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
632 pfn_batch = calloc(BATCH_SIZE, sizeof(unsigned long));
634 if ( (pfn_type == NULL) || (pfn_batch == NULL) ){
635 errno = ENOMEM;
636 goto out;
637 }
639 if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) ){
640 ERR("Unable to mlock");
641 goto out;
642 }
645 /*
646 * Quick belt and braces sanity check.
647 */
648 #if DEBUG
649 {
650 int err=0;
651 for ( i = 0; i < nr_pfns; i++ )
652 {
653 mfn = live_pfn_to_mfn_table[i];
655 if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0xffffffffUL) )
656 {
657 fprintf(stderr, "i=0x%x mfn=%lx live_mfn_to_pfn_table=%lx\n",
658 i,mfn,live_mfn_to_pfn_table[mfn]);
659 err++;
660 }
661 }
662 fprintf(stderr, "Had %d unexplained entries in p2m table\n",err);
663 }
664 #endif
667 /* Start writing out the saved-domain record. */
669 if (write(io_fd, &nr_pfns, sizeof(unsigned long)) !=
670 sizeof(unsigned long)) {
671 ERR("write: nr_pfns");
672 goto out;
673 }
674 if (write(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE) {
675 ERR("write: pfn_to_mfn_frame_list");
676 goto out;
677 }
679 print_stats( xc_handle, dom, 0, &stats, 0 );
681 /* Now write out each data page, canonicalising page tables as we go... */
683 while(1){
684 unsigned int prev_pc, sent_this_iter, N, batch;
686 iter++;
687 sent_this_iter = 0;
688 skip_this_iter = 0;
689 prev_pc = 0;
690 N=0;
692 DPRINTF("Saving memory pages: iter %d 0%%", iter);
694 while( N < nr_pfns ){
695 unsigned int this_pc = (N * 100) / nr_pfns;
697 if ( (this_pc - prev_pc) >= 5 ){
698 DPRINTF("\b\b\b\b%3d%%", this_pc);
699 prev_pc = this_pc;
700 }
702 /* slightly wasteful to peek the whole array evey time,
703 but this is fast enough for the moment. */
705 if ( !last_iter &&
706 xc_shadow_control(xc_handle, dom,
707 DOM0_SHADOW_CONTROL_OP_PEEK,
708 to_skip, nr_pfns, NULL) != nr_pfns )
709 {
710 ERR("Error peeking shadow bitmap");
711 goto out;
712 }
715 /* load pfn_type[] with the mfn of all the pages we're doing in
716 this batch. */
718 for ( batch = 0; batch < BATCH_SIZE && N < nr_pfns ; N++ )
719 {
720 int n = permute(N, nr_pfns, order_nr );
722 if ( 0 && debug ) {
723 fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d "
724 " [mfn]= %08lx\n",
725 iter, (unsigned long)n, live_pfn_to_mfn_table[n],
726 test_bit(n,to_send),
727 live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&
728 0xFFFFF]);
729 }
731 if ( !last_iter &&
732 test_bit(n, to_send) &&
733 test_bit(n, to_skip) ) {
734 skip_this_iter++; /* stats keeping */
735 }
737 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
738 (test_bit(n, to_send) && last_iter) ||
739 (test_bit(n, to_fix) && last_iter)) ) {
740 continue;
741 }
743 /* we get here if:
744 1. page is marked to_send & hasn't already been re-dirtied
745 2. (ignore to_skip in last iteration)
746 3. add in pages that still need fixup (net bufs)
747 */
749 pfn_batch[batch] = n;
750 pfn_type[batch] = live_pfn_to_mfn_table[n];
752 if( ! is_mapped(pfn_type[batch]) )
753 {
754 /* not currently in pusedo-physical map -- set bit
755 in to_fix that we must send this page in last_iter
756 unless its sent sooner anyhow */
758 set_bit( n, to_fix );
759 if( iter>1 )
760 DPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
761 iter,n,pfn_type[batch]);
762 continue;
763 }
765 if ( last_iter &&
766 test_bit(n, to_fix) &&
767 !test_bit(n, to_send) )
768 {
769 needed_to_fix++;
770 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
771 iter,n,pfn_type[batch]);
772 }
774 clear_bit(n, to_fix);
776 batch++;
777 }
779 if ( batch == 0 )
780 goto skip; /* vanishingly unlikely... */
782 if ( (region_base = xc_map_foreign_batch(xc_handle, dom,
783 PROT_READ,
784 pfn_type,
785 batch)) == 0 ){
786 ERR("map batch failed");
787 goto out;
788 }
790 if ( xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type) ){
791 ERR("get_pfn_type_batch failed");
792 goto out;
793 }
795 for ( j = 0; j < batch; j++ ){
796 if ( (pfn_type[j] & LTAB_MASK) == XTAB ){
797 DPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
798 continue;
799 }
801 if ( 0 && debug )
802 fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
803 " sum= %08lx\n",
804 iter,
805 (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
806 pfn_type[j],
807 live_mfn_to_pfn_table[pfn_type[j]&(~LTAB_MASK)],
808 csum_page(region_base + (PAGE_SIZE*j)));
810 /* canonicalise mfn->pfn */
811 pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
812 }
814 if (write(io_fd, &batch, sizeof(int)) != sizeof(int)) {
815 ERR("Error when writing to state file (2)");
816 goto out;
817 }
819 if (write(io_fd, pfn_type, sizeof(unsigned long)*j) !=
820 sizeof(unsigned long)*j) {
821 ERR("Error when writing to state file (3)");
822 goto out;
823 }
825 /* entering this loop, pfn_type is now in pfns (Not mfns) */
826 for( j = 0; j < batch; j++ ){
827 /* write out pages in batch */
828 if( (pfn_type[j] & LTAB_MASK) == XTAB){
829 DPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
830 continue;
831 }
833 if ( ((pfn_type[j] & LTABTYPE_MASK) == L1TAB) ||
834 ((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ){
835 memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
837 for ( k = 0;
838 k < (((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ?
839 (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) :
840 1024);
841 k++ ){
842 unsigned long pfn;
844 if ( !(page[k] & _PAGE_PRESENT) )
845 continue;
847 mfn = page[k] >> PAGE_SHIFT;
848 pfn = live_mfn_to_pfn_table[mfn];
850 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
851 {
852 /* I don't think this should ever happen */
853 fprintf(stderr, "FNI %d : [%08lx,%d] pte=%08lx, "
854 "mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
855 j, pfn_type[j], k,
856 page[k], mfn, live_mfn_to_pfn_table[mfn],
857 (live_mfn_to_pfn_table[mfn]<nr_pfns)?
858 live_pfn_to_mfn_table[
859 live_mfn_to_pfn_table[mfn]] :
860 0xdeadbeef);
862 pfn = 0; /* be suspicious */
863 }
865 page[k] &= PAGE_SIZE - 1;
866 page[k] |= pfn << PAGE_SHIFT;
868 #if 0
869 fprintf(stderr,
870 "L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx "
871 "xpfn=%d\n",
872 pfn_type[j]>>28,
873 j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
874 #endif
876 } /* end of page table rewrite for loop */
878 if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
879 ERR("Error when writing to state file (4)");
880 goto out;
881 }
883 } /* end of it's a PT page */ else { /* normal page */
885 if (ratewrite(io_fd, region_base + (PAGE_SIZE*j),
886 PAGE_SIZE) != PAGE_SIZE) {
887 ERR("Error when writing to state file (5)");
888 goto out;
889 }
890 }
891 } /* end of the write out for this batch */
893 sent_this_iter += batch;
895 } /* end of this while loop for this iteration */
897 munmap(region_base, batch*PAGE_SIZE);
899 skip:
901 total_sent += sent_this_iter;
903 DPRINTF("\r %d: sent %d, skipped %d, ",
904 iter, sent_this_iter, skip_this_iter );
906 if ( last_iter ) {
907 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
909 DPRINTF("Total pages sent= %d (%.2fx)\n",
910 total_sent, ((float)total_sent)/nr_pfns );
911 DPRINTF("(of which %d were fixups)\n", needed_to_fix );
912 }
914 if (last_iter && debug){
915 int minusone = -1;
916 memset( to_send, 0xff, (nr_pfns+8)/8 );
917 debug = 0;
918 fprintf(stderr, "Entering debug resend-all mode\n");
920 /* send "-1" to put receiver into debug mode */
921 if (write(io_fd, &minusone, sizeof(int)) != sizeof(int)) {
922 ERR("Error when writing to state file (6)");
923 goto out;
924 }
926 continue;
927 }
929 if ( last_iter ) break;
931 if ( live )
932 {
933 if (
934 ( ( sent_this_iter > sent_last_iter ) &&
935 (mbit_rate == MAX_MBIT_RATE ) ) ||
936 (iter >= max_iters) ||
937 (sent_this_iter+skip_this_iter < 50) ||
938 (total_sent > nr_pfns*max_factor) )
939 {
940 DPRINTF("Start last iteration\n");
941 last_iter = 1;
943 if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
944 {
945 ERR("Domain appears not to have suspended");
946 goto out;
947 }
949 DPRINTF("SUSPEND shinfo %08lx eip %08u esi %08u\n",
950 info.shared_info_frame,
951 ctxt.user_regs.eip, ctxt.user_regs.esi);
952 }
954 if ( xc_shadow_control( xc_handle, dom,
955 DOM0_SHADOW_CONTROL_OP_CLEAN,
956 to_send, nr_pfns, &stats ) != nr_pfns )
957 {
958 ERR("Error flushing shadow PT");
959 goto out;
960 }
962 sent_last_iter = sent_this_iter;
964 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
966 }
969 } /* end of while 1 */
971 DPRINTF("All memory is saved\n");
973 /* Success! */
974 rc = 0;
976 /* Zero terminate */
977 if (write(io_fd, &rc, sizeof(int)) != sizeof(int)) {
978 ERR("Error when writing to state file (6)");
979 goto out;
980 }
982 /* Send through a list of all the PFNs that were not in map at the close */
983 {
984 unsigned int i,j;
985 unsigned int pfntab[1024];
987 for ( i = 0, j = 0; i < nr_pfns; i++ )
988 {
989 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
990 j++;
991 }
993 if (write(io_fd, &j, sizeof(unsigned int)) != sizeof(unsigned int)) {
994 ERR("Error when writing to state file (6a)");
995 goto out;
996 }
998 for ( i = 0, j = 0; i < nr_pfns; )
999 {
1000 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
1002 pfntab[j++] = i;
1004 i++;
1005 if ( j == 1024 || i == nr_pfns )
1007 if (write(io_fd, &pfntab, sizeof(unsigned long)*j) !=
1008 sizeof(unsigned long)*j) {
1009 ERR("Error when writing to state file (6b)");
1010 goto out;
1012 j = 0;
1017 /* Canonicalise the suspend-record frame number. */
1018 if ( !translate_mfn_to_pfn(&ctxt.user_regs.esi) ){
1019 ERR("Suspend record is not in range of pseudophys map");
1020 goto out;
1023 /* Canonicalise each GDT frame number. */
1024 for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
1025 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1026 ERR("GDT frame is not in range of pseudophys map");
1027 goto out;
1031 /* Canonicalise the page table base pointer. */
1032 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) ) {
1033 ERR("PT base is not in range of pseudophys map");
1034 goto out;
1036 ctxt.ctrlreg[3] = live_mfn_to_pfn_table[ctxt.ctrlreg[3] >> PAGE_SHIFT] <<
1037 PAGE_SHIFT;
1039 if (write(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) ||
1040 write(io_fd, live_shinfo, PAGE_SIZE) != PAGE_SIZE) {
1041 ERR("Error when writing to state file (1)");
1042 goto out;
1045 out:
1047 if(live_shinfo)
1048 munmap(live_shinfo, PAGE_SIZE);
1050 if(live_pfn_to_mfn_frame_list)
1051 munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
1053 if(live_pfn_to_mfn_table)
1054 munmap(live_pfn_to_mfn_table, nr_pfns*4);
1056 if(live_mfn_to_pfn_table)
1057 munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024);
1059 free(pfn_type);
1060 free(pfn_batch);
1061 free(to_send);
1062 free(to_fix);
1063 free(to_skip);
1065 DPRINTF("Save exit rc=%d\n",rc);
1066 return !!rc;