ia64/xen-unstable

view tools/libxc/xc_linux_save.c @ 6946:e703abaf6e3d

Add behaviour to the remove methods to remove the transaction's path itself. This allows us to write Remove(path) to remove the specified path rather than having to slice the path ourselves.
author emellor@ewan
date Sun Sep 18 14:42:13 2005 +0100 (2005-09-18)
parents 3233e7ecfa9f
children 619e3d6f01b3 3133e64d0462
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xg_private.h"
17 #define BATCH_SIZE 1024 /* 1024 pages (4MB) at a time */
19 #define MAX_MBIT_RATE 500
22 /*
23 ** Default values for important tuning parameters. Can override by passing
24 ** non-zero replacement values to xc_linux_save().
25 **
26 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
27 **
28 */
29 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
30 #define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
34 /* Flags to control behaviour of xc_linux_save */
35 #define XCFLAGS_LIVE 1
36 #define XCFLAGS_DEBUG 2
39 #define DEBUG 0
41 #if 1
42 #define ERR(_f, _a...) do { fprintf(stderr, _f , ## _a); fflush(stderr); } while (0)
43 #else
44 #define ERR(_f, _a...) ((void)0)
45 #endif
47 #if DEBUG
48 #define DPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
49 #else
50 #define DPRINTF(_f, _a...) ((void)0)
51 #endif
53 #define PROGRESS 0
54 #if PROGRESS
55 #define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
56 #else
57 #define PPRINTF(_f, _a...)
58 #endif
60 /*
61 * Returns TRUE if the given machine frame number has a unique mapping
62 * in the guest's pseudophysical map.
63 */
65 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
66 (((_mfn) < (1024*1024)) && \
67 ((live_mfn_to_pfn_table[_mfn] < nr_pfns) && \
68 (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn))))
71 /* Returns TRUE if MFN is successfully converted to a PFN. */
72 #define translate_mfn_to_pfn(_pmfn) \
73 ({ \
74 unsigned long mfn = *(_pmfn); \
75 int _res = 1; \
76 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
77 _res = 0; \
78 else \
79 *(_pmfn) = live_mfn_to_pfn_table[mfn]; \
80 _res; \
81 })
83 #define is_mapped(pfn) (!((pfn) & 0x80000000UL))
85 static inline int test_bit ( int nr, volatile void * addr)
86 {
87 return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
88 (nr % (sizeof(unsigned long)*8))) & 1;
89 }
91 static inline void clear_bit ( int nr, volatile void * addr)
92 {
93 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &=
94 ~(1 << (nr % (sizeof(unsigned long)*8) ) );
95 }
97 static inline void set_bit ( int nr, volatile void * addr)
98 {
99 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |=
100 (1 << (nr % (sizeof(unsigned long)*8) ) );
101 }
103 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
104 static inline unsigned int hweight32(unsigned int w)
105 {
106 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
107 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
108 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
109 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
110 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
111 }
113 static inline int count_bits ( int nr, volatile void *addr)
114 {
115 int i, count = 0;
116 unsigned long *p = (unsigned long *)addr;
117 /* We know that the array is padded to unsigned long. */
118 for(i=0;i<nr/(sizeof(unsigned long)*8);i++,p++)
119 count += hweight32( *p );
120 return count;
121 }
123 static inline int permute( int i, int nr, int order_nr )
124 {
125 /* Need a simple permutation function so that we scan pages in a
126 pseudo random order, enabling us to get a better estimate of
127 the domain's page dirtying rate as we go (there are often
128 contiguous ranges of pfns that have similar behaviour, and we
129 want to mix them up. */
131 /* e.g. nr->oder 15->4 16->4 17->5 */
132 /* 512MB domain, 128k pages, order 17 */
134 /*
135 QPONMLKJIHGFEDCBA
136 QPONMLKJIH
137 GFEDCBA
138 */
140 /*
141 QPONMLKJIHGFEDCBA
142 EDCBA
143 QPONM
144 LKJIHGF
145 */
147 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
148 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
150 return i;
151 }
153 static long long tv_to_us( struct timeval *new )
154 {
155 return (new->tv_sec * 1000000) + new->tv_usec;
156 }
158 static long long llgettimeofday( void )
159 {
160 struct timeval now;
161 gettimeofday(&now, NULL);
162 return tv_to_us(&now);
163 }
165 static long long tv_delta( struct timeval *new, struct timeval *old )
166 {
167 return ((new->tv_sec - old->tv_sec)*1000000 ) +
168 (new->tv_usec - old->tv_usec);
169 }
172 #define START_MBIT_RATE 0 //ioctxt->resource
174 static int mbit_rate, ombit_rate = 0;
175 static int burst_time_us = -1;
177 #define MBIT_RATE mbit_rate
178 #define BURST_BUDGET (100*1024)
180 /*
181 1000000/((100)*1024*1024/8/(100*1024))
182 7812
183 1000000/((100)*1024/8/(100))
184 7812
185 1000000/((100)*128/(100))
186 7812
187 100000000/((100)*128)
188 7812
189 100000000/128
190 781250
191 */
192 #define RATE_TO_BTU 781250
193 #define BURST_TIME_US burst_time_us
195 static int
196 ratewrite(int io_fd, void *buf, int n)
197 {
198 static int budget = 0;
199 static struct timeval last_put = { 0 };
200 struct timeval now;
201 struct timespec delay;
202 long long delta;
204 if (START_MBIT_RATE == 0)
205 return write(io_fd, buf, n);
207 budget -= n;
208 if (budget < 0) {
209 if (MBIT_RATE != ombit_rate) {
210 BURST_TIME_US = RATE_TO_BTU / MBIT_RATE;
211 ombit_rate = MBIT_RATE;
212 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
213 MBIT_RATE, BURST_BUDGET, BURST_TIME_US);
214 }
215 if (last_put.tv_sec == 0) {
216 budget += BURST_BUDGET;
217 gettimeofday(&last_put, NULL);
218 } else {
219 while (budget < 0) {
220 gettimeofday(&now, NULL);
221 delta = tv_delta(&now, &last_put);
222 while (delta > BURST_TIME_US) {
223 budget += BURST_BUDGET;
224 last_put.tv_usec += BURST_TIME_US;
225 if (last_put.tv_usec > 1000000) {
226 last_put.tv_usec -= 1000000;
227 last_put.tv_sec++;
228 }
229 delta -= BURST_TIME_US;
230 }
231 if (budget > 0)
232 break;
233 delay.tv_sec = 0;
234 delay.tv_nsec = 1000 * (BURST_TIME_US - delta);
235 while (delay.tv_nsec > 0)
236 if (nanosleep(&delay, &delay) == 0)
237 break;
238 }
239 }
240 }
241 return write(io_fd, buf, n);
242 }
244 static int print_stats( int xc_handle, u32 domid,
245 int pages_sent, xc_shadow_control_stats_t *stats,
246 int print )
247 {
248 static struct timeval wall_last;
249 static long long d0_cpu_last;
250 static long long d1_cpu_last;
252 struct timeval wall_now;
253 long long wall_delta;
254 long long d0_cpu_now, d0_cpu_delta;
255 long long d1_cpu_now, d1_cpu_delta;
257 gettimeofday(&wall_now, NULL);
259 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
260 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
262 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
263 fprintf(stderr, "ARRHHH!!\n");
265 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
267 if ( wall_delta == 0 ) wall_delta = 1;
269 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
270 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
272 if ( print )
273 fprintf(stderr,
274 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
275 "dirtied %dMb/s %" PRId32 " pages\n",
276 wall_delta,
277 (int)((d0_cpu_delta*100)/wall_delta),
278 (int)((d1_cpu_delta*100)/wall_delta),
279 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
280 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
281 stats->dirty_count);
283 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
284 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
285 + 50;
286 if (mbit_rate > MAX_MBIT_RATE)
287 mbit_rate = MAX_MBIT_RATE;
288 }
290 d0_cpu_last = d0_cpu_now;
291 d1_cpu_last = d1_cpu_now;
292 wall_last = wall_now;
294 return 0;
295 }
297 static int analysis_phase( int xc_handle, u32 domid,
298 int nr_pfns, unsigned long *arr, int runs )
299 {
300 long long start, now;
301 xc_shadow_control_stats_t stats;
302 int j;
304 start = llgettimeofday();
306 for (j = 0; j < runs; j++)
307 {
308 int i;
310 xc_shadow_control( xc_handle, domid,
311 DOM0_SHADOW_CONTROL_OP_CLEAN,
312 arr, nr_pfns, NULL);
313 fprintf(stderr, "#Flush\n");
314 for ( i = 0; i < 40; i++ )
315 {
316 usleep(50000);
317 now = llgettimeofday();
318 xc_shadow_control( xc_handle, domid,
319 DOM0_SHADOW_CONTROL_OP_PEEK,
320 NULL, 0, &stats);
322 fprintf(stderr, "now= %lld faults= %" PRId32 " dirty= %" PRId32
323 " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
324 ((now-start)+500)/1000,
325 stats.fault_count, stats.dirty_count,
326 stats.dirty_net_count, stats.dirty_block_count);
327 }
328 }
330 return -1;
331 }
334 static int suspend_and_state(int xc_handle, int io_fd, int dom,
335 xc_dominfo_t *info,
336 vcpu_guest_context_t *ctxt)
337 {
338 int i = 0;
339 char ans[30];
341 printf("suspend\n");
342 fflush(stdout);
343 if (fgets(ans, sizeof(ans), stdin) == NULL) {
344 ERR("failed reading suspend reply");
345 return -1;
346 }
347 if (strncmp(ans, "done\n", 5)) {
348 ERR("suspend reply incorrect: %s", ans);
349 return -1;
350 }
352 retry:
354 if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1)
355 {
356 ERR("Could not get domain info");
357 return -1;
358 }
360 if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */,
361 ctxt) )
362 {
363 ERR("Could not get vcpu context");
364 }
366 if ( info->shutdown && info->shutdown_reason == SHUTDOWN_suspend )
367 {
368 return 0; // success
369 }
371 if ( info->paused )
372 {
373 // try unpausing domain, wait, and retest
374 xc_domain_unpause( xc_handle, dom );
376 ERR("Domain was paused. Wait and re-test.");
377 usleep(10000); // 10ms
379 goto retry;
380 }
383 if( ++i < 100 )
384 {
385 ERR("Retry suspend domain.");
386 usleep(10000); // 10ms
387 goto retry;
388 }
390 ERR("Unable to suspend domain.");
392 return -1;
393 }
395 int xc_linux_save(int xc_handle, int io_fd, u32 dom, u32 max_iters,
396 u32 max_factor, u32 flags)
397 {
398 xc_dominfo_t info;
400 int rc = 1, i, j, k, last_iter, iter = 0;
401 unsigned long mfn;
402 int live = (flags & XCFLAGS_LIVE);
403 int debug = (flags & XCFLAGS_DEBUG);
404 int sent_last_iter, skip_this_iter;
406 /* The new domain's shared-info frame number. */
407 unsigned long shared_info_frame;
409 /* A copy of the CPU context of the guest. */
410 vcpu_guest_context_t ctxt;
412 /* A table containg the type of each PFN (/not/ MFN!). */
413 unsigned long *pfn_type = NULL;
414 unsigned long *pfn_batch = NULL;
416 /* A temporary mapping, and a copy, of one frame of guest memory. */
417 unsigned long page[1024];
419 /* A copy of the pfn-to-mfn table frame list. */
420 unsigned long *live_pfn_to_mfn_frame_list_list = NULL;
421 unsigned long *live_pfn_to_mfn_frame_list = NULL;
422 unsigned long pfn_to_mfn_frame_list[1024];
424 /* Live mapping of the table mapping each PFN to its current MFN. */
425 unsigned long *live_pfn_to_mfn_table = NULL;
426 /* Live mapping of system MFN to PFN table. */
427 unsigned long *live_mfn_to_pfn_table = NULL;
428 unsigned long mfn_to_pfn_table_start_mfn;
430 /* Live mapping of shared info structure */
431 shared_info_t *live_shinfo = NULL;
433 /* base of the region in which domain memory is mapped */
434 unsigned char *region_base = NULL;
436 /* number of pages we're dealing with */
437 unsigned long nr_pfns;
439 /* power of 2 order of nr_pfns */
440 int order_nr;
442 /* bitmap of pages:
443 - that should be sent this iteration (unless later marked as skip);
444 - to skip this iteration because already dirty;
445 - to fixup by sending at the end if not already resent; */
446 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
448 xc_shadow_control_stats_t stats;
450 int needed_to_fix = 0;
451 int total_sent = 0;
453 MBIT_RATE = START_MBIT_RATE;
456 /* If no explicit control parameters given, use defaults */
457 if(!max_iters)
458 max_iters = DEF_MAX_ITERS;
459 if(!max_factor)
460 max_factor = DEF_MAX_FACTOR;
463 DPRINTF("xc_linux_save start DOM%u live=%s\n", dom, live?"true":"false");
465 if (mlock(&ctxt, sizeof(ctxt))) {
466 ERR("Unable to mlock ctxt");
467 return 1;
468 }
470 if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1)
471 {
472 ERR("Could not get domain info");
473 goto out;
474 }
475 if ( xc_domain_get_vcpu_context( xc_handle, dom, /* FIXME */ 0,
476 &ctxt) )
477 {
478 ERR("Could not get vcpu context");
479 goto out;
480 }
481 shared_info_frame = info.shared_info_frame;
483 /* A cheesy test to see whether the domain contains valid state. */
484 if ( ctxt.ctrlreg[3] == 0 ){
485 ERR("Domain is not in a valid Linux guest OS state");
486 goto out;
487 }
489 nr_pfns = info.max_memkb >> (PAGE_SHIFT - 10);
491 /* cheesy sanity check */
492 if ( nr_pfns > 1024*1024 )
493 {
494 ERR("Invalid state record -- pfn count out of range: %lu", nr_pfns);
495 goto out;
496 }
498 /* Map the shared info frame */
499 live_shinfo = xc_map_foreign_range(xc_handle, dom,
500 PAGE_SIZE, PROT_READ,
501 shared_info_frame);
503 if (!live_shinfo){
504 ERR("Couldn't map live_shinfo");
505 goto out;
506 }
508 live_pfn_to_mfn_frame_list_list = xc_map_foreign_range(xc_handle, dom,
509 PAGE_SIZE, PROT_READ,
510 live_shinfo->arch.pfn_to_mfn_frame_list_list);
512 if (!live_pfn_to_mfn_frame_list_list){
513 ERR("Couldn't map pfn_to_mfn_frame_list_list");
514 goto out;
515 }
517 live_pfn_to_mfn_frame_list =
518 xc_map_foreign_batch(xc_handle, dom,
519 PROT_READ,
520 live_pfn_to_mfn_frame_list_list,
521 (nr_pfns+(1024*1024)-1)/(1024*1024) );
523 if (!live_pfn_to_mfn_frame_list){
524 ERR("Couldn't map pfn_to_mfn_frame_list");
525 goto out;
526 }
529 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
530 the guest must not change which frames are used for this purpose.
531 (its not clear why it would want to change them, and we'll be OK
532 from a safety POV anyhow. */
534 live_pfn_to_mfn_table = xc_map_foreign_batch(xc_handle, dom,
535 PROT_READ,
536 live_pfn_to_mfn_frame_list,
537 (nr_pfns+1023)/1024 );
538 if( !live_pfn_to_mfn_table ){
539 ERR("Couldn't map pfn_to_mfn table");
540 goto out;
541 }
543 /* Setup the mfn_to_pfn table mapping */
544 mfn_to_pfn_table_start_mfn = xc_get_m2p_start_mfn( xc_handle );
546 live_mfn_to_pfn_table =
547 xc_map_foreign_range(xc_handle, DOMID_XEN,
548 PAGE_SIZE*1024, PROT_READ,
549 mfn_to_pfn_table_start_mfn );
551 /* Canonicalise the pfn-to-mfn table frame-number list. */
552 memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE );
554 for ( i = 0; i < nr_pfns; i += 1024 ){
555 if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ){
556 ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
557 goto out;
558 }
559 }
562 /* Domain is still running at this point */
564 if( live )
565 {
566 if ( xc_shadow_control( xc_handle, dom,
567 DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
568 NULL, 0, NULL ) < 0 ) {
569 ERR("Couldn't enable shadow mode");
570 goto out;
571 }
573 last_iter = 0;
574 } else{
575 /* This is a non-live suspend. Issue the call back to get the
576 domain suspended */
578 last_iter = 1;
580 if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
581 {
582 ERR("Domain appears not to have suspended");
583 goto out;
584 }
586 }
587 sent_last_iter = 1<<20; /* 4GB of pages */
589 /* calculate the power of 2 order of nr_pfns, e.g.
590 15->4 16->4 17->5 */
591 for( i=nr_pfns-1, order_nr=0; i ; i>>=1, order_nr++ );
593 /* Setup to_send bitmap */
594 {
595 /* size these for a maximal 4GB domain, to make interaction
596 with balloon driver easier. It's only user space memory,
597 ater all... (3x 128KB) */
599 int sz = ( 1<<20 ) / 8;
601 to_send = malloc( sz );
602 to_fix = calloc( 1, sz );
603 to_skip = malloc( sz );
605 if (!to_send || !to_fix || !to_skip){
606 ERR("Couldn't allocate to_send array");
607 goto out;
608 }
610 memset( to_send, 0xff, sz );
612 if ( mlock( to_send, sz ) ){
613 ERR("Unable to mlock to_send");
614 return 1;
615 }
617 /* (to fix is local only) */
619 if ( mlock( to_skip, sz ) ){
620 ERR("Unable to mlock to_skip");
621 return 1;
622 }
624 }
626 analysis_phase( xc_handle, dom, nr_pfns, to_skip, 0 );
628 /* We want zeroed memory so use calloc rather than malloc. */
629 pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
630 pfn_batch = calloc(BATCH_SIZE, sizeof(unsigned long));
632 if ( (pfn_type == NULL) || (pfn_batch == NULL) ){
633 errno = ENOMEM;
634 goto out;
635 }
637 if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) ){
638 ERR("Unable to mlock");
639 goto out;
640 }
643 /*
644 * Quick belt and braces sanity check.
645 */
646 #if DEBUG
647 {
648 int err=0;
649 for ( i = 0; i < nr_pfns; i++ )
650 {
651 mfn = live_pfn_to_mfn_table[i];
653 if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0xffffffffUL) )
654 {
655 fprintf(stderr, "i=0x%x mfn=%lx live_mfn_to_pfn_table=%lx\n",
656 i,mfn,live_mfn_to_pfn_table[mfn]);
657 err++;
658 }
659 }
660 fprintf(stderr, "Had %d unexplained entries in p2m table\n",err);
661 }
662 #endif
665 /* Start writing out the saved-domain record. */
667 if (write(io_fd, &nr_pfns, sizeof(unsigned long)) !=
668 sizeof(unsigned long)) {
669 ERR("write: nr_pfns");
670 goto out;
671 }
672 if (write(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE) {
673 ERR("write: pfn_to_mfn_frame_list");
674 goto out;
675 }
677 print_stats( xc_handle, dom, 0, &stats, 0 );
679 /* Now write out each data page, canonicalising page tables as we go... */
681 while(1){
682 unsigned int prev_pc, sent_this_iter, N, batch;
684 iter++;
685 sent_this_iter = 0;
686 skip_this_iter = 0;
687 prev_pc = 0;
688 N=0;
690 DPRINTF("Saving memory pages: iter %d 0%%", iter);
692 while( N < nr_pfns ){
693 unsigned int this_pc = (N * 100) / nr_pfns;
695 if ( (this_pc - prev_pc) >= 5 ){
696 DPRINTF("\b\b\b\b%3d%%", this_pc);
697 prev_pc = this_pc;
698 }
700 /* slightly wasteful to peek the whole array evey time,
701 but this is fast enough for the moment. */
703 if ( !last_iter &&
704 xc_shadow_control(xc_handle, dom,
705 DOM0_SHADOW_CONTROL_OP_PEEK,
706 to_skip, nr_pfns, NULL) != nr_pfns )
707 {
708 ERR("Error peeking shadow bitmap");
709 goto out;
710 }
713 /* load pfn_type[] with the mfn of all the pages we're doing in
714 this batch. */
716 for ( batch = 0; batch < BATCH_SIZE && N < nr_pfns ; N++ )
717 {
718 int n = permute(N, nr_pfns, order_nr );
720 if ( 0 && debug ) {
721 fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d "
722 " [mfn]= %08lx\n",
723 iter, (unsigned long)n, live_pfn_to_mfn_table[n],
724 test_bit(n,to_send),
725 live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&
726 0xFFFFF]);
727 }
729 if ( !last_iter &&
730 test_bit(n, to_send) &&
731 test_bit(n, to_skip) ) {
732 skip_this_iter++; /* stats keeping */
733 }
735 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
736 (test_bit(n, to_send) && last_iter) ||
737 (test_bit(n, to_fix) && last_iter)) ) {
738 continue;
739 }
741 /* we get here if:
742 1. page is marked to_send & hasn't already been re-dirtied
743 2. (ignore to_skip in last iteration)
744 3. add in pages that still need fixup (net bufs)
745 */
747 pfn_batch[batch] = n;
748 pfn_type[batch] = live_pfn_to_mfn_table[n];
750 if( ! is_mapped(pfn_type[batch]) )
751 {
752 /* not currently in pusedo-physical map -- set bit
753 in to_fix that we must send this page in last_iter
754 unless its sent sooner anyhow */
756 set_bit( n, to_fix );
757 if( iter>1 )
758 DPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
759 iter,n,pfn_type[batch]);
760 continue;
761 }
763 if ( last_iter &&
764 test_bit(n, to_fix) &&
765 !test_bit(n, to_send) )
766 {
767 needed_to_fix++;
768 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
769 iter,n,pfn_type[batch]);
770 }
772 clear_bit(n, to_fix);
774 batch++;
775 }
777 if ( batch == 0 )
778 goto skip; /* vanishingly unlikely... */
780 if ( (region_base = xc_map_foreign_batch(xc_handle, dom,
781 PROT_READ,
782 pfn_type,
783 batch)) == 0 ){
784 ERR("map batch failed");
785 goto out;
786 }
788 if ( xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type) ){
789 ERR("get_pfn_type_batch failed");
790 goto out;
791 }
793 for ( j = 0; j < batch; j++ ){
794 if ( (pfn_type[j] & LTAB_MASK) == XTAB ){
795 DPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
796 continue;
797 }
799 if ( 0 && debug )
800 fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
801 " sum= %08lx\n",
802 iter,
803 (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
804 pfn_type[j],
805 live_mfn_to_pfn_table[pfn_type[j]&(~LTAB_MASK)],
806 csum_page(region_base + (PAGE_SIZE*j)));
808 /* canonicalise mfn->pfn */
809 pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
810 }
812 if (write(io_fd, &batch, sizeof(int)) != sizeof(int)) {
813 ERR("Error when writing to state file (2)");
814 goto out;
815 }
817 if (write(io_fd, pfn_type, sizeof(unsigned long)*j) !=
818 sizeof(unsigned long)*j) {
819 ERR("Error when writing to state file (3)");
820 goto out;
821 }
823 /* entering this loop, pfn_type is now in pfns (Not mfns) */
824 for( j = 0; j < batch; j++ ){
825 /* write out pages in batch */
826 if( (pfn_type[j] & LTAB_MASK) == XTAB){
827 DPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
828 continue;
829 }
831 if ( ((pfn_type[j] & LTABTYPE_MASK) == L1TAB) ||
832 ((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ){
833 memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
835 for ( k = 0;
836 k < (((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ?
837 (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) :
838 1024);
839 k++ ){
840 unsigned long pfn;
842 if ( !(page[k] & _PAGE_PRESENT) )
843 continue;
845 mfn = page[k] >> PAGE_SHIFT;
846 pfn = live_mfn_to_pfn_table[mfn];
848 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
849 {
850 /* I don't think this should ever happen */
851 fprintf(stderr, "FNI %d : [%08lx,%d] pte=%08lx, "
852 "mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
853 j, pfn_type[j], k,
854 page[k], mfn, live_mfn_to_pfn_table[mfn],
855 (live_mfn_to_pfn_table[mfn]<nr_pfns)?
856 live_pfn_to_mfn_table[
857 live_mfn_to_pfn_table[mfn]] :
858 0xdeadbeef);
860 pfn = 0; /* be suspicious */
861 }
863 page[k] &= PAGE_SIZE - 1;
864 page[k] |= pfn << PAGE_SHIFT;
866 #if 0
867 fprintf(stderr,
868 "L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx "
869 "xpfn=%d\n",
870 pfn_type[j]>>28,
871 j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
872 #endif
874 } /* end of page table rewrite for loop */
876 if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
877 ERR("Error when writing to state file (4)");
878 goto out;
879 }
881 } /* end of it's a PT page */ else { /* normal page */
883 if (ratewrite(io_fd, region_base + (PAGE_SIZE*j),
884 PAGE_SIZE) != PAGE_SIZE) {
885 ERR("Error when writing to state file (5)");
886 goto out;
887 }
888 }
889 } /* end of the write out for this batch */
891 sent_this_iter += batch;
893 } /* end of this while loop for this iteration */
895 munmap(region_base, batch*PAGE_SIZE);
897 skip:
899 total_sent += sent_this_iter;
901 DPRINTF("\r %d: sent %d, skipped %d, ",
902 iter, sent_this_iter, skip_this_iter );
904 if ( last_iter ) {
905 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
907 DPRINTF("Total pages sent= %d (%.2fx)\n",
908 total_sent, ((float)total_sent)/nr_pfns );
909 DPRINTF("(of which %d were fixups)\n", needed_to_fix );
910 }
912 if (last_iter && debug){
913 int minusone = -1;
914 memset( to_send, 0xff, (nr_pfns+8)/8 );
915 debug = 0;
916 fprintf(stderr, "Entering debug resend-all mode\n");
918 /* send "-1" to put receiver into debug mode */
919 if (write(io_fd, &minusone, sizeof(int)) != sizeof(int)) {
920 ERR("Error when writing to state file (6)");
921 goto out;
922 }
924 continue;
925 }
927 if ( last_iter ) break;
929 if ( live )
930 {
931 if (
932 ( ( sent_this_iter > sent_last_iter ) &&
933 (mbit_rate == MAX_MBIT_RATE ) ) ||
934 (iter >= max_iters) ||
935 (sent_this_iter+skip_this_iter < 50) ||
936 (total_sent > nr_pfns*max_factor) )
937 {
938 DPRINTF("Start last iteration\n");
939 last_iter = 1;
941 if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
942 {
943 ERR("Domain appears not to have suspended");
944 goto out;
945 }
947 DPRINTF("SUSPEND shinfo %08lx eip %08u esi %08u\n",
948 info.shared_info_frame,
949 ctxt.user_regs.eip, ctxt.user_regs.esi);
950 }
952 if ( xc_shadow_control( xc_handle, dom,
953 DOM0_SHADOW_CONTROL_OP_CLEAN,
954 to_send, nr_pfns, &stats ) != nr_pfns )
955 {
956 ERR("Error flushing shadow PT");
957 goto out;
958 }
960 sent_last_iter = sent_this_iter;
962 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
964 }
967 } /* end of while 1 */
969 DPRINTF("All memory is saved\n");
971 /* Success! */
972 rc = 0;
974 /* Zero terminate */
975 if (write(io_fd, &rc, sizeof(int)) != sizeof(int)) {
976 ERR("Error when writing to state file (6)");
977 goto out;
978 }
980 /* Send through a list of all the PFNs that were not in map at the close */
981 {
982 unsigned int i,j;
983 unsigned int pfntab[1024];
985 for ( i = 0, j = 0; i < nr_pfns; i++ )
986 {
987 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
988 j++;
989 }
991 if (write(io_fd, &j, sizeof(unsigned int)) != sizeof(unsigned int)) {
992 ERR("Error when writing to state file (6a)");
993 goto out;
994 }
996 for ( i = 0, j = 0; i < nr_pfns; )
997 {
998 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
999 {
1000 pfntab[j++] = i;
1002 i++;
1003 if ( j == 1024 || i == nr_pfns )
1005 if (write(io_fd, &pfntab, sizeof(unsigned long)*j) !=
1006 sizeof(unsigned long)*j) {
1007 ERR("Error when writing to state file (6b)");
1008 goto out;
1010 j = 0;
1015 /* Canonicalise the suspend-record frame number. */
1016 if ( !translate_mfn_to_pfn(&ctxt.user_regs.esi) ){
1017 ERR("Suspend record is not in range of pseudophys map");
1018 goto out;
1021 /* Canonicalise each GDT frame number. */
1022 for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
1023 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1024 ERR("GDT frame is not in range of pseudophys map");
1025 goto out;
1029 /* Canonicalise the page table base pointer. */
1030 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) ) {
1031 ERR("PT base is not in range of pseudophys map");
1032 goto out;
1034 ctxt.ctrlreg[3] = live_mfn_to_pfn_table[ctxt.ctrlreg[3] >> PAGE_SHIFT] <<
1035 PAGE_SHIFT;
1037 if (write(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) ||
1038 write(io_fd, live_shinfo, PAGE_SIZE) != PAGE_SIZE) {
1039 ERR("Error when writing to state file (1)");
1040 goto out;
1043 out:
1045 if(live_shinfo)
1046 munmap(live_shinfo, PAGE_SIZE);
1048 if(live_pfn_to_mfn_frame_list)
1049 munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
1051 if(live_pfn_to_mfn_table)
1052 munmap(live_pfn_to_mfn_table, nr_pfns*4);
1054 if(live_mfn_to_pfn_table)
1055 munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024);
1057 free(pfn_type);
1058 free(pfn_batch);
1059 free(to_send);
1060 free(to_fix);
1061 free(to_skip);
1063 DPRINTF("Save exit rc=%d\n",rc);
1064 return !!rc;