direct-io.hg

view tools/libxc/xc_linux_save.c @ 2821:724449a888fe

bitkeeper revision 1.1159.1.332 (41874e954CLIDA2J3phVFD2RnzVTpA)

Clean up public XenLinux header files. Now accessible from userspace as
#include <xen/linux...>
Got rid of the linux-xen-sparse symlink as it's no longer needed.
author kaf24@freefall.cl.cam.ac.uk
date Tue Nov 02 09:08:37 2004 +0000 (2004-11-02)
parents 3f929065a1d1
children 5195a9576f40 c00fbb136368 dda5ab69e74a
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <sys/time.h>
10 #include "xc_private.h"
11 #include <xen/linux/suspend.h>
12 #include <time.h>
14 #define BATCH_SIZE 1024 /* 1024 pages (4MB) at a time */
16 #define MAX_MBIT_RATE 500
18 #define DEBUG 0
19 #define DDEBUG 0
21 #if DEBUG
22 #define DPRINTF(_f, _a...) printf ( _f , ## _a )
23 #else
24 #define DPRINTF(_f, _a...) ((void)0)
25 #endif
27 #if DDEBUG
28 #define DDPRINTF(_f, _a...) printf ( _f , ## _a )
29 #else
30 #define DDPRINTF(_f, _a...) ((void)0)
31 #endif
33 /*
34 * Returns TRUE if the given machine frame number has a unique mapping
35 * in the guest's pseudophysical map.
36 */
38 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
39 (((_mfn) < (1024*1024)) && \
40 ((live_mfn_to_pfn_table[_mfn] < nr_pfns) && \
41 (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn))))
44 /* Returns TRUE if MFN is successfully converted to a PFN. */
45 #define translate_mfn_to_pfn(_pmfn) \
46 ({ \
47 unsigned long mfn = *(_pmfn); \
48 int _res = 1; \
49 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
50 _res = 0; \
51 else \
52 *(_pmfn) = live_mfn_to_pfn_table[mfn]; \
53 _res; \
54 })
56 #define is_mapped(pfn) (!((pfn) & 0x80000000UL))
58 static inline int test_bit ( int nr, volatile void * addr)
59 {
60 return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
61 (nr % (sizeof(unsigned long)*8))) & 1;
62 }
64 static inline void clear_bit ( int nr, volatile void * addr)
65 {
66 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &=
67 ~(1 << (nr % (sizeof(unsigned long)*8) ) );
68 }
70 static inline void set_bit ( int nr, volatile void * addr)
71 {
72 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |=
73 (1 << (nr % (sizeof(unsigned long)*8) ) );
74 }
76 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
77 static inline unsigned int hweight32(unsigned int w)
78 {
79 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
80 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
81 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
82 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
83 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
84 }
86 static inline int count_bits ( int nr, volatile void *addr)
87 {
88 int i, count = 0;
89 unsigned long *p = (unsigned long *)addr;
90 /* We know that the array is padded to unsigned long. */
91 for(i=0;i<nr/(sizeof(unsigned long)*8);i++,p++)
92 count += hweight32( *p );
93 return count;
94 }
96 static inline int permute( int i, int nr, int order_nr )
97 {
98 /* Need a simple permutation function so that we scan pages in a
99 pseudo random order, enabling us to get a better estimate of
100 the domain's page dirtying rate as we go (there are often
101 contiguous ranges of pfns that have similar behaviour, and we
102 want to mix them up. */
104 /* e.g. nr->oder 15->4 16->4 17->5 */
105 /* 512MB domain, 128k pages, order 17 */
107 /*
108 QPONMLKJIHGFEDCBA
109 QPONMLKJIH
110 GFEDCBA
111 */
113 /*
114 QPONMLKJIHGFEDCBA
115 EDCBA
116 QPONM
117 LKJIHGF
118 */
120 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
121 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
123 return i;
124 }
126 static long long tv_to_us( struct timeval *new )
127 {
128 return (new->tv_sec * 1000000) + new->tv_usec;
129 }
131 static long long llgettimeofday()
132 {
133 struct timeval now;
134 gettimeofday(&now, NULL);
135 return tv_to_us(&now);
136 }
138 static long long tv_delta( struct timeval *new, struct timeval *old )
139 {
140 return ((new->tv_sec - old->tv_sec)*1000000 ) +
141 (new->tv_usec - old->tv_usec);
142 }
145 #define START_MBIT_RATE ioctxt->resource
147 static int mbit_rate, ombit_rate = 0;
148 static int burst_time_us = -1;
150 #define MBIT_RATE mbit_rate
151 #define BURST_BUDGET (100*1024)
153 /*
154 1000000/((100)*1024*1024/8/(100*1024))
155 7812
156 1000000/((100)*1024/8/(100))
157 7812
158 1000000/((100)*128/(100))
159 7812
160 100000000/((100)*128)
161 7812
162 100000000/128
163 781250
164 */
165 #define RATE_TO_BTU 781250
166 #define BURST_TIME_US burst_time_us
168 static int xcio_ratewrite(XcIOContext *ioctxt, void *buf, int n){
169 static int budget = 0;
170 static struct timeval last_put = { 0 };
171 struct timeval now;
172 struct timespec delay;
173 long long delta;
174 int rc;
176 if (START_MBIT_RATE == 0)
177 return xcio_write(ioctxt, buf, n);
179 budget -= n;
180 if (budget < 0) {
181 if (MBIT_RATE != ombit_rate) {
182 BURST_TIME_US = RATE_TO_BTU / MBIT_RATE;
183 ombit_rate = MBIT_RATE;
184 xcio_info(ioctxt,
185 "rate limit: %d mbit/s burst budget %d slot time %d\n",
186 MBIT_RATE, BURST_BUDGET, BURST_TIME_US);
187 }
188 if (last_put.tv_sec == 0) {
189 budget += BURST_BUDGET;
190 gettimeofday(&last_put, NULL);
191 } else {
192 while (budget < 0) {
193 gettimeofday(&now, NULL);
194 delta = tv_delta(&now, &last_put);
195 while (delta > BURST_TIME_US) {
196 budget += BURST_BUDGET;
197 last_put.tv_usec += BURST_TIME_US;
198 if (last_put.tv_usec > 1000000) {
199 last_put.tv_usec -= 1000000;
200 last_put.tv_sec++;
201 }
202 delta -= BURST_TIME_US;
203 }
204 if (budget > 0)
205 break;
206 delay.tv_sec = 0;
207 delay.tv_nsec = 1000 * (BURST_TIME_US - delta);
208 while (delay.tv_nsec > 0)
209 if (nanosleep(&delay, &delay) == 0)
210 break;
211 }
212 }
213 }
214 rc = IOStream_write(ioctxt->io, buf, n);
215 return (rc == n ? 0 : rc);
216 }
218 static int print_stats( int xc_handle, u32 domid,
219 int pages_sent, xc_shadow_control_stats_t *stats,
220 int print )
221 {
222 static struct timeval wall_last;
223 static long long d0_cpu_last;
224 static long long d1_cpu_last;
226 struct timeval wall_now;
227 long long wall_delta;
228 long long d0_cpu_now, d0_cpu_delta;
229 long long d1_cpu_now, d1_cpu_delta;
231 gettimeofday(&wall_now, NULL);
233 d0_cpu_now = xc_domain_get_cpu_usage( xc_handle, 0 )/1000;
234 d1_cpu_now = xc_domain_get_cpu_usage( xc_handle, domid )/1000;
236 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
237 printf("ARRHHH!!\n");
239 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
241 if ( wall_delta == 0 ) wall_delta = 1;
243 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
244 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
246 if ( print )
247 printf("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
248 "dirtied %dMb/s %ld pages\n",
249 wall_delta,
250 (int)((d0_cpu_delta*100)/wall_delta),
251 (int)((d1_cpu_delta*100)/wall_delta),
252 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
253 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
254 stats->dirty_count);
256 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
257 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
258 + 50;
259 if (mbit_rate > MAX_MBIT_RATE)
260 mbit_rate = MAX_MBIT_RATE;
261 }
263 d0_cpu_last = d0_cpu_now;
264 d1_cpu_last = d1_cpu_now;
265 wall_last = wall_now;
267 return 0;
268 }
270 /** Write the vmconfig string.
271 * It is stored as a 4-byte count 'n' followed by n bytes.
272 *
273 * @param ioctxt i/o context
274 * @return 0 on success, non-zero on error.
275 */
276 static int write_vmconfig(XcIOContext *ioctxt){
277 int err = -1;
278 if(xcio_write(ioctxt, &ioctxt->vmconfig_n, sizeof(ioctxt->vmconfig_n))) goto exit;
279 if(xcio_write(ioctxt, ioctxt->vmconfig, ioctxt->vmconfig_n)) goto exit;
280 err = 0;
281 exit:
282 return err;
283 }
285 static int analysis_phase( int xc_handle, u32 domid,
286 int nr_pfns, unsigned long *arr, int runs )
287 {
288 long long start, now;
289 xc_shadow_control_stats_t stats;
290 int j;
292 start = llgettimeofday();
294 for (j = 0; j < runs; j++)
295 {
296 int i;
298 xc_shadow_control( xc_handle, domid,
299 DOM0_SHADOW_CONTROL_OP_CLEAN,
300 arr, nr_pfns, NULL);
301 printf("#Flush\n");
302 for ( i = 0; i < 40; i++ )
303 {
304 usleep(50000);
305 now = llgettimeofday();
306 xc_shadow_control( xc_handle, domid,
307 DOM0_SHADOW_CONTROL_OP_PEEK,
308 NULL, 0, &stats);
310 printf("now= %lld faults= %ld dirty= %ld dirty_net= %ld "
311 "dirty_block= %ld\n",
312 ((now-start)+500)/1000,
313 stats.fault_count, stats.dirty_count,
314 stats.dirty_net_count, stats.dirty_block_count);
315 }
316 }
318 return -1;
319 }
322 int suspend_and_state(int xc_handle, XcIOContext *ioctxt,
323 xc_domaininfo_t *info,
324 full_execution_context_t *ctxt)
325 {
326 int i=0;
328 xcio_suspend_domain(ioctxt);
330 retry:
332 if ( xc_domain_getfullinfo(xc_handle, ioctxt->domain, info, ctxt) )
333 {
334 xcio_error(ioctxt, "Could not get full domain info");
335 return -1;
336 }
338 if ( (info->flags &
339 (DOMFLAGS_SHUTDOWN | (SHUTDOWN_suspend<<DOMFLAGS_SHUTDOWNSHIFT))) ==
340 (DOMFLAGS_SHUTDOWN | (SHUTDOWN_suspend<<DOMFLAGS_SHUTDOWNSHIFT)) )
341 {
342 return 0; // success
343 }
345 if ( info->flags & DOMFLAGS_PAUSED )
346 {
347 // try unpausing domain, wait, and retest
348 xc_domain_unpause( xc_handle, ioctxt->domain );
350 xcio_error(ioctxt, "Domain was paused. Wait and re-test. (%lx)",
351 info->flags);
352 usleep(10000); // 10ms
354 goto retry;
355 }
358 if( ++i < 100 )
359 {
360 xcio_error(ioctxt, "Retry suspend domain (%lx)",
361 info->flags);
362 usleep(10000); // 10ms
363 goto retry;
364 }
366 xcio_error(ioctxt, "Unable to suspend domain. (%lx)",
367 info->flags);
369 return -1;
370 }
372 int xc_linux_save(int xc_handle, XcIOContext *ioctxt)
373 {
374 xc_domaininfo_t info;
376 int rc = 1, i, j, k, last_iter, iter = 0;
377 unsigned long mfn;
378 u32 domid = ioctxt->domain;
379 int live = (ioctxt->flags & XCFLAGS_LIVE);
380 int debug = (ioctxt->flags & XCFLAGS_DEBUG);
381 int sent_last_iter, skip_this_iter;
383 /* Important tuning parameters */
384 int max_iters = 29; /* limit us to 30 times round loop */
385 int max_factor = 3; /* never send more than 3x nr_pfns */
387 /* The new domain's shared-info frame number. */
388 unsigned long shared_info_frame;
390 /* A copy of the CPU context of the guest. */
391 full_execution_context_t ctxt;
393 /* A table containg the type of each PFN (/not/ MFN!). */
394 unsigned long *pfn_type = NULL;
395 unsigned long *pfn_batch = NULL;
397 /* A temporary mapping, and a copy, of one frame of guest memory. */
398 unsigned long page[1024];
400 /* A copy of the pfn-to-mfn table frame list. */
401 unsigned long *live_pfn_to_mfn_frame_list = NULL;
402 unsigned long pfn_to_mfn_frame_list[1024];
404 /* Live mapping of the table mapping each PFN to its current MFN. */
405 unsigned long *live_pfn_to_mfn_table = NULL;
406 /* Live mapping of system MFN to PFN table. */
407 unsigned long *live_mfn_to_pfn_table = NULL;
408 unsigned long mfn_to_pfn_table_start_mfn;
410 /* Live mapping of shared info structure */
411 shared_info_t *live_shinfo = NULL;
413 /* base of the region in which domain memory is mapped */
414 unsigned char *region_base = NULL;
416 /* A temporary mapping, and a copy, of the guest's suspend record. */
417 suspend_record_t *p_srec = NULL;
419 /* number of pages we're dealing with */
420 unsigned long nr_pfns;
422 /* power of 2 order of nr_pfns */
423 int order_nr;
425 /* bitmap of pages:
426 - that should be sent this iteration (unless later marked as skip);
427 - to skip this iteration because already dirty;
428 - to fixup by sending at the end if not already resent; */
429 unsigned long *to_send, *to_skip, *to_fix;
431 xc_shadow_control_stats_t stats;
433 int needed_to_fix = 0;
434 int total_sent = 0;
436 MBIT_RATE = START_MBIT_RATE;
438 xcio_info(ioctxt, "xc_linux_save start %d\n", domid);
440 if (mlock(&ctxt, sizeof(ctxt))) {
441 xcio_perror(ioctxt, "Unable to mlock ctxt");
442 return 1;
443 }
445 if ( xc_domain_getfullinfo( xc_handle, domid, &info, &ctxt) )
446 {
447 xcio_error(ioctxt, "Could not get full domain info");
448 goto out;
449 }
450 shared_info_frame = info.shared_info_frame;
452 /* A cheesy test to see whether the domain contains valid state. */
453 if ( ctxt.pt_base == 0 ){
454 xcio_error(ioctxt, "Domain is not in a valid Linux guest OS state");
455 goto out;
456 }
458 nr_pfns = info.max_pages;
460 /* cheesy sanity check */
461 if ( nr_pfns > 1024*1024 ){
462 xcio_error(ioctxt, "Invalid state record -- pfn count out of range: %lu", nr_pfns);
463 goto out;
464 }
467 /* Map the shared info frame */
468 live_shinfo = xc_map_foreign_range(xc_handle, domid,
469 PAGE_SIZE, PROT_READ,
470 shared_info_frame);
472 if (!live_shinfo){
473 xcio_error(ioctxt, "Couldn't map live_shinfo");
474 goto out;
475 }
477 /* the pfn_to_mfn_frame_list fits in a single page */
478 live_pfn_to_mfn_frame_list =
479 xc_map_foreign_range(xc_handle, domid,
480 PAGE_SIZE, PROT_READ,
481 live_shinfo->arch.pfn_to_mfn_frame_list );
483 if (!live_pfn_to_mfn_frame_list){
484 xcio_error(ioctxt, "Couldn't map pfn_to_mfn_frame_list");
485 goto out;
486 }
489 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
490 the guest must not change which frames are used for this purpose.
491 (its not clear why it would want to change them, and we'll be OK
492 from a safety POV anyhow. */
494 live_pfn_to_mfn_table = xc_map_foreign_batch(xc_handle, domid,
495 PROT_READ,
496 live_pfn_to_mfn_frame_list,
497 (nr_pfns+1023)/1024 );
498 if( !live_pfn_to_mfn_table ){
499 xcio_perror(ioctxt, "Couldn't map pfn_to_mfn table");
500 goto out;
501 }
503 /* Setup the mfn_to_pfn table mapping */
504 mfn_to_pfn_table_start_mfn = xc_get_m2p_start_mfn( xc_handle );
506 live_mfn_to_pfn_table =
507 xc_map_foreign_range(xc_handle, DOMID_XEN,
508 PAGE_SIZE*1024, PROT_READ,
509 mfn_to_pfn_table_start_mfn );
511 /* Canonicalise the pfn-to-mfn table frame-number list. */
512 memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE );
514 for ( i = 0; i < nr_pfns; i += 1024 ){
515 if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ){
516 xcio_error(ioctxt, "Frame # in pfn-to-mfn frame list is not in pseudophys");
517 goto out;
518 }
519 }
522 /* Domain is still running at this point */
524 if( live )
525 {
526 if ( xc_shadow_control( xc_handle, domid,
527 DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
528 NULL, 0, NULL ) < 0 ) {
529 xcio_error(ioctxt, "Couldn't enable shadow mode");
530 goto out;
531 }
533 last_iter = 0;
534 } else{
535 /* This is a non-live suspend. Issue the call back to get the
536 domain suspended */
538 last_iter = 1;
540 if ( suspend_and_state( xc_handle, ioctxt, &info, &ctxt) )
541 {
542 xcio_error(ioctxt, "Domain appears not to have suspended: %lx",
543 info.flags);
544 goto out;
545 }
547 }
548 sent_last_iter = 1<<20; /* 4GB of pages */
550 /* calculate the power of 2 order of nr_pfns, e.g.
551 15->4 16->4 17->5 */
552 for( i=nr_pfns-1, order_nr=0; i ; i>>=1, order_nr++ );
554 /* Setup to_send bitmap */
555 {
556 /* size these for a maximal 4GB domain, to make interaction
557 with balloon driver easier. It's only user space memory,
558 ater all... (3x 128KB) */
560 int sz = ( 1<<20 ) / 8;
562 to_send = malloc( sz );
563 to_fix = calloc( 1, sz );
564 to_skip = malloc( sz );
566 if (!to_send || !to_fix || !to_skip){
567 xcio_error(ioctxt, "Couldn't allocate to_send array");
568 goto out;
569 }
571 memset( to_send, 0xff, sz );
573 if ( mlock( to_send, sz ) ){
574 xcio_perror(ioctxt, "Unable to mlock to_send");
575 return 1;
576 }
578 /* (to fix is local only) */
580 if ( mlock( to_skip, sz ) ){
581 xcio_perror(ioctxt, "Unable to mlock to_skip");
582 return 1;
583 }
585 }
587 analysis_phase( xc_handle, domid, nr_pfns, to_skip, 0 );
589 /* We want zeroed memory so use calloc rather than malloc. */
590 pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
591 pfn_batch = calloc(BATCH_SIZE, sizeof(unsigned long));
593 if ( (pfn_type == NULL) || (pfn_batch == NULL) ){
594 errno = ENOMEM;
595 goto out;
596 }
598 if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) ){
599 xcio_error(ioctxt, "Unable to mlock");
600 goto out;
601 }
604 /*
605 * Quick belt and braces sanity check.
606 */
607 #if DEBUG
608 {
609 int err=0;
610 for ( i = 0; i < nr_pfns; i++ )
611 {
612 mfn = live_pfn_to_mfn_table[i];
614 if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0xffffffffUL) )
615 {
616 printf("i=0x%x mfn=%lx live_mfn_to_pfn_table=%lx\n",
617 i,mfn,live_mfn_to_pfn_table[mfn]);
618 err++;
619 }
620 }
621 printf("Had %d unexplained entries in p2m table\n",err);
622 }
623 #endif
626 /* Start writing out the saved-domain record. */
628 if ( xcio_write(ioctxt, "LinuxGuestRecord", 16) ||
629 xcio_write(ioctxt, &nr_pfns, sizeof(unsigned long)) ||
630 xcio_write(ioctxt, pfn_to_mfn_frame_list, PAGE_SIZE) ){
631 xcio_error(ioctxt, "Error writing header");
632 goto out;
633 }
634 if(write_vmconfig(ioctxt)){
635 xcio_error(ioctxt, "Error writing vmconfig");
636 goto out;
637 }
639 print_stats( xc_handle, domid, 0, &stats, 0 );
641 /* Now write out each data page, canonicalising page tables as we go... */
643 while(1){
644 unsigned int prev_pc, sent_this_iter, N, batch;
646 iter++;
647 sent_this_iter = 0;
648 skip_this_iter = 0;
649 prev_pc = 0;
650 N=0;
652 xcio_info(ioctxt, "Saving memory pages: iter %d 0%%", iter);
654 while( N < nr_pfns ){
655 unsigned int this_pc = (N * 100) / nr_pfns;
657 if ( (this_pc - prev_pc) >= 5 ){
658 xcio_info(ioctxt, "\b\b\b\b%3d%%", this_pc);
659 prev_pc = this_pc;
660 }
662 /* slightly wasteful to peek the whole array evey time,
663 but this is fast enough for the moment. */
665 if ( !last_iter &&
666 xc_shadow_control(xc_handle, domid,
667 DOM0_SHADOW_CONTROL_OP_PEEK,
668 to_skip, nr_pfns, NULL) != nr_pfns )
669 {
670 xcio_error(ioctxt, "Error peeking shadow bitmap");
671 goto out;
672 }
675 /* load pfn_type[] with the mfn of all the pages we're doing in
676 this batch. */
678 for ( batch = 0; batch < BATCH_SIZE && N < nr_pfns ; N++ )
679 {
680 int n = permute(N, nr_pfns, order_nr );
682 if ( 0 && debug ) {
683 fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d "
684 " [mfn]= %08lx\n",
685 iter, (unsigned long)n, live_pfn_to_mfn_table[n],
686 test_bit(n,to_send),
687 live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&
688 0xFFFFF]);
689 }
691 if ( !last_iter &&
692 test_bit(n, to_send) &&
693 test_bit(n, to_skip) ) {
694 skip_this_iter++; /* stats keeping */
695 }
697 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
698 (test_bit(n, to_send) && last_iter) ||
699 (test_bit(n, to_fix) && last_iter)) ) {
700 continue;
701 }
703 /* we get here if:
704 1. page is marked to_send & hasn't already been re-dirtied
705 2. (ignore to_skip in last iteration)
706 3. add in pages that still need fixup (net bufs)
707 */
709 pfn_batch[batch] = n;
710 pfn_type[batch] = live_pfn_to_mfn_table[n];
712 if( ! is_mapped(pfn_type[batch]) )
713 {
714 /* not currently in pusedo-physical map -- set bit
715 in to_fix that we must send this page in last_iter
716 unless its sent sooner anyhow */
718 set_bit( n, to_fix );
719 if( iter>1 )
720 DDPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
721 iter,n,pfn_type[batch]);
722 continue;
723 }
725 if ( last_iter &&
726 test_bit(n, to_fix) &&
727 !test_bit(n, to_send) )
728 {
729 needed_to_fix++;
730 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
731 iter,n,pfn_type[batch]);
732 }
734 clear_bit(n, to_fix);
736 batch++;
737 }
739 // DDPRINTF("batch %d:%d (n=%d)\n", iter, batch, n);
741 if ( batch == 0 )
742 goto skip; /* vanishingly unlikely... */
744 if ( (region_base = xc_map_foreign_batch(xc_handle, domid,
745 PROT_READ,
746 pfn_type,
747 batch)) == 0 ){
748 xcio_perror(ioctxt, "map batch failed");
749 goto out;
750 }
752 if ( get_pfn_type_batch(xc_handle, domid, batch, pfn_type) ){
753 xcio_error(ioctxt, "get_pfn_type_batch failed");
754 goto out;
755 }
757 for ( j = 0; j < batch; j++ ){
758 if ( (pfn_type[j] & LTAB_MASK) == XTAB ){
759 DDPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
760 continue;
761 }
763 if ( 0 && debug )
764 fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
765 " sum= %08lx\n",
766 iter,
767 (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
768 pfn_type[j],
769 live_mfn_to_pfn_table[pfn_type[j]&(~LTAB_MASK)],
770 csum_page(region_base + (PAGE_SIZE*j)));
772 /* canonicalise mfn->pfn */
773 pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
774 }
776 if ( xcio_write(ioctxt, &batch, sizeof(int) ) ){
777 xcio_error(ioctxt, "Error when writing to state file (2)");
778 goto out;
779 }
781 if ( xcio_write(ioctxt, pfn_type, sizeof(unsigned long)*j ) ){
782 xcio_error(ioctxt, "Error when writing to state file (3)");
783 goto out;
784 }
786 /* entering this loop, pfn_type is now in pfns (Not mfns) */
787 for( j = 0; j < batch; j++ ){
788 /* write out pages in batch */
789 if( (pfn_type[j] & LTAB_MASK) == XTAB){
790 DDPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
791 continue;
792 }
794 if ( ((pfn_type[j] & LTABTYPE_MASK) == L1TAB) ||
795 ((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ){
796 memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
798 for ( k = 0;
799 k < (((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ?
800 (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) :
801 1024);
802 k++ ){
803 unsigned long pfn;
805 if ( !(page[k] & _PAGE_PRESENT) )
806 continue;
808 mfn = page[k] >> PAGE_SHIFT;
809 pfn = live_mfn_to_pfn_table[mfn];
811 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
812 {
813 /* I don't think this should ever happen */
814 printf("FNI %d : [%08lx,%d] pte=%08lx, "
815 "mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
816 j, pfn_type[j], k,
817 page[k], mfn, live_mfn_to_pfn_table[mfn],
818 (live_mfn_to_pfn_table[mfn]<nr_pfns)?
819 live_pfn_to_mfn_table[
820 live_mfn_to_pfn_table[mfn]] :
821 0xdeadbeef);
823 pfn = 0; /* be suspicious */
824 }
826 page[k] &= PAGE_SIZE - 1;
827 page[k] |= pfn << PAGE_SHIFT;
829 #if 0
830 printf("L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx "
831 "xpfn=%d\n",
832 pfn_type[j]>>28,
833 j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
834 #endif
836 } /* end of page table rewrite for loop */
838 if ( xcio_ratewrite(ioctxt, page, PAGE_SIZE) ){
839 xcio_error(ioctxt, "Error when writing to state file (4)");
840 goto out;
841 }
843 } /* end of it's a PT page */ else { /* normal page */
845 if ( xcio_ratewrite(ioctxt, region_base + (PAGE_SIZE*j),
846 PAGE_SIZE) ){
847 xcio_error(ioctxt, "Error when writing to state file (5)");
848 goto out;
849 }
850 }
851 } /* end of the write out for this batch */
853 sent_this_iter += batch;
855 } /* end of this while loop for this iteration */
857 munmap(region_base, batch*PAGE_SIZE);
859 skip:
861 total_sent += sent_this_iter;
863 xcio_info(ioctxt, "\r %d: sent %d, skipped %d, ",
864 iter, sent_this_iter, skip_this_iter );
866 if ( last_iter ) {
867 print_stats( xc_handle, domid, sent_this_iter, &stats, 1);
869 xcio_info(ioctxt, "Total pages sent= %d (%.2fx)\n",
870 total_sent, ((float)total_sent)/nr_pfns );
871 xcio_info(ioctxt, "(of which %d were fixups)\n", needed_to_fix );
872 }
874 if (last_iter && debug){
875 int minusone = -1;
876 memset( to_send, 0xff, (nr_pfns+8)/8 );
877 debug = 0;
878 printf("Entering debug resend-all mode\n");
880 /* send "-1" to put receiver into debug mode */
881 if ( xcio_write(ioctxt, &minusone, sizeof(int)) )
882 {
883 xcio_error(ioctxt, "Error when writing to state file (6)");
884 goto out;
885 }
887 continue;
888 }
890 if ( last_iter ) break;
892 if ( live )
893 {
894 if (
895 ( ( sent_this_iter > sent_last_iter ) &&
896 (mbit_rate == MAX_MBIT_RATE ) ) ||
897 (iter >= max_iters) ||
898 (sent_this_iter+skip_this_iter < 50) ||
899 (total_sent > nr_pfns*max_factor) )
900 {
901 DPRINTF("Start last iteration\n");
902 last_iter = 1;
904 if ( suspend_and_state( xc_handle, ioctxt, &info, &ctxt) )
905 {
906 xcio_error(ioctxt, "Domain appears not to have suspended: %lx",
907 info.flags);
908 goto out;
909 }
911 xcio_info(ioctxt,
912 "SUSPEND flags %08lx shinfo %08lx eip %08lx "
913 "esi %08lx\n",info.flags,
914 info.shared_info_frame,
915 ctxt.cpu_ctxt.eip, ctxt.cpu_ctxt.esi );
916 }
918 if ( xc_shadow_control( xc_handle, domid,
919 DOM0_SHADOW_CONTROL_OP_CLEAN,
920 to_send, nr_pfns, &stats ) != nr_pfns )
921 {
922 xcio_error(ioctxt, "Error flushing shadow PT");
923 goto out;
924 }
926 sent_last_iter = sent_this_iter;
928 print_stats( xc_handle, domid, sent_this_iter, &stats, 1);
930 }
933 } /* end of while 1 */
935 DPRINTF("All memory is saved\n");
937 /* Success! */
938 rc = 0;
940 /* Zero terminate */
941 if ( xcio_write(ioctxt, &rc, sizeof(int)) )
942 {
943 xcio_error(ioctxt, "Error when writing to state file (6)");
944 goto out;
945 }
947 /* Send through a list of all the PFNs that were not in map at the close */
948 {
949 unsigned int i,j;
950 unsigned int pfntab[1024];
952 for ( i = 0, j = 0; i < nr_pfns; i++ )
953 {
954 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
955 j++;
956 }
958 if ( xcio_write(ioctxt, &j, sizeof(unsigned int)) )
959 {
960 xcio_error(ioctxt, "Error when writing to state file (6a)");
961 goto out;
962 }
964 for ( i = 0, j = 0; i < nr_pfns; )
965 {
966 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
967 {
968 pfntab[j++] = i;
969 }
970 i++;
971 if ( j == 1024 || i == nr_pfns )
972 {
973 if ( xcio_write(ioctxt, &pfntab, sizeof(unsigned long)*j) )
974 {
975 xcio_error(ioctxt, "Error when writing to state file (6b)");
976 goto out;
977 }
978 j = 0;
979 }
980 }
981 }
983 /* Map the suspend-record MFN to pin it. The page must be owned by
984 domid for this to succeed. */
985 p_srec = xc_map_foreign_range(xc_handle, domid,
986 sizeof(*p_srec), PROT_READ,
987 ctxt.cpu_ctxt.esi);
988 if (!p_srec){
989 xcio_error(ioctxt, "Couldn't map suspend record");
990 goto out;
991 }
993 if (nr_pfns != p_srec->nr_pfns )
994 {
995 xcio_error(ioctxt, "Suspend record nr_pfns unexpected (%ld != %ld)",
996 p_srec->nr_pfns, nr_pfns);
997 goto out;
998 }
1000 /* Canonicalise the suspend-record frame number. */
1001 if ( !translate_mfn_to_pfn(&ctxt.cpu_ctxt.esi) ){
1002 xcio_error(ioctxt, "Suspend record is not in range of pseudophys map");
1003 goto out;
1006 /* Canonicalise each GDT frame number. */
1007 for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
1008 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1009 xcio_error(ioctxt, "GDT frame is not in range of pseudophys map");
1010 goto out;
1014 /* Canonicalise the page table base pointer. */
1015 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.pt_base >> PAGE_SHIFT) ) {
1016 xcio_error(ioctxt, "PT base is not in range of pseudophys map");
1017 goto out;
1019 ctxt.pt_base = live_mfn_to_pfn_table[ctxt.pt_base >> PAGE_SHIFT] <<
1020 PAGE_SHIFT;
1022 if ( xcio_write(ioctxt, &ctxt, sizeof(ctxt)) ||
1023 xcio_write(ioctxt, live_shinfo, PAGE_SIZE) ) {
1024 xcio_error(ioctxt, "Error when writing to state file (1)");
1025 goto out;
1028 out:
1030 if ( live_shinfo ) munmap(live_shinfo, PAGE_SIZE);
1031 if ( p_srec ) munmap(p_srec, sizeof(*p_srec));
1032 if ( live_pfn_to_mfn_frame_list ) munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
1033 if ( live_pfn_to_mfn_table ) munmap(live_pfn_to_mfn_table, nr_pfns*4 );
1034 if ( live_mfn_to_pfn_table ) munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024 );
1036 if ( pfn_type != NULL ) free(pfn_type);
1037 DPRINTF("Save exit rc=%d\n",rc);
1038 return !!rc;