direct-io.hg

view tools/libxc/xc_linux_save.c @ 3266:5195a9576f40

bitkeeper revision 1.1159.187.61 (41badd42WPbpyTHpQn9bVGYapDfdOQ)

Type decl cleanups from Charles Coffing.
author kaf24@scramble.cl.cam.ac.uk
date Sat Dec 11 11:42:58 2004 +0000 (2004-12-11)
parents 724449a888fe
children c00fbb136368
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <sys/time.h>
11 #include "xc_private.h"
12 #include <xen/linux/suspend.h>
13 #include <time.h>
15 #define BATCH_SIZE 1024 /* 1024 pages (4MB) at a time */
17 #define MAX_MBIT_RATE 500
19 #define DEBUG 0
20 #define DDEBUG 0
22 #if DEBUG
23 #define DPRINTF(_f, _a...) printf ( _f , ## _a )
24 #else
25 #define DPRINTF(_f, _a...) ((void)0)
26 #endif
28 #if DDEBUG
29 #define DDPRINTF(_f, _a...) printf ( _f , ## _a )
30 #else
31 #define DDPRINTF(_f, _a...) ((void)0)
32 #endif
34 /*
35 * Returns TRUE if the given machine frame number has a unique mapping
36 * in the guest's pseudophysical map.
37 */
39 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
40 (((_mfn) < (1024*1024)) && \
41 ((live_mfn_to_pfn_table[_mfn] < nr_pfns) && \
42 (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn))))
45 /* Returns TRUE if MFN is successfully converted to a PFN. */
46 #define translate_mfn_to_pfn(_pmfn) \
47 ({ \
48 unsigned long mfn = *(_pmfn); \
49 int _res = 1; \
50 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
51 _res = 0; \
52 else \
53 *(_pmfn) = live_mfn_to_pfn_table[mfn]; \
54 _res; \
55 })
57 #define is_mapped(pfn) (!((pfn) & 0x80000000UL))
59 static inline int test_bit ( int nr, volatile void * addr)
60 {
61 return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
62 (nr % (sizeof(unsigned long)*8))) & 1;
63 }
65 static inline void clear_bit ( int nr, volatile void * addr)
66 {
67 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &=
68 ~(1 << (nr % (sizeof(unsigned long)*8) ) );
69 }
71 static inline void set_bit ( int nr, volatile void * addr)
72 {
73 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |=
74 (1 << (nr % (sizeof(unsigned long)*8) ) );
75 }
77 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
78 static inline unsigned int hweight32(unsigned int w)
79 {
80 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
81 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
82 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
83 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
84 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
85 }
87 static inline int count_bits ( int nr, volatile void *addr)
88 {
89 int i, count = 0;
90 unsigned long *p = (unsigned long *)addr;
91 /* We know that the array is padded to unsigned long. */
92 for(i=0;i<nr/(sizeof(unsigned long)*8);i++,p++)
93 count += hweight32( *p );
94 return count;
95 }
97 static inline int permute( int i, int nr, int order_nr )
98 {
99 /* Need a simple permutation function so that we scan pages in a
100 pseudo random order, enabling us to get a better estimate of
101 the domain's page dirtying rate as we go (there are often
102 contiguous ranges of pfns that have similar behaviour, and we
103 want to mix them up. */
105 /* e.g. nr->oder 15->4 16->4 17->5 */
106 /* 512MB domain, 128k pages, order 17 */
108 /*
109 QPONMLKJIHGFEDCBA
110 QPONMLKJIH
111 GFEDCBA
112 */
114 /*
115 QPONMLKJIHGFEDCBA
116 EDCBA
117 QPONM
118 LKJIHGF
119 */
121 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
122 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
124 return i;
125 }
127 static long long tv_to_us( struct timeval *new )
128 {
129 return (new->tv_sec * 1000000) + new->tv_usec;
130 }
132 static long long llgettimeofday()
133 {
134 struct timeval now;
135 gettimeofday(&now, NULL);
136 return tv_to_us(&now);
137 }
139 static long long tv_delta( struct timeval *new, struct timeval *old )
140 {
141 return ((new->tv_sec - old->tv_sec)*1000000 ) +
142 (new->tv_usec - old->tv_usec);
143 }
146 #define START_MBIT_RATE ioctxt->resource
148 static int mbit_rate, ombit_rate = 0;
149 static int burst_time_us = -1;
151 #define MBIT_RATE mbit_rate
152 #define BURST_BUDGET (100*1024)
154 /*
155 1000000/((100)*1024*1024/8/(100*1024))
156 7812
157 1000000/((100)*1024/8/(100))
158 7812
159 1000000/((100)*128/(100))
160 7812
161 100000000/((100)*128)
162 7812
163 100000000/128
164 781250
165 */
166 #define RATE_TO_BTU 781250
167 #define BURST_TIME_US burst_time_us
169 static int xcio_ratewrite(XcIOContext *ioctxt, void *buf, int n){
170 static int budget = 0;
171 static struct timeval last_put = { 0 };
172 struct timeval now;
173 struct timespec delay;
174 long long delta;
175 int rc;
177 if (START_MBIT_RATE == 0)
178 return xcio_write(ioctxt, buf, n);
180 budget -= n;
181 if (budget < 0) {
182 if (MBIT_RATE != ombit_rate) {
183 BURST_TIME_US = RATE_TO_BTU / MBIT_RATE;
184 ombit_rate = MBIT_RATE;
185 xcio_info(ioctxt,
186 "rate limit: %d mbit/s burst budget %d slot time %d\n",
187 MBIT_RATE, BURST_BUDGET, BURST_TIME_US);
188 }
189 if (last_put.tv_sec == 0) {
190 budget += BURST_BUDGET;
191 gettimeofday(&last_put, NULL);
192 } else {
193 while (budget < 0) {
194 gettimeofday(&now, NULL);
195 delta = tv_delta(&now, &last_put);
196 while (delta > BURST_TIME_US) {
197 budget += BURST_BUDGET;
198 last_put.tv_usec += BURST_TIME_US;
199 if (last_put.tv_usec > 1000000) {
200 last_put.tv_usec -= 1000000;
201 last_put.tv_sec++;
202 }
203 delta -= BURST_TIME_US;
204 }
205 if (budget > 0)
206 break;
207 delay.tv_sec = 0;
208 delay.tv_nsec = 1000 * (BURST_TIME_US - delta);
209 while (delay.tv_nsec > 0)
210 if (nanosleep(&delay, &delay) == 0)
211 break;
212 }
213 }
214 }
215 rc = IOStream_write(ioctxt->io, buf, n);
216 return (rc == n ? 0 : rc);
217 }
219 static int print_stats( int xc_handle, u32 domid,
220 int pages_sent, xc_shadow_control_stats_t *stats,
221 int print )
222 {
223 static struct timeval wall_last;
224 static long long d0_cpu_last;
225 static long long d1_cpu_last;
227 struct timeval wall_now;
228 long long wall_delta;
229 long long d0_cpu_now, d0_cpu_delta;
230 long long d1_cpu_now, d1_cpu_delta;
232 gettimeofday(&wall_now, NULL);
234 d0_cpu_now = xc_domain_get_cpu_usage( xc_handle, 0 )/1000;
235 d1_cpu_now = xc_domain_get_cpu_usage( xc_handle, domid )/1000;
237 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
238 printf("ARRHHH!!\n");
240 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
242 if ( wall_delta == 0 ) wall_delta = 1;
244 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
245 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
247 if ( print )
248 printf("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
249 "dirtied %dMb/s %" PRId32 " pages\n",
250 wall_delta,
251 (int)((d0_cpu_delta*100)/wall_delta),
252 (int)((d1_cpu_delta*100)/wall_delta),
253 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
254 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
255 stats->dirty_count);
257 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
258 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
259 + 50;
260 if (mbit_rate > MAX_MBIT_RATE)
261 mbit_rate = MAX_MBIT_RATE;
262 }
264 d0_cpu_last = d0_cpu_now;
265 d1_cpu_last = d1_cpu_now;
266 wall_last = wall_now;
268 return 0;
269 }
271 /** Write the vmconfig string.
272 * It is stored as a 4-byte count 'n' followed by n bytes.
273 *
274 * @param ioctxt i/o context
275 * @return 0 on success, non-zero on error.
276 */
277 static int write_vmconfig(XcIOContext *ioctxt){
278 int err = -1;
279 if(xcio_write(ioctxt, &ioctxt->vmconfig_n, sizeof(ioctxt->vmconfig_n))) goto exit;
280 if(xcio_write(ioctxt, ioctxt->vmconfig, ioctxt->vmconfig_n)) goto exit;
281 err = 0;
282 exit:
283 return err;
284 }
286 static int analysis_phase( int xc_handle, u32 domid,
287 int nr_pfns, unsigned long *arr, int runs )
288 {
289 long long start, now;
290 xc_shadow_control_stats_t stats;
291 int j;
293 start = llgettimeofday();
295 for (j = 0; j < runs; j++)
296 {
297 int i;
299 xc_shadow_control( xc_handle, domid,
300 DOM0_SHADOW_CONTROL_OP_CLEAN,
301 arr, nr_pfns, NULL);
302 printf("#Flush\n");
303 for ( i = 0; i < 40; i++ )
304 {
305 usleep(50000);
306 now = llgettimeofday();
307 xc_shadow_control( xc_handle, domid,
308 DOM0_SHADOW_CONTROL_OP_PEEK,
309 NULL, 0, &stats);
311 printf("now= %lld faults= %" PRId32 " dirty= %" PRId32
312 " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
313 ((now-start)+500)/1000,
314 stats.fault_count, stats.dirty_count,
315 stats.dirty_net_count, stats.dirty_block_count);
316 }
317 }
319 return -1;
320 }
323 int suspend_and_state(int xc_handle, XcIOContext *ioctxt,
324 xc_domaininfo_t *info,
325 full_execution_context_t *ctxt)
326 {
327 int i=0;
329 xcio_suspend_domain(ioctxt);
331 retry:
333 if ( xc_domain_getfullinfo(xc_handle, ioctxt->domain, info, ctxt) )
334 {
335 xcio_error(ioctxt, "Could not get full domain info");
336 return -1;
337 }
339 if ( (info->flags &
340 (DOMFLAGS_SHUTDOWN | (SHUTDOWN_suspend<<DOMFLAGS_SHUTDOWNSHIFT))) ==
341 (DOMFLAGS_SHUTDOWN | (SHUTDOWN_suspend<<DOMFLAGS_SHUTDOWNSHIFT)) )
342 {
343 return 0; // success
344 }
346 if ( info->flags & DOMFLAGS_PAUSED )
347 {
348 // try unpausing domain, wait, and retest
349 xc_domain_unpause( xc_handle, ioctxt->domain );
351 xcio_error(ioctxt, "Domain was paused. Wait and re-test. (%lx)",
352 info->flags);
353 usleep(10000); // 10ms
355 goto retry;
356 }
359 if( ++i < 100 )
360 {
361 xcio_error(ioctxt, "Retry suspend domain (%lx)",
362 info->flags);
363 usleep(10000); // 10ms
364 goto retry;
365 }
367 xcio_error(ioctxt, "Unable to suspend domain. (%lx)",
368 info->flags);
370 return -1;
371 }
373 int xc_linux_save(int xc_handle, XcIOContext *ioctxt)
374 {
375 xc_domaininfo_t info;
377 int rc = 1, i, j, k, last_iter, iter = 0;
378 unsigned long mfn;
379 u32 domid = ioctxt->domain;
380 int live = (ioctxt->flags & XCFLAGS_LIVE);
381 int debug = (ioctxt->flags & XCFLAGS_DEBUG);
382 int sent_last_iter, skip_this_iter;
384 /* Important tuning parameters */
385 int max_iters = 29; /* limit us to 30 times round loop */
386 int max_factor = 3; /* never send more than 3x nr_pfns */
388 /* The new domain's shared-info frame number. */
389 unsigned long shared_info_frame;
391 /* A copy of the CPU context of the guest. */
392 full_execution_context_t ctxt;
394 /* A table containg the type of each PFN (/not/ MFN!). */
395 unsigned long *pfn_type = NULL;
396 unsigned long *pfn_batch = NULL;
398 /* A temporary mapping, and a copy, of one frame of guest memory. */
399 unsigned long page[1024];
401 /* A copy of the pfn-to-mfn table frame list. */
402 unsigned long *live_pfn_to_mfn_frame_list = NULL;
403 unsigned long pfn_to_mfn_frame_list[1024];
405 /* Live mapping of the table mapping each PFN to its current MFN. */
406 unsigned long *live_pfn_to_mfn_table = NULL;
407 /* Live mapping of system MFN to PFN table. */
408 unsigned long *live_mfn_to_pfn_table = NULL;
409 unsigned long mfn_to_pfn_table_start_mfn;
411 /* Live mapping of shared info structure */
412 shared_info_t *live_shinfo = NULL;
414 /* base of the region in which domain memory is mapped */
415 unsigned char *region_base = NULL;
417 /* A temporary mapping, and a copy, of the guest's suspend record. */
418 suspend_record_t *p_srec = NULL;
420 /* number of pages we're dealing with */
421 unsigned long nr_pfns;
423 /* power of 2 order of nr_pfns */
424 int order_nr;
426 /* bitmap of pages:
427 - that should be sent this iteration (unless later marked as skip);
428 - to skip this iteration because already dirty;
429 - to fixup by sending at the end if not already resent; */
430 unsigned long *to_send, *to_skip, *to_fix;
432 xc_shadow_control_stats_t stats;
434 int needed_to_fix = 0;
435 int total_sent = 0;
437 MBIT_RATE = START_MBIT_RATE;
439 xcio_info(ioctxt, "xc_linux_save start %d\n", domid);
441 if (mlock(&ctxt, sizeof(ctxt))) {
442 xcio_perror(ioctxt, "Unable to mlock ctxt");
443 return 1;
444 }
446 if ( xc_domain_getfullinfo( xc_handle, domid, &info, &ctxt) )
447 {
448 xcio_error(ioctxt, "Could not get full domain info");
449 goto out;
450 }
451 shared_info_frame = info.shared_info_frame;
453 /* A cheesy test to see whether the domain contains valid state. */
454 if ( ctxt.pt_base == 0 ){
455 xcio_error(ioctxt, "Domain is not in a valid Linux guest OS state");
456 goto out;
457 }
459 nr_pfns = info.max_pages;
461 /* cheesy sanity check */
462 if ( nr_pfns > 1024*1024 ){
463 xcio_error(ioctxt, "Invalid state record -- pfn count out of range: %lu", nr_pfns);
464 goto out;
465 }
468 /* Map the shared info frame */
469 live_shinfo = xc_map_foreign_range(xc_handle, domid,
470 PAGE_SIZE, PROT_READ,
471 shared_info_frame);
473 if (!live_shinfo){
474 xcio_error(ioctxt, "Couldn't map live_shinfo");
475 goto out;
476 }
478 /* the pfn_to_mfn_frame_list fits in a single page */
479 live_pfn_to_mfn_frame_list =
480 xc_map_foreign_range(xc_handle, domid,
481 PAGE_SIZE, PROT_READ,
482 live_shinfo->arch.pfn_to_mfn_frame_list );
484 if (!live_pfn_to_mfn_frame_list){
485 xcio_error(ioctxt, "Couldn't map pfn_to_mfn_frame_list");
486 goto out;
487 }
490 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
491 the guest must not change which frames are used for this purpose.
492 (its not clear why it would want to change them, and we'll be OK
493 from a safety POV anyhow. */
495 live_pfn_to_mfn_table = xc_map_foreign_batch(xc_handle, domid,
496 PROT_READ,
497 live_pfn_to_mfn_frame_list,
498 (nr_pfns+1023)/1024 );
499 if( !live_pfn_to_mfn_table ){
500 xcio_perror(ioctxt, "Couldn't map pfn_to_mfn table");
501 goto out;
502 }
504 /* Setup the mfn_to_pfn table mapping */
505 mfn_to_pfn_table_start_mfn = xc_get_m2p_start_mfn( xc_handle );
507 live_mfn_to_pfn_table =
508 xc_map_foreign_range(xc_handle, DOMID_XEN,
509 PAGE_SIZE*1024, PROT_READ,
510 mfn_to_pfn_table_start_mfn );
512 /* Canonicalise the pfn-to-mfn table frame-number list. */
513 memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE );
515 for ( i = 0; i < nr_pfns; i += 1024 ){
516 if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ){
517 xcio_error(ioctxt, "Frame # in pfn-to-mfn frame list is not in pseudophys");
518 goto out;
519 }
520 }
523 /* Domain is still running at this point */
525 if( live )
526 {
527 if ( xc_shadow_control( xc_handle, domid,
528 DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
529 NULL, 0, NULL ) < 0 ) {
530 xcio_error(ioctxt, "Couldn't enable shadow mode");
531 goto out;
532 }
534 last_iter = 0;
535 } else{
536 /* This is a non-live suspend. Issue the call back to get the
537 domain suspended */
539 last_iter = 1;
541 if ( suspend_and_state( xc_handle, ioctxt, &info, &ctxt) )
542 {
543 xcio_error(ioctxt, "Domain appears not to have suspended: %lx",
544 info.flags);
545 goto out;
546 }
548 }
549 sent_last_iter = 1<<20; /* 4GB of pages */
551 /* calculate the power of 2 order of nr_pfns, e.g.
552 15->4 16->4 17->5 */
553 for( i=nr_pfns-1, order_nr=0; i ; i>>=1, order_nr++ );
555 /* Setup to_send bitmap */
556 {
557 /* size these for a maximal 4GB domain, to make interaction
558 with balloon driver easier. It's only user space memory,
559 ater all... (3x 128KB) */
561 int sz = ( 1<<20 ) / 8;
563 to_send = malloc( sz );
564 to_fix = calloc( 1, sz );
565 to_skip = malloc( sz );
567 if (!to_send || !to_fix || !to_skip){
568 xcio_error(ioctxt, "Couldn't allocate to_send array");
569 goto out;
570 }
572 memset( to_send, 0xff, sz );
574 if ( mlock( to_send, sz ) ){
575 xcio_perror(ioctxt, "Unable to mlock to_send");
576 return 1;
577 }
579 /* (to fix is local only) */
581 if ( mlock( to_skip, sz ) ){
582 xcio_perror(ioctxt, "Unable to mlock to_skip");
583 return 1;
584 }
586 }
588 analysis_phase( xc_handle, domid, nr_pfns, to_skip, 0 );
590 /* We want zeroed memory so use calloc rather than malloc. */
591 pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
592 pfn_batch = calloc(BATCH_SIZE, sizeof(unsigned long));
594 if ( (pfn_type == NULL) || (pfn_batch == NULL) ){
595 errno = ENOMEM;
596 goto out;
597 }
599 if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) ){
600 xcio_error(ioctxt, "Unable to mlock");
601 goto out;
602 }
605 /*
606 * Quick belt and braces sanity check.
607 */
608 #if DEBUG
609 {
610 int err=0;
611 for ( i = 0; i < nr_pfns; i++ )
612 {
613 mfn = live_pfn_to_mfn_table[i];
615 if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0xffffffffUL) )
616 {
617 printf("i=0x%x mfn=%lx live_mfn_to_pfn_table=%lx\n",
618 i,mfn,live_mfn_to_pfn_table[mfn]);
619 err++;
620 }
621 }
622 printf("Had %d unexplained entries in p2m table\n",err);
623 }
624 #endif
627 /* Start writing out the saved-domain record. */
629 if ( xcio_write(ioctxt, "LinuxGuestRecord", 16) ||
630 xcio_write(ioctxt, &nr_pfns, sizeof(unsigned long)) ||
631 xcio_write(ioctxt, pfn_to_mfn_frame_list, PAGE_SIZE) ){
632 xcio_error(ioctxt, "Error writing header");
633 goto out;
634 }
635 if(write_vmconfig(ioctxt)){
636 xcio_error(ioctxt, "Error writing vmconfig");
637 goto out;
638 }
640 print_stats( xc_handle, domid, 0, &stats, 0 );
642 /* Now write out each data page, canonicalising page tables as we go... */
644 while(1){
645 unsigned int prev_pc, sent_this_iter, N, batch;
647 iter++;
648 sent_this_iter = 0;
649 skip_this_iter = 0;
650 prev_pc = 0;
651 N=0;
653 xcio_info(ioctxt, "Saving memory pages: iter %d 0%%", iter);
655 while( N < nr_pfns ){
656 unsigned int this_pc = (N * 100) / nr_pfns;
658 if ( (this_pc - prev_pc) >= 5 ){
659 xcio_info(ioctxt, "\b\b\b\b%3d%%", this_pc);
660 prev_pc = this_pc;
661 }
663 /* slightly wasteful to peek the whole array evey time,
664 but this is fast enough for the moment. */
666 if ( !last_iter &&
667 xc_shadow_control(xc_handle, domid,
668 DOM0_SHADOW_CONTROL_OP_PEEK,
669 to_skip, nr_pfns, NULL) != nr_pfns )
670 {
671 xcio_error(ioctxt, "Error peeking shadow bitmap");
672 goto out;
673 }
676 /* load pfn_type[] with the mfn of all the pages we're doing in
677 this batch. */
679 for ( batch = 0; batch < BATCH_SIZE && N < nr_pfns ; N++ )
680 {
681 int n = permute(N, nr_pfns, order_nr );
683 if ( 0 && debug ) {
684 fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d "
685 " [mfn]= %08lx\n",
686 iter, (unsigned long)n, live_pfn_to_mfn_table[n],
687 test_bit(n,to_send),
688 live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&
689 0xFFFFF]);
690 }
692 if ( !last_iter &&
693 test_bit(n, to_send) &&
694 test_bit(n, to_skip) ) {
695 skip_this_iter++; /* stats keeping */
696 }
698 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
699 (test_bit(n, to_send) && last_iter) ||
700 (test_bit(n, to_fix) && last_iter)) ) {
701 continue;
702 }
704 /* we get here if:
705 1. page is marked to_send & hasn't already been re-dirtied
706 2. (ignore to_skip in last iteration)
707 3. add in pages that still need fixup (net bufs)
708 */
710 pfn_batch[batch] = n;
711 pfn_type[batch] = live_pfn_to_mfn_table[n];
713 if( ! is_mapped(pfn_type[batch]) )
714 {
715 /* not currently in pusedo-physical map -- set bit
716 in to_fix that we must send this page in last_iter
717 unless its sent sooner anyhow */
719 set_bit( n, to_fix );
720 if( iter>1 )
721 DDPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
722 iter,n,pfn_type[batch]);
723 continue;
724 }
726 if ( last_iter &&
727 test_bit(n, to_fix) &&
728 !test_bit(n, to_send) )
729 {
730 needed_to_fix++;
731 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
732 iter,n,pfn_type[batch]);
733 }
735 clear_bit(n, to_fix);
737 batch++;
738 }
740 // DDPRINTF("batch %d:%d (n=%d)\n", iter, batch, n);
742 if ( batch == 0 )
743 goto skip; /* vanishingly unlikely... */
745 if ( (region_base = xc_map_foreign_batch(xc_handle, domid,
746 PROT_READ,
747 pfn_type,
748 batch)) == 0 ){
749 xcio_perror(ioctxt, "map batch failed");
750 goto out;
751 }
753 if ( get_pfn_type_batch(xc_handle, domid, batch, pfn_type) ){
754 xcio_error(ioctxt, "get_pfn_type_batch failed");
755 goto out;
756 }
758 for ( j = 0; j < batch; j++ ){
759 if ( (pfn_type[j] & LTAB_MASK) == XTAB ){
760 DDPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
761 continue;
762 }
764 if ( 0 && debug )
765 fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
766 " sum= %08lx\n",
767 iter,
768 (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
769 pfn_type[j],
770 live_mfn_to_pfn_table[pfn_type[j]&(~LTAB_MASK)],
771 csum_page(region_base + (PAGE_SIZE*j)));
773 /* canonicalise mfn->pfn */
774 pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
775 }
777 if ( xcio_write(ioctxt, &batch, sizeof(int) ) ){
778 xcio_error(ioctxt, "Error when writing to state file (2)");
779 goto out;
780 }
782 if ( xcio_write(ioctxt, pfn_type, sizeof(unsigned long)*j ) ){
783 xcio_error(ioctxt, "Error when writing to state file (3)");
784 goto out;
785 }
787 /* entering this loop, pfn_type is now in pfns (Not mfns) */
788 for( j = 0; j < batch; j++ ){
789 /* write out pages in batch */
790 if( (pfn_type[j] & LTAB_MASK) == XTAB){
791 DDPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
792 continue;
793 }
795 if ( ((pfn_type[j] & LTABTYPE_MASK) == L1TAB) ||
796 ((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ){
797 memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
799 for ( k = 0;
800 k < (((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ?
801 (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) :
802 1024);
803 k++ ){
804 unsigned long pfn;
806 if ( !(page[k] & _PAGE_PRESENT) )
807 continue;
809 mfn = page[k] >> PAGE_SHIFT;
810 pfn = live_mfn_to_pfn_table[mfn];
812 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
813 {
814 /* I don't think this should ever happen */
815 printf("FNI %d : [%08lx,%d] pte=%08lx, "
816 "mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
817 j, pfn_type[j], k,
818 page[k], mfn, live_mfn_to_pfn_table[mfn],
819 (live_mfn_to_pfn_table[mfn]<nr_pfns)?
820 live_pfn_to_mfn_table[
821 live_mfn_to_pfn_table[mfn]] :
822 0xdeadbeef);
824 pfn = 0; /* be suspicious */
825 }
827 page[k] &= PAGE_SIZE - 1;
828 page[k] |= pfn << PAGE_SHIFT;
830 #if 0
831 printf("L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx "
832 "xpfn=%d\n",
833 pfn_type[j]>>28,
834 j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
835 #endif
837 } /* end of page table rewrite for loop */
839 if ( xcio_ratewrite(ioctxt, page, PAGE_SIZE) ){
840 xcio_error(ioctxt, "Error when writing to state file (4)");
841 goto out;
842 }
844 } /* end of it's a PT page */ else { /* normal page */
846 if ( xcio_ratewrite(ioctxt, region_base + (PAGE_SIZE*j),
847 PAGE_SIZE) ){
848 xcio_error(ioctxt, "Error when writing to state file (5)");
849 goto out;
850 }
851 }
852 } /* end of the write out for this batch */
854 sent_this_iter += batch;
856 } /* end of this while loop for this iteration */
858 munmap(region_base, batch*PAGE_SIZE);
860 skip:
862 total_sent += sent_this_iter;
864 xcio_info(ioctxt, "\r %d: sent %d, skipped %d, ",
865 iter, sent_this_iter, skip_this_iter );
867 if ( last_iter ) {
868 print_stats( xc_handle, domid, sent_this_iter, &stats, 1);
870 xcio_info(ioctxt, "Total pages sent= %d (%.2fx)\n",
871 total_sent, ((float)total_sent)/nr_pfns );
872 xcio_info(ioctxt, "(of which %d were fixups)\n", needed_to_fix );
873 }
875 if (last_iter && debug){
876 int minusone = -1;
877 memset( to_send, 0xff, (nr_pfns+8)/8 );
878 debug = 0;
879 printf("Entering debug resend-all mode\n");
881 /* send "-1" to put receiver into debug mode */
882 if ( xcio_write(ioctxt, &minusone, sizeof(int)) )
883 {
884 xcio_error(ioctxt, "Error when writing to state file (6)");
885 goto out;
886 }
888 continue;
889 }
891 if ( last_iter ) break;
893 if ( live )
894 {
895 if (
896 ( ( sent_this_iter > sent_last_iter ) &&
897 (mbit_rate == MAX_MBIT_RATE ) ) ||
898 (iter >= max_iters) ||
899 (sent_this_iter+skip_this_iter < 50) ||
900 (total_sent > nr_pfns*max_factor) )
901 {
902 DPRINTF("Start last iteration\n");
903 last_iter = 1;
905 if ( suspend_and_state( xc_handle, ioctxt, &info, &ctxt) )
906 {
907 xcio_error(ioctxt, "Domain appears not to have suspended: %lx",
908 info.flags);
909 goto out;
910 }
912 xcio_info(ioctxt,
913 "SUSPEND flags %08lx shinfo %08lx eip %08lx "
914 "esi %08lx\n",info.flags,
915 info.shared_info_frame,
916 ctxt.cpu_ctxt.eip, ctxt.cpu_ctxt.esi );
917 }
919 if ( xc_shadow_control( xc_handle, domid,
920 DOM0_SHADOW_CONTROL_OP_CLEAN,
921 to_send, nr_pfns, &stats ) != nr_pfns )
922 {
923 xcio_error(ioctxt, "Error flushing shadow PT");
924 goto out;
925 }
927 sent_last_iter = sent_this_iter;
929 print_stats( xc_handle, domid, sent_this_iter, &stats, 1);
931 }
934 } /* end of while 1 */
936 DPRINTF("All memory is saved\n");
938 /* Success! */
939 rc = 0;
941 /* Zero terminate */
942 if ( xcio_write(ioctxt, &rc, sizeof(int)) )
943 {
944 xcio_error(ioctxt, "Error when writing to state file (6)");
945 goto out;
946 }
948 /* Send through a list of all the PFNs that were not in map at the close */
949 {
950 unsigned int i,j;
951 unsigned int pfntab[1024];
953 for ( i = 0, j = 0; i < nr_pfns; i++ )
954 {
955 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
956 j++;
957 }
959 if ( xcio_write(ioctxt, &j, sizeof(unsigned int)) )
960 {
961 xcio_error(ioctxt, "Error when writing to state file (6a)");
962 goto out;
963 }
965 for ( i = 0, j = 0; i < nr_pfns; )
966 {
967 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
968 {
969 pfntab[j++] = i;
970 }
971 i++;
972 if ( j == 1024 || i == nr_pfns )
973 {
974 if ( xcio_write(ioctxt, &pfntab, sizeof(unsigned long)*j) )
975 {
976 xcio_error(ioctxt, "Error when writing to state file (6b)");
977 goto out;
978 }
979 j = 0;
980 }
981 }
982 }
984 /* Map the suspend-record MFN to pin it. The page must be owned by
985 domid for this to succeed. */
986 p_srec = xc_map_foreign_range(xc_handle, domid,
987 sizeof(*p_srec), PROT_READ,
988 ctxt.cpu_ctxt.esi);
989 if (!p_srec){
990 xcio_error(ioctxt, "Couldn't map suspend record");
991 goto out;
992 }
994 if (nr_pfns != p_srec->nr_pfns )
995 {
996 xcio_error(ioctxt, "Suspend record nr_pfns unexpected (%ld != %ld)",
997 p_srec->nr_pfns, nr_pfns);
998 goto out;
999 }
1001 /* Canonicalise the suspend-record frame number. */
1002 if ( !translate_mfn_to_pfn(&ctxt.cpu_ctxt.esi) ){
1003 xcio_error(ioctxt, "Suspend record is not in range of pseudophys map");
1004 goto out;
1007 /* Canonicalise each GDT frame number. */
1008 for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
1009 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1010 xcio_error(ioctxt, "GDT frame is not in range of pseudophys map");
1011 goto out;
1015 /* Canonicalise the page table base pointer. */
1016 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.pt_base >> PAGE_SHIFT) ) {
1017 xcio_error(ioctxt, "PT base is not in range of pseudophys map");
1018 goto out;
1020 ctxt.pt_base = live_mfn_to_pfn_table[ctxt.pt_base >> PAGE_SHIFT] <<
1021 PAGE_SHIFT;
1023 if ( xcio_write(ioctxt, &ctxt, sizeof(ctxt)) ||
1024 xcio_write(ioctxt, live_shinfo, PAGE_SIZE) ) {
1025 xcio_error(ioctxt, "Error when writing to state file (1)");
1026 goto out;
1029 out:
1031 if ( live_shinfo ) munmap(live_shinfo, PAGE_SIZE);
1032 if ( p_srec ) munmap(p_srec, sizeof(*p_srec));
1033 if ( live_pfn_to_mfn_frame_list ) munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
1034 if ( live_pfn_to_mfn_table ) munmap(live_pfn_to_mfn_table, nr_pfns*4 );
1035 if ( live_mfn_to_pfn_table ) munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024 );
1037 if ( pfn_type != NULL ) free(pfn_type);
1038 DPRINTF("Save exit rc=%d\n",rc);
1039 return !!rc;