ia64/xen-unstable

view tools/libxc/xc_linux_save.c @ 4895:24dfd18ea63e

bitkeeper revision 1.1159.258.120 (42848bfe8kMyWWcBA64rq7h7l7AyoA)

Shadow code bug fix (found by Ian) that was breaking refcounts, and subsequently
causing migration problems.
author mafetter@fleming.research
date Fri May 13 11:14:06 2005 +0000 (2005-05-13)
parents b5bf5415d400
children d56c3246d889 18d709f72233
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <sys/time.h>
11 #include "xc_private.h"
12 #include <xen/linux/suspend.h>
13 #include <time.h>
15 #define BATCH_SIZE 1024 /* 1024 pages (4MB) at a time */
17 #define MAX_MBIT_RATE 500
19 #define DEBUG 0
20 #define DDEBUG 0
22 #if DEBUG
23 #define DPRINTF(_f, _a...) printf ( _f , ## _a )
24 #else
25 #define DPRINTF(_f, _a...) ((void)0)
26 #endif
28 #if DDEBUG
29 #define DDPRINTF(_f, _a...) printf ( _f , ## _a )
30 #else
31 #define DDPRINTF(_f, _a...) ((void)0)
32 #endif
34 /*
35 * Returns TRUE if the given machine frame number has a unique mapping
36 * in the guest's pseudophysical map.
37 */
39 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
40 (((_mfn) < (1024*1024)) && \
41 ((live_mfn_to_pfn_table[_mfn] < nr_pfns) && \
42 (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn))))
45 /* Returns TRUE if MFN is successfully converted to a PFN. */
46 #define translate_mfn_to_pfn(_pmfn) \
47 ({ \
48 unsigned long mfn = *(_pmfn); \
49 int _res = 1; \
50 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
51 _res = 0; \
52 else \
53 *(_pmfn) = live_mfn_to_pfn_table[mfn]; \
54 _res; \
55 })
57 #define is_mapped(pfn) (!((pfn) & 0x80000000UL))
59 static inline int test_bit ( int nr, volatile void * addr)
60 {
61 return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
62 (nr % (sizeof(unsigned long)*8))) & 1;
63 }
65 static inline void clear_bit ( int nr, volatile void * addr)
66 {
67 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &=
68 ~(1 << (nr % (sizeof(unsigned long)*8) ) );
69 }
71 static inline void set_bit ( int nr, volatile void * addr)
72 {
73 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |=
74 (1 << (nr % (sizeof(unsigned long)*8) ) );
75 }
77 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
78 static inline unsigned int hweight32(unsigned int w)
79 {
80 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
81 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
82 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
83 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
84 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
85 }
87 static inline int count_bits ( int nr, volatile void *addr)
88 {
89 int i, count = 0;
90 unsigned long *p = (unsigned long *)addr;
91 /* We know that the array is padded to unsigned long. */
92 for(i=0;i<nr/(sizeof(unsigned long)*8);i++,p++)
93 count += hweight32( *p );
94 return count;
95 }
97 static inline int permute( int i, int nr, int order_nr )
98 {
99 /* Need a simple permutation function so that we scan pages in a
100 pseudo random order, enabling us to get a better estimate of
101 the domain's page dirtying rate as we go (there are often
102 contiguous ranges of pfns that have similar behaviour, and we
103 want to mix them up. */
105 /* e.g. nr->oder 15->4 16->4 17->5 */
106 /* 512MB domain, 128k pages, order 17 */
108 /*
109 QPONMLKJIHGFEDCBA
110 QPONMLKJIH
111 GFEDCBA
112 */
114 /*
115 QPONMLKJIHGFEDCBA
116 EDCBA
117 QPONM
118 LKJIHGF
119 */
121 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
122 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
124 return i;
125 }
127 static long long tv_to_us( struct timeval *new )
128 {
129 return (new->tv_sec * 1000000) + new->tv_usec;
130 }
132 static long long llgettimeofday()
133 {
134 struct timeval now;
135 gettimeofday(&now, NULL);
136 return tv_to_us(&now);
137 }
139 static long long tv_delta( struct timeval *new, struct timeval *old )
140 {
141 return ((new->tv_sec - old->tv_sec)*1000000 ) +
142 (new->tv_usec - old->tv_usec);
143 }
146 #define START_MBIT_RATE ioctxt->resource
148 static int mbit_rate, ombit_rate = 0;
149 static int burst_time_us = -1;
151 #define MBIT_RATE mbit_rate
152 #define BURST_BUDGET (100*1024)
154 /*
155 1000000/((100)*1024*1024/8/(100*1024))
156 7812
157 1000000/((100)*1024/8/(100))
158 7812
159 1000000/((100)*128/(100))
160 7812
161 100000000/((100)*128)
162 7812
163 100000000/128
164 781250
165 */
166 #define RATE_TO_BTU 781250
167 #define BURST_TIME_US burst_time_us
169 static int xcio_ratewrite(XcIOContext *ioctxt, void *buf, int n){
170 static int budget = 0;
171 static struct timeval last_put = { 0 };
172 struct timeval now;
173 struct timespec delay;
174 long long delta;
176 if (START_MBIT_RATE == 0)
177 return xcio_write(ioctxt, buf, n);
179 budget -= n;
180 if (budget < 0) {
181 if (MBIT_RATE != ombit_rate) {
182 BURST_TIME_US = RATE_TO_BTU / MBIT_RATE;
183 ombit_rate = MBIT_RATE;
184 xcio_info(ioctxt,
185 "rate limit: %d mbit/s burst budget %d slot time %d\n",
186 MBIT_RATE, BURST_BUDGET, BURST_TIME_US);
187 }
188 if (last_put.tv_sec == 0) {
189 budget += BURST_BUDGET;
190 gettimeofday(&last_put, NULL);
191 } else {
192 while (budget < 0) {
193 gettimeofday(&now, NULL);
194 delta = tv_delta(&now, &last_put);
195 while (delta > BURST_TIME_US) {
196 budget += BURST_BUDGET;
197 last_put.tv_usec += BURST_TIME_US;
198 if (last_put.tv_usec > 1000000) {
199 last_put.tv_usec -= 1000000;
200 last_put.tv_sec++;
201 }
202 delta -= BURST_TIME_US;
203 }
204 if (budget > 0)
205 break;
206 delay.tv_sec = 0;
207 delay.tv_nsec = 1000 * (BURST_TIME_US - delta);
208 while (delay.tv_nsec > 0)
209 if (nanosleep(&delay, &delay) == 0)
210 break;
211 }
212 }
213 }
214 return xcio_write(ioctxt, buf, n);
215 }
217 static int print_stats( int xc_handle, u32 domid,
218 int pages_sent, xc_shadow_control_stats_t *stats,
219 int print )
220 {
221 static struct timeval wall_last;
222 static long long d0_cpu_last;
223 static long long d1_cpu_last;
225 struct timeval wall_now;
226 long long wall_delta;
227 long long d0_cpu_now, d0_cpu_delta;
228 long long d1_cpu_now, d1_cpu_delta;
230 gettimeofday(&wall_now, NULL);
232 d0_cpu_now = xc_domain_get_cpu_usage( xc_handle, 0 )/1000;
233 d1_cpu_now = xc_domain_get_cpu_usage( xc_handle, domid )/1000;
235 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
236 printf("ARRHHH!!\n");
238 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
240 if ( wall_delta == 0 ) wall_delta = 1;
242 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
243 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
245 if ( print )
246 printf("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
247 "dirtied %dMb/s %" PRId32 " pages\n",
248 wall_delta,
249 (int)((d0_cpu_delta*100)/wall_delta),
250 (int)((d1_cpu_delta*100)/wall_delta),
251 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
252 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
253 stats->dirty_count);
255 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
256 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
257 + 50;
258 if (mbit_rate > MAX_MBIT_RATE)
259 mbit_rate = MAX_MBIT_RATE;
260 }
262 d0_cpu_last = d0_cpu_now;
263 d1_cpu_last = d1_cpu_now;
264 wall_last = wall_now;
266 return 0;
267 }
269 /** Write the vmconfig string.
270 * It is stored as a 4-byte count 'n' followed by n bytes.
271 *
272 * @param ioctxt i/o context
273 * @return 0 on success, non-zero on error.
274 */
275 static int write_vmconfig(XcIOContext *ioctxt){
276 int err = -1;
277 if(xcio_write(ioctxt, &ioctxt->vmconfig_n, sizeof(ioctxt->vmconfig_n))) goto exit;
278 if(xcio_write(ioctxt, ioctxt->vmconfig, ioctxt->vmconfig_n)) goto exit;
279 err = 0;
280 exit:
281 return err;
282 }
284 static int analysis_phase( int xc_handle, u32 domid,
285 int nr_pfns, unsigned long *arr, int runs )
286 {
287 long long start, now;
288 xc_shadow_control_stats_t stats;
289 int j;
291 start = llgettimeofday();
293 for (j = 0; j < runs; j++)
294 {
295 int i;
297 xc_shadow_control( xc_handle, domid,
298 DOM0_SHADOW_CONTROL_OP_CLEAN,
299 arr, nr_pfns, NULL);
300 printf("#Flush\n");
301 for ( i = 0; i < 40; i++ )
302 {
303 usleep(50000);
304 now = llgettimeofday();
305 xc_shadow_control( xc_handle, domid,
306 DOM0_SHADOW_CONTROL_OP_PEEK,
307 NULL, 0, &stats);
309 printf("now= %lld faults= %" PRId32 " dirty= %" PRId32
310 " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
311 ((now-start)+500)/1000,
312 stats.fault_count, stats.dirty_count,
313 stats.dirty_net_count, stats.dirty_block_count);
314 }
315 }
317 return -1;
318 }
321 int suspend_and_state(int xc_handle, XcIOContext *ioctxt,
322 xc_domaininfo_t *info,
323 full_execution_context_t *ctxt)
324 {
325 int i=0;
327 xcio_suspend_domain(ioctxt);
329 retry:
331 if ( xc_domain_getfullinfo(xc_handle, ioctxt->domain, info, ctxt) )
332 {
333 xcio_error(ioctxt, "Could not get full domain info");
334 return -1;
335 }
337 if ( (info->flags &
338 (DOMFLAGS_SHUTDOWN | (SHUTDOWN_suspend<<DOMFLAGS_SHUTDOWNSHIFT))) ==
339 (DOMFLAGS_SHUTDOWN | (SHUTDOWN_suspend<<DOMFLAGS_SHUTDOWNSHIFT)) )
340 {
341 return 0; // success
342 }
344 if ( info->flags & DOMFLAGS_PAUSED )
345 {
346 // try unpausing domain, wait, and retest
347 xc_domain_unpause( xc_handle, ioctxt->domain );
349 xcio_error(ioctxt, "Domain was paused. Wait and re-test. (%lx)",
350 info->flags);
351 usleep(10000); // 10ms
353 goto retry;
354 }
357 if( ++i < 100 )
358 {
359 xcio_error(ioctxt, "Retry suspend domain (%lx)",
360 info->flags);
361 usleep(10000); // 10ms
362 goto retry;
363 }
365 xcio_error(ioctxt, "Unable to suspend domain. (%lx)",
366 info->flags);
368 return -1;
369 }
371 int xc_linux_save(int xc_handle, XcIOContext *ioctxt)
372 {
373 xc_domaininfo_t info;
375 int rc = 1, i, j, k, last_iter, iter = 0;
376 unsigned long mfn;
377 u32 domid = ioctxt->domain;
378 int live = (ioctxt->flags & XCFLAGS_LIVE);
379 int debug = (ioctxt->flags & XCFLAGS_DEBUG);
380 int sent_last_iter, skip_this_iter;
382 /* Important tuning parameters */
383 int max_iters = 29; /* limit us to 30 times round loop */
384 int max_factor = 3; /* never send more than 3x nr_pfns */
386 /* The new domain's shared-info frame number. */
387 unsigned long shared_info_frame;
389 /* A copy of the CPU context of the guest. */
390 full_execution_context_t ctxt;
392 /* A table containg the type of each PFN (/not/ MFN!). */
393 unsigned long *pfn_type = NULL;
394 unsigned long *pfn_batch = NULL;
396 /* A temporary mapping, and a copy, of one frame of guest memory. */
397 unsigned long page[1024];
399 /* A copy of the pfn-to-mfn table frame list. */
400 unsigned long *live_pfn_to_mfn_frame_list = NULL;
401 unsigned long pfn_to_mfn_frame_list[1024];
403 /* Live mapping of the table mapping each PFN to its current MFN. */
404 unsigned long *live_pfn_to_mfn_table = NULL;
405 /* Live mapping of system MFN to PFN table. */
406 unsigned long *live_mfn_to_pfn_table = NULL;
407 unsigned long mfn_to_pfn_table_start_mfn;
409 /* Live mapping of shared info structure */
410 shared_info_t *live_shinfo = NULL;
412 /* base of the region in which domain memory is mapped */
413 unsigned char *region_base = NULL;
415 /* A temporary mapping, and a copy, of the guest's suspend record. */
416 suspend_record_t *p_srec = NULL;
418 /* number of pages we're dealing with */
419 unsigned long nr_pfns;
421 /* power of 2 order of nr_pfns */
422 int order_nr;
424 /* bitmap of pages:
425 - that should be sent this iteration (unless later marked as skip);
426 - to skip this iteration because already dirty;
427 - to fixup by sending at the end if not already resent; */
428 unsigned long *to_send, *to_skip, *to_fix;
430 xc_shadow_control_stats_t stats;
432 int needed_to_fix = 0;
433 int total_sent = 0;
435 MBIT_RATE = START_MBIT_RATE;
437 xcio_info(ioctxt, "xc_linux_save start %d\n", domid);
439 if (mlock(&ctxt, sizeof(ctxt))) {
440 xcio_perror(ioctxt, "Unable to mlock ctxt");
441 return 1;
442 }
444 if ( xc_domain_getfullinfo( xc_handle, domid, &info, &ctxt) )
445 {
446 xcio_error(ioctxt, "Could not get full domain info");
447 goto out;
448 }
449 shared_info_frame = info.shared_info_frame;
451 /* A cheesy test to see whether the domain contains valid state. */
452 if ( ctxt.pt_base == 0 ){
453 xcio_error(ioctxt, "Domain is not in a valid Linux guest OS state");
454 goto out;
455 }
457 nr_pfns = info.max_pages;
459 /* cheesy sanity check */
460 if ( nr_pfns > 1024*1024 ){
461 xcio_error(ioctxt, "Invalid state record -- pfn count out of range: %lu", nr_pfns);
462 goto out;
463 }
466 /* Map the shared info frame */
467 live_shinfo = xc_map_foreign_range(xc_handle, domid,
468 PAGE_SIZE, PROT_READ,
469 shared_info_frame);
471 if (!live_shinfo){
472 xcio_error(ioctxt, "Couldn't map live_shinfo");
473 goto out;
474 }
476 /* the pfn_to_mfn_frame_list fits in a single page */
477 live_pfn_to_mfn_frame_list =
478 xc_map_foreign_range(xc_handle, domid,
479 PAGE_SIZE, PROT_READ,
480 live_shinfo->arch.pfn_to_mfn_frame_list );
482 if (!live_pfn_to_mfn_frame_list){
483 xcio_error(ioctxt, "Couldn't map pfn_to_mfn_frame_list");
484 goto out;
485 }
488 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
489 the guest must not change which frames are used for this purpose.
490 (its not clear why it would want to change them, and we'll be OK
491 from a safety POV anyhow. */
493 live_pfn_to_mfn_table = xc_map_foreign_batch(xc_handle, domid,
494 PROT_READ,
495 live_pfn_to_mfn_frame_list,
496 (nr_pfns+1023)/1024 );
497 if( !live_pfn_to_mfn_table ){
498 xcio_perror(ioctxt, "Couldn't map pfn_to_mfn table");
499 goto out;
500 }
502 /* Setup the mfn_to_pfn table mapping */
503 mfn_to_pfn_table_start_mfn = xc_get_m2p_start_mfn( xc_handle );
505 live_mfn_to_pfn_table =
506 xc_map_foreign_range(xc_handle, DOMID_XEN,
507 PAGE_SIZE*1024, PROT_READ,
508 mfn_to_pfn_table_start_mfn );
510 /* Canonicalise the pfn-to-mfn table frame-number list. */
511 memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE );
513 for ( i = 0; i < nr_pfns; i += 1024 ){
514 if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ){
515 xcio_error(ioctxt, "Frame # in pfn-to-mfn frame list is not in pseudophys");
516 goto out;
517 }
518 }
521 /* Domain is still running at this point */
523 if( live )
524 {
525 if ( xc_shadow_control( xc_handle, domid,
526 DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
527 NULL, 0, NULL ) < 0 ) {
528 xcio_error(ioctxt, "Couldn't enable shadow mode");
529 goto out;
530 }
532 last_iter = 0;
533 } else{
534 /* This is a non-live suspend. Issue the call back to get the
535 domain suspended */
537 last_iter = 1;
539 if ( suspend_and_state( xc_handle, ioctxt, &info, &ctxt) )
540 {
541 xcio_error(ioctxt, "Domain appears not to have suspended: %lx",
542 info.flags);
543 goto out;
544 }
546 }
547 sent_last_iter = 1<<20; /* 4GB of pages */
549 /* calculate the power of 2 order of nr_pfns, e.g.
550 15->4 16->4 17->5 */
551 for( i=nr_pfns-1, order_nr=0; i ; i>>=1, order_nr++ );
553 /* Setup to_send bitmap */
554 {
555 /* size these for a maximal 4GB domain, to make interaction
556 with balloon driver easier. It's only user space memory,
557 ater all... (3x 128KB) */
559 int sz = ( 1<<20 ) / 8;
561 to_send = malloc( sz );
562 to_fix = calloc( 1, sz );
563 to_skip = malloc( sz );
565 if (!to_send || !to_fix || !to_skip){
566 xcio_error(ioctxt, "Couldn't allocate to_send array");
567 goto out;
568 }
570 memset( to_send, 0xff, sz );
572 if ( mlock( to_send, sz ) ){
573 xcio_perror(ioctxt, "Unable to mlock to_send");
574 return 1;
575 }
577 /* (to fix is local only) */
579 if ( mlock( to_skip, sz ) ){
580 xcio_perror(ioctxt, "Unable to mlock to_skip");
581 return 1;
582 }
584 }
586 analysis_phase( xc_handle, domid, nr_pfns, to_skip, 0 );
588 /* We want zeroed memory so use calloc rather than malloc. */
589 pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
590 pfn_batch = calloc(BATCH_SIZE, sizeof(unsigned long));
592 if ( (pfn_type == NULL) || (pfn_batch == NULL) ){
593 errno = ENOMEM;
594 goto out;
595 }
597 if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) ){
598 xcio_error(ioctxt, "Unable to mlock");
599 goto out;
600 }
603 /*
604 * Quick belt and braces sanity check.
605 */
606 #if DEBUG
607 {
608 int err=0;
609 for ( i = 0; i < nr_pfns; i++ )
610 {
611 mfn = live_pfn_to_mfn_table[i];
613 if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0xffffffffUL) )
614 {
615 printf("i=0x%x mfn=%lx live_mfn_to_pfn_table=%lx\n",
616 i,mfn,live_mfn_to_pfn_table[mfn]);
617 err++;
618 }
619 }
620 printf("Had %d unexplained entries in p2m table\n",err);
621 }
622 #endif
625 /* Start writing out the saved-domain record. */
627 if ( xcio_write(ioctxt, "LinuxGuestRecord", 16) ||
628 xcio_write(ioctxt, &nr_pfns, sizeof(unsigned long)) ||
629 xcio_write(ioctxt, pfn_to_mfn_frame_list, PAGE_SIZE) ){
630 xcio_error(ioctxt, "Error writing header");
631 goto out;
632 }
633 if(write_vmconfig(ioctxt)){
634 xcio_error(ioctxt, "Error writing vmconfig");
635 goto out;
636 }
638 print_stats( xc_handle, domid, 0, &stats, 0 );
640 /* Now write out each data page, canonicalising page tables as we go... */
642 while(1){
643 unsigned int prev_pc, sent_this_iter, N, batch;
645 iter++;
646 sent_this_iter = 0;
647 skip_this_iter = 0;
648 prev_pc = 0;
649 N=0;
651 xcio_info(ioctxt, "Saving memory pages: iter %d 0%%", iter);
653 while( N < nr_pfns ){
654 unsigned int this_pc = (N * 100) / nr_pfns;
656 if ( (this_pc - prev_pc) >= 5 ){
657 xcio_info(ioctxt, "\b\b\b\b%3d%%", this_pc);
658 prev_pc = this_pc;
659 }
661 /* slightly wasteful to peek the whole array evey time,
662 but this is fast enough for the moment. */
664 if ( !last_iter &&
665 xc_shadow_control(xc_handle, domid,
666 DOM0_SHADOW_CONTROL_OP_PEEK,
667 to_skip, nr_pfns, NULL) != nr_pfns )
668 {
669 xcio_error(ioctxt, "Error peeking shadow bitmap");
670 goto out;
671 }
674 /* load pfn_type[] with the mfn of all the pages we're doing in
675 this batch. */
677 for ( batch = 0; batch < BATCH_SIZE && N < nr_pfns ; N++ )
678 {
679 int n = permute(N, nr_pfns, order_nr );
681 if ( 0 && debug ) {
682 fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d "
683 " [mfn]= %08lx\n",
684 iter, (unsigned long)n, live_pfn_to_mfn_table[n],
685 test_bit(n,to_send),
686 live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&
687 0xFFFFF]);
688 }
690 if ( !last_iter &&
691 test_bit(n, to_send) &&
692 test_bit(n, to_skip) ) {
693 skip_this_iter++; /* stats keeping */
694 }
696 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
697 (test_bit(n, to_send) && last_iter) ||
698 (test_bit(n, to_fix) && last_iter)) ) {
699 continue;
700 }
702 /* we get here if:
703 1. page is marked to_send & hasn't already been re-dirtied
704 2. (ignore to_skip in last iteration)
705 3. add in pages that still need fixup (net bufs)
706 */
708 pfn_batch[batch] = n;
709 pfn_type[batch] = live_pfn_to_mfn_table[n];
711 if( ! is_mapped(pfn_type[batch]) )
712 {
713 /* not currently in pusedo-physical map -- set bit
714 in to_fix that we must send this page in last_iter
715 unless its sent sooner anyhow */
717 set_bit( n, to_fix );
718 if( iter>1 )
719 DDPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
720 iter,n,pfn_type[batch]);
721 continue;
722 }
724 if ( last_iter &&
725 test_bit(n, to_fix) &&
726 !test_bit(n, to_send) )
727 {
728 needed_to_fix++;
729 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
730 iter,n,pfn_type[batch]);
731 }
733 clear_bit(n, to_fix);
735 batch++;
736 }
738 // DDPRINTF("batch %d:%d (n=%d)\n", iter, batch, n);
740 if ( batch == 0 )
741 goto skip; /* vanishingly unlikely... */
743 if ( (region_base = xc_map_foreign_batch(xc_handle, domid,
744 PROT_READ,
745 pfn_type,
746 batch)) == 0 ){
747 xcio_perror(ioctxt, "map batch failed");
748 goto out;
749 }
751 if ( get_pfn_type_batch(xc_handle, domid, batch, pfn_type) ){
752 xcio_error(ioctxt, "get_pfn_type_batch failed");
753 goto out;
754 }
756 for ( j = 0; j < batch; j++ ){
757 if ( (pfn_type[j] & LTAB_MASK) == XTAB ){
758 DDPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
759 continue;
760 }
762 if ( 0 && debug )
763 fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
764 " sum= %08lx\n",
765 iter,
766 (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
767 pfn_type[j],
768 live_mfn_to_pfn_table[pfn_type[j]&(~LTAB_MASK)],
769 csum_page(region_base + (PAGE_SIZE*j)));
771 /* canonicalise mfn->pfn */
772 pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
773 }
775 if ( xcio_write(ioctxt, &batch, sizeof(int) ) ){
776 xcio_error(ioctxt, "Error when writing to state file (2)");
777 goto out;
778 }
780 if ( xcio_write(ioctxt, pfn_type, sizeof(unsigned long)*j ) ){
781 xcio_error(ioctxt, "Error when writing to state file (3)");
782 goto out;
783 }
785 /* entering this loop, pfn_type is now in pfns (Not mfns) */
786 for( j = 0; j < batch; j++ ){
787 /* write out pages in batch */
788 if( (pfn_type[j] & LTAB_MASK) == XTAB){
789 DDPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
790 continue;
791 }
793 if ( ((pfn_type[j] & LTABTYPE_MASK) == L1TAB) ||
794 ((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ){
795 memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
797 for ( k = 0;
798 k < (((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ?
799 (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) :
800 1024);
801 k++ ){
802 unsigned long pfn;
804 if ( !(page[k] & _PAGE_PRESENT) )
805 continue;
807 mfn = page[k] >> PAGE_SHIFT;
808 pfn = live_mfn_to_pfn_table[mfn];
810 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
811 {
812 /* I don't think this should ever happen */
813 printf("FNI %d : [%08lx,%d] pte=%08lx, "
814 "mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
815 j, pfn_type[j], k,
816 page[k], mfn, live_mfn_to_pfn_table[mfn],
817 (live_mfn_to_pfn_table[mfn]<nr_pfns)?
818 live_pfn_to_mfn_table[
819 live_mfn_to_pfn_table[mfn]] :
820 0xdeadbeef);
822 pfn = 0; /* be suspicious */
823 }
825 page[k] &= PAGE_SIZE - 1;
826 page[k] |= pfn << PAGE_SHIFT;
828 #if 0
829 printf("L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx "
830 "xpfn=%d\n",
831 pfn_type[j]>>28,
832 j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
833 #endif
835 } /* end of page table rewrite for loop */
837 if ( xcio_ratewrite(ioctxt, page, PAGE_SIZE) ){
838 xcio_error(ioctxt, "Error when writing to state file (4)");
839 goto out;
840 }
842 } /* end of it's a PT page */ else { /* normal page */
844 if ( xcio_ratewrite(ioctxt, region_base + (PAGE_SIZE*j),
845 PAGE_SIZE) ){
846 xcio_error(ioctxt, "Error when writing to state file (5)");
847 goto out;
848 }
849 }
850 } /* end of the write out for this batch */
852 sent_this_iter += batch;
854 } /* end of this while loop for this iteration */
856 munmap(region_base, batch*PAGE_SIZE);
858 skip:
860 total_sent += sent_this_iter;
862 xcio_info(ioctxt, "\r %d: sent %d, skipped %d, ",
863 iter, sent_this_iter, skip_this_iter );
865 if ( last_iter ) {
866 print_stats( xc_handle, domid, sent_this_iter, &stats, 1);
868 xcio_info(ioctxt, "Total pages sent= %d (%.2fx)\n",
869 total_sent, ((float)total_sent)/nr_pfns );
870 xcio_info(ioctxt, "(of which %d were fixups)\n", needed_to_fix );
871 }
873 if (last_iter && debug){
874 int minusone = -1;
875 memset( to_send, 0xff, (nr_pfns+8)/8 );
876 debug = 0;
877 printf("Entering debug resend-all mode\n");
879 /* send "-1" to put receiver into debug mode */
880 if ( xcio_write(ioctxt, &minusone, sizeof(int)) )
881 {
882 xcio_error(ioctxt, "Error when writing to state file (6)");
883 goto out;
884 }
886 continue;
887 }
889 if ( last_iter ) break;
891 if ( live )
892 {
893 if (
894 ( ( sent_this_iter > sent_last_iter ) &&
895 (mbit_rate == MAX_MBIT_RATE ) ) ||
896 (iter >= max_iters) ||
897 (sent_this_iter+skip_this_iter < 50) ||
898 (total_sent > nr_pfns*max_factor) )
899 {
900 DPRINTF("Start last iteration\n");
901 last_iter = 1;
903 if ( suspend_and_state( xc_handle, ioctxt, &info, &ctxt) )
904 {
905 xcio_error(ioctxt, "Domain appears not to have suspended: %lx",
906 info.flags);
907 goto out;
908 }
910 xcio_info(ioctxt,
911 "SUSPEND flags %08lx shinfo %08lx eip %08lx "
912 "esi %08lx\n",info.flags,
913 info.shared_info_frame,
914 ctxt.cpu_ctxt.eip, ctxt.cpu_ctxt.esi );
915 }
917 if ( xc_shadow_control( xc_handle, domid,
918 DOM0_SHADOW_CONTROL_OP_CLEAN,
919 to_send, nr_pfns, &stats ) != nr_pfns )
920 {
921 xcio_error(ioctxt, "Error flushing shadow PT");
922 goto out;
923 }
925 sent_last_iter = sent_this_iter;
927 print_stats( xc_handle, domid, sent_this_iter, &stats, 1);
929 }
932 } /* end of while 1 */
934 DPRINTF("All memory is saved\n");
936 /* Success! */
937 rc = 0;
939 /* Zero terminate */
940 if ( xcio_write(ioctxt, &rc, sizeof(int)) )
941 {
942 xcio_error(ioctxt, "Error when writing to state file (6)");
943 goto out;
944 }
946 /* Send through a list of all the PFNs that were not in map at the close */
947 {
948 unsigned int i,j;
949 unsigned int pfntab[1024];
951 for ( i = 0, j = 0; i < nr_pfns; i++ )
952 {
953 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
954 j++;
955 }
957 if ( xcio_write(ioctxt, &j, sizeof(unsigned int)) )
958 {
959 xcio_error(ioctxt, "Error when writing to state file (6a)");
960 goto out;
961 }
963 for ( i = 0, j = 0; i < nr_pfns; )
964 {
965 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
966 {
967 pfntab[j++] = i;
968 }
969 i++;
970 if ( j == 1024 || i == nr_pfns )
971 {
972 if ( xcio_write(ioctxt, &pfntab, sizeof(unsigned long)*j) )
973 {
974 xcio_error(ioctxt, "Error when writing to state file (6b)");
975 goto out;
976 }
977 j = 0;
978 }
979 }
980 }
982 /* Map the suspend-record MFN to pin it. The page must be owned by
983 domid for this to succeed. */
984 p_srec = xc_map_foreign_range(xc_handle, domid,
985 sizeof(*p_srec), PROT_READ,
986 ctxt.cpu_ctxt.esi);
987 if (!p_srec){
988 xcio_error(ioctxt, "Couldn't map suspend record");
989 goto out;
990 }
992 if (nr_pfns != p_srec->nr_pfns )
993 {
994 xcio_error(ioctxt, "Suspend record nr_pfns unexpected (%ld != %ld)",
995 p_srec->nr_pfns, nr_pfns);
996 goto out;
997 }
999 /* Canonicalise the suspend-record frame number. */
1000 if ( !translate_mfn_to_pfn(&ctxt.cpu_ctxt.esi) ){
1001 xcio_error(ioctxt, "Suspend record is not in range of pseudophys map");
1002 goto out;
1005 /* Canonicalise each GDT frame number. */
1006 for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
1007 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1008 xcio_error(ioctxt, "GDT frame is not in range of pseudophys map");
1009 goto out;
1013 /* Canonicalise the page table base pointer. */
1014 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.pt_base >> PAGE_SHIFT) ) {
1015 xcio_error(ioctxt, "PT base is not in range of pseudophys map");
1016 goto out;
1018 ctxt.pt_base = live_mfn_to_pfn_table[ctxt.pt_base >> PAGE_SHIFT] <<
1019 PAGE_SHIFT;
1021 if ( xcio_write(ioctxt, &ctxt, sizeof(ctxt)) ||
1022 xcio_write(ioctxt, live_shinfo, PAGE_SIZE) ) {
1023 xcio_error(ioctxt, "Error when writing to state file (1)");
1024 goto out;
1027 out:
1029 if ( live_shinfo ) munmap(live_shinfo, PAGE_SIZE);
1030 if ( p_srec ) munmap(p_srec, sizeof(*p_srec));
1031 if ( live_pfn_to_mfn_frame_list ) munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
1032 if ( live_pfn_to_mfn_table ) munmap(live_pfn_to_mfn_table, nr_pfns*4 );
1033 if ( live_mfn_to_pfn_table ) munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024 );
1035 if ( pfn_type != NULL ) free(pfn_type);
1036 DPRINTF("Save exit rc=%d\n",rc);
1037 return !!rc;