direct-io.hg

view tools/libxc/xc_linux_save.c @ 3435:0fd048d86eed

bitkeeper revision 1.1159.220.3 (41e670c37jmaTxUns3KlvsbVRCg-UA)

The getdomaininfo hypercall now listens to the exec_domain parameter
that was already passed to it, and performs some basic sanity checking.

Added exec_domain (aka vcpu) parameters to xc_domain_getfullinfo()
and xc_domain_get_cpu_usage().
author mafetter@fleming.research
date Thu Jan 13 12:59:47 2005 +0000 (2005-01-13)
parents 9a92e168d231
children 6096356005ba
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <sys/time.h>
11 #include "xc_private.h"
12 #include <xen/linux/suspend.h>
13 #include <xen/io/domain_controller.h>
14 #include <time.h>
16 #define BATCH_SIZE 1024 /* 1024 pages (4MB) at a time */
18 #define MAX_MBIT_RATE 500
20 #define DEBUG 0
21 #define DDEBUG 0
23 #if DEBUG
24 #define DPRINTF(_f, _a...) printf ( _f , ## _a )
25 #else
26 #define DPRINTF(_f, _a...) ((void)0)
27 #endif
29 #if DDEBUG
30 #define DDPRINTF(_f, _a...) printf ( _f , ## _a )
31 #else
32 #define DDPRINTF(_f, _a...) ((void)0)
33 #endif
35 /*
36 * Returns TRUE if the given machine frame number has a unique mapping
37 * in the guest's pseudophysical map.
38 */
40 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
41 (((_mfn) < (1024*1024)) && \
42 ((live_mfn_to_pfn_table[_mfn] < nr_pfns) && \
43 (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn))))
46 /* Returns TRUE if MFN is successfully converted to a PFN. */
47 #define translate_mfn_to_pfn(_pmfn) \
48 ({ \
49 unsigned long mfn = *(_pmfn); \
50 int _res = 1; \
51 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
52 _res = 0; \
53 else \
54 *(_pmfn) = live_mfn_to_pfn_table[mfn]; \
55 _res; \
56 })
58 #define is_mapped(pfn) (!((pfn) & 0x80000000UL))
60 static inline int test_bit ( int nr, volatile void * addr)
61 {
62 return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
63 (nr % (sizeof(unsigned long)*8))) & 1;
64 }
66 static inline void clear_bit ( int nr, volatile void * addr)
67 {
68 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &=
69 ~(1 << (nr % (sizeof(unsigned long)*8) ) );
70 }
72 static inline void set_bit ( int nr, volatile void * addr)
73 {
74 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |=
75 (1 << (nr % (sizeof(unsigned long)*8) ) );
76 }
78 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
79 static inline unsigned int hweight32(unsigned int w)
80 {
81 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
82 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
83 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
84 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
85 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
86 }
88 static inline int count_bits ( int nr, volatile void *addr)
89 {
90 int i, count = 0;
91 unsigned long *p = (unsigned long *)addr;
92 /* We know that the array is padded to unsigned long. */
93 for(i=0;i<nr/(sizeof(unsigned long)*8);i++,p++)
94 count += hweight32( *p );
95 return count;
96 }
98 static inline int permute( int i, int nr, int order_nr )
99 {
100 /* Need a simple permutation function so that we scan pages in a
101 pseudo random order, enabling us to get a better estimate of
102 the domain's page dirtying rate as we go (there are often
103 contiguous ranges of pfns that have similar behaviour, and we
104 want to mix them up. */
106 /* e.g. nr->oder 15->4 16->4 17->5 */
107 /* 512MB domain, 128k pages, order 17 */
109 /*
110 QPONMLKJIHGFEDCBA
111 QPONMLKJIH
112 GFEDCBA
113 */
115 /*
116 QPONMLKJIHGFEDCBA
117 EDCBA
118 QPONM
119 LKJIHGF
120 */
122 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
123 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
125 return i;
126 }
128 static long long tv_to_us( struct timeval *new )
129 {
130 return (new->tv_sec * 1000000) + new->tv_usec;
131 }
133 static long long llgettimeofday()
134 {
135 struct timeval now;
136 gettimeofday(&now, NULL);
137 return tv_to_us(&now);
138 }
140 static long long tv_delta( struct timeval *new, struct timeval *old )
141 {
142 return ((new->tv_sec - old->tv_sec)*1000000 ) +
143 (new->tv_usec - old->tv_usec);
144 }
147 #define START_MBIT_RATE ioctxt->resource
149 static int mbit_rate, ombit_rate = 0;
150 static int burst_time_us = -1;
152 #define MBIT_RATE mbit_rate
153 #define BURST_BUDGET (100*1024)
155 /*
156 1000000/((100)*1024*1024/8/(100*1024))
157 7812
158 1000000/((100)*1024/8/(100))
159 7812
160 1000000/((100)*128/(100))
161 7812
162 100000000/((100)*128)
163 7812
164 100000000/128
165 781250
166 */
167 #define RATE_TO_BTU 781250
168 #define BURST_TIME_US burst_time_us
170 static int xcio_ratewrite(XcIOContext *ioctxt, void *buf, int n){
171 static int budget = 0;
172 static struct timeval last_put = { 0 };
173 struct timeval now;
174 struct timespec delay;
175 long long delta;
176 int rc;
178 if (START_MBIT_RATE == 0)
179 return xcio_write(ioctxt, buf, n);
181 budget -= n;
182 if (budget < 0) {
183 if (MBIT_RATE != ombit_rate) {
184 BURST_TIME_US = RATE_TO_BTU / MBIT_RATE;
185 ombit_rate = MBIT_RATE;
186 xcio_info(ioctxt,
187 "rate limit: %d mbit/s burst budget %d slot time %d\n",
188 MBIT_RATE, BURST_BUDGET, BURST_TIME_US);
189 }
190 if (last_put.tv_sec == 0) {
191 budget += BURST_BUDGET;
192 gettimeofday(&last_put, NULL);
193 } else {
194 while (budget < 0) {
195 gettimeofday(&now, NULL);
196 delta = tv_delta(&now, &last_put);
197 while (delta > BURST_TIME_US) {
198 budget += BURST_BUDGET;
199 last_put.tv_usec += BURST_TIME_US;
200 if (last_put.tv_usec > 1000000) {
201 last_put.tv_usec -= 1000000;
202 last_put.tv_sec++;
203 }
204 delta -= BURST_TIME_US;
205 }
206 if (budget > 0)
207 break;
208 delay.tv_sec = 0;
209 delay.tv_nsec = 1000 * (BURST_TIME_US - delta);
210 while (delay.tv_nsec > 0)
211 if (nanosleep(&delay, &delay) == 0)
212 break;
213 }
214 }
215 }
216 rc = IOStream_write(ioctxt->io, buf, n);
217 return (rc == n ? 0 : rc);
218 }
220 static int print_stats( int xc_handle, u32 domid,
221 int pages_sent, xc_shadow_control_stats_t *stats,
222 int print )
223 {
224 static struct timeval wall_last;
225 static long long d0_cpu_last;
226 static long long d1_cpu_last;
228 struct timeval wall_now;
229 long long wall_delta;
230 long long d0_cpu_now, d0_cpu_delta;
231 long long d1_cpu_now, d1_cpu_delta;
233 gettimeofday(&wall_now, NULL);
235 d0_cpu_now = xc_domain_get_cpu_usage( xc_handle, 0, /* FIXME */ 0 )/1000;
236 d1_cpu_now = xc_domain_get_cpu_usage( xc_handle, domid, /* FIXME */ 0 )/1000;
238 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
239 printf("ARRHHH!!\n");
241 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
243 if ( wall_delta == 0 ) wall_delta = 1;
245 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
246 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
248 if ( print )
249 printf("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
250 "dirtied %dMb/s %" PRId32 " pages\n",
251 wall_delta,
252 (int)((d0_cpu_delta*100)/wall_delta),
253 (int)((d1_cpu_delta*100)/wall_delta),
254 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
255 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
256 stats->dirty_count);
258 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
259 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
260 + 50;
261 if (mbit_rate > MAX_MBIT_RATE)
262 mbit_rate = MAX_MBIT_RATE;
263 }
265 d0_cpu_last = d0_cpu_now;
266 d1_cpu_last = d1_cpu_now;
267 wall_last = wall_now;
269 return 0;
270 }
272 /** Write the vmconfig string.
273 * It is stored as a 4-byte count 'n' followed by n bytes.
274 *
275 * @param ioctxt i/o context
276 * @return 0 on success, non-zero on error.
277 */
278 static int write_vmconfig(XcIOContext *ioctxt){
279 int err = -1;
280 if(xcio_write(ioctxt, &ioctxt->vmconfig_n, sizeof(ioctxt->vmconfig_n))) goto exit;
281 if(xcio_write(ioctxt, ioctxt->vmconfig, ioctxt->vmconfig_n)) goto exit;
282 err = 0;
283 exit:
284 return err;
285 }
287 static int analysis_phase( int xc_handle, u32 domid,
288 int nr_pfns, unsigned long *arr, int runs )
289 {
290 long long start, now;
291 xc_shadow_control_stats_t stats;
292 int j;
294 start = llgettimeofday();
296 for (j = 0; j < runs; j++)
297 {
298 int i;
300 xc_shadow_control( xc_handle, domid,
301 DOM0_SHADOW_CONTROL_OP_CLEAN,
302 arr, nr_pfns, NULL);
303 printf("#Flush\n");
304 for ( i = 0; i < 40; i++ )
305 {
306 usleep(50000);
307 now = llgettimeofday();
308 xc_shadow_control( xc_handle, domid,
309 DOM0_SHADOW_CONTROL_OP_PEEK,
310 NULL, 0, &stats);
312 printf("now= %lld faults= %" PRId32 " dirty= %" PRId32
313 " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
314 ((now-start)+500)/1000,
315 stats.fault_count, stats.dirty_count,
316 stats.dirty_net_count, stats.dirty_block_count);
317 }
318 }
320 return -1;
321 }
324 int suspend_and_state(int xc_handle, XcIOContext *ioctxt,
325 xc_domaininfo_t *info,
326 full_execution_context_t *ctxt)
327 {
328 int i=0;
330 xcio_suspend_domain(ioctxt);
332 retry:
334 if ( xc_domain_getfullinfo(xc_handle, ioctxt->domain, /* FIXME */ 0, info, ctxt) )
335 {
336 xcio_error(ioctxt, "Could not get full domain info");
337 return -1;
338 }
340 if ( (info->flags &
341 (DOMFLAGS_SHUTDOWN | (SHUTDOWN_suspend<<DOMFLAGS_SHUTDOWNSHIFT))) ==
342 (DOMFLAGS_SHUTDOWN | (SHUTDOWN_suspend<<DOMFLAGS_SHUTDOWNSHIFT)) )
343 {
344 return 0; // success
345 }
347 if ( info->flags & DOMFLAGS_PAUSED )
348 {
349 // try unpausing domain, wait, and retest
350 xc_domain_unpause( xc_handle, ioctxt->domain );
352 xcio_error(ioctxt, "Domain was paused. Wait and re-test. (%lx)",
353 info->flags);
354 usleep(10000); // 10ms
356 goto retry;
357 }
360 if( ++i < 100 )
361 {
362 xcio_error(ioctxt, "Retry suspend domain (%lx)",
363 info->flags);
364 usleep(10000); // 10ms
365 goto retry;
366 }
368 xcio_error(ioctxt, "Unable to suspend domain. (%lx)",
369 info->flags);
371 return -1;
372 }
374 int xc_linux_save(int xc_handle, XcIOContext *ioctxt)
375 {
376 xc_domaininfo_t info;
378 int rc = 1, i, j, k, last_iter, iter = 0;
379 unsigned long mfn;
380 u32 domid = ioctxt->domain;
381 int live = (ioctxt->flags & XCFLAGS_LIVE);
382 int debug = (ioctxt->flags & XCFLAGS_DEBUG);
383 int sent_last_iter, skip_this_iter;
385 /* Important tuning parameters */
386 int max_iters = 29; /* limit us to 30 times round loop */
387 int max_factor = 3; /* never send more than 3x nr_pfns */
389 /* The new domain's shared-info frame number. */
390 unsigned long shared_info_frame;
392 /* A copy of the CPU context of the guest. */
393 full_execution_context_t ctxt;
395 /* A table containg the type of each PFN (/not/ MFN!). */
396 unsigned long *pfn_type = NULL;
397 unsigned long *pfn_batch = NULL;
399 /* A temporary mapping, and a copy, of one frame of guest memory. */
400 unsigned long page[1024];
402 /* A copy of the pfn-to-mfn table frame list. */
403 unsigned long *live_pfn_to_mfn_frame_list = NULL;
404 unsigned long pfn_to_mfn_frame_list[1024];
406 /* Live mapping of the table mapping each PFN to its current MFN. */
407 unsigned long *live_pfn_to_mfn_table = NULL;
408 /* Live mapping of system MFN to PFN table. */
409 unsigned long *live_mfn_to_pfn_table = NULL;
410 unsigned long mfn_to_pfn_table_start_mfn;
412 /* Live mapping of shared info structure */
413 shared_info_t *live_shinfo = NULL;
415 /* base of the region in which domain memory is mapped */
416 unsigned char *region_base = NULL;
418 /* A temporary mapping, and a copy, of the guest's suspend record. */
419 suspend_record_t *p_srec = NULL;
421 /* number of pages we're dealing with */
422 unsigned long nr_pfns;
424 /* power of 2 order of nr_pfns */
425 int order_nr;
427 /* bitmap of pages:
428 - that should be sent this iteration (unless later marked as skip);
429 - to skip this iteration because already dirty;
430 - to fixup by sending at the end if not already resent; */
431 unsigned long *to_send, *to_skip, *to_fix;
433 xc_shadow_control_stats_t stats;
435 int needed_to_fix = 0;
436 int total_sent = 0;
438 MBIT_RATE = START_MBIT_RATE;
440 xcio_info(ioctxt, "xc_linux_save start %d\n", domid);
442 if (mlock(&ctxt, sizeof(ctxt))) {
443 xcio_perror(ioctxt, "Unable to mlock ctxt");
444 return 1;
445 }
447 if ( xc_domain_getfullinfo( xc_handle, domid, /* FIXME */ 0, &info, &ctxt) )
448 {
449 xcio_error(ioctxt, "Could not get full domain info");
450 goto out;
451 }
452 shared_info_frame = info.shared_info_frame;
454 /* A cheesy test to see whether the domain contains valid state. */
455 if ( ctxt.pt_base == 0 ){
456 xcio_error(ioctxt, "Domain is not in a valid Linux guest OS state");
457 goto out;
458 }
460 nr_pfns = info.max_pages;
462 /* cheesy sanity check */
463 if ( nr_pfns > 1024*1024 ){
464 xcio_error(ioctxt, "Invalid state record -- pfn count out of range: %lu", nr_pfns);
465 goto out;
466 }
469 /* Map the shared info frame */
470 live_shinfo = xc_map_foreign_range(xc_handle, domid,
471 PAGE_SIZE, PROT_READ,
472 shared_info_frame);
474 if (!live_shinfo){
475 xcio_error(ioctxt, "Couldn't map live_shinfo");
476 goto out;
477 }
479 /* the pfn_to_mfn_frame_list fits in a single page */
480 live_pfn_to_mfn_frame_list =
481 xc_map_foreign_range(xc_handle, domid,
482 PAGE_SIZE, PROT_READ,
483 live_shinfo->arch.pfn_to_mfn_frame_list );
485 if (!live_pfn_to_mfn_frame_list){
486 xcio_error(ioctxt, "Couldn't map pfn_to_mfn_frame_list");
487 goto out;
488 }
491 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
492 the guest must not change which frames are used for this purpose.
493 (its not clear why it would want to change them, and we'll be OK
494 from a safety POV anyhow. */
496 live_pfn_to_mfn_table = xc_map_foreign_batch(xc_handle, domid,
497 PROT_READ,
498 live_pfn_to_mfn_frame_list,
499 (nr_pfns+1023)/1024 );
500 if( !live_pfn_to_mfn_table ){
501 xcio_perror(ioctxt, "Couldn't map pfn_to_mfn table");
502 goto out;
503 }
505 /* Setup the mfn_to_pfn table mapping */
506 mfn_to_pfn_table_start_mfn = xc_get_m2p_start_mfn( xc_handle );
508 live_mfn_to_pfn_table =
509 xc_map_foreign_range(xc_handle, DOMID_XEN,
510 PAGE_SIZE*1024, PROT_READ,
511 mfn_to_pfn_table_start_mfn );
513 /* Canonicalise the pfn-to-mfn table frame-number list. */
514 memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE );
516 for ( i = 0; i < nr_pfns; i += 1024 ){
517 if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ){
518 xcio_error(ioctxt, "Frame # in pfn-to-mfn frame list is not in pseudophys");
519 goto out;
520 }
521 }
524 /* Domain is still running at this point */
526 if( live )
527 {
528 if ( xc_shadow_control( xc_handle, domid,
529 DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
530 NULL, 0, NULL ) < 0 ) {
531 xcio_error(ioctxt, "Couldn't enable shadow mode");
532 goto out;
533 }
535 last_iter = 0;
536 } else{
537 /* This is a non-live suspend. Issue the call back to get the
538 domain suspended */
540 last_iter = 1;
542 if ( suspend_and_state( xc_handle, ioctxt, &info, &ctxt) )
543 {
544 xcio_error(ioctxt, "Domain appears not to have suspended: %lx",
545 info.flags);
546 goto out;
547 }
549 }
550 sent_last_iter = 1<<20; /* 4GB of pages */
552 /* calculate the power of 2 order of nr_pfns, e.g.
553 15->4 16->4 17->5 */
554 for( i=nr_pfns-1, order_nr=0; i ; i>>=1, order_nr++ );
556 /* Setup to_send bitmap */
557 {
558 /* size these for a maximal 4GB domain, to make interaction
559 with balloon driver easier. It's only user space memory,
560 ater all... (3x 128KB) */
562 int sz = ( 1<<20 ) / 8;
564 to_send = malloc( sz );
565 to_fix = calloc( 1, sz );
566 to_skip = malloc( sz );
568 if (!to_send || !to_fix || !to_skip){
569 xcio_error(ioctxt, "Couldn't allocate to_send array");
570 goto out;
571 }
573 memset( to_send, 0xff, sz );
575 if ( mlock( to_send, sz ) ){
576 xcio_perror(ioctxt, "Unable to mlock to_send");
577 return 1;
578 }
580 /* (to fix is local only) */
582 if ( mlock( to_skip, sz ) ){
583 xcio_perror(ioctxt, "Unable to mlock to_skip");
584 return 1;
585 }
587 }
589 analysis_phase( xc_handle, domid, nr_pfns, to_skip, 0 );
591 /* We want zeroed memory so use calloc rather than malloc. */
592 pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
593 pfn_batch = calloc(BATCH_SIZE, sizeof(unsigned long));
595 if ( (pfn_type == NULL) || (pfn_batch == NULL) ){
596 errno = ENOMEM;
597 goto out;
598 }
600 if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) ){
601 xcio_error(ioctxt, "Unable to mlock");
602 goto out;
603 }
606 /*
607 * Quick belt and braces sanity check.
608 */
609 #if DEBUG
610 {
611 int err=0;
612 for ( i = 0; i < nr_pfns; i++ )
613 {
614 mfn = live_pfn_to_mfn_table[i];
616 if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0xffffffffUL) )
617 {
618 printf("i=0x%x mfn=%lx live_mfn_to_pfn_table=%lx\n",
619 i,mfn,live_mfn_to_pfn_table[mfn]);
620 err++;
621 }
622 }
623 printf("Had %d unexplained entries in p2m table\n",err);
624 }
625 #endif
628 /* Start writing out the saved-domain record. */
630 if ( xcio_write(ioctxt, "LinuxGuestRecord", 16) ||
631 xcio_write(ioctxt, &nr_pfns, sizeof(unsigned long)) ||
632 xcio_write(ioctxt, pfn_to_mfn_frame_list, PAGE_SIZE) ){
633 xcio_error(ioctxt, "Error writing header");
634 goto out;
635 }
636 if(write_vmconfig(ioctxt)){
637 xcio_error(ioctxt, "Error writing vmconfig");
638 goto out;
639 }
641 print_stats( xc_handle, domid, 0, &stats, 0 );
643 /* Now write out each data page, canonicalising page tables as we go... */
645 while(1){
646 unsigned int prev_pc, sent_this_iter, N, batch;
648 iter++;
649 sent_this_iter = 0;
650 skip_this_iter = 0;
651 prev_pc = 0;
652 N=0;
654 xcio_info(ioctxt, "Saving memory pages: iter %d 0%%", iter);
656 while( N < nr_pfns ){
657 unsigned int this_pc = (N * 100) / nr_pfns;
659 if ( (this_pc - prev_pc) >= 5 ){
660 xcio_info(ioctxt, "\b\b\b\b%3d%%", this_pc);
661 prev_pc = this_pc;
662 }
664 /* slightly wasteful to peek the whole array evey time,
665 but this is fast enough for the moment. */
667 if ( !last_iter &&
668 xc_shadow_control(xc_handle, domid,
669 DOM0_SHADOW_CONTROL_OP_PEEK,
670 to_skip, nr_pfns, NULL) != nr_pfns )
671 {
672 xcio_error(ioctxt, "Error peeking shadow bitmap");
673 goto out;
674 }
677 /* load pfn_type[] with the mfn of all the pages we're doing in
678 this batch. */
680 for ( batch = 0; batch < BATCH_SIZE && N < nr_pfns ; N++ )
681 {
682 int n = permute(N, nr_pfns, order_nr );
684 if ( 0 && debug ) {
685 fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d "
686 " [mfn]= %08lx\n",
687 iter, (unsigned long)n, live_pfn_to_mfn_table[n],
688 test_bit(n,to_send),
689 live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&
690 0xFFFFF]);
691 }
693 if ( !last_iter &&
694 test_bit(n, to_send) &&
695 test_bit(n, to_skip) ) {
696 skip_this_iter++; /* stats keeping */
697 }
699 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
700 (test_bit(n, to_send) && last_iter) ||
701 (test_bit(n, to_fix) && last_iter)) ) {
702 continue;
703 }
705 /* we get here if:
706 1. page is marked to_send & hasn't already been re-dirtied
707 2. (ignore to_skip in last iteration)
708 3. add in pages that still need fixup (net bufs)
709 */
711 pfn_batch[batch] = n;
712 pfn_type[batch] = live_pfn_to_mfn_table[n];
714 if( ! is_mapped(pfn_type[batch]) )
715 {
716 /* not currently in pusedo-physical map -- set bit
717 in to_fix that we must send this page in last_iter
718 unless its sent sooner anyhow */
720 set_bit( n, to_fix );
721 if( iter>1 )
722 DDPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
723 iter,n,pfn_type[batch]);
724 continue;
725 }
727 if ( last_iter &&
728 test_bit(n, to_fix) &&
729 !test_bit(n, to_send) )
730 {
731 needed_to_fix++;
732 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
733 iter,n,pfn_type[batch]);
734 }
736 clear_bit(n, to_fix);
738 batch++;
739 }
741 // DDPRINTF("batch %d:%d (n=%d)\n", iter, batch, n);
743 if ( batch == 0 )
744 goto skip; /* vanishingly unlikely... */
746 if ( (region_base = xc_map_foreign_batch(xc_handle, domid,
747 PROT_READ,
748 pfn_type,
749 batch)) == 0 ){
750 xcio_perror(ioctxt, "map batch failed");
751 goto out;
752 }
754 if ( get_pfn_type_batch(xc_handle, domid, batch, pfn_type) ){
755 xcio_error(ioctxt, "get_pfn_type_batch failed");
756 goto out;
757 }
759 for ( j = 0; j < batch; j++ ){
760 if ( (pfn_type[j] & LTAB_MASK) == XTAB ){
761 DDPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
762 continue;
763 }
765 if ( 0 && debug )
766 fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
767 " sum= %08lx\n",
768 iter,
769 (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
770 pfn_type[j],
771 live_mfn_to_pfn_table[pfn_type[j]&(~LTAB_MASK)],
772 csum_page(region_base + (PAGE_SIZE*j)));
774 /* canonicalise mfn->pfn */
775 pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
776 }
778 if ( xcio_write(ioctxt, &batch, sizeof(int) ) ){
779 xcio_error(ioctxt, "Error when writing to state file (2)");
780 goto out;
781 }
783 if ( xcio_write(ioctxt, pfn_type, sizeof(unsigned long)*j ) ){
784 xcio_error(ioctxt, "Error when writing to state file (3)");
785 goto out;
786 }
788 /* entering this loop, pfn_type is now in pfns (Not mfns) */
789 for( j = 0; j < batch; j++ ){
790 /* write out pages in batch */
791 if( (pfn_type[j] & LTAB_MASK) == XTAB){
792 DDPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
793 continue;
794 }
796 if ( ((pfn_type[j] & LTABTYPE_MASK) == L1TAB) ||
797 ((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ){
798 memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
800 for ( k = 0;
801 k < (((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ?
802 (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) :
803 1024);
804 k++ ){
805 unsigned long pfn;
807 if ( !(page[k] & _PAGE_PRESENT) )
808 continue;
810 mfn = page[k] >> PAGE_SHIFT;
811 pfn = live_mfn_to_pfn_table[mfn];
813 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
814 {
815 /* I don't think this should ever happen */
816 printf("FNI %d : [%08lx,%d] pte=%08lx, "
817 "mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
818 j, pfn_type[j], k,
819 page[k], mfn, live_mfn_to_pfn_table[mfn],
820 (live_mfn_to_pfn_table[mfn]<nr_pfns)?
821 live_pfn_to_mfn_table[
822 live_mfn_to_pfn_table[mfn]] :
823 0xdeadbeef);
825 pfn = 0; /* be suspicious */
826 }
828 page[k] &= PAGE_SIZE - 1;
829 page[k] |= pfn << PAGE_SHIFT;
831 #if 0
832 printf("L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx "
833 "xpfn=%d\n",
834 pfn_type[j]>>28,
835 j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
836 #endif
838 } /* end of page table rewrite for loop */
840 if ( xcio_ratewrite(ioctxt, page, PAGE_SIZE) ){
841 xcio_error(ioctxt, "Error when writing to state file (4)");
842 goto out;
843 }
845 } /* end of it's a PT page */ else { /* normal page */
847 if ( xcio_ratewrite(ioctxt, region_base + (PAGE_SIZE*j),
848 PAGE_SIZE) ){
849 xcio_error(ioctxt, "Error when writing to state file (5)");
850 goto out;
851 }
852 }
853 } /* end of the write out for this batch */
855 sent_this_iter += batch;
857 } /* end of this while loop for this iteration */
859 munmap(region_base, batch*PAGE_SIZE);
861 skip:
863 total_sent += sent_this_iter;
865 xcio_info(ioctxt, "\r %d: sent %d, skipped %d, ",
866 iter, sent_this_iter, skip_this_iter );
868 if ( last_iter ) {
869 print_stats( xc_handle, domid, sent_this_iter, &stats, 1);
871 xcio_info(ioctxt, "Total pages sent= %d (%.2fx)\n",
872 total_sent, ((float)total_sent)/nr_pfns );
873 xcio_info(ioctxt, "(of which %d were fixups)\n", needed_to_fix );
874 }
876 if (last_iter && debug){
877 int minusone = -1;
878 memset( to_send, 0xff, (nr_pfns+8)/8 );
879 debug = 0;
880 printf("Entering debug resend-all mode\n");
882 /* send "-1" to put receiver into debug mode */
883 if ( xcio_write(ioctxt, &minusone, sizeof(int)) )
884 {
885 xcio_error(ioctxt, "Error when writing to state file (6)");
886 goto out;
887 }
889 continue;
890 }
892 if ( last_iter ) break;
894 if ( live )
895 {
896 if (
897 ( ( sent_this_iter > sent_last_iter ) &&
898 (mbit_rate == MAX_MBIT_RATE ) ) ||
899 (iter >= max_iters) ||
900 (sent_this_iter+skip_this_iter < 50) ||
901 (total_sent > nr_pfns*max_factor) )
902 {
903 DPRINTF("Start last iteration\n");
904 last_iter = 1;
906 if ( suspend_and_state( xc_handle, ioctxt, &info, &ctxt) )
907 {
908 xcio_error(ioctxt, "Domain appears not to have suspended: %lx",
909 info.flags);
910 goto out;
911 }
913 xcio_info(ioctxt,
914 "SUSPEND flags %08lx shinfo %08lx eip %08lx "
915 "esi %08lx\n",info.flags,
916 info.shared_info_frame,
917 ctxt.cpu_ctxt.eip, ctxt.cpu_ctxt.esi );
918 }
920 if ( xc_shadow_control( xc_handle, domid,
921 DOM0_SHADOW_CONTROL_OP_CLEAN,
922 to_send, nr_pfns, &stats ) != nr_pfns )
923 {
924 xcio_error(ioctxt, "Error flushing shadow PT");
925 goto out;
926 }
928 sent_last_iter = sent_this_iter;
930 print_stats( xc_handle, domid, sent_this_iter, &stats, 1);
932 }
935 } /* end of while 1 */
937 DPRINTF("All memory is saved\n");
939 /* Success! */
940 rc = 0;
942 /* Zero terminate */
943 if ( xcio_write(ioctxt, &rc, sizeof(int)) )
944 {
945 xcio_error(ioctxt, "Error when writing to state file (6)");
946 goto out;
947 }
949 /* Send through a list of all the PFNs that were not in map at the close */
950 {
951 unsigned int i,j;
952 unsigned int pfntab[1024];
954 for ( i = 0, j = 0; i < nr_pfns; i++ )
955 {
956 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
957 j++;
958 }
960 if ( xcio_write(ioctxt, &j, sizeof(unsigned int)) )
961 {
962 xcio_error(ioctxt, "Error when writing to state file (6a)");
963 goto out;
964 }
966 for ( i = 0, j = 0; i < nr_pfns; )
967 {
968 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
969 {
970 pfntab[j++] = i;
971 }
972 i++;
973 if ( j == 1024 || i == nr_pfns )
974 {
975 if ( xcio_write(ioctxt, &pfntab, sizeof(unsigned long)*j) )
976 {
977 xcio_error(ioctxt, "Error when writing to state file (6b)");
978 goto out;
979 }
980 j = 0;
981 }
982 }
983 }
985 /* Map the suspend-record MFN to pin it. The page must be owned by
986 domid for this to succeed. */
987 p_srec = xc_map_foreign_range(xc_handle, domid,
988 sizeof(*p_srec), PROT_READ,
989 ctxt.cpu_ctxt.esi);
990 if (!p_srec){
991 xcio_error(ioctxt, "Couldn't map suspend record");
992 goto out;
993 }
995 if (nr_pfns != p_srec->nr_pfns )
996 {
997 xcio_error(ioctxt, "Suspend record nr_pfns unexpected (%ld != %ld)",
998 p_srec->nr_pfns, nr_pfns);
999 goto out;
1002 /* Canonicalise the suspend-record frame number. */
1003 if ( !translate_mfn_to_pfn(&ctxt.cpu_ctxt.esi) ){
1004 xcio_error(ioctxt, "Suspend record is not in range of pseudophys map");
1005 goto out;
1008 /* Canonicalise each GDT frame number. */
1009 for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
1010 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1011 xcio_error(ioctxt, "GDT frame is not in range of pseudophys map");
1012 goto out;
1016 /* Canonicalise the page table base pointer. */
1017 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.pt_base >> PAGE_SHIFT) ) {
1018 xcio_error(ioctxt, "PT base is not in range of pseudophys map");
1019 goto out;
1021 ctxt.pt_base = live_mfn_to_pfn_table[ctxt.pt_base >> PAGE_SHIFT] <<
1022 PAGE_SHIFT;
1024 if ( xcio_write(ioctxt, &ctxt, sizeof(ctxt)) ||
1025 xcio_write(ioctxt, live_shinfo, PAGE_SIZE) ) {
1026 xcio_error(ioctxt, "Error when writing to state file (1)");
1027 goto out;
1030 out:
1032 if ( live_shinfo ) munmap(live_shinfo, PAGE_SIZE);
1033 if ( p_srec ) munmap(p_srec, sizeof(*p_srec));
1034 if ( live_pfn_to_mfn_frame_list ) munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
1035 if ( live_pfn_to_mfn_table ) munmap(live_pfn_to_mfn_table, nr_pfns*4 );
1036 if ( live_mfn_to_pfn_table ) munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024 );
1038 if ( pfn_type != NULL ) free(pfn_type);
1039 DPRINTF("Save exit rc=%d\n",rc);
1040 return !!rc;