ia64/xen-unstable

view tools/libxc/xc_linux_save.c @ 5787:50da9b240a1f

Recently some people complain that they cannot set the maxmem for
domU. The problem is that some code wrongly treat dom.max_memkb as in
byte unit, instead of KB unit. This patch fixs the problem.

Signed-off-by: Nguyen Anh Quynh <aquynh@gmail.com>
author kaf24@firebug.cl.cam.ac.uk
date Fri Jul 15 08:30:22 2005 +0000 (2005-07-15)
parents 48dd03e4b388
children a83ac0806d6b
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <sys/time.h>
11 #include "xc_private.h"
12 #include <xen/linux/suspend.h>
13 #include <xen/io/domain_controller.h>
14 #include <time.h>
16 #define BATCH_SIZE 1024 /* 1024 pages (4MB) at a time */
18 #define MAX_MBIT_RATE 500
20 #define DEBUG 0
22 #if 1
23 #define ERR(_f, _a...) fprintf ( stderr, _f , ## _a )
24 #else
25 #define ERR(_f, _a...) ((void)0)
26 #endif
28 #if DEBUG
29 #define DPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
30 #else
31 #define DPRINTF(_f, _a...) ((void)0)
32 #endif
34 #define PROGRESS 0
35 #if PROGRESS
36 #define PPRINTF(_f, _a...) fprintf ( stderr, _f , ## _a )
37 #else
38 #define PPRINTF(_f, _a...)
39 #endif
41 /*
42 * Returns TRUE if the given machine frame number has a unique mapping
43 * in the guest's pseudophysical map.
44 */
46 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
47 (((_mfn) < (1024*1024)) && \
48 ((live_mfn_to_pfn_table[_mfn] < nr_pfns) && \
49 (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn))))
52 /* Returns TRUE if MFN is successfully converted to a PFN. */
53 #define translate_mfn_to_pfn(_pmfn) \
54 ({ \
55 unsigned long mfn = *(_pmfn); \
56 int _res = 1; \
57 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
58 _res = 0; \
59 else \
60 *(_pmfn) = live_mfn_to_pfn_table[mfn]; \
61 _res; \
62 })
64 #define is_mapped(pfn) (!((pfn) & 0x80000000UL))
66 static inline int test_bit ( int nr, volatile void * addr)
67 {
68 return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
69 (nr % (sizeof(unsigned long)*8))) & 1;
70 }
72 static inline void clear_bit ( int nr, volatile void * addr)
73 {
74 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &=
75 ~(1 << (nr % (sizeof(unsigned long)*8) ) );
76 }
78 static inline void set_bit ( int nr, volatile void * addr)
79 {
80 ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |=
81 (1 << (nr % (sizeof(unsigned long)*8) ) );
82 }
84 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
85 static inline unsigned int hweight32(unsigned int w)
86 {
87 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
88 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
89 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
90 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
91 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
92 }
94 static inline int count_bits ( int nr, volatile void *addr)
95 {
96 int i, count = 0;
97 unsigned long *p = (unsigned long *)addr;
98 /* We know that the array is padded to unsigned long. */
99 for(i=0;i<nr/(sizeof(unsigned long)*8);i++,p++)
100 count += hweight32( *p );
101 return count;
102 }
104 static inline int permute( int i, int nr, int order_nr )
105 {
106 /* Need a simple permutation function so that we scan pages in a
107 pseudo random order, enabling us to get a better estimate of
108 the domain's page dirtying rate as we go (there are often
109 contiguous ranges of pfns that have similar behaviour, and we
110 want to mix them up. */
112 /* e.g. nr->oder 15->4 16->4 17->5 */
113 /* 512MB domain, 128k pages, order 17 */
115 /*
116 QPONMLKJIHGFEDCBA
117 QPONMLKJIH
118 GFEDCBA
119 */
121 /*
122 QPONMLKJIHGFEDCBA
123 EDCBA
124 QPONM
125 LKJIHGF
126 */
128 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
129 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
131 return i;
132 }
134 static long long tv_to_us( struct timeval *new )
135 {
136 return (new->tv_sec * 1000000) + new->tv_usec;
137 }
139 static long long llgettimeofday()
140 {
141 struct timeval now;
142 gettimeofday(&now, NULL);
143 return tv_to_us(&now);
144 }
146 static long long tv_delta( struct timeval *new, struct timeval *old )
147 {
148 return ((new->tv_sec - old->tv_sec)*1000000 ) +
149 (new->tv_usec - old->tv_usec);
150 }
153 #define START_MBIT_RATE 0 //ioctxt->resource
155 static int mbit_rate, ombit_rate = 0;
156 static int burst_time_us = -1;
158 #define MBIT_RATE mbit_rate
159 #define BURST_BUDGET (100*1024)
161 /*
162 1000000/((100)*1024*1024/8/(100*1024))
163 7812
164 1000000/((100)*1024/8/(100))
165 7812
166 1000000/((100)*128/(100))
167 7812
168 100000000/((100)*128)
169 7812
170 100000000/128
171 781250
172 */
173 #define RATE_TO_BTU 781250
174 #define BURST_TIME_US burst_time_us
176 static int
177 ratewrite(int io_fd, void *buf, int n)
178 {
179 static int budget = 0;
180 static struct timeval last_put = { 0 };
181 struct timeval now;
182 struct timespec delay;
183 long long delta;
185 if (START_MBIT_RATE == 0)
186 return write(io_fd, buf, n);
188 budget -= n;
189 if (budget < 0) {
190 if (MBIT_RATE != ombit_rate) {
191 BURST_TIME_US = RATE_TO_BTU / MBIT_RATE;
192 ombit_rate = MBIT_RATE;
193 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
194 MBIT_RATE, BURST_BUDGET, BURST_TIME_US);
195 }
196 if (last_put.tv_sec == 0) {
197 budget += BURST_BUDGET;
198 gettimeofday(&last_put, NULL);
199 } else {
200 while (budget < 0) {
201 gettimeofday(&now, NULL);
202 delta = tv_delta(&now, &last_put);
203 while (delta > BURST_TIME_US) {
204 budget += BURST_BUDGET;
205 last_put.tv_usec += BURST_TIME_US;
206 if (last_put.tv_usec > 1000000) {
207 last_put.tv_usec -= 1000000;
208 last_put.tv_sec++;
209 }
210 delta -= BURST_TIME_US;
211 }
212 if (budget > 0)
213 break;
214 delay.tv_sec = 0;
215 delay.tv_nsec = 1000 * (BURST_TIME_US - delta);
216 while (delay.tv_nsec > 0)
217 if (nanosleep(&delay, &delay) == 0)
218 break;
219 }
220 }
221 }
222 return write(io_fd, buf, n);
223 }
225 static int print_stats( int xc_handle, u32 domid,
226 int pages_sent, xc_shadow_control_stats_t *stats,
227 int print )
228 {
229 static struct timeval wall_last;
230 static long long d0_cpu_last;
231 static long long d1_cpu_last;
233 struct timeval wall_now;
234 long long wall_delta;
235 long long d0_cpu_now, d0_cpu_delta;
236 long long d1_cpu_now, d1_cpu_delta;
238 gettimeofday(&wall_now, NULL);
240 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
241 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
243 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
244 fprintf(stderr, "ARRHHH!!\n");
246 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
248 if ( wall_delta == 0 ) wall_delta = 1;
250 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
251 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
253 if ( print )
254 fprintf(stderr,
255 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
256 "dirtied %dMb/s %" PRId32 " pages\n",
257 wall_delta,
258 (int)((d0_cpu_delta*100)/wall_delta),
259 (int)((d1_cpu_delta*100)/wall_delta),
260 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
261 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
262 stats->dirty_count);
264 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
265 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
266 + 50;
267 if (mbit_rate > MAX_MBIT_RATE)
268 mbit_rate = MAX_MBIT_RATE;
269 }
271 d0_cpu_last = d0_cpu_now;
272 d1_cpu_last = d1_cpu_now;
273 wall_last = wall_now;
275 return 0;
276 }
278 static int analysis_phase( int xc_handle, u32 domid,
279 int nr_pfns, unsigned long *arr, int runs )
280 {
281 long long start, now;
282 xc_shadow_control_stats_t stats;
283 int j;
285 start = llgettimeofday();
287 for (j = 0; j < runs; j++)
288 {
289 int i;
291 xc_shadow_control( xc_handle, domid,
292 DOM0_SHADOW_CONTROL_OP_CLEAN,
293 arr, nr_pfns, NULL);
294 fprintf(stderr, "#Flush\n");
295 for ( i = 0; i < 40; i++ )
296 {
297 usleep(50000);
298 now = llgettimeofday();
299 xc_shadow_control( xc_handle, domid,
300 DOM0_SHADOW_CONTROL_OP_PEEK,
301 NULL, 0, &stats);
303 fprintf(stderr, "now= %lld faults= %" PRId32 " dirty= %" PRId32
304 " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
305 ((now-start)+500)/1000,
306 stats.fault_count, stats.dirty_count,
307 stats.dirty_net_count, stats.dirty_block_count);
308 }
309 }
311 return -1;
312 }
315 int suspend_and_state(int xc_handle, int io_fd, int dom,
316 xc_dominfo_t *info,
317 vcpu_guest_context_t *ctxt)
318 {
319 int i=0;
320 char ans[30];
322 printf("suspend\n");
323 fflush(stdout);
324 if (fgets(ans, sizeof(ans), stdin) == NULL) {
325 ERR("failed reading suspend reply");
326 return -1;
327 }
328 if (strncmp(ans, "done\n", 5)) {
329 ERR("suspend reply incorrect: %s", ans);
330 return -1;
331 }
333 retry:
335 if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1)
336 {
337 ERR("Could not get domain info");
338 return -1;
339 }
341 if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */,
342 ctxt) )
343 {
344 ERR("Could not get vcpu context");
345 }
347 if ( info->shutdown && info->shutdown_reason == SHUTDOWN_suspend )
348 {
349 return 0; // success
350 }
352 if ( info->paused )
353 {
354 // try unpausing domain, wait, and retest
355 xc_domain_unpause( xc_handle, dom );
357 ERR("Domain was paused. Wait and re-test.");
358 usleep(10000); // 10ms
360 goto retry;
361 }
364 if( ++i < 100 )
365 {
366 ERR("Retry suspend domain.");
367 usleep(10000); // 10ms
368 goto retry;
369 }
371 ERR("Unable to suspend domain.");
373 return -1;
374 }
376 int xc_linux_save(int xc_handle, int io_fd, u32 dom)
377 {
378 xc_dominfo_t info;
380 int rc = 1, i, j, k, last_iter, iter = 0;
381 unsigned long mfn;
382 int live = 0; // (ioctxt->flags & XCFLAGS_LIVE);
383 int debug = 0; // (ioctxt->flags & XCFLAGS_DEBUG);
384 int sent_last_iter, skip_this_iter;
386 /* Important tuning parameters */
387 int max_iters = 29; /* limit us to 30 times round loop */
388 int max_factor = 3; /* never send more than 3x nr_pfns */
390 /* The new domain's shared-info frame number. */
391 unsigned long shared_info_frame;
393 /* A copy of the CPU context of the guest. */
394 vcpu_guest_context_t ctxt;
396 /* A table containg the type of each PFN (/not/ MFN!). */
397 unsigned long *pfn_type = NULL;
398 unsigned long *pfn_batch = NULL;
400 /* A temporary mapping, and a copy, of one frame of guest memory. */
401 unsigned long page[1024];
403 /* A copy of the pfn-to-mfn table frame list. */
404 unsigned long *live_pfn_to_mfn_frame_list = NULL;
405 unsigned long pfn_to_mfn_frame_list[1024];
407 /* Live mapping of the table mapping each PFN to its current MFN. */
408 unsigned long *live_pfn_to_mfn_table = NULL;
409 /* Live mapping of system MFN to PFN table. */
410 unsigned long *live_mfn_to_pfn_table = NULL;
411 unsigned long mfn_to_pfn_table_start_mfn;
413 /* Live mapping of shared info structure */
414 shared_info_t *live_shinfo = NULL;
416 /* base of the region in which domain memory is mapped */
417 unsigned char *region_base = NULL;
419 /* A temporary mapping, and a copy, of the guest's suspend record. */
420 suspend_record_t *p_srec = NULL;
422 /* number of pages we're dealing with */
423 unsigned long nr_pfns;
425 /* power of 2 order of nr_pfns */
426 int order_nr;
428 /* bitmap of pages:
429 - that should be sent this iteration (unless later marked as skip);
430 - to skip this iteration because already dirty;
431 - to fixup by sending at the end if not already resent; */
432 unsigned long *to_send, *to_skip, *to_fix;
434 xc_shadow_control_stats_t stats;
436 int needed_to_fix = 0;
437 int total_sent = 0;
439 MBIT_RATE = START_MBIT_RATE;
441 DPRINTF("xc_linux_save start %d\n", dom);
443 if (mlock(&ctxt, sizeof(ctxt))) {
444 ERR("Unable to mlock ctxt");
445 return 1;
446 }
448 if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1)
449 {
450 ERR("Could not get domain info");
451 goto out;
452 }
453 if ( xc_domain_get_vcpu_context( xc_handle, dom, /* FIXME */ 0,
454 &ctxt) )
455 {
456 ERR("Could not get vcpu context");
457 goto out;
458 }
459 shared_info_frame = info.shared_info_frame;
461 /* A cheesy test to see whether the domain contains valid state. */
462 if ( ctxt.ctrlreg[3] == 0 ){
463 ERR("Domain is not in a valid Linux guest OS state");
464 goto out;
465 }
467 nr_pfns = info.max_memkb >> (PAGE_SHIFT - 10);
469 /* cheesy sanity check */
470 if ( nr_pfns > 1024*1024 )
471 {
472 ERR("Invalid state record -- pfn count out of range: %lu", nr_pfns);
473 goto out;
474 }
476 /* Map the shared info frame */
477 live_shinfo = xc_map_foreign_range(xc_handle, dom,
478 PAGE_SIZE, PROT_READ,
479 shared_info_frame);
481 if (!live_shinfo){
482 ERR("Couldn't map live_shinfo");
483 goto out;
484 }
486 /* the pfn_to_mfn_frame_list fits in a single page */
487 live_pfn_to_mfn_frame_list =
488 xc_map_foreign_range(xc_handle, dom,
489 PAGE_SIZE, PROT_READ,
490 live_shinfo->arch.pfn_to_mfn_frame_list );
492 if (!live_pfn_to_mfn_frame_list){
493 ERR("Couldn't map pfn_to_mfn_frame_list");
494 goto out;
495 }
498 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
499 the guest must not change which frames are used for this purpose.
500 (its not clear why it would want to change them, and we'll be OK
501 from a safety POV anyhow. */
503 live_pfn_to_mfn_table = xc_map_foreign_batch(xc_handle, dom,
504 PROT_READ,
505 live_pfn_to_mfn_frame_list,
506 (nr_pfns+1023)/1024 );
507 if( !live_pfn_to_mfn_table ){
508 ERR("Couldn't map pfn_to_mfn table");
509 goto out;
510 }
512 /* Setup the mfn_to_pfn table mapping */
513 mfn_to_pfn_table_start_mfn = xc_get_m2p_start_mfn( xc_handle );
515 live_mfn_to_pfn_table =
516 xc_map_foreign_range(xc_handle, DOMID_XEN,
517 PAGE_SIZE*1024, PROT_READ,
518 mfn_to_pfn_table_start_mfn );
520 /* Canonicalise the pfn-to-mfn table frame-number list. */
521 memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE );
523 for ( i = 0; i < nr_pfns; i += 1024 ){
524 if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) ){
525 ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
526 goto out;
527 }
528 }
531 /* Domain is still running at this point */
533 if( live )
534 {
535 if ( xc_shadow_control( xc_handle, dom,
536 DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
537 NULL, 0, NULL ) < 0 ) {
538 ERR("Couldn't enable shadow mode");
539 goto out;
540 }
542 last_iter = 0;
543 } else{
544 /* This is a non-live suspend. Issue the call back to get the
545 domain suspended */
547 last_iter = 1;
549 if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
550 {
551 ERR("Domain appears not to have suspended");
552 goto out;
553 }
555 }
556 sent_last_iter = 1<<20; /* 4GB of pages */
558 /* calculate the power of 2 order of nr_pfns, e.g.
559 15->4 16->4 17->5 */
560 for( i=nr_pfns-1, order_nr=0; i ; i>>=1, order_nr++ );
562 /* Setup to_send bitmap */
563 {
564 /* size these for a maximal 4GB domain, to make interaction
565 with balloon driver easier. It's only user space memory,
566 ater all... (3x 128KB) */
568 int sz = ( 1<<20 ) / 8;
570 to_send = malloc( sz );
571 to_fix = calloc( 1, sz );
572 to_skip = malloc( sz );
574 if (!to_send || !to_fix || !to_skip){
575 ERR("Couldn't allocate to_send array");
576 goto out;
577 }
579 memset( to_send, 0xff, sz );
581 if ( mlock( to_send, sz ) ){
582 ERR("Unable to mlock to_send");
583 return 1;
584 }
586 /* (to fix is local only) */
588 if ( mlock( to_skip, sz ) ){
589 ERR("Unable to mlock to_skip");
590 return 1;
591 }
593 }
595 analysis_phase( xc_handle, dom, nr_pfns, to_skip, 0 );
597 /* We want zeroed memory so use calloc rather than malloc. */
598 pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
599 pfn_batch = calloc(BATCH_SIZE, sizeof(unsigned long));
601 if ( (pfn_type == NULL) || (pfn_batch == NULL) ){
602 errno = ENOMEM;
603 goto out;
604 }
606 if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) ){
607 ERR("Unable to mlock");
608 goto out;
609 }
612 /*
613 * Quick belt and braces sanity check.
614 */
615 #if DEBUG
616 {
617 int err=0;
618 for ( i = 0; i < nr_pfns; i++ )
619 {
620 mfn = live_pfn_to_mfn_table[i];
622 if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0xffffffffUL) )
623 {
624 fprintf(stderr, "i=0x%x mfn=%lx live_mfn_to_pfn_table=%lx\n",
625 i,mfn,live_mfn_to_pfn_table[mfn]);
626 err++;
627 }
628 }
629 fprintf(stderr, "Had %d unexplained entries in p2m table\n",err);
630 }
631 #endif
634 /* Start writing out the saved-domain record. */
636 if (write(io_fd, &nr_pfns, sizeof(unsigned long)) !=
637 sizeof(unsigned long)) {
638 ERR("write: nr_pfns");
639 goto out;
640 }
641 if (write(io_fd, pfn_to_mfn_frame_list, PAGE_SIZE) != PAGE_SIZE) {
642 ERR("write: pfn_to_mfn_frame_list");
643 goto out;
644 }
646 print_stats( xc_handle, dom, 0, &stats, 0 );
648 /* Now write out each data page, canonicalising page tables as we go... */
650 while(1){
651 unsigned int prev_pc, sent_this_iter, N, batch;
653 iter++;
654 sent_this_iter = 0;
655 skip_this_iter = 0;
656 prev_pc = 0;
657 N=0;
659 DPRINTF("Saving memory pages: iter %d 0%%", iter);
661 while( N < nr_pfns ){
662 unsigned int this_pc = (N * 100) / nr_pfns;
664 if ( (this_pc - prev_pc) >= 5 ){
665 DPRINTF("\b\b\b\b%3d%%", this_pc);
666 prev_pc = this_pc;
667 }
669 /* slightly wasteful to peek the whole array evey time,
670 but this is fast enough for the moment. */
672 if ( !last_iter &&
673 xc_shadow_control(xc_handle, dom,
674 DOM0_SHADOW_CONTROL_OP_PEEK,
675 to_skip, nr_pfns, NULL) != nr_pfns )
676 {
677 ERR("Error peeking shadow bitmap");
678 goto out;
679 }
682 /* load pfn_type[] with the mfn of all the pages we're doing in
683 this batch. */
685 for ( batch = 0; batch < BATCH_SIZE && N < nr_pfns ; N++ )
686 {
687 int n = permute(N, nr_pfns, order_nr );
689 if ( 0 && debug ) {
690 fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d "
691 " [mfn]= %08lx\n",
692 iter, (unsigned long)n, live_pfn_to_mfn_table[n],
693 test_bit(n,to_send),
694 live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&
695 0xFFFFF]);
696 }
698 if ( !last_iter &&
699 test_bit(n, to_send) &&
700 test_bit(n, to_skip) ) {
701 skip_this_iter++; /* stats keeping */
702 }
704 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
705 (test_bit(n, to_send) && last_iter) ||
706 (test_bit(n, to_fix) && last_iter)) ) {
707 continue;
708 }
710 /* we get here if:
711 1. page is marked to_send & hasn't already been re-dirtied
712 2. (ignore to_skip in last iteration)
713 3. add in pages that still need fixup (net bufs)
714 */
716 pfn_batch[batch] = n;
717 pfn_type[batch] = live_pfn_to_mfn_table[n];
719 if( ! is_mapped(pfn_type[batch]) )
720 {
721 /* not currently in pusedo-physical map -- set bit
722 in to_fix that we must send this page in last_iter
723 unless its sent sooner anyhow */
725 set_bit( n, to_fix );
726 if( iter>1 )
727 DPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
728 iter,n,pfn_type[batch]);
729 continue;
730 }
732 if ( last_iter &&
733 test_bit(n, to_fix) &&
734 !test_bit(n, to_send) )
735 {
736 needed_to_fix++;
737 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
738 iter,n,pfn_type[batch]);
739 }
741 clear_bit(n, to_fix);
743 batch++;
744 }
746 // DPRINTF("batch %d:%d (n=%d)\n", iter, batch, n);
748 if ( batch == 0 )
749 goto skip; /* vanishingly unlikely... */
751 if ( (region_base = xc_map_foreign_batch(xc_handle, dom,
752 PROT_READ,
753 pfn_type,
754 batch)) == 0 ){
755 ERR("map batch failed");
756 goto out;
757 }
759 if ( get_pfn_type_batch(xc_handle, dom, batch, pfn_type) ){
760 ERR("get_pfn_type_batch failed");
761 goto out;
762 }
764 for ( j = 0; j < batch; j++ ){
765 if ( (pfn_type[j] & LTAB_MASK) == XTAB ){
766 DPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
767 continue;
768 }
770 if ( 0 && debug )
771 fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
772 " sum= %08lx\n",
773 iter,
774 (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
775 pfn_type[j],
776 live_mfn_to_pfn_table[pfn_type[j]&(~LTAB_MASK)],
777 csum_page(region_base + (PAGE_SIZE*j)));
779 /* canonicalise mfn->pfn */
780 pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
781 }
783 if (write(io_fd, &batch, sizeof(int)) != sizeof(int)) {
784 ERR("Error when writing to state file (2)");
785 goto out;
786 }
788 if (write(io_fd, pfn_type, sizeof(unsigned long)*j) !=
789 sizeof(unsigned long)*j) {
790 ERR("Error when writing to state file (3)");
791 goto out;
792 }
794 /* entering this loop, pfn_type is now in pfns (Not mfns) */
795 for( j = 0; j < batch; j++ ){
796 /* write out pages in batch */
797 if( (pfn_type[j] & LTAB_MASK) == XTAB){
798 DPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
799 continue;
800 }
802 if ( ((pfn_type[j] & LTABTYPE_MASK) == L1TAB) ||
803 ((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ){
804 memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
806 for ( k = 0;
807 k < (((pfn_type[j] & LTABTYPE_MASK) == L2TAB) ?
808 (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) :
809 1024);
810 k++ ){
811 unsigned long pfn;
813 if ( !(page[k] & _PAGE_PRESENT) )
814 continue;
816 mfn = page[k] >> PAGE_SHIFT;
817 pfn = live_mfn_to_pfn_table[mfn];
819 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
820 {
821 /* I don't think this should ever happen */
822 fprintf(stderr, "FNI %d : [%08lx,%d] pte=%08lx, "
823 "mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
824 j, pfn_type[j], k,
825 page[k], mfn, live_mfn_to_pfn_table[mfn],
826 (live_mfn_to_pfn_table[mfn]<nr_pfns)?
827 live_pfn_to_mfn_table[
828 live_mfn_to_pfn_table[mfn]] :
829 0xdeadbeef);
831 pfn = 0; /* be suspicious */
832 }
834 page[k] &= PAGE_SIZE - 1;
835 page[k] |= pfn << PAGE_SHIFT;
837 #if 0
838 fprintf(stderr,
839 "L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx "
840 "xpfn=%d\n",
841 pfn_type[j]>>28,
842 j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
843 #endif
845 } /* end of page table rewrite for loop */
847 if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
848 ERR("Error when writing to state file (4)");
849 goto out;
850 }
852 } /* end of it's a PT page */ else { /* normal page */
854 if (ratewrite(io_fd, region_base + (PAGE_SIZE*j),
855 PAGE_SIZE) != PAGE_SIZE) {
856 ERR("Error when writing to state file (5)");
857 goto out;
858 }
859 }
860 } /* end of the write out for this batch */
862 sent_this_iter += batch;
864 } /* end of this while loop for this iteration */
866 munmap(region_base, batch*PAGE_SIZE);
868 skip:
870 total_sent += sent_this_iter;
872 DPRINTF("\r %d: sent %d, skipped %d, ",
873 iter, sent_this_iter, skip_this_iter );
875 if ( last_iter ) {
876 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
878 DPRINTF("Total pages sent= %d (%.2fx)\n",
879 total_sent, ((float)total_sent)/nr_pfns );
880 DPRINTF("(of which %d were fixups)\n", needed_to_fix );
881 }
883 if (last_iter && debug){
884 int minusone = -1;
885 memset( to_send, 0xff, (nr_pfns+8)/8 );
886 debug = 0;
887 fprintf(stderr, "Entering debug resend-all mode\n");
889 /* send "-1" to put receiver into debug mode */
890 if (write(io_fd, &minusone, sizeof(int)) != sizeof(int)) {
891 ERR("Error when writing to state file (6)");
892 goto out;
893 }
895 continue;
896 }
898 if ( last_iter ) break;
900 if ( live )
901 {
902 if (
903 ( ( sent_this_iter > sent_last_iter ) &&
904 (mbit_rate == MAX_MBIT_RATE ) ) ||
905 (iter >= max_iters) ||
906 (sent_this_iter+skip_this_iter < 50) ||
907 (total_sent > nr_pfns*max_factor) )
908 {
909 DPRINTF("Start last iteration\n");
910 last_iter = 1;
912 if ( suspend_and_state( xc_handle, io_fd, dom, &info, &ctxt) )
913 {
914 ERR("Domain appears not to have suspended");
915 goto out;
916 }
918 DPRINTF("SUSPEND shinfo %08lx eip %08u esi %08u\n",
919 info.shared_info_frame,
920 ctxt.user_regs.eip, ctxt.user_regs.esi);
921 }
923 if ( xc_shadow_control( xc_handle, dom,
924 DOM0_SHADOW_CONTROL_OP_CLEAN,
925 to_send, nr_pfns, &stats ) != nr_pfns )
926 {
927 ERR("Error flushing shadow PT");
928 goto out;
929 }
931 sent_last_iter = sent_this_iter;
933 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
935 }
938 } /* end of while 1 */
940 DPRINTF("All memory is saved\n");
942 /* Success! */
943 rc = 0;
945 /* Zero terminate */
946 if (write(io_fd, &rc, sizeof(int)) != sizeof(int)) {
947 ERR("Error when writing to state file (6)");
948 goto out;
949 }
951 /* Send through a list of all the PFNs that were not in map at the close */
952 {
953 unsigned int i,j;
954 unsigned int pfntab[1024];
956 for ( i = 0, j = 0; i < nr_pfns; i++ )
957 {
958 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
959 j++;
960 }
962 if (write(io_fd, &j, sizeof(unsigned int)) != sizeof(unsigned int)) {
963 ERR("Error when writing to state file (6a)");
964 goto out;
965 }
967 for ( i = 0, j = 0; i < nr_pfns; )
968 {
969 if ( ! is_mapped(live_pfn_to_mfn_table[i]) )
970 {
971 pfntab[j++] = i;
972 }
973 i++;
974 if ( j == 1024 || i == nr_pfns )
975 {
976 if (write(io_fd, &pfntab, sizeof(unsigned long)*j) !=
977 sizeof(unsigned long)*j) {
978 ERR("Error when writing to state file (6b)");
979 goto out;
980 }
981 j = 0;
982 }
983 }
984 }
986 /* Map the suspend-record MFN to pin it. The page must be owned by
987 dom for this to succeed. */
988 p_srec = xc_map_foreign_range(xc_handle, dom,
989 sizeof(*p_srec), PROT_READ,
990 ctxt.user_regs.esi);
991 if (!p_srec){
992 ERR("Couldn't map suspend record");
993 goto out;
994 }
996 if (nr_pfns != p_srec->nr_pfns )
997 {
998 ERR("Suspend record nr_pfns unexpected (%ld != %ld)",
999 p_srec->nr_pfns, nr_pfns);
1000 goto out;
1003 /* Canonicalise the suspend-record frame number. */
1004 if ( !translate_mfn_to_pfn(&ctxt.user_regs.esi) ){
1005 ERR("Suspend record is not in range of pseudophys map");
1006 goto out;
1009 /* Canonicalise each GDT frame number. */
1010 for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
1011 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1012 ERR("GDT frame is not in range of pseudophys map");
1013 goto out;
1017 /* Canonicalise the page table base pointer. */
1018 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) ) {
1019 ERR("PT base is not in range of pseudophys map");
1020 goto out;
1022 ctxt.ctrlreg[3] = live_mfn_to_pfn_table[ctxt.ctrlreg[3] >> PAGE_SHIFT] <<
1023 PAGE_SHIFT;
1025 if (write(io_fd, &ctxt, sizeof(ctxt)) != sizeof(ctxt) ||
1026 write(io_fd, live_shinfo, PAGE_SIZE) != PAGE_SIZE) {
1027 ERR("Error when writing to state file (1)");
1028 goto out;
1031 out:
1033 if(live_shinfo)
1034 munmap(live_shinfo, PAGE_SIZE);
1036 if(p_srec)
1037 munmap(p_srec, sizeof(*p_srec));
1039 if(live_pfn_to_mfn_frame_list)
1040 munmap(live_pfn_to_mfn_frame_list, PAGE_SIZE);
1042 if(live_pfn_to_mfn_table)
1043 munmap(live_pfn_to_mfn_table, nr_pfns*4);
1045 if(live_mfn_to_pfn_table)
1046 munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024);
1048 if (pfn_type != NULL)
1049 free(pfn_type);
1051 DPRINTF("Save exit rc=%d\n",rc);
1052 return !!rc;