ia64/xen-unstable

view tools/libxc/xc_hvm_save.c @ 14203:beabac411220

[HVM] Save/restore: remove bogus "valid state" check
Signed-off-by: Tim Deegan <Tim.Deegan@xensource.com>
author Tim Deegan <Tim.Deegan@xensource.com>
date Thu Mar 01 14:00:05 2007 +0000 (2007-03-01)
parents e21834bc78f2
children eedbddf55e51
line source
1 /******************************************************************************
2 * xc_hvm_save.c
3 *
4 * Save the state of a running HVM guest.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 * Copyright (c) 2006 Intel Corperation
8 * rewriten for hvm guest by Zhai Edwin <edwin.zhai@intel.com>
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms and conditions of the GNU General Public License,
12 * version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
17 * more details.
18 *
19 * You should have received a copy of the GNU General Public License along with
20 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
21 * Place - Suite 330, Boston, MA 02111-1307 USA.
22 *
23 */
25 #include <inttypes.h>
26 #include <time.h>
27 #include <stdlib.h>
28 #include <unistd.h>
29 #include <sys/time.h>
30 #include <xen/hvm/e820.h>
32 #include "xc_private.h"
33 #include "xg_private.h"
34 #include "xg_save_restore.h"
36 /*
37 ** Default values for important tuning parameters. Can override by passing
38 ** non-zero replacement values to xc_hvm_save().
39 **
40 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
41 **
42 */
43 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
44 #define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
46 /* max mfn of the whole machine */
47 static unsigned long max_mfn;
49 /* virtual starting address of the hypervisor */
50 static unsigned long hvirt_start;
52 /* #levels of page tables used by the currrent guest */
53 static unsigned int pt_levels;
55 /* total number of pages used by the current guest */
56 static unsigned long max_pfn;
58 /*
59 ** During (live) save/migrate, we maintain a number of bitmaps to track
60 ** which pages we have to send, to fixup, and to skip.
61 */
63 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
64 #define BITMAP_SIZE ((max_pfn + BITS_PER_LONG - 1) / 8)
66 #define BITMAP_ENTRY(_nr,_bmap) \
67 ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
69 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
71 static inline int test_bit (int nr, volatile void * addr)
72 {
73 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
74 }
76 static inline void clear_bit (int nr, volatile void * addr)
77 {
78 BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
79 }
81 static inline int permute( int i, int nr, int order_nr )
82 {
83 /* Need a simple permutation function so that we scan pages in a
84 pseudo random order, enabling us to get a better estimate of
85 the domain's page dirtying rate as we go (there are often
86 contiguous ranges of pfns that have similar behaviour, and we
87 want to mix them up. */
89 /* e.g. nr->oder 15->4 16->4 17->5 */
90 /* 512MB domain, 128k pages, order 17 */
92 /*
93 QPONMLKJIHGFEDCBA
94 QPONMLKJIH
95 GFEDCBA
96 */
98 /*
99 QPONMLKJIHGFEDCBA
100 EDCBA
101 QPONM
102 LKJIHGF
103 */
105 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
106 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
108 return i;
109 }
111 static uint64_t tv_to_us(struct timeval *new)
112 {
113 return (new->tv_sec * 1000000) + new->tv_usec;
114 }
116 static uint64_t llgettimeofday(void)
117 {
118 struct timeval now;
119 gettimeofday(&now, NULL);
120 return tv_to_us(&now);
121 }
123 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
124 {
125 return ((new->tv_sec - old->tv_sec)*1000000 ) +
126 (new->tv_usec - old->tv_usec);
127 }
130 #define RATE_IS_MAX() (0)
131 #define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
132 #define initialize_mbit_rate()
134 static inline ssize_t write_exact(int fd, void *buf, size_t count)
135 {
136 if(write(fd, buf, count) != count)
137 return 0;
138 return 1;
139 }
141 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
142 xc_shadow_op_stats_t *stats, int print)
143 {
144 static struct timeval wall_last;
145 static long long d0_cpu_last;
146 static long long d1_cpu_last;
148 struct timeval wall_now;
149 long long wall_delta;
150 long long d0_cpu_now, d0_cpu_delta;
151 long long d1_cpu_now, d1_cpu_delta;
153 gettimeofday(&wall_now, NULL);
155 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
156 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
158 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
159 DPRINTF("ARRHHH!!\n");
161 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
163 if (wall_delta == 0) wall_delta = 1;
165 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
166 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
168 if (print)
169 DPRINTF(
170 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
171 "dirtied %dMb/s %" PRId32 " pages\n",
172 wall_delta,
173 (int)((d0_cpu_delta*100)/wall_delta),
174 (int)((d1_cpu_delta*100)/wall_delta),
175 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
176 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
177 stats->dirty_count);
179 d0_cpu_last = d0_cpu_now;
180 d1_cpu_last = d1_cpu_now;
181 wall_last = wall_now;
183 return 0;
184 }
186 static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn,
187 unsigned long *arr, int runs)
188 {
189 long long start, now;
190 xc_shadow_op_stats_t stats;
191 int j;
193 start = llgettimeofday();
195 for (j = 0; j < runs; j++) {
196 int i;
198 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
199 arr, max_pfn, NULL, 0, NULL);
200 DPRINTF("#Flush\n");
201 for ( i = 0; i < 40; i++ ) {
202 usleep(50000);
203 now = llgettimeofday();
204 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
205 NULL, 0, NULL, 0, &stats);
207 DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
208 ((now-start)+500)/1000,
209 stats.fault_count, stats.dirty_count);
210 }
211 }
213 return -1;
214 }
216 static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
217 int dom, xc_dominfo_t *info,
218 vcpu_guest_context_t *ctxt)
219 {
220 int i = 0;
222 if (!(*suspend)(dom)) {
223 ERROR("Suspend request failed");
224 return -1;
225 }
227 retry:
229 if (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) {
230 ERROR("Could not get domain info");
231 return -1;
232 }
234 if ( xc_vcpu_getcontext(xc_handle, dom, 0 /* XXX */, ctxt))
235 ERROR("Could not get vcpu context");
238 if (info->shutdown && info->shutdown_reason == SHUTDOWN_suspend)
239 return 0; // success
241 if (info->paused) {
242 // try unpausing domain, wait, and retest
243 xc_domain_unpause( xc_handle, dom );
245 ERROR("Domain was paused. Wait and re-test.");
246 usleep(10000); // 10ms
248 goto retry;
249 }
252 if( ++i < 100 ) {
253 ERROR("Retry suspend domain.");
254 usleep(10000); // 10ms
255 goto retry;
256 }
258 ERROR("Unable to suspend domain.");
260 return -1;
261 }
263 int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
264 uint32_t max_factor, uint32_t flags, int (*suspend)(int))
265 {
266 xc_dominfo_t info;
268 int rc = 1, i, j, last_iter, iter = 0;
269 int live = (flags & XCFLAGS_LIVE);
270 int debug = (flags & XCFLAGS_DEBUG);
271 int sent_last_iter, skip_this_iter;
273 /* The new domain's shared-info frame number. */
274 unsigned long shared_info_frame;
276 /* A copy of the CPU context of the guest. */
277 vcpu_guest_context_t ctxt;
279 /* A table containg the PFNs (/not/ MFN!) to map. */
280 xen_pfn_t *pfn_batch = NULL;
282 /* A copy of hvm domain context buffer*/
283 uint32_t hvm_buf_size;
284 uint8_t *hvm_buf = NULL;
286 /* Live mapping of shared info structure */
287 shared_info_t *live_shinfo = NULL;
289 /* base of the region in which domain memory is mapped */
290 unsigned char *region_base = NULL;
292 uint32_t rec_size, nr_vcpus;
294 /* power of 2 order of max_pfn */
295 int order_nr;
297 /* bitmap of pages:
298 - that should be sent this iteration (unless later marked as skip);
299 - to skip this iteration because already dirty; */
300 unsigned long *to_send = NULL, *to_skip = NULL;
302 xc_shadow_op_stats_t stats;
304 unsigned long total_sent = 0;
306 DPRINTF("xc_hvm_save:dom=%d, max_iters=%d, max_factor=%d, flags=0x%x, live=%d, debug=%d.\n",
307 dom, max_iters, max_factor, flags,
308 live, debug);
310 /* If no explicit control parameters given, use defaults */
311 if(!max_iters)
312 max_iters = DEF_MAX_ITERS;
313 if(!max_factor)
314 max_factor = DEF_MAX_FACTOR;
316 initialize_mbit_rate();
318 if(!get_platform_info(xc_handle, dom,
319 &max_mfn, &hvirt_start, &pt_levels)) {
320 ERROR("HVM:Unable to get platform info.");
321 return 1;
322 }
324 if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
325 ERROR("HVM:Could not get domain info");
326 return 1;
327 }
328 nr_vcpus = info.nr_online_vcpus;
330 if (mlock(&ctxt, sizeof(ctxt))) {
331 ERROR("HVM:Unable to mlock ctxt");
332 return 1;
333 }
335 /* Only have to worry about vcpu 0 even for SMP */
336 if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) {
337 ERROR("HVM:Could not get vcpu context");
338 goto out;
339 }
340 shared_info_frame = info.shared_info_frame;
342 /* cheesy sanity check */
343 if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
344 ERROR("Invalid HVM state record -- pfn count out of range: %lu",
345 (info.max_memkb >> (PAGE_SHIFT - 10)));
346 goto out;
347 }
349 /* Map the shared info frame */
350 if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
351 PROT_READ, shared_info_frame))) {
352 ERROR("HVM:Couldn't map live_shinfo");
353 goto out;
354 }
356 DPRINTF("saved hvm domain info:max_memkb=0x%lx, max_mfn=0x%lx, "
357 "nr_pages=0x%lx\n", info.max_memkb, max_mfn, info.nr_pages);
359 if (live) {
360 ERROR("hvm domain doesn't support live migration now.\n");
361 goto out;
363 if (xc_shadow_control(xc_handle, dom,
364 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
365 NULL, 0, NULL, 0, NULL) < 0) {
366 ERROR("Couldn't enable shadow mode");
367 goto out;
368 }
370 last_iter = 0;
371 DPRINTF("hvm domain live migration debug start: logdirty enable.\n");
372 } else {
373 /* This is a non-live suspend. Issue the call back to get the
374 domain suspended */
376 last_iter = 1;
378 /* suspend hvm domain */
379 if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt)) {
380 ERROR("HVM Domain appears not to have suspended");
381 goto out;
382 }
383 }
385 DPRINTF("after 1st handle hvm domain nr_pages=0x%lx, "
386 "max_memkb=0x%lx, live=%d.\n",
387 info.nr_pages, info.max_memkb, live);
389 /* Calculate the highest PFN of "normal" memory:
390 * HVM memory is sequential except for the VGA and MMIO holes. */
391 max_pfn = info.nr_pages;
392 /* Skip the VGA hole from 0xa0000 to 0xc0000 */
393 max_pfn += 0x20;
394 /* Skip the MMIO hole: 256MB just below 4GB */
395 if ( max_pfn >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT) )
396 max_pfn += (HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT);
398 skip_this_iter = 0;/*XXX*/
399 /* pretend we sent all the pages last iteration */
400 sent_last_iter = max_pfn;
402 /* calculate the power of 2 order of max_pfn, e.g.
403 15->4 16->4 17->5 */
404 for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
405 continue;
407 /* Setup to_send / to_fix and to_skip bitmaps */
408 to_send = malloc(BITMAP_SIZE);
409 to_skip = malloc(BITMAP_SIZE);
412 hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
413 if ( hvm_buf_size == -1 )
414 {
415 ERROR("Couldn't get HVM context size from Xen");
416 goto out;
417 }
418 hvm_buf = malloc(hvm_buf_size);
420 if (!to_send ||!to_skip ||!hvm_buf) {
421 ERROR("Couldn't allocate memory");
422 goto out;
423 }
425 memset(to_send, 0xff, BITMAP_SIZE);
427 if (lock_pages(to_send, BITMAP_SIZE)) {
428 ERROR("Unable to lock to_send");
429 return 1;
430 }
432 /* (to fix is local only) */
433 if (lock_pages(to_skip, BITMAP_SIZE)) {
434 ERROR("Unable to lock to_skip");
435 return 1;
436 }
438 analysis_phase(xc_handle, dom, max_pfn, to_skip, 0);
441 /* We want zeroed memory so use calloc rather than malloc. */
442 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
444 if (pfn_batch == NULL) {
445 ERROR("failed to alloc memory for pfn_batch array");
446 errno = ENOMEM;
447 goto out;
448 }
450 /* Start writing out the saved-domain record. */
451 if (!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
452 ERROR("write: max_pfn");
453 goto out;
454 }
456 while(1) {
458 unsigned int prev_pc, sent_this_iter, N, batch;
460 iter++;
461 sent_this_iter = 0;
462 skip_this_iter = 0;
463 prev_pc = 0;
464 N=0;
466 DPRINTF("Saving HVM domain memory pages: iter %d 0%%", iter);
468 while( N < max_pfn ){
470 unsigned int this_pc = (N * 100) / max_pfn;
472 if ((this_pc - prev_pc) >= 5) {
473 DPRINTF("\b\b\b\b%3d%%", this_pc);
474 prev_pc = this_pc;
475 }
477 /* slightly wasteful to peek the whole array evey time,
478 but this is fast enough for the moment. */
479 if (!last_iter && xc_shadow_control(
480 xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK,
481 to_skip, max_pfn, NULL, 0, NULL) != max_pfn) {
482 ERROR("Error peeking HVM shadow bitmap");
483 goto out;
484 }
487 /* load pfn_batch[] with the mfn of all the pages we're doing in
488 this batch. */
489 for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) {
491 int n = permute(N, max_pfn, order_nr);
493 if (debug) {
494 DPRINTF("%d pfn= %08lx %d \n",
495 iter, (unsigned long)n, test_bit(n, to_send));
496 }
498 if (!last_iter && test_bit(n, to_send)&& test_bit(n, to_skip))
499 skip_this_iter++; /* stats keeping */
501 if (!((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
502 (test_bit(n, to_send) && last_iter)))
503 continue;
505 /* Skip PFNs that aren't really there */
506 if ((n >= 0xa0 && n < 0xc0) /* VGA hole */
507 || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT)
508 && n < (1ULL << 32) >> PAGE_SHIFT)) /* 4G MMIO hole */
509 continue;
511 /*
512 ** we get here if:
513 ** 1. page is marked to_send & hasn't already been re-dirtied
514 ** 2. (ignore to_skip in last iteration)
515 */
517 pfn_batch[batch] = n;
519 batch++;
520 }
522 if (batch == 0)
523 goto skip; /* vanishingly unlikely... */
525 /* map_foreign use pfns now !*/
526 if ((region_base = xc_map_foreign_batch(
527 xc_handle, dom, PROT_READ, pfn_batch, batch)) == 0) {
528 ERROR("map batch failed");
529 goto out;
530 }
532 /* write num of pfns */
533 if(!write_exact(io_fd, &batch, sizeof(unsigned int))) {
534 ERROR("Error when writing to state file (2)");
535 goto out;
536 }
538 /* write all the pfns */
539 if(!write_exact(io_fd, pfn_batch, sizeof(unsigned long)*batch)) {
540 ERROR("Error when writing to state file (3)");
541 goto out;
542 }
544 for ( j = 0; j < batch; j++ )
545 {
546 if ( pfn_batch[j] & XEN_DOMCTL_PFINFO_LTAB_MASK )
547 continue;
548 if ( ratewrite(io_fd, region_base + j*PAGE_SIZE,
549 PAGE_SIZE) != PAGE_SIZE )
550 {
551 ERROR("ERROR when writing to state file (4)");
552 goto out;
553 }
554 }
556 sent_this_iter += batch;
558 munmap(region_base, batch*PAGE_SIZE);
560 } /* end of this while loop for this iteration */
562 skip:
564 total_sent += sent_this_iter;
566 DPRINTF("\r %d: sent %d, skipped %d, ",
567 iter, sent_this_iter, skip_this_iter );
569 if (last_iter) {
570 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
572 DPRINTF("Total pages sent= %ld (%.2fx)\n",
573 total_sent, ((float)total_sent)/max_pfn );
574 }
576 if (last_iter && debug){
577 int minusone = -1;
578 memset(to_send, 0xff, BITMAP_SIZE);
579 debug = 0;
580 DPRINTF("Entering debug resend-all mode\n");
582 /* send "-1" to put receiver into debug mode */
583 if(!write_exact(io_fd, &minusone, sizeof(int))) {
584 ERROR("Error when writing to state file (6)");
585 goto out;
586 }
588 continue;
589 }
591 if (last_iter) break;
593 if (live) {
596 if(
597 ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
598 (iter >= max_iters) ||
599 (sent_this_iter+skip_this_iter < 50) ||
600 (total_sent > max_pfn*max_factor) ) {
602 DPRINTF("Start last iteration for HVM domain\n");
603 last_iter = 1;
605 if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
606 &ctxt)) {
607 ERROR("Domain appears not to have suspended");
608 goto out;
609 }
611 DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
612 info.shared_info_frame,
613 (unsigned long)ctxt.user_regs.eip,
614 (unsigned long)ctxt.user_regs.edx);
615 }
617 if (xc_shadow_control(xc_handle, dom,
618 XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
619 max_pfn, NULL, 0, &stats) != max_pfn) {
620 ERROR("Error flushing shadow PT");
621 goto out;
622 }
624 sent_last_iter = sent_this_iter;
626 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
628 }
631 } /* end of while 1 */
634 DPRINTF("All HVM memory is saved\n");
636 /* Zero terminate */
637 i = 0;
638 if (!write_exact(io_fd, &i, sizeof(int))) {
639 ERROR("Error when writing to state file (6)");
640 goto out;
641 }
643 if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
644 hvm_buf_size)) == -1) {
645 ERROR("HVM:Could not get hvm buffer");
646 goto out;
647 }
649 if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) {
650 ERROR("error write hvm buffer size");
651 goto out;
652 }
654 if ( !write_exact(io_fd, hvm_buf, rec_size) ) {
655 ERROR("write HVM info failed!\n");
656 }
658 /* save vcpu/vmcs context */
659 if (!write_exact(io_fd, &nr_vcpus, sizeof(uint32_t))) {
660 ERROR("error write nr vcpus");
661 goto out;
662 }
664 /*XXX: need a online map to exclude down cpu */
665 for (i = 0; i < nr_vcpus; i++) {
667 if (xc_vcpu_getcontext(xc_handle, dom, i, &ctxt)) {
668 ERROR("HVM:Could not get vcpu context");
669 goto out;
670 }
672 rec_size = sizeof(ctxt);
673 DPRINTF("write %d vcpucontext of total %d.\n", i, nr_vcpus);
674 if (!write_exact(io_fd, &rec_size, sizeof(uint32_t))) {
675 ERROR("error write vcpu ctxt size");
676 goto out;
677 }
679 if (!write_exact(io_fd, &(ctxt), sizeof(ctxt)) ) {
680 ERROR("write vmcs failed!\n");
681 goto out;
682 }
683 }
685 /* Shared-info pfn */
686 if (!write_exact(io_fd, &(shared_info_frame), sizeof(uint32_t)) ) {
687 ERROR("write shared-info pfn failed!\n");
688 goto out;
689 }
691 /* Success! */
692 rc = 0;
694 out:
696 if (live) {
697 if(xc_shadow_control(xc_handle, dom,
698 XEN_DOMCTL_SHADOW_OP_OFF,
699 NULL, 0, NULL, 0, NULL) < 0) {
700 DPRINTF("Warning - couldn't disable shadow mode");
701 }
702 }
704 free(hvm_buf);
705 free(pfn_batch);
706 free(to_send);
707 free(to_skip);
709 return !!rc;
710 }