direct-io.hg

view tools/libxc/xc_linux_save.c @ 12765:2dd4569e0640

[LIBXC] Add an error reporting API to the libxc library.

- An 'xc_error' struct is used to pass around error
details. Currently contains two members 'code' an enumeration of
error types, and 'message' a free text description of the specific
problem.

- The xc_get_last_error() method returns a const pointer to the
internal instance of this struct manged by libxc. By returning a
const pointer we can add extra members to the end of the struct at
any time without worrying about ABI of callers. This will let us
provide more fine-grained info if needed in the future.

- The xc_error instance is statically defined inside libxc and marked
__thread. This ensures that errors are recorded per-thread, and
that when dealing with errors we never need to call malloc - all
storage needed is statically allocated.

- The xc_clear_last_error() method resets any currently recorded
error details

- The xc_error_code_to_desc() method converts the integer error code
into a generic user facing messsage. eg "Invalid kernel". Together
with the 'message' field from xc_error, this provides the user
visible feedback. eg "Invalid kernel: Non PAE-kernel on PAE host."

- A callback can be registered with xc_set_error_handler to receive
notification whenever an error is recorded, rather than querying
for error details after the fact with xc_get_last_error

- If built with -DDEBUG set, a default error handler will be
registered which calls fprintf(stderr), thus maintaining current
behaviour of logging errors to stderr during developer builds.

- The python binding for libxc is updated to use xc_get_last_error
to pull out error details whenever appropriate, instead of
returning info based on 'errno'

- The xc_set_error method is private to libxc internals, and is used
for setting error details

- The ERROR and PERROR macros have been updated to call xc_set_error
automatically specifying XC_INTERNAL_ERROR as the error code. This
gives a generic error report for all current failure points

- Some uses of the ERROR macro have been replaced with explicit
calls to xc_set_error to enable finer grained error reporting. In
particular the code dealing with invalid kernel types uses this
to report about PAE/architecture/wordsize mismatches

The patch has been tested by calling xm create against a varietry of
config files defining invalid kernels of various kinds. It has also
been tested with libvirt talking to xend. In both cases the error
messages were propagated all the way back up the stack.

There is only one place where I need to do further work. The suspend
& restore APIs in Xend invoke external helper programs rather than
calling libxc directly. This means that error details are essentially
lost. Since there is already code in XenD which scans STDERR from
these programs I will investigate adapting this to extract actual
error messages from these helpers.

Signed-off-by: Daniel P. Berrange <berrange@redhat.com>
author kfraser@localhost.localdomain
date Thu Dec 07 11:36:26 2006 +0000 (2006-12-07)
parents 96f51a000ed0
children 1818b322ede9
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xc_private.h"
16 #include "xg_private.h"
17 #include "xg_save_restore.h"
19 /*
20 ** Default values for important tuning parameters. Can override by passing
21 ** non-zero replacement values to xc_linux_save().
22 **
23 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
24 **
25 */
26 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
27 #define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
30 /* max mfn of the whole machine */
31 static unsigned long max_mfn;
33 /* virtual starting address of the hypervisor */
34 static unsigned long hvirt_start;
36 /* #levels of page tables used by the currrent guest */
37 static unsigned int pt_levels;
39 /* total number of pages used by the current guest */
40 static unsigned long max_pfn;
42 /* Live mapping of the table mapping each PFN to its current MFN. */
43 static xen_pfn_t *live_p2m = NULL;
45 /* Live mapping of system MFN to PFN table. */
46 static xen_pfn_t *live_m2p = NULL;
48 /* grep fodder: machine_to_phys */
50 #define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
52 /*
53 * Returns TRUE if the given machine frame number has a unique mapping
54 * in the guest's pseudophysical map.
55 */
56 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
57 (((_mfn) < (max_mfn)) && \
58 ((mfn_to_pfn(_mfn) < (max_pfn)) && \
59 (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
62 /* Returns TRUE if MFN is successfully converted to a PFN. */
63 #define translate_mfn_to_pfn(_pmfn) \
64 ({ \
65 unsigned long mfn = *(_pmfn); \
66 int _res = 1; \
67 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
68 _res = 0; \
69 else \
70 *(_pmfn) = mfn_to_pfn(mfn); \
71 _res; \
72 })
74 /*
75 ** During (live) save/migrate, we maintain a number of bitmaps to track
76 ** which pages we have to send, to fixup, and to skip.
77 */
79 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
80 #define BITMAP_SIZE ((max_pfn + BITS_PER_LONG - 1) / 8)
82 #define BITMAP_ENTRY(_nr,_bmap) \
83 ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
85 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
87 static inline int test_bit (int nr, volatile void * addr)
88 {
89 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
90 }
92 static inline void clear_bit (int nr, volatile void * addr)
93 {
94 BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
95 }
97 static inline void set_bit ( int nr, volatile void * addr)
98 {
99 BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
100 }
102 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
103 static inline unsigned int hweight32(unsigned int w)
104 {
105 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
106 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
107 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
108 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
109 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
110 }
112 static inline int count_bits ( int nr, volatile void *addr)
113 {
114 int i, count = 0;
115 unsigned long *p = (unsigned long *)addr;
116 /* We know that the array is padded to unsigned long. */
117 for( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
118 count += hweight32(*p);
119 return count;
120 }
122 static inline int permute( int i, int nr, int order_nr )
123 {
124 /* Need a simple permutation function so that we scan pages in a
125 pseudo random order, enabling us to get a better estimate of
126 the domain's page dirtying rate as we go (there are often
127 contiguous ranges of pfns that have similar behaviour, and we
128 want to mix them up. */
130 /* e.g. nr->oder 15->4 16->4 17->5 */
131 /* 512MB domain, 128k pages, order 17 */
133 /*
134 QPONMLKJIHGFEDCBA
135 QPONMLKJIH
136 GFEDCBA
137 */
139 /*
140 QPONMLKJIHGFEDCBA
141 EDCBA
142 QPONM
143 LKJIHGF
144 */
146 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
147 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
149 return i;
150 }
155 static uint64_t tv_to_us(struct timeval *new)
156 {
157 return (new->tv_sec * 1000000) + new->tv_usec;
158 }
160 static uint64_t llgettimeofday(void)
161 {
162 struct timeval now;
163 gettimeofday(&now, NULL);
164 return tv_to_us(&now);
165 }
167 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
168 {
169 return ((new->tv_sec - old->tv_sec)*1000000 ) +
170 (new->tv_usec - old->tv_usec);
171 }
174 #ifdef ADAPTIVE_SAVE
177 /*
178 ** We control the rate at which we transmit (or save) to minimize impact
179 ** on running domains (including the target if we're doing live migrate).
180 */
182 #define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
183 #define START_MBIT_RATE 100 /* initial transmit rate for migrate */
186 /* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
187 #define RATE_TO_BTU 781250
189 /* Amount in bytes we allow ourselves to send in a burst */
190 #define BURST_BUDGET (100*1024)
193 /* We keep track of the current and previous transmission rate */
194 static int mbit_rate, ombit_rate = 0;
196 /* Have we reached the maximum transmission rate? */
197 #define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
200 static inline void initialize_mbit_rate()
201 {
202 mbit_rate = START_MBIT_RATE;
203 }
206 static int ratewrite(int io_fd, void *buf, int n)
207 {
208 static int budget = 0;
209 static int burst_time_us = -1;
210 static struct timeval last_put = { 0 };
211 struct timeval now;
212 struct timespec delay;
213 long long delta;
215 if (START_MBIT_RATE == 0)
216 return write(io_fd, buf, n);
218 budget -= n;
219 if (budget < 0) {
220 if (mbit_rate != ombit_rate) {
221 burst_time_us = RATE_TO_BTU / mbit_rate;
222 ombit_rate = mbit_rate;
223 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
224 mbit_rate, BURST_BUDGET, burst_time_us);
225 }
226 if (last_put.tv_sec == 0) {
227 budget += BURST_BUDGET;
228 gettimeofday(&last_put, NULL);
229 } else {
230 while (budget < 0) {
231 gettimeofday(&now, NULL);
232 delta = tv_delta(&now, &last_put);
233 while (delta > burst_time_us) {
234 budget += BURST_BUDGET;
235 last_put.tv_usec += burst_time_us;
236 if (last_put.tv_usec > 1000000) {
237 last_put.tv_usec -= 1000000;
238 last_put.tv_sec++;
239 }
240 delta -= burst_time_us;
241 }
242 if (budget > 0)
243 break;
244 delay.tv_sec = 0;
245 delay.tv_nsec = 1000 * (burst_time_us - delta);
246 while (delay.tv_nsec > 0)
247 if (nanosleep(&delay, &delay) == 0)
248 break;
249 }
250 }
251 }
252 return write(io_fd, buf, n);
253 }
255 #else /* ! ADAPTIVE SAVE */
257 #define RATE_IS_MAX() (0)
258 #define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
259 #define initialize_mbit_rate()
261 #endif
264 static inline ssize_t write_exact(int fd, void *buf, size_t count)
265 {
266 if(write(fd, buf, count) != count)
267 return 0;
268 return 1;
269 }
273 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
274 xc_shadow_op_stats_t *stats, int print)
275 {
276 static struct timeval wall_last;
277 static long long d0_cpu_last;
278 static long long d1_cpu_last;
280 struct timeval wall_now;
281 long long wall_delta;
282 long long d0_cpu_now, d0_cpu_delta;
283 long long d1_cpu_now, d1_cpu_delta;
285 gettimeofday(&wall_now, NULL);
287 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
288 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
290 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
291 DPRINTF("ARRHHH!!\n");
293 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
295 if (wall_delta == 0) wall_delta = 1;
297 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
298 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
300 if (print)
301 DPRINTF(
302 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
303 "dirtied %dMb/s %" PRId32 " pages\n",
304 wall_delta,
305 (int)((d0_cpu_delta*100)/wall_delta),
306 (int)((d1_cpu_delta*100)/wall_delta),
307 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
308 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
309 stats->dirty_count);
311 #ifdef ADAPTIVE_SAVE
312 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
313 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
314 + 50;
315 if (mbit_rate > MAX_MBIT_RATE)
316 mbit_rate = MAX_MBIT_RATE;
317 }
318 #endif
320 d0_cpu_last = d0_cpu_now;
321 d1_cpu_last = d1_cpu_now;
322 wall_last = wall_now;
324 return 0;
325 }
328 static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn,
329 unsigned long *arr, int runs)
330 {
331 long long start, now;
332 xc_shadow_op_stats_t stats;
333 int j;
335 start = llgettimeofday();
337 for (j = 0; j < runs; j++) {
338 int i;
340 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
341 arr, max_pfn, NULL, 0, NULL);
342 DPRINTF("#Flush\n");
343 for ( i = 0; i < 40; i++ ) {
344 usleep(50000);
345 now = llgettimeofday();
346 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
347 NULL, 0, NULL, 0, &stats);
349 DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
350 ((now-start)+500)/1000,
351 stats.fault_count, stats.dirty_count);
352 }
353 }
355 return -1;
356 }
359 static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
360 int dom, xc_dominfo_t *info,
361 vcpu_guest_context_t *ctxt)
362 {
363 int i = 0;
365 if (!(*suspend)(dom)) {
366 ERROR("Suspend request failed");
367 return -1;
368 }
370 retry:
372 if (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) {
373 ERROR("Could not get domain info");
374 return -1;
375 }
377 if ( xc_vcpu_getcontext(xc_handle, dom, 0 /* XXX */, ctxt))
378 ERROR("Could not get vcpu context");
381 if (info->shutdown && info->shutdown_reason == SHUTDOWN_suspend)
382 return 0; // success
384 if (info->paused) {
385 // try unpausing domain, wait, and retest
386 xc_domain_unpause( xc_handle, dom );
388 ERROR("Domain was paused. Wait and re-test.");
389 usleep(10000); // 10ms
391 goto retry;
392 }
395 if( ++i < 100 ) {
396 ERROR("Retry suspend domain.");
397 usleep(10000); // 10ms
398 goto retry;
399 }
401 ERROR("Unable to suspend domain.");
403 return -1;
404 }
407 /*
408 ** During transfer (or in the state file), all page-table pages must be
409 ** converted into a 'canonical' form where references to actual mfns
410 ** are replaced with references to the corresponding pfns.
411 **
412 ** This function performs the appropriate conversion, taking into account
413 ** which entries do not require canonicalization (in particular, those
414 ** entries which map the virtual address reserved for the hypervisor).
415 */
416 static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
417 const void *spage, void *dpage)
418 {
420 int i, pte_last, xen_start, xen_end, race = 0;
421 uint64_t pte;
423 /*
424 ** We need to determine which entries in this page table hold
425 ** reserved hypervisor mappings. This depends on the current
426 ** page table type as well as the number of paging levels.
427 */
428 xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
430 if (pt_levels == 2 && type == XEN_DOMCTL_PFINFO_L2TAB)
431 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
433 if (pt_levels == 3 && type == XEN_DOMCTL_PFINFO_L3TAB)
434 xen_start = L3_PAGETABLE_ENTRIES_PAE;
436 /*
437 ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
438 ** We can spot this by looking for the guest linear mapping which
439 ** Xen always ensures is present in that L2. Guests must ensure
440 ** that this check will fail for other L2s.
441 */
442 if (pt_levels == 3 && type == XEN_DOMCTL_PFINFO_L2TAB) {
444 /* XXX index of the L2 entry in PAE mode which holds the guest LPT */
445 #define PAE_GLPT_L2ENTRY (495)
446 pte = ((uint64_t*)spage)[PAE_GLPT_L2ENTRY];
448 if(((pte >> PAGE_SHIFT) & 0x0fffffff) == live_p2m[pfn])
449 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
450 }
452 if (pt_levels == 4 && type == XEN_DOMCTL_PFINFO_L4TAB) {
453 /*
454 ** XXX SMH: should compute these from hvirt_start (which we have)
455 ** and hvirt_end (which we don't)
456 */
457 xen_start = 256;
458 xen_end = 272;
459 }
461 /* Now iterate through the page table, canonicalizing each PTE */
462 for (i = 0; i < pte_last; i++ ) {
464 unsigned long pfn, mfn;
466 if (pt_levels == 2)
467 pte = ((uint32_t*)spage)[i];
468 else
469 pte = ((uint64_t*)spage)[i];
471 if (i >= xen_start && i < xen_end)
472 pte = 0;
474 if (pte & _PAGE_PRESENT) {
476 mfn = (pte >> PAGE_SHIFT) & 0xfffffff;
477 if (!MFN_IS_IN_PSEUDOPHYS_MAP(mfn)) {
478 /* This will happen if the type info is stale which
479 is quite feasible under live migration */
480 DPRINTF("PT Race: [%08lx,%d] pte=%llx, mfn=%08lx\n",
481 type, i, (unsigned long long)pte, mfn);
482 pfn = 0; /* zap it - we'll retransmit this page later */
483 race = 1; /* inform the caller of race; fatal if !live */
484 } else
485 pfn = mfn_to_pfn(mfn);
487 pte &= 0xffffff0000000fffULL;
488 pte |= (uint64_t)pfn << PAGE_SHIFT;
489 }
491 if (pt_levels == 2)
492 ((uint32_t*)dpage)[i] = pte;
493 else
494 ((uint64_t*)dpage)[i] = pte;
496 }
498 return race;
499 }
503 static xen_pfn_t *xc_map_m2p(int xc_handle,
504 unsigned long max_mfn,
505 int prot)
506 {
507 struct xen_machphys_mfn_list xmml;
508 privcmd_mmap_entry_t *entries;
509 unsigned long m2p_chunks, m2p_size;
510 xen_pfn_t *m2p;
511 xen_pfn_t *extent_start;
512 int i, rc;
514 m2p_size = M2P_SIZE(max_mfn);
515 m2p_chunks = M2P_CHUNKS(max_mfn);
517 xmml.max_extents = m2p_chunks;
518 if (!(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t)))) {
519 ERROR("failed to allocate space for m2p mfns");
520 return NULL;
521 }
522 set_xen_guest_handle(xmml.extent_start, extent_start);
524 if (xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
525 (xmml.nr_extents != m2p_chunks)) {
526 ERROR("xc_get_m2p_mfns");
527 return NULL;
528 }
530 if ((m2p = mmap(NULL, m2p_size, prot,
531 MAP_SHARED, xc_handle, 0)) == MAP_FAILED) {
532 ERROR("failed to mmap m2p");
533 return NULL;
534 }
536 if (!(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t)))) {
537 ERROR("failed to allocate space for mmap entries");
538 return NULL;
539 }
541 for (i=0; i < m2p_chunks; i++) {
542 entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
543 entries[i].mfn = extent_start[i];
544 entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
545 }
547 if ((rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
548 entries, m2p_chunks)) < 0) {
549 ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
550 return NULL;
551 }
553 free(extent_start);
554 free(entries);
556 return m2p;
557 }
561 int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
562 uint32_t max_factor, uint32_t flags, int (*suspend)(int))
563 {
564 xc_dominfo_t info;
566 int rc = 1, i, j, last_iter, iter = 0;
567 int live = (flags & XCFLAGS_LIVE);
568 int debug = (flags & XCFLAGS_DEBUG);
569 int race = 0, sent_last_iter, skip_this_iter;
571 /* The new domain's shared-info frame number. */
572 unsigned long shared_info_frame;
574 /* A copy of the CPU context of the guest. */
575 vcpu_guest_context_t ctxt;
577 /* A table containg the type of each PFN (/not/ MFN!). */
578 unsigned long *pfn_type = NULL;
579 unsigned long *pfn_batch = NULL;
581 /* A temporary mapping, and a copy, of one frame of guest memory. */
582 char page[PAGE_SIZE];
584 /* Double and single indirect references to the live P2M table */
585 xen_pfn_t *live_p2m_frame_list_list = NULL;
586 xen_pfn_t *live_p2m_frame_list = NULL;
588 /* A copy of the pfn-to-mfn table frame list. */
589 xen_pfn_t *p2m_frame_list = NULL;
591 /* Live mapping of shared info structure */
592 shared_info_t *live_shinfo = NULL;
594 /* base of the region in which domain memory is mapped */
595 unsigned char *region_base = NULL;
597 /* power of 2 order of max_pfn */
598 int order_nr;
600 /* bitmap of pages:
601 - that should be sent this iteration (unless later marked as skip);
602 - to skip this iteration because already dirty;
603 - to fixup by sending at the end if not already resent; */
604 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
606 xc_shadow_op_stats_t stats;
608 unsigned long needed_to_fix = 0;
609 unsigned long total_sent = 0;
612 /* If no explicit control parameters given, use defaults */
613 if(!max_iters)
614 max_iters = DEF_MAX_ITERS;
615 if(!max_factor)
616 max_factor = DEF_MAX_FACTOR;
618 initialize_mbit_rate();
620 if(!get_platform_info(xc_handle, dom,
621 &max_mfn, &hvirt_start, &pt_levels)) {
622 ERROR("Unable to get platform info.");
623 return 1;
624 }
626 if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
627 ERROR("Could not get domain info");
628 return 1;
629 }
631 if (lock_pages(&ctxt, sizeof(ctxt))) {
632 ERROR("Unable to lock ctxt");
633 return 1;
634 }
636 /* Only have to worry about vcpu 0 even for SMP */
637 if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) {
638 ERROR("Could not get vcpu context");
639 goto out;
640 }
641 shared_info_frame = info.shared_info_frame;
643 /* A cheesy test to see whether the domain contains valid state. */
644 if (ctxt.ctrlreg[3] == 0)
645 {
646 ERROR("Domain is not in a valid Linux guest OS state");
647 goto out;
648 }
650 /* cheesy sanity check */
651 if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
652 ERROR("Invalid state record -- pfn count out of range: %lu",
653 (info.max_memkb >> (PAGE_SHIFT - 10)));
654 goto out;
655 }
657 /* Map the shared info frame */
658 if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
659 PROT_READ, shared_info_frame))) {
660 ERROR("Couldn't map live_shinfo");
661 goto out;
662 }
664 max_pfn = live_shinfo->arch.max_pfn;
666 live_p2m_frame_list_list =
667 xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
668 live_shinfo->arch.pfn_to_mfn_frame_list_list);
670 if (!live_p2m_frame_list_list) {
671 ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
672 goto out;
673 }
675 live_p2m_frame_list =
676 xc_map_foreign_batch(xc_handle, dom, PROT_READ,
677 live_p2m_frame_list_list,
678 P2M_FLL_ENTRIES);
680 if (!live_p2m_frame_list) {
681 ERROR("Couldn't map p2m_frame_list");
682 goto out;
683 }
685 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
686 the guest must not change which frames are used for this purpose.
687 (its not clear why it would want to change them, and we'll be OK
688 from a safety POV anyhow. */
690 live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
691 live_p2m_frame_list,
692 P2M_FL_ENTRIES);
694 if (!live_p2m) {
695 ERROR("Couldn't map p2m table");
696 goto out;
697 }
699 /* Setup the mfn_to_pfn table mapping */
700 if(!(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ))) {
701 ERROR("Failed to map live M2P table");
702 goto out;
703 }
706 /* Get a local copy of the live_P2M_frame_list */
707 if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
708 ERROR("Couldn't allocate p2m_frame_list array");
709 goto out;
710 }
711 memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
713 /* Canonicalise the pfn-to-mfn table frame-number list. */
714 for (i = 0; i < max_pfn; i += fpp) {
715 if (!translate_mfn_to_pfn(&p2m_frame_list[i/fpp])) {
716 ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
717 ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
718 (uint64_t)p2m_frame_list[i/fpp]);
719 goto out;
720 }
721 }
723 /* Domain is still running at this point */
724 if (live) {
726 if (xc_shadow_control(xc_handle, dom,
727 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
728 NULL, 0, NULL, 0, NULL) < 0) {
729 ERROR("Couldn't enable shadow mode");
730 goto out;
731 }
733 last_iter = 0;
735 } else {
737 /* This is a non-live suspend. Issue the call back to get the
738 domain suspended */
740 last_iter = 1;
742 if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt)) {
743 ERROR("Domain appears not to have suspended");
744 goto out;
745 }
747 }
749 /* pretend we sent all the pages last iteration */
750 sent_last_iter = max_pfn;
753 /* calculate the power of 2 order of max_pfn, e.g.
754 15->4 16->4 17->5 */
755 for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
756 continue;
758 /* Setup to_send / to_fix and to_skip bitmaps */
759 to_send = malloc(BITMAP_SIZE);
760 to_fix = calloc(1, BITMAP_SIZE);
761 to_skip = malloc(BITMAP_SIZE);
763 if (!to_send || !to_fix || !to_skip) {
764 ERROR("Couldn't allocate to_send array");
765 goto out;
766 }
768 memset(to_send, 0xff, BITMAP_SIZE);
770 if (lock_pages(to_send, BITMAP_SIZE)) {
771 ERROR("Unable to lock to_send");
772 return 1;
773 }
775 /* (to fix is local only) */
776 if (lock_pages(to_skip, BITMAP_SIZE)) {
777 ERROR("Unable to lock to_skip");
778 return 1;
779 }
781 analysis_phase(xc_handle, dom, max_pfn, to_skip, 0);
783 /* We want zeroed memory so use calloc rather than malloc. */
784 pfn_type = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
785 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
787 if ((pfn_type == NULL) || (pfn_batch == NULL)) {
788 ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
789 errno = ENOMEM;
790 goto out;
791 }
793 if (lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type))) {
794 ERROR("Unable to lock");
795 goto out;
796 }
798 /*
799 * Quick belt and braces sanity check.
800 */
801 {
802 int err=0;
803 unsigned long mfn;
804 for (i = 0; i < max_pfn; i++) {
806 mfn = live_p2m[i];
807 if((mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i)) {
808 DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
809 mfn, mfn_to_pfn(mfn));
810 err++;
811 }
812 }
813 DPRINTF("Had %d unexplained entries in p2m table\n", err);
814 }
817 /* Start writing out the saved-domain record. */
819 if (!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
820 ERROR("write: max_pfn");
821 goto out;
822 }
824 /*
825 * Write an extended-info structure to inform the restore code that
826 * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
827 * slow paths in the restore code.
828 */
829 if ((pt_levels == 3) &&
830 (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3))) {
831 unsigned long signature = ~0UL;
832 uint32_t tot_sz = sizeof(struct vcpu_guest_context) + 8;
833 uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
834 char chunk_sig[] = "vcpu";
835 if (!write_exact(io_fd, &signature, sizeof(signature)) ||
836 !write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
837 !write_exact(io_fd, &chunk_sig, 4) ||
838 !write_exact(io_fd, &chunk_sz, sizeof(chunk_sz)) ||
839 !write_exact(io_fd, &ctxt, sizeof(ctxt))) {
840 ERROR("write: extended info");
841 goto out;
842 }
843 }
845 if (!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
846 ERROR("write: p2m_frame_list");
847 goto out;
848 }
850 print_stats(xc_handle, dom, 0, &stats, 0);
852 /* Now write out each data page, canonicalising page tables as we go... */
854 while(1) {
856 unsigned int prev_pc, sent_this_iter, N, batch;
858 iter++;
859 sent_this_iter = 0;
860 skip_this_iter = 0;
861 prev_pc = 0;
862 N=0;
864 DPRINTF("Saving memory pages: iter %d 0%%", iter);
866 while( N < max_pfn ){
868 unsigned int this_pc = (N * 100) / max_pfn;
870 if ((this_pc - prev_pc) >= 5) {
871 DPRINTF("\b\b\b\b%3d%%", this_pc);
872 prev_pc = this_pc;
873 }
875 /* slightly wasteful to peek the whole array evey time,
876 but this is fast enough for the moment. */
877 if (!last_iter && xc_shadow_control(
878 xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK,
879 to_skip, max_pfn, NULL, 0, NULL) != max_pfn) {
880 ERROR("Error peeking shadow bitmap");
881 goto out;
882 }
885 /* load pfn_type[] with the mfn of all the pages we're doing in
886 this batch. */
887 for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) {
889 int n = permute(N, max_pfn, order_nr);
891 if (debug) {
892 DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n",
893 iter, (unsigned long)n, live_p2m[n],
894 test_bit(n, to_send),
895 mfn_to_pfn(live_p2m[n]&0xFFFFF));
896 }
898 if (!last_iter && test_bit(n, to_send)&& test_bit(n, to_skip))
899 skip_this_iter++; /* stats keeping */
901 if (!((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
902 (test_bit(n, to_send) && last_iter) ||
903 (test_bit(n, to_fix) && last_iter)))
904 continue;
906 /*
907 ** we get here if:
908 ** 1. page is marked to_send & hasn't already been re-dirtied
909 ** 2. (ignore to_skip in last iteration)
910 ** 3. add in pages that still need fixup (net bufs)
911 */
913 pfn_batch[batch] = n;
914 pfn_type[batch] = live_p2m[n];
916 if(!is_mapped(pfn_type[batch])) {
918 /* not currently in pusedo-physical map -- set bit
919 in to_fix that we must send this page in last_iter
920 unless its sent sooner anyhow */
922 set_bit(n, to_fix);
923 if( (iter > 1) && IS_REAL_PFN(n) )
924 DPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
925 iter, n, pfn_type[batch]);
926 continue;
927 }
929 if(last_iter && test_bit(n, to_fix) && !test_bit(n, to_send)) {
930 needed_to_fix++;
931 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
932 iter, n, pfn_type[batch]);
933 }
935 clear_bit(n, to_fix);
937 batch++;
938 }
940 if (batch == 0)
941 goto skip; /* vanishingly unlikely... */
943 if ((region_base = xc_map_foreign_batch(
944 xc_handle, dom, PROT_READ, pfn_type, batch)) == 0) {
945 ERROR("map batch failed");
946 goto out;
947 }
949 if (xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type)) {
950 ERROR("get_pfn_type_batch failed");
951 goto out;
952 }
954 for ( j = 0; j < batch; j++ )
955 {
957 if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
958 XEN_DOMCTL_PFINFO_XTAB )
959 {
960 DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
961 continue;
962 }
964 if (debug)
965 DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
966 " sum= %08lx\n",
967 iter,
968 (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
969 pfn_batch[j],
970 pfn_type[j],
971 mfn_to_pfn(pfn_type[j] &
972 ~XEN_DOMCTL_PFINFO_LTAB_MASK),
973 csum_page(region_base + (PAGE_SIZE*j)));
975 /* canonicalise mfn->pfn */
976 pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
977 pfn_batch[j];
978 }
980 if(!write_exact(io_fd, &batch, sizeof(unsigned int))) {
981 ERROR("Error when writing to state file (2) (errno %d)",
982 errno);
983 goto out;
984 }
986 if(!write_exact(io_fd, pfn_type, sizeof(unsigned long)*j)) {
987 ERROR("Error when writing to state file (3) (errno %d)",
988 errno);
989 goto out;
990 }
992 /* entering this loop, pfn_type is now in pfns (Not mfns) */
993 for ( j = 0; j < batch; j++ )
994 {
995 unsigned long pfn, pagetype;
996 void *spage = (char *)region_base + (PAGE_SIZE*j);
998 pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
999 pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
1001 /* write out pages in batch */
1002 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
1003 continue;
1005 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
1007 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
1008 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
1010 /* We have a pagetable page: need to rewrite it. */
1011 race =
1012 canonicalize_pagetable(pagetype, pfn, spage, page);
1014 if(race && !live)
1015 goto out;
1017 if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
1018 ERROR("Error when writing to state file (4)"
1019 " (errno %d)", errno);
1020 goto out;
1023 } else {
1025 /* We have a normal page: just write it directly. */
1026 if (ratewrite(io_fd, spage, PAGE_SIZE) != PAGE_SIZE) {
1027 ERROR("Error when writing to state file (5)"
1028 " (errno %d)", errno);
1029 goto out;
1032 } /* end of the write out for this batch */
1034 sent_this_iter += batch;
1036 munmap(region_base, batch*PAGE_SIZE);
1038 } /* end of this while loop for this iteration */
1040 skip:
1042 total_sent += sent_this_iter;
1044 DPRINTF("\r %d: sent %d, skipped %d, ",
1045 iter, sent_this_iter, skip_this_iter );
1047 if (last_iter) {
1048 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
1050 DPRINTF("Total pages sent= %ld (%.2fx)\n",
1051 total_sent, ((float)total_sent)/max_pfn );
1052 DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
1055 if (last_iter && debug){
1056 int minusone = -1;
1057 memset(to_send, 0xff, BITMAP_SIZE);
1058 debug = 0;
1059 DPRINTF("Entering debug resend-all mode\n");
1061 /* send "-1" to put receiver into debug mode */
1062 if(!write_exact(io_fd, &minusone, sizeof(int))) {
1063 ERROR("Error when writing to state file (6) (errno %d)",
1064 errno);
1065 goto out;
1068 continue;
1071 if (last_iter) break;
1073 if (live) {
1076 if(
1077 ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
1078 (iter >= max_iters) ||
1079 (sent_this_iter+skip_this_iter < 50) ||
1080 (total_sent > max_pfn*max_factor) ) {
1082 DPRINTF("Start last iteration\n");
1083 last_iter = 1;
1085 if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
1086 &ctxt)) {
1087 ERROR("Domain appears not to have suspended");
1088 goto out;
1091 DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
1092 info.shared_info_frame,
1093 (unsigned long)ctxt.user_regs.eip,
1094 (unsigned long)ctxt.user_regs.edx);
1097 if (xc_shadow_control(xc_handle, dom,
1098 XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
1099 max_pfn, NULL, 0, &stats) != max_pfn) {
1100 ERROR("Error flushing shadow PT");
1101 goto out;
1104 sent_last_iter = sent_this_iter;
1106 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
1111 } /* end of while 1 */
1113 DPRINTF("All memory is saved\n");
1115 /* Zero terminate */
1116 i = 0;
1117 if (!write_exact(io_fd, &i, sizeof(int))) {
1118 ERROR("Error when writing to state file (6') (errno %d)", errno);
1119 goto out;
1122 /* Send through a list of all the PFNs that were not in map at the close */
1124 unsigned int i,j;
1125 unsigned long pfntab[1024];
1127 for (i = 0, j = 0; i < max_pfn; i++) {
1128 if (!is_mapped(live_p2m[i]))
1129 j++;
1132 if(!write_exact(io_fd, &j, sizeof(unsigned int))) {
1133 ERROR("Error when writing to state file (6a) (errno %d)", errno);
1134 goto out;
1137 for (i = 0, j = 0; i < max_pfn; ) {
1139 if (!is_mapped(live_p2m[i]))
1140 pfntab[j++] = i;
1142 i++;
1143 if (j == 1024 || i == max_pfn) {
1144 if(!write_exact(io_fd, &pfntab, sizeof(unsigned long)*j)) {
1145 ERROR("Error when writing to state file (6b) (errno %d)",
1146 errno);
1147 goto out;
1149 j = 0;
1155 /* Canonicalise the suspend-record frame number. */
1156 if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) ){
1157 ERROR("Suspend record is not in range of pseudophys map");
1158 goto out;
1161 /* Canonicalise each GDT frame number. */
1162 for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
1163 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1164 ERROR("GDT frame is not in range of pseudophys map");
1165 goto out;
1169 /* Canonicalise the page table base pointer. */
1170 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) ) {
1171 ERROR("PT base is not in range of pseudophys map");
1172 goto out;
1174 ctxt.ctrlreg[3] =
1175 xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3])));
1177 if (!write_exact(io_fd, &ctxt, sizeof(ctxt)) ||
1178 !write_exact(io_fd, live_shinfo, PAGE_SIZE)) {
1179 ERROR("Error when writing to state file (1) (errno %d)", errno);
1180 goto out;
1183 /* Success! */
1184 rc = 0;
1186 out:
1188 if (live) {
1189 if(xc_shadow_control(xc_handle, dom,
1190 XEN_DOMCTL_SHADOW_OP_OFF,
1191 NULL, 0, NULL, 0, NULL) < 0) {
1192 DPRINTF("Warning - couldn't disable shadow mode");
1196 if (live_shinfo)
1197 munmap(live_shinfo, PAGE_SIZE);
1199 if (live_p2m_frame_list_list)
1200 munmap(live_p2m_frame_list_list, PAGE_SIZE);
1202 if (live_p2m_frame_list)
1203 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
1205 if(live_p2m)
1206 munmap(live_p2m, P2M_SIZE);
1208 if(live_m2p)
1209 munmap(live_m2p, M2P_SIZE(max_mfn));
1211 free(pfn_type);
1212 free(pfn_batch);
1213 free(to_send);
1214 free(to_fix);
1215 free(to_skip);
1217 DPRINTF("Save exit rc=%d\n",rc);
1219 return !!rc;
1222 /*
1223 * Local variables:
1224 * mode: C
1225 * c-set-style: "BSD"
1226 * c-basic-offset: 4
1227 * tab-width: 4
1228 * indent-tabs-mode: nil
1229 * End:
1230 */