ia64/xen-unstable

view tools/libxc/xc_linux_save.c @ 9488:0a6f5527ca4b

[IA64] set itv handoff as masked and enable reading irr[0-3]

Set initial vcpu itv handoff state to mask the timer vector.
This seems to match hardware and makes logical sense from a
spurious interrupt perspective. Enable vcpu_get_irr[0-3]
functions as they seem to work and have the proper backing.
This enables the check_sal_cache_flush() in arch/ia64/kernel.sal.c
to work unmodified, allowing us to remove the Xen changes from
the file (and thus the file from the sparse tree).

Signed-off-by: Alex Williamson <alex.williamson@hp.com>
author awilliam@xenbuild.aw
date Tue Apr 04 09:39:45 2006 -0600 (2006-04-04)
parents a51fcb5de470
children 74ee53209cca
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xg_private.h"
16 #include "xg_save_restore.h"
18 /*
19 ** Default values for important tuning parameters. Can override by passing
20 ** non-zero replacement values to xc_linux_save().
21 **
22 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
23 **
24 */
25 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
26 #define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */
29 /* max mfn of the whole machine */
30 static unsigned long max_mfn;
32 /* virtual starting address of the hypervisor */
33 static unsigned long hvirt_start;
35 /* #levels of page tables used by the currrent guest */
36 static unsigned int pt_levels;
38 /* total number of pages used by the current guest */
39 static unsigned long max_pfn;
41 /* Live mapping of the table mapping each PFN to its current MFN. */
42 static unsigned long *live_p2m = NULL;
44 /* Live mapping of system MFN to PFN table. */
45 static unsigned long *live_m2p = NULL;
47 /* grep fodder: machine_to_phys */
49 #define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
51 /*
52 * Returns TRUE if the given machine frame number has a unique mapping
53 * in the guest's pseudophysical map.
54 */
55 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
56 (((_mfn) < (max_mfn)) && \
57 ((mfn_to_pfn(_mfn) < (max_pfn)) && \
58 (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
61 /* Returns TRUE if MFN is successfully converted to a PFN. */
62 #define translate_mfn_to_pfn(_pmfn) \
63 ({ \
64 unsigned long mfn = *(_pmfn); \
65 int _res = 1; \
66 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \
67 _res = 0; \
68 else \
69 *(_pmfn) = mfn_to_pfn(mfn); \
70 _res; \
71 })
73 /*
74 ** During (live) save/migrate, we maintain a number of bitmaps to track
75 ** which pages we have to send, to fixup, and to skip.
76 */
78 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
79 #define BITMAP_SIZE ((max_pfn + BITS_PER_LONG - 1) / 8)
81 #define BITMAP_ENTRY(_nr,_bmap) \
82 ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
84 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
86 static inline int test_bit (int nr, volatile void * addr)
87 {
88 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
89 }
91 static inline void clear_bit (int nr, volatile void * addr)
92 {
93 BITMAP_ENTRY(nr, addr) &= ~(1 << BITMAP_SHIFT(nr));
94 }
96 static inline void set_bit ( int nr, volatile void * addr)
97 {
98 BITMAP_ENTRY(nr, addr) |= (1 << BITMAP_SHIFT(nr));
99 }
101 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
102 static inline unsigned int hweight32(unsigned int w)
103 {
104 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
105 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
106 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
107 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
108 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
109 }
111 static inline int count_bits ( int nr, volatile void *addr)
112 {
113 int i, count = 0;
114 unsigned long *p = (unsigned long *)addr;
115 /* We know that the array is padded to unsigned long. */
116 for( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
117 count += hweight32(*p);
118 return count;
119 }
121 static inline int permute( int i, int nr, int order_nr )
122 {
123 /* Need a simple permutation function so that we scan pages in a
124 pseudo random order, enabling us to get a better estimate of
125 the domain's page dirtying rate as we go (there are often
126 contiguous ranges of pfns that have similar behaviour, and we
127 want to mix them up. */
129 /* e.g. nr->oder 15->4 16->4 17->5 */
130 /* 512MB domain, 128k pages, order 17 */
132 /*
133 QPONMLKJIHGFEDCBA
134 QPONMLKJIH
135 GFEDCBA
136 */
138 /*
139 QPONMLKJIHGFEDCBA
140 EDCBA
141 QPONM
142 LKJIHGF
143 */
145 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
146 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
148 return i;
149 }
154 static uint64_t tv_to_us(struct timeval *new)
155 {
156 return (new->tv_sec * 1000000) + new->tv_usec;
157 }
159 static uint64_t llgettimeofday(void)
160 {
161 struct timeval now;
162 gettimeofday(&now, NULL);
163 return tv_to_us(&now);
164 }
166 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
167 {
168 return ((new->tv_sec - old->tv_sec)*1000000 ) +
169 (new->tv_usec - old->tv_usec);
170 }
173 #ifdef ADAPTIVE_SAVE
176 /*
177 ** We control the rate at which we transmit (or save) to minimize impact
178 ** on running domains (including the target if we're doing live migrate).
179 */
181 #define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
182 #define START_MBIT_RATE 100 /* initial transmit rate for migrate */
185 /* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
186 #define RATE_TO_BTU 781250
188 /* Amount in bytes we allow ourselves to send in a burst */
189 #define BURST_BUDGET (100*1024)
192 /* We keep track of the current and previous transmission rate */
193 static int mbit_rate, ombit_rate = 0;
195 /* Have we reached the maximum transmission rate? */
196 #define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
199 static inline void initialize_mbit_rate()
200 {
201 mbit_rate = START_MBIT_RATE;
202 }
205 static int ratewrite(int io_fd, void *buf, int n)
206 {
207 static int budget = 0;
208 static int burst_time_us = -1;
209 static struct timeval last_put = { 0 };
210 struct timeval now;
211 struct timespec delay;
212 long long delta;
214 if (START_MBIT_RATE == 0)
215 return write(io_fd, buf, n);
217 budget -= n;
218 if (budget < 0) {
219 if (mbit_rate != ombit_rate) {
220 burst_time_us = RATE_TO_BTU / mbit_rate;
221 ombit_rate = mbit_rate;
222 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
223 mbit_rate, BURST_BUDGET, burst_time_us);
224 }
225 if (last_put.tv_sec == 0) {
226 budget += BURST_BUDGET;
227 gettimeofday(&last_put, NULL);
228 } else {
229 while (budget < 0) {
230 gettimeofday(&now, NULL);
231 delta = tv_delta(&now, &last_put);
232 while (delta > burst_time_us) {
233 budget += BURST_BUDGET;
234 last_put.tv_usec += burst_time_us;
235 if (last_put.tv_usec > 1000000) {
236 last_put.tv_usec -= 1000000;
237 last_put.tv_sec++;
238 }
239 delta -= burst_time_us;
240 }
241 if (budget > 0)
242 break;
243 delay.tv_sec = 0;
244 delay.tv_nsec = 1000 * (burst_time_us - delta);
245 while (delay.tv_nsec > 0)
246 if (nanosleep(&delay, &delay) == 0)
247 break;
248 }
249 }
250 }
251 return write(io_fd, buf, n);
252 }
254 #else /* ! ADAPTIVE SAVE */
256 #define RATE_IS_MAX() (0)
257 #define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
258 #define initialize_mbit_rate()
260 #endif
263 static inline ssize_t write_exact(int fd, void *buf, size_t count)
264 {
265 if(write(fd, buf, count) != count)
266 return 0;
267 return 1;
268 }
272 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
273 xc_shadow_control_stats_t *stats, int print)
274 {
275 static struct timeval wall_last;
276 static long long d0_cpu_last;
277 static long long d1_cpu_last;
279 struct timeval wall_now;
280 long long wall_delta;
281 long long d0_cpu_now, d0_cpu_delta;
282 long long d1_cpu_now, d1_cpu_delta;
284 gettimeofday(&wall_now, NULL);
286 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
287 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
289 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
290 fprintf(stderr, "ARRHHH!!\n");
292 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
294 if (wall_delta == 0) wall_delta = 1;
296 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
297 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
299 if (print)
300 fprintf(stderr,
301 "delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
302 "dirtied %dMb/s %" PRId32 " pages\n",
303 wall_delta,
304 (int)((d0_cpu_delta*100)/wall_delta),
305 (int)((d1_cpu_delta*100)/wall_delta),
306 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
307 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
308 stats->dirty_count);
310 #ifdef ADAPTIVE_SAVE
311 if (((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate) {
312 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
313 + 50;
314 if (mbit_rate > MAX_MBIT_RATE)
315 mbit_rate = MAX_MBIT_RATE;
316 }
317 #endif
319 d0_cpu_last = d0_cpu_now;
320 d1_cpu_last = d1_cpu_now;
321 wall_last = wall_now;
323 return 0;
324 }
327 static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn,
328 unsigned long *arr, int runs)
329 {
330 long long start, now;
331 xc_shadow_control_stats_t stats;
332 int j;
334 start = llgettimeofday();
336 for (j = 0; j < runs; j++) {
337 int i;
339 xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_CLEAN,
340 arr, max_pfn, NULL);
341 fprintf(stderr, "#Flush\n");
342 for ( i = 0; i < 40; i++ ) {
343 usleep(50000);
344 now = llgettimeofday();
345 xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_PEEK,
346 NULL, 0, &stats);
348 fprintf(stderr, "now= %lld faults= %" PRId32 " dirty= %" PRId32
349 " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
350 ((now-start)+500)/1000,
351 stats.fault_count, stats.dirty_count,
352 stats.dirty_net_count, stats.dirty_block_count);
353 }
354 }
356 return -1;
357 }
360 static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
361 int dom, xc_dominfo_t *info,
362 vcpu_guest_context_t *ctxt)
363 {
364 int i = 0;
366 if (!(*suspend)(dom)) {
367 ERR("Suspend request failed");
368 return -1;
369 }
371 retry:
373 if (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) {
374 ERR("Could not get domain info");
375 return -1;
376 }
378 if ( xc_vcpu_getcontext(xc_handle, dom, 0 /* XXX */, ctxt))
379 ERR("Could not get vcpu context");
382 if (info->shutdown && info->shutdown_reason == SHUTDOWN_suspend)
383 return 0; // success
385 if (info->paused) {
386 // try unpausing domain, wait, and retest
387 xc_domain_unpause( xc_handle, dom );
389 ERR("Domain was paused. Wait and re-test.");
390 usleep(10000); // 10ms
392 goto retry;
393 }
396 if( ++i < 100 ) {
397 ERR("Retry suspend domain.");
398 usleep(10000); // 10ms
399 goto retry;
400 }
402 ERR("Unable to suspend domain.");
404 return -1;
405 }
408 /*
409 ** During transfer (or in the state file), all page-table pages must be
410 ** converted into a 'canonical' form where references to actual mfns
411 ** are replaced with references to the corresponding pfns.
412 **
413 ** This function performs the appropriate conversion, taking into account
414 ** which entries do not require canonicalization (in particular, those
415 ** entries which map the virtual address reserved for the hypervisor).
416 */
417 void canonicalize_pagetable(unsigned long type, unsigned long pfn,
418 const void *spage, void *dpage)
419 {
421 int i, pte_last, xen_start, xen_end;
422 uint64_t pte;
424 /*
425 ** We need to determine which entries in this page table hold
426 ** reserved hypervisor mappings. This depends on the current
427 ** page table type as well as the number of paging levels.
428 */
429 xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8);
431 if (pt_levels == 2 && type == L2TAB)
432 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
434 if (pt_levels == 3 && type == L3TAB)
435 xen_start = L3_PAGETABLE_ENTRIES_PAE;
437 /*
438 ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
439 ** We can spot this by looking for the guest linear mapping which
440 ** Xen always ensures is present in that L2. Guests must ensure
441 ** that this check will fail for other L2s.
442 */
443 if (pt_levels == 3 && type == L2TAB) {
445 /* XXX index of the L2 entry in PAE mode which holds the guest LPT */
446 #define PAE_GLPT_L2ENTRY (495)
447 pte = ((uint64_t*)spage)[PAE_GLPT_L2ENTRY];
449 if(((pte >> PAGE_SHIFT) & 0x0fffffff) == live_p2m[pfn])
450 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
451 }
453 if (pt_levels == 4 && type == L4TAB) {
454 /*
455 ** XXX SMH: should compute these from hvirt_start (which we have)
456 ** and hvirt_end (which we don't)
457 */
458 xen_start = 256;
459 xen_end = 272;
460 }
462 /* Now iterate through the page table, canonicalizing each PTE */
463 for (i = 0; i < pte_last; i++ ) {
465 unsigned long pfn, mfn;
467 if (pt_levels == 2)
468 pte = ((uint32_t*)spage)[i];
469 else
470 pte = ((uint64_t*)spage)[i];
472 if (i >= xen_start && i < xen_end)
473 pte = 0;
475 if (pte & _PAGE_PRESENT) {
477 mfn = (pte >> PAGE_SHIFT) & 0xfffffff;
478 if (!MFN_IS_IN_PSEUDOPHYS_MAP(mfn)) {
479 /* This will happen if the type info is stale which
480 is quite feasible under live migration */
481 DPRINTF("PT Race: [%08lx,%d] pte=%llx, mfn=%08lx\n",
482 type, i, (unsigned long long)pte, mfn);
483 pfn = 0; /* zap it - we'll retransmit this page later */
484 } else
485 pfn = mfn_to_pfn(mfn);
487 pte &= 0xffffff0000000fffULL;
488 pte |= (uint64_t)pfn << PAGE_SHIFT;
489 }
491 if (pt_levels == 2)
492 ((uint32_t*)dpage)[i] = pte;
493 else
494 ((uint64_t*)dpage)[i] = pte;
496 }
498 return;
499 }
503 static unsigned long *xc_map_m2p(int xc_handle,
504 unsigned long max_mfn,
505 int prot)
506 {
507 struct xen_machphys_mfn_list xmml;
508 privcmd_mmap_t ioctlx;
509 privcmd_mmap_entry_t *entries;
510 unsigned long m2p_chunks, m2p_size;
511 unsigned long *m2p;
512 int i, rc;
514 m2p_size = M2P_SIZE(max_mfn);
515 m2p_chunks = M2P_CHUNKS(max_mfn);
517 xmml.max_extents = m2p_chunks;
518 if (!(xmml.extent_start = malloc(m2p_chunks * sizeof(unsigned long)))) {
519 ERR("failed to allocate space for m2p mfns");
520 return NULL;
521 }
523 if (xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
524 (xmml.nr_extents != m2p_chunks)) {
525 ERR("xc_get_m2p_mfns");
526 return NULL;
527 }
529 if ((m2p = mmap(NULL, m2p_size, prot,
530 MAP_SHARED, xc_handle, 0)) == MAP_FAILED) {
531 ERR("failed to mmap m2p");
532 return NULL;
533 }
535 if (!(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t)))) {
536 ERR("failed to allocate space for mmap entries");
537 return NULL;
538 }
540 ioctlx.num = m2p_chunks;
541 ioctlx.dom = DOMID_XEN;
542 ioctlx.entry = entries;
544 for (i=0; i < m2p_chunks; i++) {
545 entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
546 entries[i].mfn = xmml.extent_start[i];
547 entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
548 }
550 if ((rc = ioctl(xc_handle, IOCTL_PRIVCMD_MMAP, &ioctlx)) < 0) {
551 ERR("ioctl_mmap failed (rc = %d)", rc);
552 return NULL;
553 }
555 free(xmml.extent_start);
556 free(entries);
558 return m2p;
559 }
563 int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
564 uint32_t max_factor, uint32_t flags, int (*suspend)(int))
565 {
566 xc_dominfo_t info;
568 int rc = 1, i, j, last_iter, iter = 0;
569 int live = (flags & XCFLAGS_LIVE);
570 int debug = (flags & XCFLAGS_DEBUG);
571 int sent_last_iter, skip_this_iter;
573 /* The new domain's shared-info frame number. */
574 unsigned long shared_info_frame;
576 /* A copy of the CPU context of the guest. */
577 vcpu_guest_context_t ctxt;
579 /* A table containg the type of each PFN (/not/ MFN!). */
580 unsigned long *pfn_type = NULL;
581 unsigned long *pfn_batch = NULL;
583 /* A temporary mapping, and a copy, of one frame of guest memory. */
584 char page[PAGE_SIZE];
586 /* Double and single indirect references to the live P2M table */
587 unsigned long *live_p2m_frame_list_list = NULL;
588 unsigned long *live_p2m_frame_list = NULL;
590 /* A copy of the pfn-to-mfn table frame list. */
591 unsigned long *p2m_frame_list = NULL;
593 /* Live mapping of shared info structure */
594 shared_info_t *live_shinfo = NULL;
596 /* base of the region in which domain memory is mapped */
597 unsigned char *region_base = NULL;
599 /* power of 2 order of max_pfn */
600 int order_nr;
602 /* bitmap of pages:
603 - that should be sent this iteration (unless later marked as skip);
604 - to skip this iteration because already dirty;
605 - to fixup by sending at the end if not already resent; */
606 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
608 xc_shadow_control_stats_t stats;
610 unsigned long needed_to_fix = 0;
611 unsigned long total_sent = 0;
614 /* If no explicit control parameters given, use defaults */
615 if(!max_iters)
616 max_iters = DEF_MAX_ITERS;
617 if(!max_factor)
618 max_factor = DEF_MAX_FACTOR;
620 initialize_mbit_rate();
622 if(!get_platform_info(xc_handle, dom,
623 &max_mfn, &hvirt_start, &pt_levels)) {
624 ERR("Unable to get platform info.");
625 return 1;
626 }
628 if (xc_domain_getinfo(xc_handle, dom, 1, &info) != 1) {
629 ERR("Could not get domain info");
630 return 1;
631 }
633 if (mlock(&ctxt, sizeof(ctxt))) {
634 ERR("Unable to mlock ctxt");
635 return 1;
636 }
638 /* Only have to worry about vcpu 0 even for SMP */
639 if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) {
640 ERR("Could not get vcpu context");
641 goto out;
642 }
643 shared_info_frame = info.shared_info_frame;
645 /* A cheesy test to see whether the domain contains valid state. */
646 if (ctxt.ctrlreg[3] == 0)
647 {
648 ERR("Domain is not in a valid Linux guest OS state");
649 goto out;
650 }
652 /* cheesy sanity check */
653 if ((info.max_memkb >> (PAGE_SHIFT - 10)) > max_mfn) {
654 ERR("Invalid state record -- pfn count out of range: %lu",
655 (info.max_memkb >> (PAGE_SHIFT - 10)));
656 goto out;
657 }
659 /* Map the shared info frame */
660 if(!(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
661 PROT_READ, shared_info_frame))) {
662 ERR("Couldn't map live_shinfo");
663 goto out;
664 }
666 max_pfn = live_shinfo->arch.max_pfn;
668 live_p2m_frame_list_list =
669 xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
670 live_shinfo->arch.pfn_to_mfn_frame_list_list);
672 if (!live_p2m_frame_list_list) {
673 ERR("Couldn't map p2m_frame_list_list (errno %d)", errno);
674 goto out;
675 }
677 live_p2m_frame_list =
678 xc_map_foreign_batch(xc_handle, dom, PROT_READ,
679 live_p2m_frame_list_list,
680 P2M_FLL_ENTRIES);
682 if (!live_p2m_frame_list) {
683 ERR("Couldn't map p2m_frame_list");
684 goto out;
685 }
687 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
688 the guest must not change which frames are used for this purpose.
689 (its not clear why it would want to change them, and we'll be OK
690 from a safety POV anyhow. */
692 live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
693 live_p2m_frame_list,
694 P2M_FL_ENTRIES);
696 if (!live_p2m) {
697 ERR("Couldn't map p2m table");
698 goto out;
699 }
701 /* Setup the mfn_to_pfn table mapping */
702 if(!(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ))) {
703 ERR("Failed to map live M2P table");
704 goto out;
705 }
708 /* Get a local copy of the live_P2M_frame_list */
709 if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
710 ERR("Couldn't allocate p2m_frame_list array");
711 goto out;
712 }
713 memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
715 /* Canonicalise the pfn-to-mfn table frame-number list. */
716 for (i = 0; i < max_pfn; i += ulpp) {
717 if (!translate_mfn_to_pfn(&p2m_frame_list[i/ulpp])) {
718 ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
719 ERR("entry %d: p2m_frame_list[%ld] is 0x%lx", i, i/ulpp,
720 p2m_frame_list[i/ulpp]);
721 goto out;
722 }
723 }
725 /* Domain is still running at this point */
726 if (live) {
728 if (xc_shadow_control(xc_handle, dom,
729 DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
730 NULL, 0, NULL ) < 0) {
731 ERR("Couldn't enable shadow mode");
732 goto out;
733 }
735 last_iter = 0;
737 } else {
739 /* This is a non-live suspend. Issue the call back to get the
740 domain suspended */
742 last_iter = 1;
744 if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt)) {
745 ERR("Domain appears not to have suspended");
746 goto out;
747 }
749 }
751 /* pretend we sent all the pages last iteration */
752 sent_last_iter = max_pfn;
755 /* calculate the power of 2 order of max_pfn, e.g.
756 15->4 16->4 17->5 */
757 for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
758 continue;
760 /* Setup to_send / to_fix and to_skip bitmaps */
761 to_send = malloc(BITMAP_SIZE);
762 to_fix = calloc(1, BITMAP_SIZE);
763 to_skip = malloc(BITMAP_SIZE);
765 if (!to_send || !to_fix || !to_skip) {
766 ERR("Couldn't allocate to_send array");
767 goto out;
768 }
770 memset(to_send, 0xff, BITMAP_SIZE);
772 if (mlock(to_send, BITMAP_SIZE)) {
773 ERR("Unable to mlock to_send");
774 return 1;
775 }
777 /* (to fix is local only) */
778 if (mlock(to_skip, BITMAP_SIZE)) {
779 ERR("Unable to mlock to_skip");
780 return 1;
781 }
783 analysis_phase(xc_handle, dom, max_pfn, to_skip, 0);
785 /* We want zeroed memory so use calloc rather than malloc. */
786 pfn_type = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
787 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
789 if ((pfn_type == NULL) || (pfn_batch == NULL)) {
790 ERR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
791 errno = ENOMEM;
792 goto out;
793 }
795 if (mlock(pfn_type, MAX_BATCH_SIZE * sizeof(unsigned long))) {
796 ERR("Unable to mlock");
797 goto out;
798 }
801 /*
802 * Quick belt and braces sanity check.
803 */
804 {
805 int err=0;
806 unsigned long mfn;
807 for (i = 0; i < max_pfn; i++) {
809 mfn = live_p2m[i];
810 if((mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i)) {
811 DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
812 mfn, mfn_to_pfn(mfn));
813 err++;
814 }
815 }
816 DPRINTF("Had %d unexplained entries in p2m table\n", err);
817 }
820 /* Start writing out the saved-domain record. */
822 if(!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
823 ERR("write: max_pfn");
824 goto out;
825 }
827 if(!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
828 ERR("write: p2m_frame_list");
829 goto out;
830 }
832 print_stats(xc_handle, dom, 0, &stats, 0);
834 /* Now write out each data page, canonicalising page tables as we go... */
836 while(1) {
838 unsigned int prev_pc, sent_this_iter, N, batch;
840 iter++;
841 sent_this_iter = 0;
842 skip_this_iter = 0;
843 prev_pc = 0;
844 N=0;
846 DPRINTF("Saving memory pages: iter %d 0%%", iter);
848 while( N < max_pfn ){
850 unsigned int this_pc = (N * 100) / max_pfn;
852 if ((this_pc - prev_pc) >= 5) {
853 DPRINTF("\b\b\b\b%3d%%", this_pc);
854 prev_pc = this_pc;
855 }
857 /* slightly wasteful to peek the whole array evey time,
858 but this is fast enough for the moment. */
859 if (!last_iter && xc_shadow_control(
860 xc_handle, dom, DOM0_SHADOW_CONTROL_OP_PEEK,
861 to_skip, max_pfn, NULL) != max_pfn) {
862 ERR("Error peeking shadow bitmap");
863 goto out;
864 }
867 /* load pfn_type[] with the mfn of all the pages we're doing in
868 this batch. */
869 for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) {
871 int n = permute(N, max_pfn, order_nr);
873 if (debug) {
874 DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n",
875 iter, (unsigned long)n, live_p2m[n],
876 test_bit(n, to_send),
877 mfn_to_pfn(live_p2m[n]&0xFFFFF));
878 }
880 if (!last_iter && test_bit(n, to_send)&& test_bit(n, to_skip))
881 skip_this_iter++; /* stats keeping */
883 if (!((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
884 (test_bit(n, to_send) && last_iter) ||
885 (test_bit(n, to_fix) && last_iter)))
886 continue;
888 /*
889 ** we get here if:
890 ** 1. page is marked to_send & hasn't already been re-dirtied
891 ** 2. (ignore to_skip in last iteration)
892 ** 3. add in pages that still need fixup (net bufs)
893 */
895 pfn_batch[batch] = n;
896 pfn_type[batch] = live_p2m[n];
898 if(!is_mapped(pfn_type[batch])) {
900 /* not currently in pusedo-physical map -- set bit
901 in to_fix that we must send this page in last_iter
902 unless its sent sooner anyhow */
904 set_bit(n, to_fix);
905 if( (iter > 1) && IS_REAL_PFN(n) )
906 DPRINTF("netbuf race: iter %d, pfn %x. mfn %lx\n",
907 iter, n, pfn_type[batch]);
908 continue;
909 }
911 if(last_iter && test_bit(n, to_fix) && !test_bit(n, to_send)) {
912 needed_to_fix++;
913 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
914 iter,n,pfn_type[batch]);
915 }
917 clear_bit(n, to_fix);
919 batch++;
920 }
922 if (batch == 0)
923 goto skip; /* vanishingly unlikely... */
925 if ((region_base = xc_map_foreign_batch(
926 xc_handle, dom, PROT_READ, pfn_type, batch)) == 0) {
927 ERR("map batch failed");
928 goto out;
929 }
931 if (xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type)) {
932 ERR("get_pfn_type_batch failed");
933 goto out;
934 }
936 for (j = 0; j < batch; j++) {
938 if ((pfn_type[j] & LTAB_MASK) == XTAB) {
939 DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
940 continue;
941 }
943 if (debug)
944 fprintf(stderr, "%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
945 " sum= %08lx\n",
946 iter,
947 (pfn_type[j] & LTAB_MASK) | pfn_batch[j],
948 pfn_type[j],
949 mfn_to_pfn(pfn_type[j]&(~LTAB_MASK)),
950 csum_page(region_base + (PAGE_SIZE*j)));
952 /* canonicalise mfn->pfn */
953 pfn_type[j] = (pfn_type[j] & LTAB_MASK) | pfn_batch[j];
954 }
956 if(!write_exact(io_fd, &batch, sizeof(unsigned int))) {
957 ERR("Error when writing to state file (2)");
958 goto out;
959 }
961 if(!write_exact(io_fd, pfn_type, sizeof(unsigned long)*j)) {
962 ERR("Error when writing to state file (3)");
963 goto out;
964 }
966 /* entering this loop, pfn_type is now in pfns (Not mfns) */
967 for (j = 0; j < batch; j++) {
969 unsigned long pfn = pfn_type[j] & ~LTAB_MASK;
970 unsigned long pagetype = pfn_type[j] & LTAB_MASK;
971 void *spage = (void *) region_base + (PAGE_SIZE*j);
974 /* write out pages in batch */
975 if (pagetype == XTAB)
976 continue;
978 pagetype &= LTABTYPE_MASK;
980 if (pagetype >= L1TAB && pagetype <= L4TAB) {
982 /* We have a pagetable page: need to rewrite it. */
983 canonicalize_pagetable(pagetype, pfn, spage, page);
985 if (ratewrite(io_fd, page, PAGE_SIZE) != PAGE_SIZE) {
986 ERR("Error when writing to state file (4)");
987 goto out;
988 }
990 } else {
992 /* We have a normal page: just write it directly. */
993 if (ratewrite(io_fd, spage, PAGE_SIZE) != PAGE_SIZE) {
994 ERR("Error when writing to state file (5)");
995 goto out;
996 }
997 }
998 } /* end of the write out for this batch */
1000 sent_this_iter += batch;
1002 munmap(region_base, batch*PAGE_SIZE);
1004 } /* end of this while loop for this iteration */
1006 skip:
1008 total_sent += sent_this_iter;
1010 DPRINTF("\r %d: sent %d, skipped %d, ",
1011 iter, sent_this_iter, skip_this_iter );
1013 if (last_iter) {
1014 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
1016 DPRINTF("Total pages sent= %ld (%.2fx)\n",
1017 total_sent, ((float)total_sent)/max_pfn );
1018 DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
1021 if (last_iter && debug){
1022 int minusone = -1;
1023 memset(to_send, 0xff, BITMAP_SIZE);
1024 debug = 0;
1025 fprintf(stderr, "Entering debug resend-all mode\n");
1027 /* send "-1" to put receiver into debug mode */
1028 if(!write_exact(io_fd, &minusone, sizeof(int))) {
1029 ERR("Error when writing to state file (6)");
1030 goto out;
1033 continue;
1036 if (last_iter) break;
1038 if (live) {
1041 if(
1042 ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
1043 (iter >= max_iters) ||
1044 (sent_this_iter+skip_this_iter < 50) ||
1045 (total_sent > max_pfn*max_factor) ) {
1047 DPRINTF("Start last iteration\n");
1048 last_iter = 1;
1050 if (suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
1051 &ctxt)) {
1052 ERR("Domain appears not to have suspended");
1053 goto out;
1056 DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
1057 info.shared_info_frame,
1058 (unsigned long)ctxt.user_regs.eip,
1059 (unsigned long)ctxt.user_regs.edx);
1062 if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_CLEAN,
1063 to_send, max_pfn, &stats ) != max_pfn) {
1064 ERR("Error flushing shadow PT");
1065 goto out;
1068 sent_last_iter = sent_this_iter;
1070 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
1075 } /* end of while 1 */
1077 DPRINTF("All memory is saved\n");
1079 /* Zero terminate */
1080 i = 0;
1081 if (!write_exact(io_fd, &i, sizeof(int))) {
1082 ERR("Error when writing to state file (6)");
1083 goto out;
1086 /* Send through a list of all the PFNs that were not in map at the close */
1088 unsigned int i,j;
1089 unsigned long pfntab[1024];
1091 for (i = 0, j = 0; i < max_pfn; i++) {
1092 if (!is_mapped(live_p2m[i]))
1093 j++;
1096 if(!write_exact(io_fd, &j, sizeof(unsigned int))) {
1097 ERR("Error when writing to state file (6a)");
1098 goto out;
1101 for (i = 0, j = 0; i < max_pfn; ) {
1103 if (!is_mapped(live_p2m[i]))
1104 pfntab[j++] = i;
1106 i++;
1107 if (j == 1024 || i == max_pfn) {
1108 if(!write_exact(io_fd, &pfntab, sizeof(unsigned long)*j)) {
1109 ERR("Error when writing to state file (6b)");
1110 goto out;
1112 j = 0;
1118 /* Canonicalise the suspend-record frame number. */
1119 if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) ){
1120 ERR("Suspend record is not in range of pseudophys map");
1121 goto out;
1124 /* Canonicalise each GDT frame number. */
1125 for ( i = 0; i < ctxt.gdt_ents; i += 512 ) {
1126 if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) ) {
1127 ERR("GDT frame is not in range of pseudophys map");
1128 goto out;
1132 /* Canonicalise the page table base pointer. */
1133 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.ctrlreg[3] >> PAGE_SHIFT) ) {
1134 ERR("PT base is not in range of pseudophys map");
1135 goto out;
1137 ctxt.ctrlreg[3] = mfn_to_pfn(ctxt.ctrlreg[3] >> PAGE_SHIFT) <<
1138 PAGE_SHIFT;
1140 if (!write_exact(io_fd, &ctxt, sizeof(ctxt)) ||
1141 !write_exact(io_fd, live_shinfo, PAGE_SIZE)) {
1142 ERR("Error when writing to state file (1)");
1143 goto out;
1146 /* Success! */
1147 rc = 0;
1149 out:
1151 if (live) {
1152 if(xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_OFF,
1153 NULL, 0, NULL ) < 0) {
1154 DPRINTF("Warning - couldn't disable shadow mode");
1158 if (live_shinfo)
1159 munmap(live_shinfo, PAGE_SIZE);
1161 if (live_p2m_frame_list_list)
1162 munmap(live_p2m_frame_list_list, PAGE_SIZE);
1164 if (live_p2m_frame_list)
1165 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
1167 if(live_p2m)
1168 munmap(live_p2m, P2M_SIZE);
1170 if(live_m2p)
1171 munmap(live_m2p, M2P_SIZE(max_mfn));
1173 free(pfn_type);
1174 free(pfn_batch);
1175 free(to_send);
1176 free(to_fix);
1177 free(to_skip);
1179 DPRINTF("Save exit rc=%d\n",rc);
1181 return !!rc;
1184 /*
1185 * Local variables:
1186 * mode: C
1187 * c-set-style: "BSD"
1188 * c-basic-offset: 4
1189 * tab-width: 4
1190 * indent-tabs-mode: nil
1191 * End:
1192 */