ia64/xen-unstable

view tools/libxc/xc_domain_save.c @ 16371:dfca1120813f

libxc: Consistently print errno on write() error in domain_save code.
Signed-off-by: Keir Fraser <keir@xensource.com>
author Keir Fraser <keir@xensource.com>
date Sun Nov 11 18:28:57 2007 +0000 (2007-11-11)
parents f669bf5c6720
children 0164d924ceba
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xc_private.h"
16 #include "xc_dom.h"
17 #include "xg_private.h"
18 #include "xg_save_restore.h"
20 #include <xen/hvm/params.h>
21 #include "xc_e820.h"
23 /*
24 ** Default values for important tuning parameters. Can override by passing
25 ** non-zero replacement values to xc_domain_save().
26 **
27 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
28 **
29 */
30 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
31 #define DEF_MAX_FACTOR 3 /* never send more than 3x p2m_size */
33 /* max mfn of the whole machine */
34 static unsigned long max_mfn;
36 /* virtual starting address of the hypervisor */
37 static unsigned long hvirt_start;
39 /* #levels of page tables used by the current guest */
40 static unsigned int pt_levels;
42 /* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */
43 static unsigned long *qemu_bitmaps[2];
44 static int qemu_active;
45 static int qemu_non_active;
47 /* number of pfns this guest has (i.e. number of entries in the P2M) */
48 static unsigned long p2m_size;
50 /* Live mapping of the table mapping each PFN to its current MFN. */
51 static xen_pfn_t *live_p2m = NULL;
53 /* Live mapping of system MFN to PFN table. */
54 static xen_pfn_t *live_m2p = NULL;
55 static unsigned long m2p_mfn0;
57 /* Address size of the guest */
58 unsigned int guest_width;
60 /* grep fodder: machine_to_phys */
62 #define mfn_to_pfn(_mfn) (live_m2p[(_mfn)])
64 #define pfn_to_mfn(_pfn) \
65 ((xen_pfn_t) ((guest_width==8) \
66 ? (((uint64_t *)live_p2m)[(_pfn)]) \
67 : (((uint32_t *)live_p2m)[(_pfn)])))
69 /*
70 * Returns TRUE if the given machine frame number has a unique mapping
71 * in the guest's pseudophysical map.
72 */
73 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
74 (((_mfn) < (max_mfn)) && \
75 ((mfn_to_pfn(_mfn) < (p2m_size)) && \
76 (pfn_to_mfn(mfn_to_pfn(_mfn)) == (_mfn))))
78 /*
79 ** During (live) save/migrate, we maintain a number of bitmaps to track
80 ** which pages we have to send, to fixup, and to skip.
81 */
83 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
84 #define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
85 #define BITMAP_SIZE (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long))
87 #define BITMAP_ENTRY(_nr,_bmap) \
88 ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
90 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
92 static inline int test_bit (int nr, volatile void * addr)
93 {
94 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
95 }
97 static inline void clear_bit (int nr, volatile void * addr)
98 {
99 BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
100 }
102 static inline void set_bit ( int nr, volatile void * addr)
103 {
104 BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
105 }
107 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
108 static inline unsigned int hweight32(unsigned int w)
109 {
110 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
111 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
112 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
113 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
114 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
115 }
117 static inline int count_bits ( int nr, volatile void *addr)
118 {
119 int i, count = 0;
120 volatile unsigned long *p = (volatile unsigned long *)addr;
121 /* We know that the array is padded to unsigned long. */
122 for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
123 count += hweight32(*p);
124 return count;
125 }
127 static inline int permute( int i, int nr, int order_nr )
128 {
129 /* Need a simple permutation function so that we scan pages in a
130 pseudo random order, enabling us to get a better estimate of
131 the domain's page dirtying rate as we go (there are often
132 contiguous ranges of pfns that have similar behaviour, and we
133 want to mix them up. */
135 /* e.g. nr->oder 15->4 16->4 17->5 */
136 /* 512MB domain, 128k pages, order 17 */
138 /*
139 QPONMLKJIHGFEDCBA
140 QPONMLKJIH
141 GFEDCBA
142 */
144 /*
145 QPONMLKJIHGFEDCBA
146 EDCBA
147 QPONM
148 LKJIHGF
149 */
151 do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
152 while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
154 return i;
155 }
157 static uint64_t tv_to_us(struct timeval *new)
158 {
159 return (new->tv_sec * 1000000) + new->tv_usec;
160 }
162 static uint64_t llgettimeofday(void)
163 {
164 struct timeval now;
165 gettimeofday(&now, NULL);
166 return tv_to_us(&now);
167 }
169 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
170 {
171 return (((new->tv_sec - old->tv_sec)*1000000) +
172 (new->tv_usec - old->tv_usec));
173 }
175 static int noncached_write(int fd, int live, void *buffer, int len)
176 {
177 static int write_count = 0;
178 int rc = (write_exact(fd, buffer, len) == 0) ? len : -1;
180 write_count += len;
181 if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
182 {
183 /* Time to discard cache - dont care if this fails */
184 discard_file_cache(fd, 0 /* no flush */);
185 write_count = 0;
186 }
188 return rc;
189 }
191 #ifdef ADAPTIVE_SAVE
193 /*
194 ** We control the rate at which we transmit (or save) to minimize impact
195 ** on running domains (including the target if we're doing live migrate).
196 */
198 #define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
199 #define START_MBIT_RATE 100 /* initial transmit rate for migrate */
201 /* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
202 #define RATE_TO_BTU 781250
204 /* Amount in bytes we allow ourselves to send in a burst */
205 #define BURST_BUDGET (100*1024)
207 /* We keep track of the current and previous transmission rate */
208 static int mbit_rate, ombit_rate = 0;
210 /* Have we reached the maximum transmission rate? */
211 #define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
213 static inline void initialize_mbit_rate()
214 {
215 mbit_rate = START_MBIT_RATE;
216 }
218 static int ratewrite(int io_fd, int live, void *buf, int n)
219 {
220 static int budget = 0;
221 static int burst_time_us = -1;
222 static struct timeval last_put = { 0 };
223 struct timeval now;
224 struct timespec delay;
225 long long delta;
227 if ( START_MBIT_RATE == 0 )
228 return noncached_write(io_fd, live, buf, n);
230 budget -= n;
231 if ( budget < 0 )
232 {
233 if ( mbit_rate != ombit_rate )
234 {
235 burst_time_us = RATE_TO_BTU / mbit_rate;
236 ombit_rate = mbit_rate;
237 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
238 mbit_rate, BURST_BUDGET, burst_time_us);
239 }
240 if ( last_put.tv_sec == 0 )
241 {
242 budget += BURST_BUDGET;
243 gettimeofday(&last_put, NULL);
244 }
245 else
246 {
247 while ( budget < 0 )
248 {
249 gettimeofday(&now, NULL);
250 delta = tv_delta(&now, &last_put);
251 while ( delta > burst_time_us )
252 {
253 budget += BURST_BUDGET;
254 last_put.tv_usec += burst_time_us;
255 if ( last_put.tv_usec > 1000000
256 {
257 last_put.tv_usec -= 1000000;
258 last_put.tv_sec++;
259 }
260 delta -= burst_time_us;
261 }
262 if ( budget > 0 )
263 break;
264 delay.tv_sec = 0;
265 delay.tv_nsec = 1000 * (burst_time_us - delta);
266 while ( delay.tv_nsec > 0 )
267 if ( nanosleep(&delay, &delay) == 0 )
268 break;
269 }
270 }
271 }
272 return noncached_write(io_fd, live, buf, n);
273 }
275 #else /* ! ADAPTIVE SAVE */
277 #define RATE_IS_MAX() (0)
278 #define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), (_buf), (_n))
279 #define initialize_mbit_rate()
281 #endif
283 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
284 xc_shadow_op_stats_t *stats, int print)
285 {
286 static struct timeval wall_last;
287 static long long d0_cpu_last;
288 static long long d1_cpu_last;
290 struct timeval wall_now;
291 long long wall_delta;
292 long long d0_cpu_now, d0_cpu_delta;
293 long long d1_cpu_now, d1_cpu_delta;
295 gettimeofday(&wall_now, NULL);
297 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
298 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
300 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
301 DPRINTF("ARRHHH!!\n");
303 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
304 if ( wall_delta == 0 )
305 wall_delta = 1;
307 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
308 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
310 if ( print )
311 DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
312 "dirtied %dMb/s %" PRId32 " pages\n",
313 wall_delta,
314 (int)((d0_cpu_delta*100)/wall_delta),
315 (int)((d1_cpu_delta*100)/wall_delta),
316 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
317 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
318 stats->dirty_count);
320 #ifdef ADAPTIVE_SAVE
321 if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
322 {
323 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
324 + 50;
325 if ( mbit_rate > MAX_MBIT_RATE )
326 mbit_rate = MAX_MBIT_RATE;
327 }
328 #endif
330 d0_cpu_last = d0_cpu_now;
331 d1_cpu_last = d1_cpu_now;
332 wall_last = wall_now;
334 return 0;
335 }
338 static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size,
339 unsigned long *arr, int runs)
340 {
341 long long start, now;
342 xc_shadow_op_stats_t stats;
343 int j;
345 start = llgettimeofday();
347 for ( j = 0; j < runs; j++ )
348 {
349 int i;
351 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
352 arr, p2m_size, NULL, 0, NULL);
353 DPRINTF("#Flush\n");
354 for ( i = 0; i < 40; i++ )
355 {
356 usleep(50000);
357 now = llgettimeofday();
358 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
359 NULL, 0, NULL, 0, &stats);
360 DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
361 ((now-start)+500)/1000,
362 stats.fault_count, stats.dirty_count);
363 }
364 }
366 return -1;
367 }
370 static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
371 int dom, xc_dominfo_t *info)
372 {
373 int i = 0;
375 if ( !(*suspend)(dom) )
376 {
377 ERROR("Suspend request failed");
378 return -1;
379 }
381 retry:
383 if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
384 {
385 ERROR("Could not get domain info");
386 return -1;
387 }
389 if ( info->dying )
390 {
391 ERROR("domain is dying");
392 return -1;
393 }
395 if ( info->crashed )
396 {
397 ERROR("domain has crashed");
398 return -1;
399 }
401 if ( info->shutdown )
402 {
403 switch ( info->shutdown_reason )
404 {
405 case SHUTDOWN_poweroff:
406 case SHUTDOWN_reboot:
407 ERROR("domain has shut down");
408 return -1;
409 case SHUTDOWN_suspend:
410 return 0;
411 case SHUTDOWN_crash:
412 ERROR("domain has crashed");
413 return -1;
414 }
415 }
417 if ( info->paused )
418 {
419 /* Try unpausing domain, wait, and retest. */
420 xc_domain_unpause( xc_handle, dom );
421 ERROR("Domain was paused. Wait and re-test.");
422 usleep(10000); /* 10ms */
423 goto retry;
424 }
426 if ( ++i < 100 )
427 {
428 ERROR("Retry suspend domain");
429 usleep(10000); /* 10ms */
430 goto retry;
431 }
433 ERROR("Unable to suspend domain.");
435 return -1;
436 }
438 /*
439 ** Map the top-level page of MFNs from the guest. The guest might not have
440 ** finished resuming from a previous restore operation, so we wait a while for
441 ** it to update the MFN to a reasonable value.
442 */
443 static void *map_frame_list_list(int xc_handle, uint32_t dom,
444 shared_info_either_t *shinfo)
445 {
446 int count = 100;
447 void *p;
448 uint64_t fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list);
450 while ( count-- && (fll == 0) )
451 {
452 usleep(10000);
453 fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list);
454 }
456 if ( fll == 0 )
457 {
458 ERROR("Timed out waiting for frame list updated.");
459 return NULL;
460 }
462 p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, fll);
463 if ( p == NULL )
464 ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
466 return p;
467 }
469 /*
470 ** During transfer (or in the state file), all page-table pages must be
471 ** converted into a 'canonical' form where references to actual mfns
472 ** are replaced with references to the corresponding pfns.
473 **
474 ** This function performs the appropriate conversion, taking into account
475 ** which entries do not require canonicalization (in particular, those
476 ** entries which map the virtual address reserved for the hypervisor).
477 */
478 static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
479 const void *spage, void *dpage)
480 {
482 int i, pte_last, xen_start, xen_end, race = 0;
483 uint64_t pte;
485 /*
486 ** We need to determine which entries in this page table hold
487 ** reserved hypervisor mappings. This depends on the current
488 ** page table type as well as the number of paging levels.
489 */
490 xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
492 if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
493 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
495 if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
496 xen_start = L3_PAGETABLE_ENTRIES_PAE;
498 /*
499 ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
500 ** We can spot this by looking for the guest linear mapping which
501 ** Xen always ensures is present in that L2. Guests must ensure
502 ** that this check will fail for other L2s.
503 */
504 if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
505 {
506 int hstart;
507 uint64_t he;
509 hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
510 he = ((const uint64_t *) spage)[hstart];
512 if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
513 {
514 /* hvirt starts with xen stuff... */
515 xen_start = hstart;
516 }
517 else if ( hvirt_start != 0xf5800000 )
518 {
519 /* old L2s from before hole was shrunk... */
520 hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
521 he = ((const uint64_t *) spage)[hstart];
522 if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
523 xen_start = hstart;
524 }
525 }
527 if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
528 {
529 /*
530 ** XXX SMH: should compute these from hvirt_start (which we have)
531 ** and hvirt_end (which we don't)
532 */
533 xen_start = 256;
534 xen_end = 272;
535 }
537 /* Now iterate through the page table, canonicalizing each PTE */
538 for (i = 0; i < pte_last; i++ )
539 {
540 unsigned long pfn, mfn;
542 if ( pt_levels == 2 )
543 pte = ((const uint32_t*)spage)[i];
544 else
545 pte = ((const uint64_t*)spage)[i];
547 if ( (i >= xen_start) && (i < xen_end) )
548 pte = 0;
550 if ( pte & _PAGE_PRESENT )
551 {
552 mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
553 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
554 {
555 /* This will happen if the type info is stale which
556 is quite feasible under live migration */
557 pfn = 0; /* zap it - we'll retransmit this page later */
558 race = 1; /* inform the caller of race; fatal if !live */
559 }
560 else
561 pfn = mfn_to_pfn(mfn);
563 pte &= ~MADDR_MASK_X86;
564 pte |= (uint64_t)pfn << PAGE_SHIFT;
566 /*
567 * PAE guest L3Es can contain these flags when running on
568 * a 64bit hypervisor. We zap these here to avoid any
569 * surprise at restore time...
570 */
571 if ( (pt_levels == 3) &&
572 (type == XEN_DOMCTL_PFINFO_L3TAB) &&
573 (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
574 pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
575 }
577 if ( pt_levels == 2 )
578 ((uint32_t*)dpage)[i] = pte;
579 else
580 ((uint64_t*)dpage)[i] = pte;
581 }
583 return race;
584 }
586 static xen_pfn_t *xc_map_m2p(int xc_handle,
587 unsigned long max_mfn,
588 int prot)
589 {
590 struct xen_machphys_mfn_list xmml;
591 privcmd_mmap_entry_t *entries;
592 unsigned long m2p_chunks, m2p_size;
593 xen_pfn_t *m2p;
594 xen_pfn_t *extent_start;
595 int i, rc;
597 m2p_size = M2P_SIZE(max_mfn);
598 m2p_chunks = M2P_CHUNKS(max_mfn);
600 xmml.max_extents = m2p_chunks;
601 if ( !(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t))) )
602 {
603 ERROR("failed to allocate space for m2p mfns");
604 return NULL;
605 }
606 set_xen_guest_handle(xmml.extent_start, extent_start);
608 if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
609 (xmml.nr_extents != m2p_chunks) )
610 {
611 ERROR("xc_get_m2p_mfns");
612 return NULL;
613 }
615 if ( (m2p = mmap(NULL, m2p_size, prot,
616 MAP_SHARED, xc_handle, 0)) == MAP_FAILED )
617 {
618 ERROR("failed to mmap m2p");
619 return NULL;
620 }
622 if ( !(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t))) )
623 {
624 ERROR("failed to allocate space for mmap entries");
625 return NULL;
626 }
628 for ( i = 0; i < m2p_chunks; i++ )
629 {
630 entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
631 entries[i].mfn = extent_start[i];
632 entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
633 }
635 if ( (rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
636 entries, m2p_chunks)) < 0 )
637 {
638 ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
639 return NULL;
640 }
642 m2p_mfn0 = entries[0].mfn;
644 free(extent_start);
645 free(entries);
647 return m2p;
648 }
651 static xen_pfn_t *map_and_save_p2m_table(int xc_handle,
652 int io_fd,
653 uint32_t dom,
654 unsigned long p2m_size,
655 shared_info_either_t *live_shinfo)
656 {
657 vcpu_guest_context_either_t ctxt;
659 /* Double and single indirect references to the live P2M table */
660 void *live_p2m_frame_list_list = NULL;
661 void *live_p2m_frame_list = NULL;
663 /* Copies of the above. */
664 xen_pfn_t *p2m_frame_list_list = NULL;
665 xen_pfn_t *p2m_frame_list = NULL;
667 /* The mapping of the live p2m table itself */
668 xen_pfn_t *p2m = NULL;
670 int i, success = 0;
672 live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
673 live_shinfo);
674 if ( !live_p2m_frame_list_list )
675 goto out;
677 /* Get a local copy of the live_P2M_frame_list_list */
678 if ( !(p2m_frame_list_list = malloc(PAGE_SIZE)) )
679 {
680 ERROR("Couldn't allocate p2m_frame_list_list array");
681 goto out;
682 }
683 memcpy(p2m_frame_list_list, live_p2m_frame_list_list, PAGE_SIZE);
685 /* Canonicalize guest's unsigned long vs ours */
686 if ( guest_width > sizeof(unsigned long) )
687 for ( i = 0; i < PAGE_SIZE/sizeof(unsigned long); i++ )
688 if ( i < PAGE_SIZE/guest_width )
689 p2m_frame_list_list[i] = ((uint64_t *)p2m_frame_list_list)[i];
690 else
691 p2m_frame_list_list[i] = 0;
692 else if ( guest_width < sizeof(unsigned long) )
693 for ( i = PAGE_SIZE/sizeof(unsigned long) - 1; i >= 0; i++ )
694 p2m_frame_list_list[i] = ((uint32_t *)p2m_frame_list_list)[i];
696 live_p2m_frame_list =
697 xc_map_foreign_batch(xc_handle, dom, PROT_READ,
698 p2m_frame_list_list,
699 P2M_FLL_ENTRIES);
700 if ( !live_p2m_frame_list )
701 {
702 ERROR("Couldn't map p2m_frame_list");
703 goto out;
704 }
706 /* Get a local copy of the live_P2M_frame_list */
707 if ( !(p2m_frame_list = malloc(P2M_FL_SIZE)) )
708 {
709 ERROR("Couldn't allocate p2m_frame_list array");
710 goto out;
711 }
712 memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
714 /* Canonicalize guest's unsigned long vs ours */
715 if ( guest_width > sizeof(unsigned long) )
716 for ( i = 0; i < P2M_FL_ENTRIES; i++ )
717 p2m_frame_list[i] = ((uint64_t *)p2m_frame_list)[i];
718 else if ( guest_width < sizeof(unsigned long) )
719 for ( i = P2M_FL_ENTRIES - 1; i >= 0; i++ )
720 p2m_frame_list[i] = ((uint32_t *)p2m_frame_list)[i];
723 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
724 the guest must not change which frames are used for this purpose.
725 (its not clear why it would want to change them, and we'll be OK
726 from a safety POV anyhow. */
728 p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
729 p2m_frame_list,
730 P2M_FL_ENTRIES);
731 if ( !p2m )
732 {
733 ERROR("Couldn't map p2m table");
734 goto out;
735 }
736 live_p2m = p2m; /* So that translation macros will work */
738 /* Canonicalise the pfn-to-mfn table frame-number list. */
739 for ( i = 0; i < p2m_size; i += FPP )
740 {
741 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(p2m_frame_list[i/FPP]) )
742 {
743 ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
744 ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64", max 0x%lx",
745 i, i/FPP, (uint64_t)p2m_frame_list[i/FPP], max_mfn);
746 if ( p2m_frame_list[i/FPP] < max_mfn )
747 {
748 ERROR("m2p[0x%"PRIx64"] = 0x%"PRIx64,
749 (uint64_t)p2m_frame_list[i/FPP],
750 (uint64_t)live_m2p[p2m_frame_list[i/FPP]]);
751 ERROR("p2m[0x%"PRIx64"] = 0x%"PRIx64,
752 (uint64_t)live_m2p[p2m_frame_list[i/FPP]],
753 (uint64_t)p2m[live_m2p[p2m_frame_list[i/FPP]]]);
755 }
756 goto out;
757 }
758 p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]);
759 }
761 if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) )
762 {
763 ERROR("Could not get vcpu context");
764 goto out;
765 }
767 /*
768 * Write an extended-info structure to inform the restore code that
769 * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
770 * slow paths in the restore code.
771 */
772 {
773 unsigned long signature = ~0UL;
774 uint32_t chunk1_sz = ((guest_width==8)
775 ? sizeof(ctxt.x64)
776 : sizeof(ctxt.x32));
777 uint32_t chunk2_sz = 0;
778 uint32_t tot_sz = (chunk1_sz + 8) + (chunk2_sz + 8);
779 if ( write_exact(io_fd, &signature, sizeof(signature)) ||
780 write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
781 write_exact(io_fd, "vcpu", 4) ||
782 write_exact(io_fd, &chunk1_sz, sizeof(chunk1_sz)) ||
783 write_exact(io_fd, &ctxt, chunk1_sz) ||
784 write_exact(io_fd, "extv", 4) ||
785 write_exact(io_fd, &chunk2_sz, sizeof(chunk2_sz)) )
786 {
787 PERROR("write: extended info");
788 goto out;
789 }
790 }
792 if ( write_exact(io_fd, p2m_frame_list,
793 P2M_FL_ENTRIES * sizeof(xen_pfn_t)) )
794 {
795 PERROR("write: p2m_frame_list");
796 goto out;
797 }
799 success = 1;
801 out:
803 if ( !success && p2m )
804 munmap(p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
806 if ( live_p2m_frame_list_list )
807 munmap(live_p2m_frame_list_list, PAGE_SIZE);
809 if ( live_p2m_frame_list )
810 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
812 if ( p2m_frame_list_list )
813 free(p2m_frame_list_list);
815 if ( p2m_frame_list )
816 free(p2m_frame_list);
818 return success ? p2m : NULL;
819 }
823 int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
824 uint32_t max_factor, uint32_t flags, int (*suspend)(int),
825 int hvm, void *(*init_qemu_maps)(int, unsigned),
826 void (*qemu_flip_buffer)(int, int))
827 {
828 xc_dominfo_t info;
829 DECLARE_DOMCTL;
831 int rc = 1, frc, i, j, last_iter, iter = 0;
832 int live = (flags & XCFLAGS_LIVE);
833 int debug = (flags & XCFLAGS_DEBUG);
834 int race = 0, sent_last_iter, skip_this_iter;
836 /* The new domain's shared-info frame number. */
837 unsigned long shared_info_frame;
839 /* A copy of the CPU context of the guest. */
840 vcpu_guest_context_either_t ctxt;
842 /* A table containing the type of each PFN (/not/ MFN!). */
843 unsigned long *pfn_type = NULL;
844 unsigned long *pfn_batch = NULL;
846 /* A copy of one frame of guest memory. */
847 char page[PAGE_SIZE];
849 /* Live mapping of shared info structure */
850 shared_info_either_t *live_shinfo = NULL;
852 /* base of the region in which domain memory is mapped */
853 unsigned char *region_base = NULL;
855 /* power of 2 order of p2m_size */
856 int order_nr;
858 /* bitmap of pages:
859 - that should be sent this iteration (unless later marked as skip);
860 - to skip this iteration because already dirty;
861 - to fixup by sending at the end if not already resent; */
862 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
864 xc_shadow_op_stats_t stats;
866 unsigned long needed_to_fix = 0;
867 unsigned long total_sent = 0;
869 uint64_t vcpumap = 1ULL;
871 /* HVM: a buffer for holding HVM context */
872 uint32_t hvm_buf_size = 0;
873 uint8_t *hvm_buf = NULL;
875 /* HVM: magic frames for ioreqs and xenstore comms. */
876 uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
878 unsigned long mfn;
880 /* If no explicit control parameters given, use defaults */
881 max_iters = max_iters ? : DEF_MAX_ITERS;
882 max_factor = max_factor ? : DEF_MAX_FACTOR;
884 initialize_mbit_rate();
886 if ( !get_platform_info(xc_handle, dom,
887 &max_mfn, &hvirt_start, &pt_levels, &guest_width) )
888 {
889 ERROR("Unable to get platform info.");
890 return 1;
891 }
893 if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
894 {
895 ERROR("Could not get domain info");
896 return 1;
897 }
899 shared_info_frame = info.shared_info_frame;
901 /* Map the shared info frame */
902 if ( !hvm )
903 {
904 live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
905 PROT_READ, shared_info_frame);
906 if ( !live_shinfo )
907 {
908 ERROR("Couldn't map live_shinfo");
909 goto out;
910 }
911 }
913 /* Get the size of the P2M table */
914 p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom) + 1;
916 /* Domain is still running at this point */
917 if ( live )
918 {
919 /* Live suspend. Enable log-dirty mode. */
920 if ( xc_shadow_control(xc_handle, dom,
921 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
922 NULL, 0, NULL, 0, NULL) < 0 )
923 {
924 /* log-dirty already enabled? There's no test op,
925 so attempt to disable then reenable it */
926 frc = xc_shadow_control(xc_handle, dom, XEN_DOMCTL_SHADOW_OP_OFF,
927 NULL, 0, NULL, 0, NULL);
928 if ( frc >= 0 )
929 {
930 frc = xc_shadow_control(xc_handle, dom,
931 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
932 NULL, 0, NULL, 0, NULL);
933 }
935 if ( frc < 0 )
936 {
937 ERROR("Couldn't enable shadow mode (rc %d) (errno %d)", frc, errno );
938 goto out;
939 }
940 }
942 if ( hvm )
943 {
944 /* Get qemu-dm logging dirty pages too */
945 void *seg = init_qemu_maps(dom, BITMAP_SIZE);
946 qemu_bitmaps[0] = seg;
947 qemu_bitmaps[1] = seg + BITMAP_SIZE;
948 qemu_active = 0;
949 qemu_non_active = 1;
950 }
951 }
952 else
953 {
954 /* This is a non-live suspend. Suspend the domain .*/
955 if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info) )
956 {
957 ERROR("Domain appears not to have suspended");
958 goto out;
959 }
960 }
962 last_iter = !live;
964 /* pretend we sent all the pages last iteration */
965 sent_last_iter = p2m_size;
967 /* calculate the power of 2 order of p2m_size, e.g.
968 15->4 16->4 17->5 */
969 for ( i = p2m_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
970 continue;
972 /* Setup to_send / to_fix and to_skip bitmaps */
973 to_send = malloc(BITMAP_SIZE);
974 to_fix = calloc(1, BITMAP_SIZE);
975 to_skip = malloc(BITMAP_SIZE);
977 if ( !to_send || !to_fix || !to_skip )
978 {
979 ERROR("Couldn't allocate to_send array");
980 goto out;
981 }
983 memset(to_send, 0xff, BITMAP_SIZE);
985 if ( lock_pages(to_send, BITMAP_SIZE) )
986 {
987 ERROR("Unable to lock to_send");
988 return 1;
989 }
991 /* (to fix is local only) */
992 if ( lock_pages(to_skip, BITMAP_SIZE) )
993 {
994 ERROR("Unable to lock to_skip");
995 return 1;
996 }
998 if ( hvm )
999 {
1000 /* Need another buffer for HVM context */
1001 hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
1002 if ( hvm_buf_size == -1 )
1004 ERROR("Couldn't get HVM context size from Xen");
1005 goto out;
1007 hvm_buf = malloc(hvm_buf_size);
1008 if ( !hvm_buf )
1010 ERROR("Couldn't allocate memory");
1011 goto out;
1015 analysis_phase(xc_handle, dom, p2m_size, to_skip, 0);
1017 /* We want zeroed memory so use calloc rather than malloc. */
1018 pfn_type = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
1019 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
1020 if ( (pfn_type == NULL) || (pfn_batch == NULL) )
1022 ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
1023 errno = ENOMEM;
1024 goto out;
1027 if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
1029 ERROR("Unable to lock");
1030 goto out;
1033 /* Setup the mfn_to_pfn table mapping */
1034 if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ)) )
1036 ERROR("Failed to map live M2P table");
1037 goto out;
1040 /* Start writing out the saved-domain record. */
1041 if ( write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
1043 PERROR("write: p2m_size");
1044 goto out;
1047 if ( !hvm )
1049 int err = 0;
1051 /* Map the P2M table, and write the list of P2M frames */
1052 live_p2m = map_and_save_p2m_table(xc_handle, io_fd, dom,
1053 p2m_size, live_shinfo);
1054 if ( live_p2m == NULL )
1056 ERROR("Failed to map/save the p2m frame list");
1057 goto out;
1060 /*
1061 * Quick belt and braces sanity check.
1062 */
1064 for ( i = 0; i < p2m_size; i++ )
1066 mfn = pfn_to_mfn(i);
1067 if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
1069 DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
1070 mfn, mfn_to_pfn(mfn));
1071 err++;
1074 DPRINTF("Had %d unexplained entries in p2m table\n", err);
1077 print_stats(xc_handle, dom, 0, &stats, 0);
1079 /* Now write out each data page, canonicalising page tables as we go... */
1080 for ( ; ; )
1082 unsigned int prev_pc, sent_this_iter, N, batch;
1084 iter++;
1085 sent_this_iter = 0;
1086 skip_this_iter = 0;
1087 prev_pc = 0;
1088 N = 0;
1090 DPRINTF("Saving memory pages: iter %d 0%%", iter);
1092 while ( N < p2m_size )
1094 unsigned int this_pc = (N * 100) / p2m_size;
1096 if ( (this_pc - prev_pc) >= 5 )
1098 DPRINTF("\b\b\b\b%3d%%", this_pc);
1099 prev_pc = this_pc;
1102 if ( !last_iter )
1104 /* Slightly wasteful to peek the whole array evey time,
1105 but this is fast enough for the moment. */
1106 frc = xc_shadow_control(
1107 xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip,
1108 p2m_size, NULL, 0, NULL);
1109 if ( frc != p2m_size )
1111 ERROR("Error peeking shadow bitmap");
1112 goto out;
1116 /* load pfn_type[] with the mfn of all the pages we're doing in
1117 this batch. */
1118 for ( batch = 0;
1119 (batch < MAX_BATCH_SIZE) && (N < p2m_size);
1120 N++ )
1122 int n = permute(N, p2m_size, order_nr);
1124 if ( debug )
1126 DPRINTF("%d pfn= %08lx mfn= %08lx %d",
1127 iter, (unsigned long)n,
1128 hvm ? 0 : pfn_to_mfn(n),
1129 test_bit(n, to_send));
1130 if ( !hvm && is_mapped(pfn_to_mfn(n)) )
1131 DPRINTF(" [mfn]= %08lx",
1132 mfn_to_pfn(pfn_to_mfn(n)&0xFFFFF));
1133 DPRINTF("\n");
1135 if ( !last_iter &&
1136 test_bit(n, to_send) &&
1137 test_bit(n, to_skip) )
1138 skip_this_iter++; /* stats keeping */
1140 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
1141 (test_bit(n, to_send) && last_iter) ||
1142 (test_bit(n, to_fix) && last_iter)) )
1143 continue;
1145 /* Skip PFNs that aren't really there */
1146 if ( hvm && ((n >= 0xa0 && n < 0xc0) /* VGA hole */
1147 || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT)
1148 && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ )
1149 continue;
1151 /*
1152 ** we get here if:
1153 ** 1. page is marked to_send & hasn't already been re-dirtied
1154 ** 2. (ignore to_skip in last iteration)
1155 ** 3. add in pages that still need fixup (net bufs)
1156 */
1158 pfn_batch[batch] = n;
1160 /* Hypercall interfaces operate in PFNs for HVM guests
1161 * and MFNs for PV guests */
1162 if ( hvm )
1163 pfn_type[batch] = n;
1164 else
1165 pfn_type[batch] = pfn_to_mfn(n);
1167 if ( !is_mapped(pfn_type[batch]) )
1169 /*
1170 ** not currently in psuedo-physical map -- set bit
1171 ** in to_fix since we must send this page in last_iter
1172 ** unless its sent sooner anyhow, or it never enters
1173 ** pseudo-physical map (e.g. for ballooned down doms)
1174 */
1175 set_bit(n, to_fix);
1176 continue;
1179 if ( last_iter &&
1180 test_bit(n, to_fix) &&
1181 !test_bit(n, to_send) )
1183 needed_to_fix++;
1184 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
1185 iter, n, pfn_type[batch]);
1188 clear_bit(n, to_fix);
1190 batch++;
1193 if ( batch == 0 )
1194 goto skip; /* vanishingly unlikely... */
1196 region_base = xc_map_foreign_batch(
1197 xc_handle, dom, PROT_READ, pfn_type, batch);
1198 if ( region_base == NULL )
1200 ERROR("map batch failed");
1201 goto out;
1204 if ( !hvm )
1206 /* Get page types */
1207 for ( j = 0; j < batch; j++ )
1208 ((uint32_t *)pfn_type)[j] = pfn_type[j];
1209 if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
1210 (uint32_t *)pfn_type) )
1212 ERROR("get_pfn_type_batch failed");
1213 goto out;
1215 for ( j = batch-1; j >= 0; j-- )
1216 pfn_type[j] = ((uint32_t *)pfn_type)[j];
1218 for ( j = 0; j < batch; j++ )
1221 if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
1222 XEN_DOMCTL_PFINFO_XTAB )
1224 DPRINTF("type fail: page %i mfn %08lx\n",
1225 j, pfn_type[j]);
1226 continue;
1229 if ( debug )
1230 DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
1231 " sum= %08lx\n",
1232 iter,
1233 (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
1234 pfn_batch[j],
1235 pfn_type[j],
1236 mfn_to_pfn(pfn_type[j] &
1237 ~XEN_DOMCTL_PFINFO_LTAB_MASK),
1238 csum_page(region_base + (PAGE_SIZE*j)));
1240 /* canonicalise mfn->pfn */
1241 pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
1242 pfn_batch[j];
1246 if ( write_exact(io_fd, &batch, sizeof(unsigned int)) )
1248 PERROR("Error when writing to state file (2)");
1249 goto out;
1252 if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
1254 PERROR("Error when writing to state file (3)");
1255 goto out;
1258 /* entering this loop, pfn_type is now in pfns (Not mfns) */
1259 for ( j = 0; j < batch; j++ )
1261 unsigned long pfn, pagetype;
1262 void *spage = (char *)region_base + (PAGE_SIZE*j);
1264 pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
1265 pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
1267 /* write out pages in batch */
1268 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
1269 continue;
1271 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
1273 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
1274 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
1276 /* We have a pagetable page: need to rewrite it. */
1277 race =
1278 canonicalize_pagetable(pagetype, pfn, spage, page);
1280 if ( race && !live )
1282 ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
1283 pagetype);
1284 goto out;
1287 if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
1289 ERROR("Error when writing to state file (4)"
1290 " (errno %d)", errno);
1291 goto out;
1294 else
1296 /* We have a normal page: just write it directly. */
1297 if ( ratewrite(io_fd, live, spage, PAGE_SIZE) !=
1298 PAGE_SIZE )
1300 ERROR("Error when writing to state file (5)"
1301 " (errno %d)", errno);
1302 goto out;
1305 } /* end of the write out for this batch */
1307 sent_this_iter += batch;
1309 munmap(region_base, batch*PAGE_SIZE);
1311 } /* end of this while loop for this iteration */
1313 skip:
1315 total_sent += sent_this_iter;
1317 DPRINTF("\r %d: sent %d, skipped %d, ",
1318 iter, sent_this_iter, skip_this_iter );
1320 if ( last_iter )
1322 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
1324 DPRINTF("Total pages sent= %ld (%.2fx)\n",
1325 total_sent, ((float)total_sent)/p2m_size );
1326 DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
1329 if ( last_iter && debug )
1331 int minusone = -1;
1332 memset(to_send, 0xff, BITMAP_SIZE);
1333 debug = 0;
1334 DPRINTF("Entering debug resend-all mode\n");
1336 /* send "-1" to put receiver into debug mode */
1337 if ( write_exact(io_fd, &minusone, sizeof(int)) )
1339 PERROR("Error when writing to state file (6)");
1340 goto out;
1343 continue;
1346 if ( last_iter )
1347 break;
1349 if ( live )
1351 if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
1352 (iter >= max_iters) ||
1353 (sent_this_iter+skip_this_iter < 50) ||
1354 (total_sent > p2m_size*max_factor) )
1356 DPRINTF("Start last iteration\n");
1357 last_iter = 1;
1359 if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info) )
1361 ERROR("Domain appears not to have suspended");
1362 goto out;
1365 DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
1368 if ( xc_shadow_control(xc_handle, dom,
1369 XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
1370 p2m_size, NULL, 0, &stats) != p2m_size )
1372 ERROR("Error flushing shadow PT");
1373 goto out;
1376 if ( hvm )
1378 /* Pull in the dirty bits from qemu-dm too */
1379 if ( !last_iter )
1381 qemu_active = qemu_non_active;
1382 qemu_non_active = qemu_active ? 0 : 1;
1383 qemu_flip_buffer(dom, qemu_active);
1384 for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
1386 to_send[j] |= qemu_bitmaps[qemu_non_active][j];
1387 qemu_bitmaps[qemu_non_active][j] = 0;
1390 else
1392 for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
1393 to_send[j] |= qemu_bitmaps[qemu_active][j];
1397 sent_last_iter = sent_this_iter;
1399 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
1402 } /* end of infinite for loop */
1404 DPRINTF("All memory is saved\n");
1407 struct {
1408 int minustwo;
1409 int max_vcpu_id;
1410 uint64_t vcpumap;
1411 } chunk = { -2, info.max_vcpu_id };
1413 if ( info.max_vcpu_id >= 64 )
1415 ERROR("Too many VCPUS in guest!");
1416 goto out;
1419 for ( i = 1; i <= info.max_vcpu_id; i++ )
1421 xc_vcpuinfo_t vinfo;
1422 if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
1423 vinfo.online )
1424 vcpumap |= 1ULL << i;
1427 chunk.vcpumap = vcpumap;
1428 if ( write_exact(io_fd, &chunk, sizeof(chunk)) )
1430 PERROR("Error when writing to state file");
1431 goto out;
1435 /* Zero terminate */
1436 i = 0;
1437 if ( write_exact(io_fd, &i, sizeof(int)) )
1439 PERROR("Error when writing to state file (6')");
1440 goto out;
1443 if ( hvm )
1445 uint32_t rec_size;
1447 /* Save magic-page locations. */
1448 memset(magic_pfns, 0, sizeof(magic_pfns));
1449 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
1450 (unsigned long *)&magic_pfns[0]);
1451 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
1452 (unsigned long *)&magic_pfns[1]);
1453 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
1454 (unsigned long *)&magic_pfns[2]);
1455 if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
1457 PERROR("Error when writing to state file (7)");
1458 goto out;
1461 /* Get HVM context from Xen and save it too */
1462 if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
1463 hvm_buf_size)) == -1 )
1465 ERROR("HVM:Could not get hvm buffer");
1466 goto out;
1469 if ( write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
1471 PERROR("error write hvm buffer size");
1472 goto out;
1475 if ( write_exact(io_fd, hvm_buf, rec_size) )
1477 PERROR("write HVM info failed!\n");
1478 goto out;
1481 /* HVM guests are done now */
1482 rc = 0;
1483 goto out;
1486 /* PV guests only from now on */
1488 /* Send through a list of all the PFNs that were not in map at the close */
1490 unsigned int i,j;
1491 unsigned long pfntab[1024];
1493 for ( i = 0, j = 0; i < p2m_size; i++ )
1495 if ( !is_mapped(pfn_to_mfn(i)) )
1496 j++;
1499 if ( write_exact(io_fd, &j, sizeof(unsigned int)) )
1501 PERROR("Error when writing to state file (6a)");
1502 goto out;
1505 for ( i = 0, j = 0; i < p2m_size; )
1507 if ( !is_mapped(pfn_to_mfn(i)) )
1508 pfntab[j++] = i;
1510 i++;
1511 if ( (j == 1024) || (i == p2m_size) )
1513 if ( write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
1515 PERROR("Error when writing to state file (6b)");
1516 goto out;
1518 j = 0;
1523 if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) )
1525 ERROR("Could not get vcpu context");
1526 goto out;
1529 /* Canonicalise the suspend-record frame number. */
1530 mfn = GET_FIELD(&ctxt, user_regs.edx);
1531 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
1533 ERROR("Suspend record is not in range of pseudophys map");
1534 goto out;
1536 SET_FIELD(&ctxt, user_regs.edx, mfn_to_pfn(mfn));
1538 for ( i = 0; i <= info.max_vcpu_id; i++ )
1540 if ( !(vcpumap & (1ULL << i)) )
1541 continue;
1543 if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt.c) )
1545 ERROR("No context for VCPU%d", i);
1546 goto out;
1549 /* Canonicalise each GDT frame number. */
1550 for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents); j++ )
1552 mfn = GET_FIELD(&ctxt, gdt_frames[j]);
1553 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
1555 ERROR("GDT frame is not in range of pseudophys map");
1556 goto out;
1558 SET_FIELD(&ctxt, gdt_frames[j], mfn_to_pfn(mfn));
1561 /* Canonicalise the page table base pointer. */
1562 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(
1563 GET_FIELD(&ctxt, ctrlreg[3]))) )
1565 ERROR("PT base is not in range of pseudophys map");
1566 goto out;
1568 SET_FIELD(&ctxt, ctrlreg[3],
1569 xen_pfn_to_cr3(
1570 mfn_to_pfn(
1571 xen_cr3_to_pfn(
1572 GET_FIELD(&ctxt, ctrlreg[3])))));
1574 /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
1575 if ( (pt_levels == 4) && ctxt.x64.ctrlreg[1] )
1577 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(
1578 xen_cr3_to_pfn(ctxt.x64.ctrlreg[1])) )
1580 ERROR("PT base is not in range of pseudophys map");
1581 goto out;
1583 /* Least-significant bit means 'valid PFN'. */
1584 ctxt.x64.ctrlreg[1] = 1 |
1585 xen_pfn_to_cr3(
1586 mfn_to_pfn(xen_cr3_to_pfn(ctxt.x64.ctrlreg[1])));
1589 if ( write_exact(io_fd, &ctxt, ((guest_width==8)
1590 ? sizeof(ctxt.x64)
1591 : sizeof(ctxt.x32))) )
1593 PERROR("Error when writing to state file (1)");
1594 goto out;
1597 domctl.cmd = XEN_DOMCTL_get_ext_vcpucontext;
1598 domctl.domain = dom;
1599 domctl.u.ext_vcpucontext.vcpu = i;
1600 if ( xc_domctl(xc_handle, &domctl) < 0 )
1602 ERROR("No extended context for VCPU%d", i);
1603 goto out;
1605 if ( write_exact(io_fd, &domctl.u.ext_vcpucontext, 128) )
1607 PERROR("Error when writing to state file (2)");
1608 goto out;
1612 /*
1613 * Reset the MFN to be a known-invalid value. See map_frame_list_list().
1614 */
1615 memcpy(page, live_shinfo, PAGE_SIZE);
1616 SET_FIELD(((shared_info_either_t *)page),
1617 arch.pfn_to_mfn_frame_list_list, 0);
1618 if ( write_exact(io_fd, page, PAGE_SIZE) )
1620 PERROR("Error when writing to state file (1)");
1621 goto out;
1624 /* Success! */
1625 rc = 0;
1627 out:
1629 if ( live )
1631 if ( xc_shadow_control(xc_handle, dom,
1632 XEN_DOMCTL_SHADOW_OP_OFF,
1633 NULL, 0, NULL, 0, NULL) < 0 )
1634 DPRINTF("Warning - couldn't disable shadow mode");
1637 /* Flush last write and discard cache for file. */
1638 discard_file_cache(io_fd, 1 /* flush */);
1640 if ( live_shinfo )
1641 munmap(live_shinfo, PAGE_SIZE);
1643 if ( live_p2m )
1644 munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
1646 if ( live_m2p )
1647 munmap(live_m2p, M2P_SIZE(max_mfn));
1649 free(pfn_type);
1650 free(pfn_batch);
1651 free(to_send);
1652 free(to_fix);
1653 free(to_skip);
1655 DPRINTF("Save exit rc=%d\n",rc);
1657 return !!rc;
1660 /*
1661 * Local variables:
1662 * mode: C
1663 * c-set-style: "BSD"
1664 * c-basic-offset: 4
1665 * tab-width: 4
1666 * indent-tabs-mode: nil
1667 * End:
1668 */