ia64/xen-unstable

view tools/libxc/xc_domain_save.c @ 18394:dade7f0bdc8d

hvm: Use main memory for video memory.

When creating an HVM domain, if e.g. another domain is created before
qemu allocates video memory, the extra 8MB memory ballooning is not
available any more, because it got consumed by the other domain.

This fixes it by taking video memory from the main memory:

- make hvmloader use e820_malloc to reserve some of the main memory
and notify ioemu of its address through the Xen platform PCI card.
- add XENMAPSPACE_mfn to the xen_add_to_physmap memory op, to allow
ioemu to move the MFNs between the original position and the PCI
mapping, when LFB acceleration is disabled/enabled
- add a remove_from_physmap memory op, to allow ioemu to unmap it
completely for the case of old guests with acceleration disabled.
- add xc_domain_memory_translate_gpfn_list to libxc to allow ioemu to
get the MFNs of the video memory.
- have xend save the PCI memory space instead of ioemu: if a memory
page is there, the guest can access it like usual memory, so xend
can safely be responsible to save it. The extra benefit is that
live migration will apply the logdirty optimization there too.
- handle old saved images, populating the video memory from ioemu if
really needed.

Signed-off-by: Samuel Thibault <samuel.thibault@eu.citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Wed Aug 27 14:53:39 2008 +0100 (2008-08-27)
parents 7299346111fb
children a7586ec158d0
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xc_private.h"
16 #include "xc_dom.h"
17 #include "xg_private.h"
18 #include "xg_save_restore.h"
20 #include <xen/hvm/params.h>
21 #include "xc_e820.h"
23 /*
24 ** Default values for important tuning parameters. Can override by passing
25 ** non-zero replacement values to xc_domain_save().
26 **
27 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
28 **
29 */
30 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
31 #define DEF_MAX_FACTOR 3 /* never send more than 3x p2m_size */
33 /* max mfn of the whole machine */
34 static unsigned long max_mfn;
36 /* virtual starting address of the hypervisor */
37 static unsigned long hvirt_start;
39 /* #levels of page tables used by the current guest */
40 static unsigned int pt_levels;
42 /* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */
43 static unsigned long *qemu_bitmaps[2];
44 static int qemu_active;
45 static int qemu_non_active;
47 /* number of pfns this guest has (i.e. number of entries in the P2M) */
48 static unsigned long p2m_size;
50 /* Live mapping of the table mapping each PFN to its current MFN. */
51 static xen_pfn_t *live_p2m = NULL;
53 /* Live mapping of system MFN to PFN table. */
54 static xen_pfn_t *live_m2p = NULL;
55 static unsigned long m2p_mfn0;
57 /* Address size of the guest */
58 unsigned int guest_width;
60 /* grep fodder: machine_to_phys */
62 #define mfn_to_pfn(_mfn) (live_m2p[(_mfn)])
64 #define pfn_to_mfn(_pfn) \
65 ((xen_pfn_t) ((guest_width==8) \
66 ? (((uint64_t *)live_p2m)[(_pfn)]) \
67 : ((((uint32_t *)live_p2m)[(_pfn)]) == 0xffffffffU \
68 ? (-1UL) : (((uint32_t *)live_p2m)[(_pfn)]))))
70 /*
71 * Returns TRUE if the given machine frame number has a unique mapping
72 * in the guest's pseudophysical map.
73 */
74 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
75 (((_mfn) < (max_mfn)) && \
76 ((mfn_to_pfn(_mfn) < (p2m_size)) && \
77 (pfn_to_mfn(mfn_to_pfn(_mfn)) == (_mfn))))
79 /*
80 ** During (live) save/migrate, we maintain a number of bitmaps to track
81 ** which pages we have to send, to fixup, and to skip.
82 */
84 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
85 #define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
86 #define BITMAP_SIZE (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long))
88 #define BITMAP_ENTRY(_nr,_bmap) \
89 ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
91 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
93 static inline int test_bit (int nr, volatile void * addr)
94 {
95 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
96 }
98 static inline void clear_bit (int nr, volatile void * addr)
99 {
100 BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
101 }
103 static inline void set_bit ( int nr, volatile void * addr)
104 {
105 BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
106 }
108 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
109 static inline unsigned int hweight32(unsigned int w)
110 {
111 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
112 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
113 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
114 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
115 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
116 }
118 static inline int count_bits ( int nr, volatile void *addr)
119 {
120 int i, count = 0;
121 volatile unsigned long *p = (volatile unsigned long *)addr;
122 /* We know that the array is padded to unsigned long. */
123 for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
124 count += hweight32(*p);
125 return count;
126 }
128 static uint64_t tv_to_us(struct timeval *new)
129 {
130 return (new->tv_sec * 1000000) + new->tv_usec;
131 }
133 static uint64_t llgettimeofday(void)
134 {
135 struct timeval now;
136 gettimeofday(&now, NULL);
137 return tv_to_us(&now);
138 }
140 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
141 {
142 return (((new->tv_sec - old->tv_sec)*1000000) +
143 (new->tv_usec - old->tv_usec));
144 }
146 static int noncached_write(int fd, int live, void *buffer, int len)
147 {
148 static int write_count = 0;
149 int rc = (write_exact(fd, buffer, len) == 0) ? len : -1;
151 write_count += len;
152 if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
153 {
154 /* Time to discard cache - dont care if this fails */
155 discard_file_cache(fd, 0 /* no flush */);
156 write_count = 0;
157 }
159 return rc;
160 }
162 #ifdef ADAPTIVE_SAVE
164 /*
165 ** We control the rate at which we transmit (or save) to minimize impact
166 ** on running domains (including the target if we're doing live migrate).
167 */
169 #define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
170 #define START_MBIT_RATE 100 /* initial transmit rate for migrate */
172 /* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
173 #define RATE_TO_BTU 781250
175 /* Amount in bytes we allow ourselves to send in a burst */
176 #define BURST_BUDGET (100*1024)
178 /* We keep track of the current and previous transmission rate */
179 static int mbit_rate, ombit_rate = 0;
181 /* Have we reached the maximum transmission rate? */
182 #define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
184 static inline void initialize_mbit_rate()
185 {
186 mbit_rate = START_MBIT_RATE;
187 }
189 static int ratewrite(int io_fd, int live, void *buf, int n)
190 {
191 static int budget = 0;
192 static int burst_time_us = -1;
193 static struct timeval last_put = { 0 };
194 struct timeval now;
195 struct timespec delay;
196 long long delta;
198 if ( START_MBIT_RATE == 0 )
199 return noncached_write(io_fd, live, buf, n);
201 budget -= n;
202 if ( budget < 0 )
203 {
204 if ( mbit_rate != ombit_rate )
205 {
206 burst_time_us = RATE_TO_BTU / mbit_rate;
207 ombit_rate = mbit_rate;
208 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
209 mbit_rate, BURST_BUDGET, burst_time_us);
210 }
211 if ( last_put.tv_sec == 0 )
212 {
213 budget += BURST_BUDGET;
214 gettimeofday(&last_put, NULL);
215 }
216 else
217 {
218 while ( budget < 0 )
219 {
220 gettimeofday(&now, NULL);
221 delta = tv_delta(&now, &last_put);
222 while ( delta > burst_time_us )
223 {
224 budget += BURST_BUDGET;
225 last_put.tv_usec += burst_time_us;
226 if ( last_put.tv_usec > 1000000 )
227 {
228 last_put.tv_usec -= 1000000;
229 last_put.tv_sec++;
230 }
231 delta -= burst_time_us;
232 }
233 if ( budget > 0 )
234 break;
235 delay.tv_sec = 0;
236 delay.tv_nsec = 1000 * (burst_time_us - delta);
237 while ( delay.tv_nsec > 0 )
238 if ( nanosleep(&delay, &delay) == 0 )
239 break;
240 }
241 }
242 }
243 return noncached_write(io_fd, live, buf, n);
244 }
246 #else /* ! ADAPTIVE SAVE */
248 #define RATE_IS_MAX() (0)
249 #define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), (_buf), (_n))
250 #define initialize_mbit_rate()
252 #endif
254 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
255 xc_shadow_op_stats_t *stats, int print)
256 {
257 static struct timeval wall_last;
258 static long long d0_cpu_last;
259 static long long d1_cpu_last;
261 struct timeval wall_now;
262 long long wall_delta;
263 long long d0_cpu_now, d0_cpu_delta;
264 long long d1_cpu_now, d1_cpu_delta;
266 gettimeofday(&wall_now, NULL);
268 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
269 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
271 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
272 DPRINTF("ARRHHH!!\n");
274 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
275 if ( wall_delta == 0 )
276 wall_delta = 1;
278 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
279 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
281 if ( print )
282 DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
283 "dirtied %dMb/s %" PRId32 " pages\n",
284 wall_delta,
285 (int)((d0_cpu_delta*100)/wall_delta),
286 (int)((d1_cpu_delta*100)/wall_delta),
287 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
288 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
289 stats->dirty_count);
291 #ifdef ADAPTIVE_SAVE
292 if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
293 {
294 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
295 + 50;
296 if ( mbit_rate > MAX_MBIT_RATE )
297 mbit_rate = MAX_MBIT_RATE;
298 }
299 #endif
301 d0_cpu_last = d0_cpu_now;
302 d1_cpu_last = d1_cpu_now;
303 wall_last = wall_now;
305 return 0;
306 }
309 static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size,
310 unsigned long *arr, int runs)
311 {
312 long long start, now;
313 xc_shadow_op_stats_t stats;
314 int j;
316 start = llgettimeofday();
318 for ( j = 0; j < runs; j++ )
319 {
320 int i;
322 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
323 arr, p2m_size, NULL, 0, NULL);
324 DPRINTF("#Flush\n");
325 for ( i = 0; i < 40; i++ )
326 {
327 usleep(50000);
328 now = llgettimeofday();
329 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
330 NULL, 0, NULL, 0, &stats);
331 DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
332 ((now-start)+500)/1000,
333 stats.fault_count, stats.dirty_count);
334 }
335 }
337 return -1;
338 }
341 static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
342 int dom, xc_dominfo_t *info)
343 {
344 int i = 0;
346 if ( !(*suspend)(dom) )
347 {
348 ERROR("Suspend request failed");
349 return -1;
350 }
352 retry:
354 if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
355 {
356 ERROR("Could not get domain info");
357 return -1;
358 }
360 if ( info->dying )
361 {
362 ERROR("domain is dying");
363 return -1;
364 }
366 if ( info->crashed )
367 {
368 ERROR("domain has crashed");
369 return -1;
370 }
372 if ( info->shutdown )
373 {
374 switch ( info->shutdown_reason )
375 {
376 case SHUTDOWN_poweroff:
377 case SHUTDOWN_reboot:
378 ERROR("domain has shut down");
379 return -1;
380 case SHUTDOWN_suspend:
381 return 0;
382 case SHUTDOWN_crash:
383 ERROR("domain has crashed");
384 return -1;
385 }
386 }
388 if ( info->paused )
389 {
390 /* Try unpausing domain, wait, and retest. */
391 xc_domain_unpause( xc_handle, dom );
392 ERROR("Domain was paused. Wait and re-test.");
393 usleep(10000); /* 10ms */
394 goto retry;
395 }
397 if ( ++i < 100 )
398 {
399 ERROR("Retry suspend domain");
400 usleep(10000); /* 10ms */
401 goto retry;
402 }
404 ERROR("Unable to suspend domain.");
406 return -1;
407 }
409 /*
410 ** Map the top-level page of MFNs from the guest. The guest might not have
411 ** finished resuming from a previous restore operation, so we wait a while for
412 ** it to update the MFN to a reasonable value.
413 */
414 static void *map_frame_list_list(int xc_handle, uint32_t dom,
415 shared_info_any_t *shinfo)
416 {
417 int count = 100;
418 void *p;
419 uint64_t fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list);
421 while ( count-- && (fll == 0) )
422 {
423 usleep(10000);
424 fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list);
425 }
427 if ( fll == 0 )
428 {
429 ERROR("Timed out waiting for frame list updated.");
430 return NULL;
431 }
433 p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, fll);
434 if ( p == NULL )
435 ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
437 return p;
438 }
440 /*
441 ** During transfer (or in the state file), all page-table pages must be
442 ** converted into a 'canonical' form where references to actual mfns
443 ** are replaced with references to the corresponding pfns.
444 **
445 ** This function performs the appropriate conversion, taking into account
446 ** which entries do not require canonicalization (in particular, those
447 ** entries which map the virtual address reserved for the hypervisor).
448 */
449 static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
450 const void *spage, void *dpage)
451 {
453 int i, pte_last, xen_start, xen_end, race = 0;
454 uint64_t pte;
456 /*
457 ** We need to determine which entries in this page table hold
458 ** reserved hypervisor mappings. This depends on the current
459 ** page table type as well as the number of paging levels.
460 */
461 xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
463 if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
464 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
466 if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
467 xen_start = L3_PAGETABLE_ENTRIES_PAE;
469 /*
470 ** In PAE only the L2 mapping the top 1GB contains Xen mappings.
471 ** We can spot this by looking for the guest's mappingof the m2p.
472 ** Guests must ensure that this check will fail for other L2s.
473 */
474 if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
475 {
476 int hstart;
477 uint64_t he;
479 hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
480 he = ((const uint64_t *) spage)[hstart];
482 if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
483 {
484 /* hvirt starts with xen stuff... */
485 xen_start = hstart;
486 }
487 else if ( hvirt_start != 0xf5800000 )
488 {
489 /* old L2s from before hole was shrunk... */
490 hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
491 he = ((const uint64_t *) spage)[hstart];
492 if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
493 xen_start = hstart;
494 }
495 }
497 if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
498 {
499 /*
500 ** XXX SMH: should compute these from hvirt_start (which we have)
501 ** and hvirt_end (which we don't)
502 */
503 xen_start = 256;
504 xen_end = 272;
505 }
507 /* Now iterate through the page table, canonicalizing each PTE */
508 for (i = 0; i < pte_last; i++ )
509 {
510 unsigned long pfn, mfn;
512 if ( pt_levels == 2 )
513 pte = ((const uint32_t*)spage)[i];
514 else
515 pte = ((const uint64_t*)spage)[i];
517 if ( (i >= xen_start) && (i < xen_end) )
518 pte = 0;
520 if ( pte & _PAGE_PRESENT )
521 {
522 mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
523 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
524 {
525 /* This will happen if the type info is stale which
526 is quite feasible under live migration */
527 pfn = 0; /* zap it - we'll retransmit this page later */
528 /* XXX: We can't spot Xen mappings in compat-mode L2es
529 * from 64-bit tools, but the only thing in them is the
530 * compat m2p, so we quietly zap them. This doesn't
531 * count as a race, so don't report it. */
532 if ( !(type == XEN_DOMCTL_PFINFO_L2TAB
533 && sizeof (unsigned long) > guest_width) )
534 race = 1; /* inform the caller; fatal if !live */
535 }
536 else
537 pfn = mfn_to_pfn(mfn);
539 pte &= ~MADDR_MASK_X86;
540 pte |= (uint64_t)pfn << PAGE_SHIFT;
542 /*
543 * PAE guest L3Es can contain these flags when running on
544 * a 64bit hypervisor. We zap these here to avoid any
545 * surprise at restore time...
546 */
547 if ( (pt_levels == 3) &&
548 (type == XEN_DOMCTL_PFINFO_L3TAB) &&
549 (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
550 pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
551 }
553 if ( pt_levels == 2 )
554 ((uint32_t*)dpage)[i] = pte;
555 else
556 ((uint64_t*)dpage)[i] = pte;
557 }
559 return race;
560 }
562 static xen_pfn_t *xc_map_m2p(int xc_handle,
563 unsigned long max_mfn,
564 int prot)
565 {
566 struct xen_machphys_mfn_list xmml;
567 privcmd_mmap_entry_t *entries;
568 unsigned long m2p_chunks, m2p_size;
569 xen_pfn_t *m2p;
570 xen_pfn_t *extent_start;
571 int i;
573 m2p = NULL;
574 m2p_size = M2P_SIZE(max_mfn);
575 m2p_chunks = M2P_CHUNKS(max_mfn);
577 xmml.max_extents = m2p_chunks;
579 extent_start = calloc(m2p_chunks, sizeof(xen_pfn_t));
580 if ( !extent_start )
581 {
582 ERROR("failed to allocate space for m2p mfns");
583 goto err0;
584 }
585 set_xen_guest_handle(xmml.extent_start, extent_start);
587 if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
588 (xmml.nr_extents != m2p_chunks) )
589 {
590 ERROR("xc_get_m2p_mfns");
591 goto err1;
592 }
594 entries = calloc(m2p_chunks, sizeof(privcmd_mmap_entry_t));
595 if (entries == NULL)
596 {
597 ERROR("failed to allocate space for mmap entries");
598 goto err1;
599 }
601 for ( i = 0; i < m2p_chunks; i++ )
602 entries[i].mfn = extent_start[i];
604 m2p = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
605 m2p_size, prot, M2P_CHUNK_SIZE,
606 entries, m2p_chunks);
607 if (m2p == NULL)
608 {
609 ERROR("xc_mmap_foreign_ranges failed");
610 goto err2;
611 }
613 m2p_mfn0 = entries[0].mfn;
615 err2:
616 free(entries);
617 err1:
618 free(extent_start);
620 err0:
621 return m2p;
622 }
625 static xen_pfn_t *map_and_save_p2m_table(int xc_handle,
626 int io_fd,
627 uint32_t dom,
628 unsigned long p2m_size,
629 shared_info_any_t *live_shinfo)
630 {
631 vcpu_guest_context_any_t ctxt;
633 /* Double and single indirect references to the live P2M table */
634 void *live_p2m_frame_list_list = NULL;
635 void *live_p2m_frame_list = NULL;
637 /* Copies of the above. */
638 xen_pfn_t *p2m_frame_list_list = NULL;
639 xen_pfn_t *p2m_frame_list = NULL;
641 /* The mapping of the live p2m table itself */
642 xen_pfn_t *p2m = NULL;
644 int i, success = 0;
646 live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
647 live_shinfo);
648 if ( !live_p2m_frame_list_list )
649 goto out;
651 /* Get a local copy of the live_P2M_frame_list_list */
652 if ( !(p2m_frame_list_list = malloc(PAGE_SIZE)) )
653 {
654 ERROR("Couldn't allocate p2m_frame_list_list array");
655 goto out;
656 }
657 memcpy(p2m_frame_list_list, live_p2m_frame_list_list, PAGE_SIZE);
659 /* Canonicalize guest's unsigned long vs ours */
660 if ( guest_width > sizeof(unsigned long) )
661 for ( i = 0; i < PAGE_SIZE/sizeof(unsigned long); i++ )
662 if ( i < PAGE_SIZE/guest_width )
663 p2m_frame_list_list[i] = ((uint64_t *)p2m_frame_list_list)[i];
664 else
665 p2m_frame_list_list[i] = 0;
666 else if ( guest_width < sizeof(unsigned long) )
667 for ( i = PAGE_SIZE/sizeof(unsigned long) - 1; i >= 0; i-- )
668 p2m_frame_list_list[i] = ((uint32_t *)p2m_frame_list_list)[i];
670 live_p2m_frame_list =
671 xc_map_foreign_batch(xc_handle, dom, PROT_READ,
672 p2m_frame_list_list,
673 P2M_FLL_ENTRIES);
674 if ( !live_p2m_frame_list )
675 {
676 ERROR("Couldn't map p2m_frame_list");
677 goto out;
678 }
680 /* Get a local copy of the live_P2M_frame_list */
681 if ( !(p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE)) )
682 {
683 ERROR("Couldn't allocate p2m_frame_list array");
684 goto out;
685 }
686 memset(p2m_frame_list, 0, P2M_TOOLS_FL_SIZE);
687 memcpy(p2m_frame_list, live_p2m_frame_list, P2M_GUEST_FL_SIZE);
689 /* Canonicalize guest's unsigned long vs ours */
690 if ( guest_width > sizeof(unsigned long) )
691 for ( i = 0; i < P2M_FL_ENTRIES; i++ )
692 p2m_frame_list[i] = ((uint64_t *)p2m_frame_list)[i];
693 else if ( guest_width < sizeof(unsigned long) )
694 for ( i = P2M_FL_ENTRIES - 1; i >= 0; i-- )
695 p2m_frame_list[i] = ((uint32_t *)p2m_frame_list)[i];
698 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
699 the guest must not change which frames are used for this purpose.
700 (its not clear why it would want to change them, and we'll be OK
701 from a safety POV anyhow. */
703 p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
704 p2m_frame_list,
705 P2M_FL_ENTRIES);
706 if ( !p2m )
707 {
708 ERROR("Couldn't map p2m table");
709 goto out;
710 }
711 live_p2m = p2m; /* So that translation macros will work */
713 /* Canonicalise the pfn-to-mfn table frame-number list. */
714 for ( i = 0; i < p2m_size; i += FPP )
715 {
716 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(p2m_frame_list[i/FPP]) )
717 {
718 ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
719 ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64", max 0x%lx",
720 i, i/FPP, (uint64_t)p2m_frame_list[i/FPP], max_mfn);
721 if ( p2m_frame_list[i/FPP] < max_mfn )
722 {
723 ERROR("m2p[0x%"PRIx64"] = 0x%"PRIx64,
724 (uint64_t)p2m_frame_list[i/FPP],
725 (uint64_t)live_m2p[p2m_frame_list[i/FPP]]);
726 ERROR("p2m[0x%"PRIx64"] = 0x%"PRIx64,
727 (uint64_t)live_m2p[p2m_frame_list[i/FPP]],
728 (uint64_t)p2m[live_m2p[p2m_frame_list[i/FPP]]]);
730 }
731 goto out;
732 }
733 p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]);
734 }
736 if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
737 {
738 ERROR("Could not get vcpu context");
739 goto out;
740 }
742 /*
743 * Write an extended-info structure to inform the restore code that
744 * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
745 * slow paths in the restore code.
746 */
747 {
748 unsigned long signature = ~0UL;
749 uint32_t chunk1_sz = ((guest_width==8)
750 ? sizeof(ctxt.x64)
751 : sizeof(ctxt.x32));
752 uint32_t chunk2_sz = 0;
753 uint32_t tot_sz = (chunk1_sz + 8) + (chunk2_sz + 8);
754 if ( write_exact(io_fd, &signature, sizeof(signature)) ||
755 write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
756 write_exact(io_fd, "vcpu", 4) ||
757 write_exact(io_fd, &chunk1_sz, sizeof(chunk1_sz)) ||
758 write_exact(io_fd, &ctxt, chunk1_sz) ||
759 write_exact(io_fd, "extv", 4) ||
760 write_exact(io_fd, &chunk2_sz, sizeof(chunk2_sz)) )
761 {
762 PERROR("write: extended info");
763 goto out;
764 }
765 }
767 if ( write_exact(io_fd, p2m_frame_list,
768 P2M_FL_ENTRIES * sizeof(xen_pfn_t)) )
769 {
770 PERROR("write: p2m_frame_list");
771 goto out;
772 }
774 success = 1;
776 out:
778 if ( !success && p2m )
779 munmap(p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
781 if ( live_p2m_frame_list_list )
782 munmap(live_p2m_frame_list_list, PAGE_SIZE);
784 if ( live_p2m_frame_list )
785 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
787 if ( p2m_frame_list_list )
788 free(p2m_frame_list_list);
790 if ( p2m_frame_list )
791 free(p2m_frame_list);
793 return success ? p2m : NULL;
794 }
798 int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
799 uint32_t max_factor, uint32_t flags, int (*suspend)(int),
800 int hvm, void *(*init_qemu_maps)(int, unsigned),
801 void (*qemu_flip_buffer)(int, int))
802 {
803 xc_dominfo_t info;
804 DECLARE_DOMCTL;
806 int rc = 1, frc, i, j, last_iter, iter = 0;
807 int live = (flags & XCFLAGS_LIVE);
808 int debug = (flags & XCFLAGS_DEBUG);
809 int race = 0, sent_last_iter, skip_this_iter;
811 /* The new domain's shared-info frame number. */
812 unsigned long shared_info_frame;
814 /* A copy of the CPU context of the guest. */
815 vcpu_guest_context_any_t ctxt;
817 /* A table containing the type of each PFN (/not/ MFN!). */
818 unsigned long *pfn_type = NULL;
819 unsigned long *pfn_batch = NULL;
821 /* A copy of one frame of guest memory. */
822 char page[PAGE_SIZE];
824 /* Live mapping of shared info structure */
825 shared_info_any_t *live_shinfo = NULL;
827 /* base of the region in which domain memory is mapped */
828 unsigned char *region_base = NULL;
830 /* bitmap of pages:
831 - that should be sent this iteration (unless later marked as skip);
832 - to skip this iteration because already dirty;
833 - to fixup by sending at the end if not already resent; */
834 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
836 xc_shadow_op_stats_t stats;
838 unsigned long needed_to_fix = 0;
839 unsigned long total_sent = 0;
841 uint64_t vcpumap = 1ULL;
843 /* HVM: a buffer for holding HVM context */
844 uint32_t hvm_buf_size = 0;
845 uint8_t *hvm_buf = NULL;
847 /* HVM: magic frames for ioreqs and xenstore comms. */
848 uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
850 unsigned long mfn;
852 /* If no explicit control parameters given, use defaults */
853 max_iters = max_iters ? : DEF_MAX_ITERS;
854 max_factor = max_factor ? : DEF_MAX_FACTOR;
856 initialize_mbit_rate();
858 if ( !get_platform_info(xc_handle, dom,
859 &max_mfn, &hvirt_start, &pt_levels, &guest_width) )
860 {
861 ERROR("Unable to get platform info.");
862 return 1;
863 }
865 if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
866 {
867 ERROR("Could not get domain info");
868 return 1;
869 }
871 shared_info_frame = info.shared_info_frame;
873 /* Map the shared info frame */
874 if ( !hvm )
875 {
876 live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
877 PROT_READ, shared_info_frame);
878 if ( !live_shinfo )
879 {
880 ERROR("Couldn't map live_shinfo");
881 goto out;
882 }
883 }
885 /* Get the size of the P2M table */
886 p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom) + 1;
888 /* Domain is still running at this point */
889 if ( live )
890 {
891 /* Live suspend. Enable log-dirty mode. */
892 if ( xc_shadow_control(xc_handle, dom,
893 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
894 NULL, 0, NULL, 0, NULL) < 0 )
895 {
896 /* log-dirty already enabled? There's no test op,
897 so attempt to disable then reenable it */
898 frc = xc_shadow_control(xc_handle, dom, XEN_DOMCTL_SHADOW_OP_OFF,
899 NULL, 0, NULL, 0, NULL);
900 if ( frc >= 0 )
901 {
902 frc = xc_shadow_control(xc_handle, dom,
903 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
904 NULL, 0, NULL, 0, NULL);
905 }
907 if ( frc < 0 )
908 {
909 ERROR("Couldn't enable shadow mode (rc %d) (errno %d)", frc, errno );
910 goto out;
911 }
912 }
914 if ( hvm )
915 {
916 /* Get qemu-dm logging dirty pages too */
917 void *seg = init_qemu_maps(dom, BITMAP_SIZE);
918 qemu_bitmaps[0] = seg;
919 qemu_bitmaps[1] = seg + BITMAP_SIZE;
920 qemu_active = 0;
921 qemu_non_active = 1;
922 }
923 }
924 else
925 {
926 /* This is a non-live suspend. Suspend the domain .*/
927 if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info) )
928 {
929 ERROR("Domain appears not to have suspended");
930 goto out;
931 }
932 }
934 last_iter = !live;
936 /* pretend we sent all the pages last iteration */
937 sent_last_iter = p2m_size;
939 /* Setup to_send / to_fix and to_skip bitmaps */
940 to_send = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT));
941 to_fix = calloc(1, BITMAP_SIZE);
942 to_skip = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT));
944 if ( !to_send || !to_fix || !to_skip )
945 {
946 ERROR("Couldn't allocate to_send array");
947 goto out;
948 }
950 memset(to_send, 0xff, BITMAP_SIZE);
952 if ( lock_pages(to_send, BITMAP_SIZE) )
953 {
954 ERROR("Unable to lock to_send");
955 return 1;
956 }
958 /* (to fix is local only) */
959 if ( lock_pages(to_skip, BITMAP_SIZE) )
960 {
961 ERROR("Unable to lock to_skip");
962 return 1;
963 }
965 if ( hvm )
966 {
967 /* Need another buffer for HVM context */
968 hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
969 if ( hvm_buf_size == -1 )
970 {
971 ERROR("Couldn't get HVM context size from Xen");
972 goto out;
973 }
974 hvm_buf = malloc(hvm_buf_size);
975 if ( !hvm_buf )
976 {
977 ERROR("Couldn't allocate memory");
978 goto out;
979 }
980 }
982 analysis_phase(xc_handle, dom, p2m_size, to_skip, 0);
984 pfn_type = xg_memalign(PAGE_SIZE, ROUNDUP(
985 MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
986 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
987 if ( (pfn_type == NULL) || (pfn_batch == NULL) )
988 {
989 ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
990 errno = ENOMEM;
991 goto out;
992 }
993 memset(pfn_type, 0,
994 ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
996 if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
997 {
998 ERROR("Unable to lock pfn_type array");
999 goto out;
1002 /* Setup the mfn_to_pfn table mapping */
1003 if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ)) )
1005 ERROR("Failed to map live M2P table");
1006 goto out;
1009 /* Start writing out the saved-domain record. */
1010 if ( write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
1012 PERROR("write: p2m_size");
1013 goto out;
1016 if ( !hvm )
1018 int err = 0;
1020 /* Map the P2M table, and write the list of P2M frames */
1021 live_p2m = map_and_save_p2m_table(xc_handle, io_fd, dom,
1022 p2m_size, live_shinfo);
1023 if ( live_p2m == NULL )
1025 ERROR("Failed to map/save the p2m frame list");
1026 goto out;
1029 /*
1030 * Quick belt and braces sanity check.
1031 */
1033 for ( i = 0; i < p2m_size; i++ )
1035 mfn = pfn_to_mfn(i);
1036 if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
1038 DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
1039 mfn, mfn_to_pfn(mfn));
1040 err++;
1043 DPRINTF("Had %d unexplained entries in p2m table\n", err);
1046 print_stats(xc_handle, dom, 0, &stats, 0);
1048 /* Now write out each data page, canonicalising page tables as we go... */
1049 for ( ; ; )
1051 unsigned int prev_pc, sent_this_iter, N, batch, run;
1053 iter++;
1054 sent_this_iter = 0;
1055 skip_this_iter = 0;
1056 prev_pc = 0;
1057 N = 0;
1059 DPRINTF("Saving memory pages: iter %d 0%%", iter);
1061 while ( N < p2m_size )
1063 unsigned int this_pc = (N * 100) / p2m_size;
1065 if ( (this_pc - prev_pc) >= 5 )
1067 DPRINTF("\b\b\b\b%3d%%", this_pc);
1068 prev_pc = this_pc;
1071 if ( !last_iter )
1073 /* Slightly wasteful to peek the whole array evey time,
1074 but this is fast enough for the moment. */
1075 frc = xc_shadow_control(
1076 xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip,
1077 p2m_size, NULL, 0, NULL);
1078 if ( frc != p2m_size )
1080 ERROR("Error peeking shadow bitmap");
1081 goto out;
1085 /* load pfn_type[] with the mfn of all the pages we're doing in
1086 this batch. */
1087 for ( batch = 0;
1088 (batch < MAX_BATCH_SIZE) && (N < p2m_size);
1089 N++ )
1091 int n = N;
1093 if ( debug )
1095 DPRINTF("%d pfn= %08lx mfn= %08lx %d",
1096 iter, (unsigned long)n,
1097 hvm ? 0 : pfn_to_mfn(n),
1098 test_bit(n, to_send));
1099 if ( !hvm && is_mapped(pfn_to_mfn(n)) )
1100 DPRINTF(" [mfn]= %08lx",
1101 mfn_to_pfn(pfn_to_mfn(n)&0xFFFFF));
1102 DPRINTF("\n");
1104 if ( !last_iter &&
1105 test_bit(n, to_send) &&
1106 test_bit(n, to_skip) )
1107 skip_this_iter++; /* stats keeping */
1109 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
1110 (test_bit(n, to_send) && last_iter) ||
1111 (test_bit(n, to_fix) && last_iter)) )
1112 continue;
1114 /*
1115 ** we get here if:
1116 ** 1. page is marked to_send & hasn't already been re-dirtied
1117 ** 2. (ignore to_skip in last iteration)
1118 ** 3. add in pages that still need fixup (net bufs)
1119 */
1121 pfn_batch[batch] = n;
1123 /* Hypercall interfaces operate in PFNs for HVM guests
1124 * and MFNs for PV guests */
1125 if ( hvm )
1126 pfn_type[batch] = n;
1127 else
1128 pfn_type[batch] = pfn_to_mfn(n);
1130 if ( !is_mapped(pfn_type[batch]) )
1132 /*
1133 ** not currently in psuedo-physical map -- set bit
1134 ** in to_fix since we must send this page in last_iter
1135 ** unless its sent sooner anyhow, or it never enters
1136 ** pseudo-physical map (e.g. for ballooned down doms)
1137 */
1138 set_bit(n, to_fix);
1139 continue;
1142 if ( last_iter &&
1143 test_bit(n, to_fix) &&
1144 !test_bit(n, to_send) )
1146 needed_to_fix++;
1147 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
1148 iter, n, pfn_type[batch]);
1151 clear_bit(n, to_fix);
1153 batch++;
1156 if ( batch == 0 )
1157 goto skip; /* vanishingly unlikely... */
1159 region_base = xc_map_foreign_batch(
1160 xc_handle, dom, PROT_READ, pfn_type, batch);
1161 if ( region_base == NULL )
1163 ERROR("map batch failed");
1164 goto out;
1167 if ( !hvm )
1169 /* Get page types */
1170 for ( j = 0; j < batch; j++ )
1171 ((uint32_t *)pfn_type)[j] = pfn_type[j];
1172 if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
1173 (uint32_t *)pfn_type) )
1175 ERROR("get_pfn_type_batch failed");
1176 goto out;
1178 for ( j = batch-1; j >= 0; j-- )
1179 pfn_type[j] = ((uint32_t *)pfn_type)[j];
1181 for ( j = 0; j < batch; j++ )
1184 if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
1185 XEN_DOMCTL_PFINFO_XTAB )
1187 DPRINTF("type fail: page %i mfn %08lx\n",
1188 j, pfn_type[j]);
1189 continue;
1192 if ( debug )
1193 DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
1194 " sum= %08lx\n",
1195 iter,
1196 (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
1197 pfn_batch[j],
1198 pfn_type[j],
1199 mfn_to_pfn(pfn_type[j] &
1200 ~XEN_DOMCTL_PFINFO_LTAB_MASK),
1201 csum_page(region_base + (PAGE_SIZE*j)));
1203 /* canonicalise mfn->pfn */
1204 pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
1205 pfn_batch[j];
1209 if ( write_exact(io_fd, &batch, sizeof(unsigned int)) )
1211 PERROR("Error when writing to state file (2)");
1212 goto out;
1215 if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
1217 PERROR("Error when writing to state file (3)");
1218 goto out;
1221 /* entering this loop, pfn_type is now in pfns (Not mfns) */
1222 run = 0;
1223 for ( j = 0; j < batch; j++ )
1225 unsigned long pfn, pagetype;
1226 void *spage = (char *)region_base + (PAGE_SIZE*j);
1228 pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
1229 pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
1231 if ( pagetype != 0 )
1233 /* If the page is not a normal data page, write out any
1234 run of pages we may have previously acumulated */
1235 if ( run )
1237 if ( ratewrite(io_fd, live,
1238 (char*)region_base+(PAGE_SIZE*(j-run)),
1239 PAGE_SIZE*run) != PAGE_SIZE*run )
1241 ERROR("Error when writing to state file (4a)"
1242 " (errno %d)", errno);
1243 goto out;
1245 run = 0;
1249 /* skip pages that aren't present */
1250 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
1251 continue;
1253 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
1255 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
1256 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
1258 /* We have a pagetable page: need to rewrite it. */
1259 race =
1260 canonicalize_pagetable(pagetype, pfn, spage, page);
1262 if ( race && !live )
1264 ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
1265 pagetype);
1266 goto out;
1269 if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
1271 ERROR("Error when writing to state file (4b)"
1272 " (errno %d)", errno);
1273 goto out;
1276 else
1278 /* We have a normal page: accumulate it for writing. */
1279 run++;
1281 } /* end of the write out for this batch */
1283 if ( run )
1285 /* write out the last accumulated run of pages */
1286 if ( ratewrite(io_fd, live,
1287 (char*)region_base+(PAGE_SIZE*(j-run)),
1288 PAGE_SIZE*run) != PAGE_SIZE*run )
1290 ERROR("Error when writing to state file (4c)"
1291 " (errno %d)", errno);
1292 goto out;
1296 sent_this_iter += batch;
1298 munmap(region_base, batch*PAGE_SIZE);
1300 } /* end of this while loop for this iteration */
1302 skip:
1304 total_sent += sent_this_iter;
1306 DPRINTF("\r %d: sent %d, skipped %d, ",
1307 iter, sent_this_iter, skip_this_iter );
1309 if ( last_iter )
1311 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
1313 DPRINTF("Total pages sent= %ld (%.2fx)\n",
1314 total_sent, ((float)total_sent)/p2m_size );
1315 DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
1318 if ( last_iter && debug )
1320 int minusone = -1;
1321 memset(to_send, 0xff, BITMAP_SIZE);
1322 debug = 0;
1323 DPRINTF("Entering debug resend-all mode\n");
1325 /* send "-1" to put receiver into debug mode */
1326 if ( write_exact(io_fd, &minusone, sizeof(int)) )
1328 PERROR("Error when writing to state file (6)");
1329 goto out;
1332 continue;
1335 if ( last_iter )
1336 break;
1338 if ( live )
1340 if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
1341 (iter >= max_iters) ||
1342 (sent_this_iter+skip_this_iter < 50) ||
1343 (total_sent > p2m_size*max_factor) )
1345 DPRINTF("Start last iteration\n");
1346 last_iter = 1;
1348 if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info) )
1350 ERROR("Domain appears not to have suspended");
1351 goto out;
1354 DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
1357 if ( xc_shadow_control(xc_handle, dom,
1358 XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
1359 p2m_size, NULL, 0, &stats) != p2m_size )
1361 ERROR("Error flushing shadow PT");
1362 goto out;
1365 if ( hvm )
1367 /* Pull in the dirty bits from qemu-dm too */
1368 if ( !last_iter )
1370 qemu_active = qemu_non_active;
1371 qemu_non_active = qemu_active ? 0 : 1;
1372 qemu_flip_buffer(dom, qemu_active);
1373 for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
1375 to_send[j] |= qemu_bitmaps[qemu_non_active][j];
1376 qemu_bitmaps[qemu_non_active][j] = 0;
1379 else
1381 for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
1382 to_send[j] |= qemu_bitmaps[qemu_active][j];
1386 sent_last_iter = sent_this_iter;
1388 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
1391 } /* end of infinite for loop */
1393 DPRINTF("All memory is saved\n");
1396 struct {
1397 int minustwo;
1398 int max_vcpu_id;
1399 uint64_t vcpumap;
1400 } chunk = { -2, info.max_vcpu_id };
1402 if ( info.max_vcpu_id >= 64 )
1404 ERROR("Too many VCPUS in guest!");
1405 goto out;
1408 for ( i = 1; i <= info.max_vcpu_id; i++ )
1410 xc_vcpuinfo_t vinfo;
1411 if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
1412 vinfo.online )
1413 vcpumap |= 1ULL << i;
1416 chunk.vcpumap = vcpumap;
1417 if ( write_exact(io_fd, &chunk, sizeof(chunk)) )
1419 PERROR("Error when writing to state file");
1420 goto out;
1424 if ( hvm )
1426 struct {
1427 int minusthree;
1428 uint32_t pad;
1429 uint64_t ident_pt;
1430 } chunk = { -3, 0 };
1432 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
1433 (unsigned long *)&chunk.ident_pt);
1435 if ( (chunk.ident_pt != 0) &&
1436 write_exact(io_fd, &chunk, sizeof(chunk)) )
1438 PERROR("Error when writing the ident_pt for EPT guest");
1439 goto out;
1443 /* Zero terminate */
1444 i = 0;
1445 if ( write_exact(io_fd, &i, sizeof(int)) )
1447 PERROR("Error when writing to state file (6')");
1448 goto out;
1451 if ( hvm )
1453 uint32_t rec_size;
1455 /* Save magic-page locations. */
1456 memset(magic_pfns, 0, sizeof(magic_pfns));
1457 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
1458 (unsigned long *)&magic_pfns[0]);
1459 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
1460 (unsigned long *)&magic_pfns[1]);
1461 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
1462 (unsigned long *)&magic_pfns[2]);
1463 if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
1465 PERROR("Error when writing to state file (7)");
1466 goto out;
1469 /* Get HVM context from Xen and save it too */
1470 if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
1471 hvm_buf_size)) == -1 )
1473 ERROR("HVM:Could not get hvm buffer");
1474 goto out;
1477 if ( write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
1479 PERROR("error write hvm buffer size");
1480 goto out;
1483 if ( write_exact(io_fd, hvm_buf, rec_size) )
1485 PERROR("write HVM info failed!\n");
1486 goto out;
1489 /* HVM guests are done now */
1490 rc = 0;
1491 goto out;
1494 /* PV guests only from now on */
1496 /* Send through a list of all the PFNs that were not in map at the close */
1498 unsigned int i,j;
1499 unsigned long pfntab[1024];
1501 for ( i = 0, j = 0; i < p2m_size; i++ )
1503 if ( !is_mapped(pfn_to_mfn(i)) )
1504 j++;
1507 if ( write_exact(io_fd, &j, sizeof(unsigned int)) )
1509 PERROR("Error when writing to state file (6a)");
1510 goto out;
1513 for ( i = 0, j = 0; i < p2m_size; )
1515 if ( !is_mapped(pfn_to_mfn(i)) )
1516 pfntab[j++] = i;
1518 i++;
1519 if ( (j == 1024) || (i == p2m_size) )
1521 if ( write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
1523 PERROR("Error when writing to state file (6b)");
1524 goto out;
1526 j = 0;
1531 if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
1533 ERROR("Could not get vcpu context");
1534 goto out;
1537 /* Canonicalise the suspend-record frame number. */
1538 mfn = GET_FIELD(&ctxt, user_regs.edx);
1539 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
1541 ERROR("Suspend record is not in range of pseudophys map");
1542 goto out;
1544 SET_FIELD(&ctxt, user_regs.edx, mfn_to_pfn(mfn));
1546 for ( i = 0; i <= info.max_vcpu_id; i++ )
1548 if ( !(vcpumap & (1ULL << i)) )
1549 continue;
1551 if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
1553 ERROR("No context for VCPU%d", i);
1554 goto out;
1557 /* Canonicalise each GDT frame number. */
1558 for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents); j++ )
1560 mfn = GET_FIELD(&ctxt, gdt_frames[j]);
1561 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
1563 ERROR("GDT frame is not in range of pseudophys map");
1564 goto out;
1566 SET_FIELD(&ctxt, gdt_frames[j], mfn_to_pfn(mfn));
1569 /* Canonicalise the page table base pointer. */
1570 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(UNFOLD_CR3(
1571 GET_FIELD(&ctxt, ctrlreg[3]))) )
1573 ERROR("PT base is not in range of pseudophys map");
1574 goto out;
1576 SET_FIELD(&ctxt, ctrlreg[3],
1577 FOLD_CR3(mfn_to_pfn(UNFOLD_CR3(GET_FIELD(&ctxt, ctrlreg[3])))));
1579 /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
1580 if ( (pt_levels == 4) && ctxt.x64.ctrlreg[1] )
1582 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(UNFOLD_CR3(ctxt.x64.ctrlreg[1])) )
1584 ERROR("PT base is not in range of pseudophys map");
1585 goto out;
1587 /* Least-significant bit means 'valid PFN'. */
1588 ctxt.x64.ctrlreg[1] = 1 |
1589 FOLD_CR3(mfn_to_pfn(UNFOLD_CR3(ctxt.x64.ctrlreg[1])));
1592 if ( write_exact(io_fd, &ctxt, ((guest_width==8)
1593 ? sizeof(ctxt.x64)
1594 : sizeof(ctxt.x32))) )
1596 PERROR("Error when writing to state file (1)");
1597 goto out;
1600 domctl.cmd = XEN_DOMCTL_get_ext_vcpucontext;
1601 domctl.domain = dom;
1602 domctl.u.ext_vcpucontext.vcpu = i;
1603 if ( xc_domctl(xc_handle, &domctl) < 0 )
1605 ERROR("No extended context for VCPU%d", i);
1606 goto out;
1608 if ( write_exact(io_fd, &domctl.u.ext_vcpucontext, 128) )
1610 PERROR("Error when writing to state file (2)");
1611 goto out;
1615 /*
1616 * Reset the MFN to be a known-invalid value. See map_frame_list_list().
1617 */
1618 memcpy(page, live_shinfo, PAGE_SIZE);
1619 SET_FIELD(((shared_info_any_t *)page),
1620 arch.pfn_to_mfn_frame_list_list, 0);
1621 if ( write_exact(io_fd, page, PAGE_SIZE) )
1623 PERROR("Error when writing to state file (1)");
1624 goto out;
1627 /* Success! */
1628 rc = 0;
1630 out:
1632 if ( live )
1634 if ( xc_shadow_control(xc_handle, dom,
1635 XEN_DOMCTL_SHADOW_OP_OFF,
1636 NULL, 0, NULL, 0, NULL) < 0 )
1637 DPRINTF("Warning - couldn't disable shadow mode");
1640 /* Flush last write and discard cache for file. */
1641 discard_file_cache(io_fd, 1 /* flush */);
1643 if ( live_shinfo )
1644 munmap(live_shinfo, PAGE_SIZE);
1646 if ( live_p2m )
1647 munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
1649 if ( live_m2p )
1650 munmap(live_m2p, M2P_SIZE(max_mfn));
1652 free(pfn_type);
1653 free(pfn_batch);
1654 free(to_send);
1655 free(to_fix);
1656 free(to_skip);
1658 DPRINTF("Save exit rc=%d\n",rc);
1660 return !!rc;
1663 /*
1664 * Local variables:
1665 * mode: C
1666 * c-set-style: "BSD"
1667 * c-basic-offset: 4
1668 * tab-width: 4
1669 * indent-tabs-mode: nil
1670 * End:
1671 */