ia64/xen-unstable

view tools/libxc/xc_domain_save.c @ 19688:bd39df93a29e

libxc: export xc_map_m2p() so that it can be called outside.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
author Keir Fraser <keir.fraser@citrix.com>
date Mon Jun 01 14:12:53 2009 +0100 (2009-06-01)
parents 0b13d9787622
children
line source
1 /******************************************************************************
2 * xc_linux_save.c
3 *
4 * Save the state of a running Linux session.
5 *
6 * Copyright (c) 2003, K A Fraser.
7 */
9 #include <inttypes.h>
10 #include <time.h>
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <sys/time.h>
15 #include "xc_private.h"
16 #include "xc_dom.h"
17 #include "xg_private.h"
18 #include "xg_save_restore.h"
20 #include <xen/hvm/params.h>
21 #include "xc_e820.h"
23 /*
24 ** Default values for important tuning parameters. Can override by passing
25 ** non-zero replacement values to xc_domain_save().
26 **
27 ** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
28 **
29 */
30 #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */
31 #define DEF_MAX_FACTOR 3 /* never send more than 3x p2m_size */
33 /* max mfn of the whole machine */
34 static unsigned long max_mfn;
36 /* virtual starting address of the hypervisor */
37 static unsigned long hvirt_start;
39 /* #levels of page tables used by the current guest */
40 static unsigned int pt_levels;
42 /* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */
43 static unsigned long *qemu_bitmaps[2];
44 static int qemu_active;
45 static int qemu_non_active;
47 /* number of pfns this guest has (i.e. number of entries in the P2M) */
48 static unsigned long p2m_size;
50 /* Live mapping of the table mapping each PFN to its current MFN. */
51 static xen_pfn_t *live_p2m = NULL;
53 /* Live mapping of system MFN to PFN table. */
54 static xen_pfn_t *live_m2p = NULL;
55 static unsigned long m2p_mfn0;
57 /* Address size of the guest */
58 unsigned int guest_width;
60 /* grep fodder: machine_to_phys */
62 #define mfn_to_pfn(_mfn) (live_m2p[(_mfn)])
64 #define pfn_to_mfn(_pfn) \
65 ((xen_pfn_t) ((guest_width==8) \
66 ? (((uint64_t *)live_p2m)[(_pfn)]) \
67 : ((((uint32_t *)live_p2m)[(_pfn)]) == 0xffffffffU \
68 ? (-1UL) : (((uint32_t *)live_p2m)[(_pfn)]))))
70 /*
71 * Returns TRUE if the given machine frame number has a unique mapping
72 * in the guest's pseudophysical map.
73 */
74 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
75 (((_mfn) < (max_mfn)) && \
76 ((mfn_to_pfn(_mfn) < (p2m_size)) && \
77 (pfn_to_mfn(mfn_to_pfn(_mfn)) == (_mfn))))
79 /*
80 ** During (live) save/migrate, we maintain a number of bitmaps to track
81 ** which pages we have to send, to fixup, and to skip.
82 */
84 #define BITS_PER_LONG (sizeof(unsigned long) * 8)
85 #define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
86 #define BITMAP_SIZE (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long))
88 #define BITMAP_ENTRY(_nr,_bmap) \
89 ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
91 #define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
93 static inline int test_bit (int nr, volatile void * addr)
94 {
95 return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
96 }
98 static inline void clear_bit (int nr, volatile void * addr)
99 {
100 BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
101 }
103 static inline void set_bit ( int nr, volatile void * addr)
104 {
105 BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
106 }
108 /* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
109 static inline unsigned int hweight32(unsigned int w)
110 {
111 unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
112 res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
113 res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
114 res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
115 return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
116 }
118 static inline int count_bits ( int nr, volatile void *addr)
119 {
120 int i, count = 0;
121 volatile unsigned long *p = (volatile unsigned long *)addr;
122 /* We know that the array is padded to unsigned long. */
123 for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
124 count += hweight32(*p);
125 return count;
126 }
128 static uint64_t tv_to_us(struct timeval *new)
129 {
130 return (new->tv_sec * 1000000) + new->tv_usec;
131 }
133 static uint64_t llgettimeofday(void)
134 {
135 struct timeval now;
136 gettimeofday(&now, NULL);
137 return tv_to_us(&now);
138 }
140 static uint64_t tv_delta(struct timeval *new, struct timeval *old)
141 {
142 return (((new->tv_sec - old->tv_sec)*1000000) +
143 (new->tv_usec - old->tv_usec));
144 }
146 static int noncached_write(int fd, int live, void *buffer, int len)
147 {
148 static int write_count = 0;
149 int rc = (write_exact(fd, buffer, len) == 0) ? len : -1;
151 write_count += len;
152 if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
153 {
154 /* Time to discard cache - dont care if this fails */
155 discard_file_cache(fd, 0 /* no flush */);
156 write_count = 0;
157 }
159 return rc;
160 }
162 #ifdef ADAPTIVE_SAVE
164 /*
165 ** We control the rate at which we transmit (or save) to minimize impact
166 ** on running domains (including the target if we're doing live migrate).
167 */
169 #define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
170 #define START_MBIT_RATE 100 /* initial transmit rate for migrate */
172 /* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
173 #define RATE_TO_BTU 781250
175 /* Amount in bytes we allow ourselves to send in a burst */
176 #define BURST_BUDGET (100*1024)
178 /* We keep track of the current and previous transmission rate */
179 static int mbit_rate, ombit_rate = 0;
181 /* Have we reached the maximum transmission rate? */
182 #define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
184 static inline void initialize_mbit_rate()
185 {
186 mbit_rate = START_MBIT_RATE;
187 }
189 static int ratewrite(int io_fd, int live, void *buf, int n)
190 {
191 static int budget = 0;
192 static int burst_time_us = -1;
193 static struct timeval last_put = { 0 };
194 struct timeval now;
195 struct timespec delay;
196 long long delta;
198 if ( START_MBIT_RATE == 0 )
199 return noncached_write(io_fd, live, buf, n);
201 budget -= n;
202 if ( budget < 0 )
203 {
204 if ( mbit_rate != ombit_rate )
205 {
206 burst_time_us = RATE_TO_BTU / mbit_rate;
207 ombit_rate = mbit_rate;
208 DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
209 mbit_rate, BURST_BUDGET, burst_time_us);
210 }
211 if ( last_put.tv_sec == 0 )
212 {
213 budget += BURST_BUDGET;
214 gettimeofday(&last_put, NULL);
215 }
216 else
217 {
218 while ( budget < 0 )
219 {
220 gettimeofday(&now, NULL);
221 delta = tv_delta(&now, &last_put);
222 while ( delta > burst_time_us )
223 {
224 budget += BURST_BUDGET;
225 last_put.tv_usec += burst_time_us;
226 if ( last_put.tv_usec > 1000000 )
227 {
228 last_put.tv_usec -= 1000000;
229 last_put.tv_sec++;
230 }
231 delta -= burst_time_us;
232 }
233 if ( budget > 0 )
234 break;
235 delay.tv_sec = 0;
236 delay.tv_nsec = 1000 * (burst_time_us - delta);
237 while ( delay.tv_nsec > 0 )
238 if ( nanosleep(&delay, &delay) == 0 )
239 break;
240 }
241 }
242 }
243 return noncached_write(io_fd, live, buf, n);
244 }
246 #else /* ! ADAPTIVE SAVE */
248 #define RATE_IS_MAX() (0)
249 #define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), (_buf), (_n))
250 #define initialize_mbit_rate()
252 #endif
254 static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
255 xc_shadow_op_stats_t *stats, int print)
256 {
257 static struct timeval wall_last;
258 static long long d0_cpu_last;
259 static long long d1_cpu_last;
261 struct timeval wall_now;
262 long long wall_delta;
263 long long d0_cpu_now, d0_cpu_delta;
264 long long d1_cpu_now, d1_cpu_delta;
266 gettimeofday(&wall_now, NULL);
268 d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
269 d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
271 if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
272 DPRINTF("ARRHHH!!\n");
274 wall_delta = tv_delta(&wall_now,&wall_last)/1000;
275 if ( wall_delta == 0 )
276 wall_delta = 1;
278 d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
279 d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
281 if ( print )
282 DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
283 "dirtied %dMb/s %" PRId32 " pages\n",
284 wall_delta,
285 (int)((d0_cpu_delta*100)/wall_delta),
286 (int)((d1_cpu_delta*100)/wall_delta),
287 (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
288 (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
289 stats->dirty_count);
291 #ifdef ADAPTIVE_SAVE
292 if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
293 {
294 mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
295 + 50;
296 if ( mbit_rate > MAX_MBIT_RATE )
297 mbit_rate = MAX_MBIT_RATE;
298 }
299 #endif
301 d0_cpu_last = d0_cpu_now;
302 d1_cpu_last = d1_cpu_now;
303 wall_last = wall_now;
305 return 0;
306 }
309 static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size,
310 unsigned long *arr, int runs)
311 {
312 long long start, now;
313 xc_shadow_op_stats_t stats;
314 int j;
316 start = llgettimeofday();
318 for ( j = 0; j < runs; j++ )
319 {
320 int i;
322 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
323 arr, p2m_size, NULL, 0, NULL);
324 DPRINTF("#Flush\n");
325 for ( i = 0; i < 40; i++ )
326 {
327 usleep(50000);
328 now = llgettimeofday();
329 xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
330 NULL, 0, NULL, 0, &stats);
331 DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
332 ((now-start)+500)/1000,
333 stats.fault_count, stats.dirty_count);
334 }
335 }
337 return -1;
338 }
341 static int suspend_and_state(int (*suspend)(void), int xc_handle, int io_fd,
342 int dom, xc_dominfo_t *info)
343 {
344 if ( !(*suspend)() )
345 {
346 ERROR("Suspend request failed");
347 return -1;
348 }
350 if ( (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) ||
351 !info->shutdown || (info->shutdown_reason != SHUTDOWN_suspend) )
352 {
353 ERROR("Domain not in suspended state");
354 return -1;
355 }
357 return 0;
358 }
360 /*
361 ** Map the top-level page of MFNs from the guest. The guest might not have
362 ** finished resuming from a previous restore operation, so we wait a while for
363 ** it to update the MFN to a reasonable value.
364 */
365 static void *map_frame_list_list(int xc_handle, uint32_t dom,
366 shared_info_any_t *shinfo)
367 {
368 int count = 100;
369 void *p;
370 uint64_t fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list);
372 while ( count-- && (fll == 0) )
373 {
374 usleep(10000);
375 fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list);
376 }
378 if ( fll == 0 )
379 {
380 ERROR("Timed out waiting for frame list updated.");
381 return NULL;
382 }
384 p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, fll);
385 if ( p == NULL )
386 ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
388 return p;
389 }
391 /*
392 ** During transfer (or in the state file), all page-table pages must be
393 ** converted into a 'canonical' form where references to actual mfns
394 ** are replaced with references to the corresponding pfns.
395 **
396 ** This function performs the appropriate conversion, taking into account
397 ** which entries do not require canonicalization (in particular, those
398 ** entries which map the virtual address reserved for the hypervisor).
399 */
400 static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
401 const void *spage, void *dpage)
402 {
404 int i, pte_last, xen_start, xen_end, race = 0;
405 uint64_t pte;
407 /*
408 ** We need to determine which entries in this page table hold
409 ** reserved hypervisor mappings. This depends on the current
410 ** page table type as well as the number of paging levels.
411 */
412 xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
414 if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
415 xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
417 if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
418 xen_start = L3_PAGETABLE_ENTRIES_PAE;
420 /*
421 ** In PAE only the L2 mapping the top 1GB contains Xen mappings.
422 ** We can spot this by looking for the guest's mappingof the m2p.
423 ** Guests must ensure that this check will fail for other L2s.
424 */
425 if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
426 {
427 int hstart;
428 uint64_t he;
430 hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
431 he = ((const uint64_t *) spage)[hstart];
433 if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
434 {
435 /* hvirt starts with xen stuff... */
436 xen_start = hstart;
437 }
438 else if ( hvirt_start != 0xf5800000 )
439 {
440 /* old L2s from before hole was shrunk... */
441 hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
442 he = ((const uint64_t *) spage)[hstart];
443 if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
444 xen_start = hstart;
445 }
446 }
448 if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
449 {
450 /*
451 ** XXX SMH: should compute these from hvirt_start (which we have)
452 ** and hvirt_end (which we don't)
453 */
454 xen_start = 256;
455 xen_end = 272;
456 }
458 /* Now iterate through the page table, canonicalizing each PTE */
459 for (i = 0; i < pte_last; i++ )
460 {
461 unsigned long pfn, mfn;
463 if ( pt_levels == 2 )
464 pte = ((const uint32_t*)spage)[i];
465 else
466 pte = ((const uint64_t*)spage)[i];
468 if ( (i >= xen_start) && (i < xen_end) )
469 pte = 0;
471 if ( pte & _PAGE_PRESENT )
472 {
473 mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
474 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
475 {
476 /* This will happen if the type info is stale which
477 is quite feasible under live migration */
478 pfn = 0; /* zap it - we'll retransmit this page later */
479 /* XXX: We can't spot Xen mappings in compat-mode L2es
480 * from 64-bit tools, but the only thing in them is the
481 * compat m2p, so we quietly zap them. This doesn't
482 * count as a race, so don't report it. */
483 if ( !(type == XEN_DOMCTL_PFINFO_L2TAB
484 && sizeof (unsigned long) > guest_width) )
485 race = 1; /* inform the caller; fatal if !live */
486 }
487 else
488 pfn = mfn_to_pfn(mfn);
490 pte &= ~MADDR_MASK_X86;
491 pte |= (uint64_t)pfn << PAGE_SHIFT;
493 /*
494 * PAE guest L3Es can contain these flags when running on
495 * a 64bit hypervisor. We zap these here to avoid any
496 * surprise at restore time...
497 */
498 if ( (pt_levels == 3) &&
499 (type == XEN_DOMCTL_PFINFO_L3TAB) &&
500 (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
501 pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
502 }
504 if ( pt_levels == 2 )
505 ((uint32_t*)dpage)[i] = pte;
506 else
507 ((uint64_t*)dpage)[i] = pte;
508 }
510 return race;
511 }
513 xen_pfn_t *xc_map_m2p(int xc_handle,
514 unsigned long max_mfn,
515 int prot,
516 unsigned long *mfn0)
517 {
518 struct xen_machphys_mfn_list xmml;
519 privcmd_mmap_entry_t *entries;
520 unsigned long m2p_chunks, m2p_size;
521 xen_pfn_t *m2p;
522 xen_pfn_t *extent_start;
523 int i;
525 m2p = NULL;
526 m2p_size = M2P_SIZE(max_mfn);
527 m2p_chunks = M2P_CHUNKS(max_mfn);
529 xmml.max_extents = m2p_chunks;
531 extent_start = calloc(m2p_chunks, sizeof(xen_pfn_t));
532 if ( !extent_start )
533 {
534 ERROR("failed to allocate space for m2p mfns");
535 goto err0;
536 }
537 set_xen_guest_handle(xmml.extent_start, extent_start);
539 if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
540 (xmml.nr_extents != m2p_chunks) )
541 {
542 ERROR("xc_get_m2p_mfns");
543 goto err1;
544 }
546 entries = calloc(m2p_chunks, sizeof(privcmd_mmap_entry_t));
547 if (entries == NULL)
548 {
549 ERROR("failed to allocate space for mmap entries");
550 goto err1;
551 }
553 for ( i = 0; i < m2p_chunks; i++ )
554 entries[i].mfn = extent_start[i];
556 m2p = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
557 m2p_size, prot, M2P_CHUNK_SIZE,
558 entries, m2p_chunks);
559 if (m2p == NULL)
560 {
561 ERROR("xc_mmap_foreign_ranges failed");
562 goto err2;
563 }
565 if (mfn0)
566 *mfn0 = entries[0].mfn;
568 err2:
569 free(entries);
570 err1:
571 free(extent_start);
573 err0:
574 return m2p;
575 }
578 static xen_pfn_t *map_and_save_p2m_table(int xc_handle,
579 int io_fd,
580 uint32_t dom,
581 unsigned long p2m_size,
582 shared_info_any_t *live_shinfo)
583 {
584 vcpu_guest_context_any_t ctxt;
586 /* Double and single indirect references to the live P2M table */
587 void *live_p2m_frame_list_list = NULL;
588 void *live_p2m_frame_list = NULL;
590 /* Copies of the above. */
591 xen_pfn_t *p2m_frame_list_list = NULL;
592 xen_pfn_t *p2m_frame_list = NULL;
594 /* The mapping of the live p2m table itself */
595 xen_pfn_t *p2m = NULL;
597 int i, success = 0;
599 live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
600 live_shinfo);
601 if ( !live_p2m_frame_list_list )
602 goto out;
604 /* Get a local copy of the live_P2M_frame_list_list */
605 if ( !(p2m_frame_list_list = malloc(PAGE_SIZE)) )
606 {
607 ERROR("Couldn't allocate p2m_frame_list_list array");
608 goto out;
609 }
610 memcpy(p2m_frame_list_list, live_p2m_frame_list_list, PAGE_SIZE);
612 /* Canonicalize guest's unsigned long vs ours */
613 if ( guest_width > sizeof(unsigned long) )
614 for ( i = 0; i < PAGE_SIZE/sizeof(unsigned long); i++ )
615 if ( i < PAGE_SIZE/guest_width )
616 p2m_frame_list_list[i] = ((uint64_t *)p2m_frame_list_list)[i];
617 else
618 p2m_frame_list_list[i] = 0;
619 else if ( guest_width < sizeof(unsigned long) )
620 for ( i = PAGE_SIZE/sizeof(unsigned long) - 1; i >= 0; i-- )
621 p2m_frame_list_list[i] = ((uint32_t *)p2m_frame_list_list)[i];
623 live_p2m_frame_list =
624 xc_map_foreign_batch(xc_handle, dom, PROT_READ,
625 p2m_frame_list_list,
626 P2M_FLL_ENTRIES);
627 if ( !live_p2m_frame_list )
628 {
629 ERROR("Couldn't map p2m_frame_list");
630 goto out;
631 }
633 /* Get a local copy of the live_P2M_frame_list */
634 if ( !(p2m_frame_list = malloc(P2M_TOOLS_FL_SIZE)) )
635 {
636 ERROR("Couldn't allocate p2m_frame_list array");
637 goto out;
638 }
639 memset(p2m_frame_list, 0, P2M_TOOLS_FL_SIZE);
640 memcpy(p2m_frame_list, live_p2m_frame_list, P2M_GUEST_FL_SIZE);
642 /* Canonicalize guest's unsigned long vs ours */
643 if ( guest_width > sizeof(unsigned long) )
644 for ( i = 0; i < P2M_FL_ENTRIES; i++ )
645 p2m_frame_list[i] = ((uint64_t *)p2m_frame_list)[i];
646 else if ( guest_width < sizeof(unsigned long) )
647 for ( i = P2M_FL_ENTRIES - 1; i >= 0; i-- )
648 p2m_frame_list[i] = ((uint32_t *)p2m_frame_list)[i];
651 /* Map all the frames of the pfn->mfn table. For migrate to succeed,
652 the guest must not change which frames are used for this purpose.
653 (its not clear why it would want to change them, and we'll be OK
654 from a safety POV anyhow. */
656 p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
657 p2m_frame_list,
658 P2M_FL_ENTRIES);
659 if ( !p2m )
660 {
661 ERROR("Couldn't map p2m table");
662 goto out;
663 }
664 live_p2m = p2m; /* So that translation macros will work */
666 /* Canonicalise the pfn-to-mfn table frame-number list. */
667 for ( i = 0; i < p2m_size; i += FPP )
668 {
669 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(p2m_frame_list[i/FPP]) )
670 {
671 ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
672 ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64", max 0x%lx",
673 i, i/FPP, (uint64_t)p2m_frame_list[i/FPP], max_mfn);
674 if ( p2m_frame_list[i/FPP] < max_mfn )
675 {
676 ERROR("m2p[0x%"PRIx64"] = 0x%"PRIx64,
677 (uint64_t)p2m_frame_list[i/FPP],
678 (uint64_t)live_m2p[p2m_frame_list[i/FPP]]);
679 ERROR("p2m[0x%"PRIx64"] = 0x%"PRIx64,
680 (uint64_t)live_m2p[p2m_frame_list[i/FPP]],
681 (uint64_t)p2m[live_m2p[p2m_frame_list[i/FPP]]]);
683 }
684 goto out;
685 }
686 p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]);
687 }
689 if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
690 {
691 ERROR("Could not get vcpu context");
692 goto out;
693 }
695 /*
696 * Write an extended-info structure to inform the restore code that
697 * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
698 * slow paths in the restore code.
699 */
700 {
701 unsigned long signature = ~0UL;
702 uint32_t chunk1_sz = ((guest_width==8)
703 ? sizeof(ctxt.x64)
704 : sizeof(ctxt.x32));
705 uint32_t chunk2_sz = 0;
706 uint32_t tot_sz = (chunk1_sz + 8) + (chunk2_sz + 8);
707 if ( write_exact(io_fd, &signature, sizeof(signature)) ||
708 write_exact(io_fd, &tot_sz, sizeof(tot_sz)) ||
709 write_exact(io_fd, "vcpu", 4) ||
710 write_exact(io_fd, &chunk1_sz, sizeof(chunk1_sz)) ||
711 write_exact(io_fd, &ctxt, chunk1_sz) ||
712 write_exact(io_fd, "extv", 4) ||
713 write_exact(io_fd, &chunk2_sz, sizeof(chunk2_sz)) )
714 {
715 PERROR("write: extended info");
716 goto out;
717 }
718 }
720 if ( write_exact(io_fd, p2m_frame_list,
721 P2M_FL_ENTRIES * sizeof(xen_pfn_t)) )
722 {
723 PERROR("write: p2m_frame_list");
724 goto out;
725 }
727 success = 1;
729 out:
731 if ( !success && p2m )
732 munmap(p2m, P2M_FLL_ENTRIES * PAGE_SIZE);
734 if ( live_p2m_frame_list_list )
735 munmap(live_p2m_frame_list_list, PAGE_SIZE);
737 if ( live_p2m_frame_list )
738 munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
740 if ( p2m_frame_list_list )
741 free(p2m_frame_list_list);
743 if ( p2m_frame_list )
744 free(p2m_frame_list);
746 return success ? p2m : NULL;
747 }
749 int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
750 uint32_t max_factor, uint32_t flags, int (*suspend)(void),
751 int hvm, void *(*init_qemu_maps)(int, unsigned),
752 void (*qemu_flip_buffer)(int, int))
753 {
754 xc_dominfo_t info;
755 DECLARE_DOMCTL;
757 int rc = 1, frc, i, j, last_iter, iter = 0;
758 int live = (flags & XCFLAGS_LIVE);
759 int debug = (flags & XCFLAGS_DEBUG);
760 int race = 0, sent_last_iter, skip_this_iter;
762 /* The new domain's shared-info frame number. */
763 unsigned long shared_info_frame;
765 /* A copy of the CPU context of the guest. */
766 vcpu_guest_context_any_t ctxt;
768 /* A table containing the type of each PFN (/not/ MFN!). */
769 unsigned long *pfn_type = NULL;
770 unsigned long *pfn_batch = NULL;
772 /* A copy of one frame of guest memory. */
773 char page[PAGE_SIZE];
775 /* Live mapping of shared info structure */
776 shared_info_any_t *live_shinfo = NULL;
778 /* base of the region in which domain memory is mapped */
779 unsigned char *region_base = NULL;
781 /* bitmap of pages:
782 - that should be sent this iteration (unless later marked as skip);
783 - to skip this iteration because already dirty;
784 - to fixup by sending at the end if not already resent; */
785 unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
787 xc_shadow_op_stats_t stats;
789 unsigned long needed_to_fix = 0;
790 unsigned long total_sent = 0;
792 uint64_t vcpumap = 1ULL;
794 /* HVM: a buffer for holding HVM context */
795 uint32_t hvm_buf_size = 0;
796 uint8_t *hvm_buf = NULL;
798 /* HVM: magic frames for ioreqs and xenstore comms. */
799 uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
801 unsigned long mfn;
803 /* If no explicit control parameters given, use defaults */
804 max_iters = max_iters ? : DEF_MAX_ITERS;
805 max_factor = max_factor ? : DEF_MAX_FACTOR;
807 initialize_mbit_rate();
809 if ( !get_platform_info(xc_handle, dom,
810 &max_mfn, &hvirt_start, &pt_levels, &guest_width) )
811 {
812 ERROR("Unable to get platform info.");
813 return 1;
814 }
816 if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
817 {
818 ERROR("Could not get domain info");
819 return 1;
820 }
822 shared_info_frame = info.shared_info_frame;
824 /* Map the shared info frame */
825 if ( !hvm )
826 {
827 live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
828 PROT_READ, shared_info_frame);
829 if ( !live_shinfo )
830 {
831 ERROR("Couldn't map live_shinfo");
832 goto out;
833 }
834 }
836 /* Get the size of the P2M table */
837 p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom) + 1;
839 /* Domain is still running at this point */
840 if ( live )
841 {
842 /* Live suspend. Enable log-dirty mode. */
843 if ( xc_shadow_control(xc_handle, dom,
844 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
845 NULL, 0, NULL, 0, NULL) < 0 )
846 {
847 /* log-dirty already enabled? There's no test op,
848 so attempt to disable then reenable it */
849 frc = xc_shadow_control(xc_handle, dom, XEN_DOMCTL_SHADOW_OP_OFF,
850 NULL, 0, NULL, 0, NULL);
851 if ( frc >= 0 )
852 {
853 frc = xc_shadow_control(xc_handle, dom,
854 XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
855 NULL, 0, NULL, 0, NULL);
856 }
858 if ( frc < 0 )
859 {
860 ERROR("Couldn't enable shadow mode (rc %d) (errno %d)", frc, errno );
861 goto out;
862 }
863 }
865 if ( hvm )
866 {
867 /* Get qemu-dm logging dirty pages too */
868 void *seg = init_qemu_maps(dom, BITMAP_SIZE);
869 qemu_bitmaps[0] = seg;
870 qemu_bitmaps[1] = seg + BITMAP_SIZE;
871 qemu_active = 0;
872 qemu_non_active = 1;
873 }
874 }
875 else
876 {
877 /* This is a non-live suspend. Suspend the domain .*/
878 if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info) )
879 {
880 ERROR("Domain appears not to have suspended");
881 goto out;
882 }
883 }
885 last_iter = !live;
887 /* pretend we sent all the pages last iteration */
888 sent_last_iter = p2m_size;
890 /* Setup to_send / to_fix and to_skip bitmaps */
891 to_send = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT));
892 to_fix = calloc(1, BITMAP_SIZE);
893 to_skip = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT));
895 if ( !to_send || !to_fix || !to_skip )
896 {
897 ERROR("Couldn't allocate to_send array");
898 goto out;
899 }
901 memset(to_send, 0xff, BITMAP_SIZE);
903 if ( lock_pages(to_send, BITMAP_SIZE) )
904 {
905 ERROR("Unable to lock to_send");
906 return 1;
907 }
909 /* (to fix is local only) */
910 if ( lock_pages(to_skip, BITMAP_SIZE) )
911 {
912 ERROR("Unable to lock to_skip");
913 return 1;
914 }
916 if ( hvm )
917 {
918 /* Need another buffer for HVM context */
919 hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
920 if ( hvm_buf_size == -1 )
921 {
922 ERROR("Couldn't get HVM context size from Xen");
923 goto out;
924 }
925 hvm_buf = malloc(hvm_buf_size);
926 if ( !hvm_buf )
927 {
928 ERROR("Couldn't allocate memory");
929 goto out;
930 }
931 }
933 analysis_phase(xc_handle, dom, p2m_size, to_skip, 0);
935 pfn_type = xg_memalign(PAGE_SIZE, ROUNDUP(
936 MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
937 pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
938 if ( (pfn_type == NULL) || (pfn_batch == NULL) )
939 {
940 ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
941 errno = ENOMEM;
942 goto out;
943 }
944 memset(pfn_type, 0,
945 ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
947 if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
948 {
949 ERROR("Unable to lock pfn_type array");
950 goto out;
951 }
953 /* Setup the mfn_to_pfn table mapping */
954 if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ, &m2p_mfn0)) )
955 {
956 ERROR("Failed to map live M2P table");
957 goto out;
958 }
960 /* Start writing out the saved-domain record. */
961 if ( write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
962 {
963 PERROR("write: p2m_size");
964 goto out;
965 }
967 if ( !hvm )
968 {
969 int err = 0;
971 /* Map the P2M table, and write the list of P2M frames */
972 live_p2m = map_and_save_p2m_table(xc_handle, io_fd, dom,
973 p2m_size, live_shinfo);
974 if ( live_p2m == NULL )
975 {
976 ERROR("Failed to map/save the p2m frame list");
977 goto out;
978 }
980 /*
981 * Quick belt and braces sanity check.
982 */
984 for ( i = 0; i < p2m_size; i++ )
985 {
986 mfn = pfn_to_mfn(i);
987 if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
988 {
989 DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
990 mfn, mfn_to_pfn(mfn));
991 err++;
992 }
993 }
994 DPRINTF("Had %d unexplained entries in p2m table\n", err);
995 }
997 print_stats(xc_handle, dom, 0, &stats, 0);
999 /* Now write out each data page, canonicalising page tables as we go... */
1000 for ( ; ; )
1002 unsigned int prev_pc, sent_this_iter, N, batch, run;
1004 iter++;
1005 sent_this_iter = 0;
1006 skip_this_iter = 0;
1007 prev_pc = 0;
1008 N = 0;
1010 DPRINTF("Saving memory pages: iter %d 0%%", iter);
1012 while ( N < p2m_size )
1014 unsigned int this_pc = (N * 100) / p2m_size;
1016 if ( (this_pc - prev_pc) >= 5 )
1018 DPRINTF("\b\b\b\b%3d%%", this_pc);
1019 prev_pc = this_pc;
1022 if ( !last_iter )
1024 /* Slightly wasteful to peek the whole array evey time,
1025 but this is fast enough for the moment. */
1026 frc = xc_shadow_control(
1027 xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip,
1028 p2m_size, NULL, 0, NULL);
1029 if ( frc != p2m_size )
1031 ERROR("Error peeking shadow bitmap");
1032 goto out;
1036 /* load pfn_type[] with the mfn of all the pages we're doing in
1037 this batch. */
1038 for ( batch = 0;
1039 (batch < MAX_BATCH_SIZE) && (N < p2m_size);
1040 N++ )
1042 int n = N;
1044 if ( debug )
1046 DPRINTF("%d pfn= %08lx mfn= %08lx %d",
1047 iter, (unsigned long)n,
1048 hvm ? 0 : pfn_to_mfn(n),
1049 test_bit(n, to_send));
1050 if ( !hvm && is_mapped(pfn_to_mfn(n)) )
1051 DPRINTF(" [mfn]= %08lx",
1052 mfn_to_pfn(pfn_to_mfn(n)&0xFFFFF));
1053 DPRINTF("\n");
1055 if ( !last_iter &&
1056 test_bit(n, to_send) &&
1057 test_bit(n, to_skip) )
1058 skip_this_iter++; /* stats keeping */
1060 if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
1061 (test_bit(n, to_send) && last_iter) ||
1062 (test_bit(n, to_fix) && last_iter)) )
1063 continue;
1065 /*
1066 ** we get here if:
1067 ** 1. page is marked to_send & hasn't already been re-dirtied
1068 ** 2. (ignore to_skip in last iteration)
1069 ** 3. add in pages that still need fixup (net bufs)
1070 */
1072 pfn_batch[batch] = n;
1074 /* Hypercall interfaces operate in PFNs for HVM guests
1075 * and MFNs for PV guests */
1076 if ( hvm )
1077 pfn_type[batch] = n;
1078 else
1079 pfn_type[batch] = pfn_to_mfn(n);
1081 if ( !is_mapped(pfn_type[batch]) )
1083 /*
1084 ** not currently in psuedo-physical map -- set bit
1085 ** in to_fix since we must send this page in last_iter
1086 ** unless its sent sooner anyhow, or it never enters
1087 ** pseudo-physical map (e.g. for ballooned down doms)
1088 */
1089 set_bit(n, to_fix);
1090 continue;
1093 if ( last_iter &&
1094 test_bit(n, to_fix) &&
1095 !test_bit(n, to_send) )
1097 needed_to_fix++;
1098 DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
1099 iter, n, pfn_type[batch]);
1102 clear_bit(n, to_fix);
1104 batch++;
1107 if ( batch == 0 )
1108 goto skip; /* vanishingly unlikely... */
1110 region_base = xc_map_foreign_batch(
1111 xc_handle, dom, PROT_READ, pfn_type, batch);
1112 if ( region_base == NULL )
1114 ERROR("map batch failed");
1115 goto out;
1118 if ( hvm )
1120 /* Look for and skip completely empty batches. */
1121 for ( j = 0; j < batch; j++ )
1122 if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) !=
1123 XEN_DOMCTL_PFINFO_XTAB )
1124 break;
1125 if ( j == batch )
1127 munmap(region_base, batch*PAGE_SIZE);
1128 continue; /* bail on this batch: no valid pages */
1131 else
1133 /* Get page types */
1134 for ( j = 0; j < batch; j++ )
1135 ((uint32_t *)pfn_type)[j] = pfn_type[j];
1136 if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
1137 (uint32_t *)pfn_type) )
1139 ERROR("get_pfn_type_batch failed");
1140 goto out;
1142 for ( j = batch-1; j >= 0; j-- )
1143 pfn_type[j] = ((uint32_t *)pfn_type)[j];
1145 for ( j = 0; j < batch; j++ )
1148 if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
1149 XEN_DOMCTL_PFINFO_XTAB )
1151 DPRINTF("type fail: page %i mfn %08lx\n",
1152 j, pfn_type[j]);
1153 continue;
1156 if ( debug )
1157 DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
1158 " sum= %08lx\n",
1159 iter,
1160 (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
1161 pfn_batch[j],
1162 pfn_type[j],
1163 mfn_to_pfn(pfn_type[j] &
1164 ~XEN_DOMCTL_PFINFO_LTAB_MASK),
1165 csum_page(region_base + (PAGE_SIZE*j)));
1167 /* canonicalise mfn->pfn */
1168 pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
1169 pfn_batch[j];
1173 if ( write_exact(io_fd, &batch, sizeof(unsigned int)) )
1175 PERROR("Error when writing to state file (2)");
1176 goto out;
1179 if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
1181 PERROR("Error when writing to state file (3)");
1182 goto out;
1185 /* entering this loop, pfn_type is now in pfns (Not mfns) */
1186 run = 0;
1187 for ( j = 0; j < batch; j++ )
1189 unsigned long pfn, pagetype;
1190 void *spage = (char *)region_base + (PAGE_SIZE*j);
1192 pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
1193 pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
1195 if ( pagetype != 0 )
1197 /* If the page is not a normal data page, write out any
1198 run of pages we may have previously acumulated */
1199 if ( run )
1201 if ( ratewrite(io_fd, live,
1202 (char*)region_base+(PAGE_SIZE*(j-run)),
1203 PAGE_SIZE*run) != PAGE_SIZE*run )
1205 ERROR("Error when writing to state file (4a)"
1206 " (errno %d)", errno);
1207 goto out;
1209 run = 0;
1213 /* skip pages that aren't present */
1214 if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
1215 continue;
1217 pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
1219 if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
1220 (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
1222 /* We have a pagetable page: need to rewrite it. */
1223 race =
1224 canonicalize_pagetable(pagetype, pfn, spage, page);
1226 if ( race && !live )
1228 ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
1229 pagetype);
1230 goto out;
1233 if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
1235 ERROR("Error when writing to state file (4b)"
1236 " (errno %d)", errno);
1237 goto out;
1240 else
1242 /* We have a normal page: accumulate it for writing. */
1243 run++;
1245 } /* end of the write out for this batch */
1247 if ( run )
1249 /* write out the last accumulated run of pages */
1250 if ( ratewrite(io_fd, live,
1251 (char*)region_base+(PAGE_SIZE*(j-run)),
1252 PAGE_SIZE*run) != PAGE_SIZE*run )
1254 ERROR("Error when writing to state file (4c)"
1255 " (errno %d)", errno);
1256 goto out;
1260 sent_this_iter += batch;
1262 munmap(region_base, batch*PAGE_SIZE);
1264 } /* end of this while loop for this iteration */
1266 skip:
1268 total_sent += sent_this_iter;
1270 DPRINTF("\r %d: sent %d, skipped %d, ",
1271 iter, sent_this_iter, skip_this_iter );
1273 if ( last_iter )
1275 print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
1277 DPRINTF("Total pages sent= %ld (%.2fx)\n",
1278 total_sent, ((float)total_sent)/p2m_size );
1279 DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
1282 if ( last_iter && debug )
1284 int minusone = -1;
1285 memset(to_send, 0xff, BITMAP_SIZE);
1286 debug = 0;
1287 DPRINTF("Entering debug resend-all mode\n");
1289 /* send "-1" to put receiver into debug mode */
1290 if ( write_exact(io_fd, &minusone, sizeof(int)) )
1292 PERROR("Error when writing to state file (6)");
1293 goto out;
1296 continue;
1299 if ( last_iter )
1300 break;
1302 if ( live )
1304 if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
1305 (iter >= max_iters) ||
1306 (sent_this_iter+skip_this_iter < 50) ||
1307 (total_sent > p2m_size*max_factor) )
1309 DPRINTF("Start last iteration\n");
1310 last_iter = 1;
1312 if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info) )
1314 ERROR("Domain appears not to have suspended");
1315 goto out;
1318 DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
1321 if ( xc_shadow_control(xc_handle, dom,
1322 XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
1323 p2m_size, NULL, 0, &stats) != p2m_size )
1325 ERROR("Error flushing shadow PT");
1326 goto out;
1329 if ( hvm )
1331 /* Pull in the dirty bits from qemu-dm too */
1332 if ( !last_iter )
1334 qemu_active = qemu_non_active;
1335 qemu_non_active = qemu_active ? 0 : 1;
1336 qemu_flip_buffer(dom, qemu_active);
1337 for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
1339 to_send[j] |= qemu_bitmaps[qemu_non_active][j];
1340 qemu_bitmaps[qemu_non_active][j] = 0;
1343 else
1345 for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
1346 to_send[j] |= qemu_bitmaps[qemu_active][j];
1350 sent_last_iter = sent_this_iter;
1352 print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
1355 } /* end of infinite for loop */
1357 DPRINTF("All memory is saved\n");
1360 struct {
1361 int minustwo;
1362 int max_vcpu_id;
1363 uint64_t vcpumap;
1364 } chunk = { -2, info.max_vcpu_id };
1366 if ( info.max_vcpu_id >= 64 )
1368 ERROR("Too many VCPUS in guest!");
1369 goto out;
1372 for ( i = 1; i <= info.max_vcpu_id; i++ )
1374 xc_vcpuinfo_t vinfo;
1375 if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
1376 vinfo.online )
1377 vcpumap |= 1ULL << i;
1380 chunk.vcpumap = vcpumap;
1381 if ( write_exact(io_fd, &chunk, sizeof(chunk)) )
1383 PERROR("Error when writing to state file");
1384 goto out;
1388 if ( hvm )
1390 struct {
1391 int id;
1392 uint32_t pad;
1393 uint64_t data;
1394 } chunk = { 0, };
1396 chunk.id = -3;
1397 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
1398 (unsigned long *)&chunk.data);
1400 if ( (chunk.data != 0) &&
1401 write_exact(io_fd, &chunk, sizeof(chunk)) )
1403 PERROR("Error when writing the ident_pt for EPT guest");
1404 goto out;
1407 chunk.id = -4;
1408 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS,
1409 (unsigned long *)&chunk.data);
1411 if ( (chunk.data != 0) &&
1412 write_exact(io_fd, &chunk, sizeof(chunk)) )
1414 PERROR("Error when writing the vm86 TSS for guest");
1415 goto out;
1419 /* Zero terminate */
1420 i = 0;
1421 if ( write_exact(io_fd, &i, sizeof(int)) )
1423 PERROR("Error when writing to state file (6')");
1424 goto out;
1427 if ( hvm )
1429 uint32_t rec_size;
1431 /* Save magic-page locations. */
1432 memset(magic_pfns, 0, sizeof(magic_pfns));
1433 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
1434 (unsigned long *)&magic_pfns[0]);
1435 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
1436 (unsigned long *)&magic_pfns[1]);
1437 xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
1438 (unsigned long *)&magic_pfns[2]);
1439 if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
1441 PERROR("Error when writing to state file (7)");
1442 goto out;
1445 /* Get HVM context from Xen and save it too */
1446 if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
1447 hvm_buf_size)) == -1 )
1449 ERROR("HVM:Could not get hvm buffer");
1450 goto out;
1453 if ( write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
1455 PERROR("error write hvm buffer size");
1456 goto out;
1459 if ( write_exact(io_fd, hvm_buf, rec_size) )
1461 PERROR("write HVM info failed!\n");
1462 goto out;
1465 /* HVM guests are done now */
1466 rc = 0;
1467 goto out;
1470 /* PV guests only from now on */
1472 /* Send through a list of all the PFNs that were not in map at the close */
1474 unsigned int i,j;
1475 unsigned long pfntab[1024];
1477 for ( i = 0, j = 0; i < p2m_size; i++ )
1479 if ( !is_mapped(pfn_to_mfn(i)) )
1480 j++;
1483 if ( write_exact(io_fd, &j, sizeof(unsigned int)) )
1485 PERROR("Error when writing to state file (6a)");
1486 goto out;
1489 for ( i = 0, j = 0; i < p2m_size; )
1491 if ( !is_mapped(pfn_to_mfn(i)) )
1492 pfntab[j++] = i;
1494 i++;
1495 if ( (j == 1024) || (i == p2m_size) )
1497 if ( write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
1499 PERROR("Error when writing to state file (6b)");
1500 goto out;
1502 j = 0;
1507 if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
1509 ERROR("Could not get vcpu context");
1510 goto out;
1513 /* Canonicalise the suspend-record frame number. */
1514 mfn = GET_FIELD(&ctxt, user_regs.edx);
1515 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
1517 ERROR("Suspend record is not in range of pseudophys map");
1518 goto out;
1520 SET_FIELD(&ctxt, user_regs.edx, mfn_to_pfn(mfn));
1522 for ( i = 0; i <= info.max_vcpu_id; i++ )
1524 if ( !(vcpumap & (1ULL << i)) )
1525 continue;
1527 if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
1529 ERROR("No context for VCPU%d", i);
1530 goto out;
1533 /* Canonicalise each GDT frame number. */
1534 for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents); j++ )
1536 mfn = GET_FIELD(&ctxt, gdt_frames[j]);
1537 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
1539 ERROR("GDT frame is not in range of pseudophys map");
1540 goto out;
1542 SET_FIELD(&ctxt, gdt_frames[j], mfn_to_pfn(mfn));
1545 /* Canonicalise the page table base pointer. */
1546 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(UNFOLD_CR3(
1547 GET_FIELD(&ctxt, ctrlreg[3]))) )
1549 ERROR("PT base is not in range of pseudophys map");
1550 goto out;
1552 SET_FIELD(&ctxt, ctrlreg[3],
1553 FOLD_CR3(mfn_to_pfn(UNFOLD_CR3(GET_FIELD(&ctxt, ctrlreg[3])))));
1555 /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
1556 if ( (pt_levels == 4) && ctxt.x64.ctrlreg[1] )
1558 if ( !MFN_IS_IN_PSEUDOPHYS_MAP(UNFOLD_CR3(ctxt.x64.ctrlreg[1])) )
1560 ERROR("PT base is not in range of pseudophys map");
1561 goto out;
1563 /* Least-significant bit means 'valid PFN'. */
1564 ctxt.x64.ctrlreg[1] = 1 |
1565 FOLD_CR3(mfn_to_pfn(UNFOLD_CR3(ctxt.x64.ctrlreg[1])));
1568 if ( write_exact(io_fd, &ctxt, ((guest_width==8)
1569 ? sizeof(ctxt.x64)
1570 : sizeof(ctxt.x32))) )
1572 PERROR("Error when writing to state file (1)");
1573 goto out;
1576 domctl.cmd = XEN_DOMCTL_get_ext_vcpucontext;
1577 domctl.domain = dom;
1578 domctl.u.ext_vcpucontext.vcpu = i;
1579 if ( xc_domctl(xc_handle, &domctl) < 0 )
1581 ERROR("No extended context for VCPU%d", i);
1582 goto out;
1584 if ( write_exact(io_fd, &domctl.u.ext_vcpucontext, 128) )
1586 PERROR("Error when writing to state file (2)");
1587 goto out;
1591 /*
1592 * Reset the MFN to be a known-invalid value. See map_frame_list_list().
1593 */
1594 memcpy(page, live_shinfo, PAGE_SIZE);
1595 SET_FIELD(((shared_info_any_t *)page),
1596 arch.pfn_to_mfn_frame_list_list, 0);
1597 if ( write_exact(io_fd, page, PAGE_SIZE) )
1599 PERROR("Error when writing to state file (1)");
1600 goto out;
1603 /* Success! */
1604 rc = 0;
1606 out:
1608 if ( live )
1610 if ( xc_shadow_control(xc_handle, dom,
1611 XEN_DOMCTL_SHADOW_OP_OFF,
1612 NULL, 0, NULL, 0, NULL) < 0 )
1613 DPRINTF("Warning - couldn't disable shadow mode");
1616 /* Flush last write and discard cache for file. */
1617 discard_file_cache(io_fd, 1 /* flush */);
1619 if ( live_shinfo )
1620 munmap(live_shinfo, PAGE_SIZE);
1622 if ( live_p2m )
1623 munmap(live_p2m, P2M_FLL_ENTRIES * PAGE_SIZE);
1625 if ( live_m2p )
1626 munmap(live_m2p, M2P_SIZE(max_mfn));
1628 free(pfn_type);
1629 free(pfn_batch);
1630 free(to_send);
1631 free(to_fix);
1632 free(to_skip);
1634 DPRINTF("Save exit rc=%d\n",rc);
1636 return !!rc;
1639 /*
1640 * Local variables:
1641 * mode: C
1642 * c-set-style: "BSD"
1643 * c-basic-offset: 4
1644 * tab-width: 4
1645 * indent-tabs-mode: nil
1646 * End:
1647 */