ia64/xen-unstable
changeset 14811:db4fcb609383
Merge with xen-ia64-unstable.hg
author | kfraser@localhost.localdomain |
---|---|
date | Wed Apr 11 15:45:29 2007 +0100 (2007-04-11) |
parents | 3d356a2b1c75 0d92cd901f80 |
children | 38204c93428e |
files | tools/libxc/xc_hvm_save.c tools/libxc/xc_linux_save.c |
line diff
1.1 --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h Wed Apr 11 07:30:02 2007 -0600 1.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h Wed Apr 11 15:45:29 2007 +0100 1.3 @@ -56,20 +56,6 @@ 1.4 #include <asm/atomic.h> 1.5 #include <asm/uaccess.h> 1.6 1.7 -#if 1 1.8 -#define IPRINTK(fmt, args...) \ 1.9 - printk(KERN_INFO "xen_blk: " fmt, ##args) 1.10 -#else 1.11 -#define IPRINTK(fmt, args...) ((void)0) 1.12 -#endif 1.13 - 1.14 -#if 1 1.15 -#define WPRINTK(fmt, args...) \ 1.16 - printk(KERN_WARNING "xen_blk: " fmt, ##args) 1.17 -#else 1.18 -#define WPRINTK(fmt, args...) ((void)0) 1.19 -#endif 1.20 - 1.21 #define DPRINTK(_f, _a...) pr_debug(_f, ## _a) 1.22 1.23 #if 0
2.1 --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c Wed Apr 11 07:30:02 2007 -0600 2.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c Wed Apr 11 15:45:29 2007 +0100 2.3 @@ -128,14 +128,12 @@ xlbd_alloc_major_info(int major, int min 2.4 break; 2.5 } 2.6 2.7 - printk("Registering block device major %i\n", ptr->major); 2.8 if (register_blkdev(ptr->major, ptr->type->devname)) { 2.9 - WPRINTK("can't get major %d with name %s\n", 2.10 - ptr->major, ptr->type->devname); 2.11 kfree(ptr); 2.12 return NULL; 2.13 } 2.14 2.15 + printk("xen-vbd: registered block device major %i\n", ptr->major); 2.16 major_info[index] = ptr; 2.17 return ptr; 2.18 }
3.1 --- a/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c Wed Apr 11 07:30:02 2007 -0600 3.2 +++ b/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c Wed Apr 11 15:45:29 2007 +0100 3.3 @@ -60,9 +60,6 @@ static grant_ref_t gnttab_free_head; 3.4 static DEFINE_SPINLOCK(gnttab_list_lock); 3.5 3.6 static struct grant_entry *shared; 3.7 -#ifndef CONFIG_XEN 3.8 -static unsigned long resume_frames; 3.9 -#endif 3.10 3.11 static struct gnttab_free_callback *gnttab_free_callback_list; 3.12 3.13 @@ -514,6 +511,8 @@ int gnttab_suspend(void) 3.14 3.15 #include <platform-pci.h> 3.16 3.17 +static unsigned long resume_frames; 3.18 + 3.19 static int gnttab_map(unsigned int start_idx, unsigned int end_idx) 3.20 { 3.21 struct xen_add_to_physmap xatp; 3.22 @@ -543,23 +542,17 @@ int gnttab_resume(void) 3.23 if (max_nr_gframes < nr_gframes) 3.24 return -ENOSYS; 3.25 3.26 - resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); 3.27 + if (!resume_frames) { 3.28 + resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); 3.29 + shared = ioremap(resume_frames, PAGE_SIZE * max_nr_gframes); 3.30 + if (shared == NULL) { 3.31 + printk("error to ioremap gnttab share frames\n"); 3.32 + return -1; 3.33 + } 3.34 + } 3.35 3.36 gnttab_map(0, nr_gframes - 1); 3.37 3.38 - shared = ioremap(resume_frames, PAGE_SIZE * max_nr_gframes); 3.39 - if (shared == NULL) { 3.40 - printk("error to ioremap gnttab share frames\n"); 3.41 - return -1; 3.42 - } 3.43 - 3.44 - return 0; 3.45 -} 3.46 - 3.47 -int gnttab_suspend(void) 3.48 -{ 3.49 - iounmap(shared); 3.50 - resume_frames = 0; 3.51 return 0; 3.52 } 3.53 3.54 @@ -624,7 +617,6 @@ int __devinit gnttab_init(void) 3.55 gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES; 3.56 gnttab_free_head = NR_RESERVED_ENTRIES; 3.57 3.58 - printk("Grant table initialized\n"); 3.59 return 0; 3.60 3.61 ini_nomem:
4.1 --- a/linux-2.6-xen-sparse/drivers/xen/core/machine_reboot.c Wed Apr 11 07:30:02 2007 -0600 4.2 +++ b/linux-2.6-xen-sparse/drivers/xen/core/machine_reboot.c Wed Apr 11 15:45:29 2007 +0100 4.3 @@ -209,6 +209,8 @@ int __xen_suspend(int fast_suspend) 4.4 if (fast_suspend) { 4.5 xenbus_suspend(); 4.6 err = stop_machine_run(take_machine_down, &fast_suspend, 0); 4.7 + if (err < 0) 4.8 + xenbus_suspend_cancel(); 4.9 } else { 4.10 err = take_machine_down(&fast_suspend); 4.11 }
5.1 --- a/tools/blktap/drivers/block-qcow.c Wed Apr 11 07:30:02 2007 -0600 5.2 +++ b/tools/blktap/drivers/block-qcow.c Wed Apr 11 15:45:29 2007 +0100 5.3 @@ -949,8 +949,14 @@ int tdqcow_open (struct disk_driver *dd, 5.4 goto fail; 5.5 } 5.6 init_fds(dd); 5.7 - s->fd_end = (final_cluster == 0 ? (s->l1_table_offset + l1_table_size) : 5.8 - (final_cluster + s->cluster_size)); 5.9 + 5.10 + if (!final_cluster) 5.11 + s->fd_end = s->l1_table_offset + l1_table_size; 5.12 + else { 5.13 + s->fd_end = lseek64(fd, 0, SEEK_END); 5.14 + if (s->fd_end == (off64_t)-1) 5.15 + goto fail; 5.16 + } 5.17 5.18 return 0; 5.19
6.1 --- a/tools/ioemu/hw/pc.c Wed Apr 11 07:30:02 2007 -0600 6.2 +++ b/tools/ioemu/hw/pc.c Wed Apr 11 15:45:29 2007 +0100 6.3 @@ -902,7 +902,6 @@ static void pc_init1(uint64_t ram_size, 6.4 if (pci_enabled && acpi_enabled) { 6.5 piix4_pm_init(pci_bus, piix3_devfn + 3); 6.6 } 6.7 -#endif /* !CONFIG_DM */ 6.8 6.9 #if 0 6.10 /* ??? Need to figure out some way for the user to 6.11 @@ -921,6 +920,17 @@ static void pc_init1(uint64_t ram_size, 6.12 lsi_scsi_attach(scsi, bdrv, -1); 6.13 } 6.14 #endif 6.15 +#else 6.16 + if (pci_enabled) { 6.17 + void *scsi; 6.18 + 6.19 + scsi = lsi_scsi_init(pci_bus, -1); 6.20 + for (i = 0; i < MAX_SCSI_DISKS ; i++) { 6.21 + if (bs_table[i + MAX_DISKS]) 6.22 + lsi_scsi_attach(scsi, bs_table[i + MAX_DISKS], -1); 6.23 + } 6.24 + } 6.25 +#endif /* !CONFIG_DM */ 6.26 /* must be done after all PCI devices are instanciated */ 6.27 /* XXX: should be done in the Bochs BIOS */ 6.28 if (pci_enabled) {
7.1 --- a/tools/ioemu/vl.c Wed Apr 11 07:30:02 2007 -0600 7.2 +++ b/tools/ioemu/vl.c Wed Apr 11 15:45:29 2007 +0100 7.3 @@ -116,7 +116,7 @@ char phys_ram_file[1024]; 7.4 void *ioport_opaque[MAX_IOPORTS]; 7.5 IOPortReadFunc *ioport_read_table[3][MAX_IOPORTS]; 7.6 IOPortWriteFunc *ioport_write_table[3][MAX_IOPORTS]; 7.7 -BlockDriverState *bs_table[MAX_DISKS], *fd_table[MAX_FD]; 7.8 +BlockDriverState *bs_table[MAX_DISKS+MAX_SCSI_DISKS], *fd_table[MAX_FD]; 7.9 int vga_ram_size; 7.10 int bios_size; 7.11 static DisplayState display_state;
8.1 --- a/tools/ioemu/vl.h Wed Apr 11 07:30:02 2007 -0600 8.2 +++ b/tools/ioemu/vl.h Wed Apr 11 15:45:29 2007 +0100 8.3 @@ -818,8 +818,9 @@ int vnc_start_viewer(int port); 8.4 8.5 /* ide.c */ 8.6 #define MAX_DISKS 4 8.7 +#define MAX_SCSI_DISKS 7 8.8 8.9 -extern BlockDriverState *bs_table[MAX_DISKS]; 8.10 +extern BlockDriverState *bs_table[MAX_DISKS+MAX_SCSI_DISKS]; 8.11 8.12 void isa_ide_init(int iobase, int iobase2, int irq, 8.13 BlockDriverState *hd0, BlockDriverState *hd1);
9.1 --- a/tools/ioemu/xenstore.c Wed Apr 11 07:30:02 2007 -0600 9.2 +++ b/tools/ioemu/xenstore.c Wed Apr 11 15:45:29 2007 +0100 9.3 @@ -30,11 +30,11 @@ static int pasprintf(char **buf, const c 9.4 int ret = 0; 9.5 9.6 if (*buf) 9.7 - free(*buf); 9.8 + free(*buf); 9.9 va_start(ap, fmt); 9.10 if (vasprintf(buf, fmt, ap) == -1) { 9.11 - buf = NULL; 9.12 - ret = -1; 9.13 + buf = NULL; 9.14 + ret = -1; 9.15 } 9.16 va_end(ap); 9.17 return ret; 9.18 @@ -45,11 +45,11 @@ static void insert_media(void *opaque) 9.19 int i; 9.20 9.21 for (i = 0; i < MAX_DISKS; i++) { 9.22 - if (media_filename[i] && bs_table[i]) { 9.23 - do_change(bs_table[i]->device_name, media_filename[i]); 9.24 - free(media_filename[i]); 9.25 - media_filename[i] = NULL; 9.26 - } 9.27 + if (media_filename[i] && bs_table[i]) { 9.28 + do_change(bs_table[i]->device_name, media_filename[i]); 9.29 + free(media_filename[i]); 9.30 + media_filename[i] = NULL; 9.31 + } 9.32 } 9.33 } 9.34 9.35 @@ -57,7 +57,7 @@ void xenstore_check_new_media_present(in 9.36 { 9.37 9.38 if (insert_timer == NULL) 9.39 - insert_timer = qemu_new_timer(rt_clock, insert_media, NULL); 9.40 + insert_timer = qemu_new_timer(rt_clock, insert_media, NULL); 9.41 qemu_mod_timer(insert_timer, qemu_get_clock(rt_clock) + timeout); 9.42 } 9.43 9.44 @@ -82,8 +82,8 @@ void xenstore_parse_domain_config(int do 9.45 char **e = NULL; 9.46 char *buf = NULL, *path; 9.47 char *fpath = NULL, *bpath = NULL, 9.48 - *dev = NULL, *params = NULL, *type = NULL; 9.49 - int i; 9.50 + *dev = NULL, *params = NULL, *type = NULL; 9.51 + int i, is_scsi; 9.52 unsigned int len, num, hd_index; 9.53 9.54 for(i = 0; i < MAX_DISKS; i++) 9.55 @@ -91,8 +91,8 @@ void xenstore_parse_domain_config(int do 9.56 9.57 xsh = xs_daemon_open(); 9.58 if (xsh == NULL) { 9.59 - fprintf(logfile, "Could not contact xenstore for domain config\n"); 9.60 - return; 9.61 + fprintf(logfile, "Could not contact xenstore for domain config\n"); 9.62 + return; 9.63 } 9.64 9.65 path = xs_get_domain_path(xsh, domid); 9.66 @@ -102,59 +102,60 @@ void xenstore_parse_domain_config(int do 9.67 } 9.68 9.69 if (pasprintf(&buf, "%s/device/vbd", path) == -1) 9.70 - goto out; 9.71 + goto out; 9.72 9.73 e = xs_directory(xsh, XBT_NULL, buf, &num); 9.74 if (e == NULL) 9.75 - goto out; 9.76 + goto out; 9.77 9.78 for (i = 0; i < num; i++) { 9.79 - /* read the backend path */ 9.80 - if (pasprintf(&buf, "%s/device/vbd/%s/backend", path, e[i]) == -1) 9.81 - continue; 9.82 - free(bpath); 9.83 + /* read the backend path */ 9.84 + if (pasprintf(&buf, "%s/device/vbd/%s/backend", path, e[i]) == -1) 9.85 + continue; 9.86 + free(bpath); 9.87 bpath = xs_read(xsh, XBT_NULL, buf, &len); 9.88 - if (bpath == NULL) 9.89 - continue; 9.90 - /* read the name of the device */ 9.91 - if (pasprintf(&buf, "%s/dev", bpath) == -1) 9.92 - continue; 9.93 - free(dev); 9.94 - dev = xs_read(xsh, XBT_NULL, buf, &len); 9.95 - if (dev == NULL) 9.96 - continue; 9.97 - if (strncmp(dev, "hd", 2) || strlen(dev) != 3) 9.98 - continue; 9.99 - hd_index = dev[2] - 'a'; 9.100 - if (hd_index >= MAX_DISKS) 9.101 - continue; 9.102 - /* read the type of the device */ 9.103 - if (pasprintf(&buf, "%s/device/vbd/%s/device-type", path, e[i]) == -1) 9.104 - continue; 9.105 - free(type); 9.106 - type = xs_read(xsh, XBT_NULL, buf, &len); 9.107 - if (pasprintf(&buf, "%s/params", bpath) == -1) 9.108 - continue; 9.109 - free(params); 9.110 - params = xs_read(xsh, XBT_NULL, buf, &len); 9.111 - if (params == NULL) 9.112 - continue; 9.113 + if (bpath == NULL) 9.114 + continue; 9.115 + /* read the name of the device */ 9.116 + if (pasprintf(&buf, "%s/dev", bpath) == -1) 9.117 + continue; 9.118 + free(dev); 9.119 + dev = xs_read(xsh, XBT_NULL, buf, &len); 9.120 + if (dev == NULL) 9.121 + continue; 9.122 + is_scsi = !strncmp(dev, "sd", 2); 9.123 + if ((strncmp(dev, "hd", 2) && !is_scsi) || strlen(dev) != 3 ) 9.124 + continue; 9.125 + hd_index = dev[2] - 'a'; 9.126 + if (hd_index >= (is_scsi ? MAX_SCSI_DISKS : MAX_DISKS)) 9.127 + continue; 9.128 + /* read the type of the device */ 9.129 + if (pasprintf(&buf, "%s/device/vbd/%s/device-type", path, e[i]) == -1) 9.130 + continue; 9.131 + free(type); 9.132 + type = xs_read(xsh, XBT_NULL, buf, &len); 9.133 + if (pasprintf(&buf, "%s/params", bpath) == -1) 9.134 + continue; 9.135 + free(params); 9.136 + params = xs_read(xsh, XBT_NULL, buf, &len); 9.137 + if (params == NULL) 9.138 + continue; 9.139 /* 9.140 * check if device has a phantom vbd; the phantom is hooked 9.141 * to the frontend device (for ease of cleanup), so lookup 9.142 * the frontend device, and see if there is a phantom_vbd 9.143 * if there is, we will use resolution as the filename 9.144 */ 9.145 - if (pasprintf(&buf, "%s/device/vbd/%s/phantom_vbd", path, e[i]) == -1) 9.146 - continue; 9.147 - free(fpath); 9.148 + if (pasprintf(&buf, "%s/device/vbd/%s/phantom_vbd", path, e[i]) == -1) 9.149 + continue; 9.150 + free(fpath); 9.151 fpath = xs_read(xsh, XBT_NULL, buf, &len); 9.152 - if (fpath) { 9.153 - if (pasprintf(&buf, "%s/dev", fpath) == -1) 9.154 - continue; 9.155 - free(params); 9.156 + if (fpath) { 9.157 + if (pasprintf(&buf, "%s/dev", fpath) == -1) 9.158 + continue; 9.159 + free(params); 9.160 params = xs_read(xsh, XBT_NULL, buf , &len); 9.161 - if (params) { 9.162 + if (params) { 9.163 /* 9.164 * wait for device, on timeout silently fail because we will 9.165 * fail to open below 9.166 @@ -163,19 +164,20 @@ void xenstore_parse_domain_config(int do 9.167 } 9.168 } 9.169 9.170 - bs_table[hd_index] = bdrv_new(dev); 9.171 - /* check if it is a cdrom */ 9.172 - if (type && !strcmp(type, "cdrom")) { 9.173 - bdrv_set_type_hint(bs_table[hd_index], BDRV_TYPE_CDROM); 9.174 - if (pasprintf(&buf, "%s/params", bpath) != -1) 9.175 - xs_watch(xsh, buf, dev); 9.176 - } 9.177 - /* open device now if media present */ 9.178 - if (params[0]) { 9.179 - if (bdrv_open(bs_table[hd_index], params, 0 /* snapshot */) < 0) 9.180 + bs_table[hd_index + (is_scsi ? MAX_DISKS : 0)] = bdrv_new(dev); 9.181 + /* check if it is a cdrom */ 9.182 + if (type && !strcmp(type, "cdrom")) { 9.183 + bdrv_set_type_hint(bs_table[hd_index], BDRV_TYPE_CDROM); 9.184 + if (pasprintf(&buf, "%s/params", bpath) != -1) 9.185 + xs_watch(xsh, buf, dev); 9.186 + } 9.187 + /* open device now if media present */ 9.188 + if (params[0]) { 9.189 + if (bdrv_open(bs_table[hd_index + (is_scsi ? MAX_DISKS : 0)], 9.190 + params, 0 /* snapshot */) < 0) 9.191 fprintf(stderr, "qemu: could not open hard disk image '%s'\n", 9.192 params); 9.193 - } 9.194 + } 9.195 } 9.196 9.197 /* Set a watch for log-dirty requests from the migration tools */ 9.198 @@ -199,7 +201,7 @@ void xenstore_parse_domain_config(int do 9.199 int xenstore_fd(void) 9.200 { 9.201 if (xsh) 9.202 - return xs_fileno(xsh); 9.203 + return xs_fileno(xsh); 9.204 return -1; 9.205 } 9.206 9.207 @@ -316,7 +318,7 @@ void xenstore_process_event(void *opaque 9.208 9.209 vec = xs_read_watch(xsh, &num); 9.210 if (!vec) 9.211 - return; 9.212 + return; 9.213 9.214 if (!strcmp(vec[XS_WATCH_TOKEN], "logdirty")) { 9.215 xenstore_process_logdirty_event(); 9.216 @@ -324,23 +326,23 @@ void xenstore_process_event(void *opaque 9.217 } 9.218 9.219 if (strncmp(vec[XS_WATCH_TOKEN], "hd", 2) || 9.220 - strlen(vec[XS_WATCH_TOKEN]) != 3) 9.221 - goto out; 9.222 + strlen(vec[XS_WATCH_TOKEN]) != 3) 9.223 + goto out; 9.224 hd_index = vec[XS_WATCH_TOKEN][2] - 'a'; 9.225 image = xs_read(xsh, XBT_NULL, vec[XS_WATCH_PATH], &len); 9.226 if (image == NULL || !strcmp(image, bs_table[hd_index]->filename)) 9.227 - goto out; /* gone or identical */ 9.228 + goto out; /* gone or identical */ 9.229 9.230 do_eject(0, vec[XS_WATCH_TOKEN]); 9.231 bs_table[hd_index]->filename[0] = 0; 9.232 if (media_filename[hd_index]) { 9.233 - free(media_filename[hd_index]); 9.234 - media_filename[hd_index] = NULL; 9.235 + free(media_filename[hd_index]); 9.236 + media_filename[hd_index] = NULL; 9.237 } 9.238 9.239 if (image[0]) { 9.240 - media_filename[hd_index] = strdup(image); 9.241 - xenstore_check_new_media_present(5000); 9.242 + media_filename[hd_index] = strdup(image); 9.243 + xenstore_check_new_media_present(5000); 9.244 } 9.245 9.246 out: 9.247 @@ -354,7 +356,7 @@ void xenstore_write_vncport(int display) 9.248 char *portstr = NULL; 9.249 9.250 if (xsh == NULL) 9.251 - return; 9.252 + return; 9.253 9.254 path = xs_get_domain_path(xsh, domid); 9.255 if (path == NULL) { 9.256 @@ -363,10 +365,10 @@ void xenstore_write_vncport(int display) 9.257 } 9.258 9.259 if (pasprintf(&buf, "%s/console/vnc-port", path) == -1) 9.260 - goto out; 9.261 + goto out; 9.262 9.263 if (pasprintf(&portstr, "%d", 5900 + display) == -1) 9.264 - goto out; 9.265 + goto out; 9.266 9.267 if (xs_write(xsh, XBT_NULL, buf, portstr, strlen(portstr)) == 0) 9.268 fprintf(logfile, "xs_write() vncport failed\n"); 9.269 @@ -383,41 +385,41 @@ int xenstore_read_vncpasswd(int domid) 9.270 unsigned int i, len, rc = 0; 9.271 9.272 if (xsh == NULL) { 9.273 - return -1; 9.274 + return -1; 9.275 } 9.276 9.277 path = xs_get_domain_path(xsh, domid); 9.278 if (path == NULL) { 9.279 - fprintf(logfile, "xs_get_domain_path() error. domid %d.\n", domid); 9.280 - return -1; 9.281 + fprintf(logfile, "xs_get_domain_path() error. domid %d.\n", domid); 9.282 + return -1; 9.283 } 9.284 9.285 pasprintf(&buf, "%s/vm", path); 9.286 uuid = xs_read(xsh, XBT_NULL, buf, &len); 9.287 if (uuid == NULL) { 9.288 - fprintf(logfile, "xs_read(): uuid get error. %s.\n", buf); 9.289 - free(path); 9.290 - return -1; 9.291 + fprintf(logfile, "xs_read(): uuid get error. %s.\n", buf); 9.292 + free(path); 9.293 + return -1; 9.294 } 9.295 9.296 pasprintf(&buf, "%s/vncpasswd", uuid); 9.297 passwd = xs_read(xsh, XBT_NULL, buf, &len); 9.298 if (passwd == NULL) { 9.299 - fprintf(logfile, "xs_read(): vncpasswd get error. %s.\n", buf); 9.300 - free(uuid); 9.301 - free(path); 9.302 - return rc; 9.303 + fprintf(logfile, "xs_read(): vncpasswd get error. %s.\n", buf); 9.304 + free(uuid); 9.305 + free(path); 9.306 + return rc; 9.307 } 9.308 9.309 for (i=0; i<len && i<63; i++) { 9.310 - vncpasswd[i] = passwd[i]; 9.311 - passwd[i] = '\0'; 9.312 + vncpasswd[i] = passwd[i]; 9.313 + passwd[i] = '\0'; 9.314 } 9.315 vncpasswd[len] = '\0'; 9.316 pasprintf(&buf, "%s/vncpasswd", uuid); 9.317 if (xs_write(xsh, XBT_NULL, buf, passwd, len) == 0) { 9.318 - fprintf(logfile, "xs_write() vncpasswd failed.\n"); 9.319 - rc = -1; 9.320 + fprintf(logfile, "xs_write() vncpasswd failed.\n"); 9.321 + rc = -1; 9.322 } 9.323 9.324 free(passwd); 9.325 @@ -443,7 +445,7 @@ char **xenstore_domain_get_devices(struc 9.326 goto out; 9.327 9.328 if (pasprintf(&buf, "%s/device/%s", path,devtype) == -1) 9.329 - goto out; 9.330 + goto out; 9.331 9.332 e = xs_directory(handle, XBT_NULL, buf, num); 9.333 9.334 @@ -496,13 +498,13 @@ char *xenstore_backend_read_variable(str 9.335 9.336 buf = get_device_variable_path(devtype, inst, var); 9.337 if (NULL == buf) 9.338 - goto out; 9.339 + goto out; 9.340 9.341 value = xs_read(handle, XBT_NULL, buf, &len); 9.342 9.343 free(buf); 9.344 9.345 -out: 9.346 + out: 9.347 return value; 9.348 } 9.349 9.350 @@ -569,27 +571,27 @@ char *xenstore_vm_read(int domid, char * 9.351 char *buf = NULL, *path = NULL, *value = NULL; 9.352 9.353 if (xsh == NULL) 9.354 - goto out; 9.355 + goto out; 9.356 9.357 path = xs_get_domain_path(xsh, domid); 9.358 if (path == NULL) { 9.359 - fprintf(logfile, "xs_get_domain_path(%d): error\n", domid); 9.360 - goto out; 9.361 + fprintf(logfile, "xs_get_domain_path(%d): error\n", domid); 9.362 + goto out; 9.363 } 9.364 9.365 pasprintf(&buf, "%s/vm", path); 9.366 free(path); 9.367 path = xs_read(xsh, XBT_NULL, buf, NULL); 9.368 if (path == NULL) { 9.369 - fprintf(logfile, "xs_read(%s): read error\n", buf); 9.370 - goto out; 9.371 + fprintf(logfile, "xs_read(%s): read error\n", buf); 9.372 + goto out; 9.373 } 9.374 9.375 pasprintf(&buf, "%s/%s", path, key); 9.376 value = xs_read(xsh, XBT_NULL, buf, len); 9.377 if (value == NULL) { 9.378 - fprintf(logfile, "xs_read(%s): read error\n", buf); 9.379 - goto out; 9.380 + fprintf(logfile, "xs_read(%s): read error\n", buf); 9.381 + goto out; 9.382 } 9.383 9.384 out: 9.385 @@ -604,27 +606,27 @@ int xenstore_vm_write(int domid, char *k 9.386 int rc = -1; 9.387 9.388 if (xsh == NULL) 9.389 - goto out; 9.390 + goto out; 9.391 9.392 path = xs_get_domain_path(xsh, domid); 9.393 if (path == NULL) { 9.394 - fprintf(logfile, "xs_get_domain_path: error\n"); 9.395 - goto out; 9.396 + fprintf(logfile, "xs_get_domain_path: error\n"); 9.397 + goto out; 9.398 } 9.399 9.400 pasprintf(&buf, "%s/vm", path); 9.401 free(path); 9.402 path = xs_read(xsh, XBT_NULL, buf, NULL); 9.403 if (path == NULL) { 9.404 - fprintf(logfile, "xs_read(%s): read error\n", buf); 9.405 - goto out; 9.406 + fprintf(logfile, "xs_read(%s): read error\n", buf); 9.407 + goto out; 9.408 } 9.409 9.410 pasprintf(&buf, "%s/%s", path, key); 9.411 rc = xs_write(xsh, XBT_NULL, buf, value, strlen(value)); 9.412 if (rc) { 9.413 - fprintf(logfile, "xs_write(%s, %s): write error\n", buf, key); 9.414 - goto out; 9.415 + fprintf(logfile, "xs_write(%s, %s): write error\n", buf, key); 9.416 + goto out; 9.417 } 9.418 9.419 out:
10.1 --- a/tools/libfsimage/fat/fat.h Wed Apr 11 07:30:02 2007 -0600 10.2 +++ b/tools/libfsimage/fat/fat.h Wed Apr 11 15:45:29 2007 +0100 10.3 @@ -84,17 +84,17 @@ struct fat_bpb { 10.4 #define FAT_DIRENTRY_LENGTH 32 10.5 10.6 #define FAT_DIRENTRY_ATTRIB(entry) \ 10.7 - (*((unsigned char *) (entry+11))) 10.8 + (*((__u8 *) (entry+11))) 10.9 #define FAT_DIRENTRY_VALID(entry) \ 10.10 - ( ((*((unsigned char *) entry)) != 0) \ 10.11 - && ((*((unsigned char *) entry)) != 0xE5) \ 10.12 + ( ((*((__u8 *) entry)) != 0) \ 10.13 + && ((*((__u8 *) entry)) != 0xE5) \ 10.14 && !(FAT_DIRENTRY_ATTRIB(entry) & FAT_ATTRIB_NOT_OK_MASK) ) 10.15 #define FAT_DIRENTRY_FIRST_CLUSTER(entry) \ 10.16 - ((*((unsigned short *) (entry+26)))+(*((unsigned short *) (entry+20)) << 16)) 10.17 + ((*((__u16 *) (entry+26)))+(*((__u16 *) (entry+20)) << 16)) 10.18 #define FAT_DIRENTRY_FILELENGTH(entry) \ 10.19 - (*((unsigned long *) (entry+28))) 10.20 + (*((__u32 *) (entry+28))) 10.21 10.22 #define FAT_LONGDIR_ID(entry) \ 10.23 - (*((unsigned char *) (entry))) 10.24 + (*((__u8 *) (entry))) 10.25 #define FAT_LONGDIR_ALIASCHECKSUM(entry) \ 10.26 - (*((unsigned char *) (entry+13))) 10.27 + (*((__u8 *) (entry+13)))
11.1 --- a/tools/libxc/Makefile Wed Apr 11 07:30:02 2007 -0600 11.2 +++ b/tools/libxc/Makefile Wed Apr 11 15:45:29 2007 +0100 11.3 @@ -26,8 +26,8 @@ CTRL_SRCS-$(CONFIG_X86_Linux) += xc_ptra 11.4 11.5 GUEST_SRCS-y := 11.6 GUEST_SRCS-y += xg_private.c 11.7 -GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_linux_save.c 11.8 -GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c xc_hvm_save.c 11.9 +GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_domain_save.c 11.10 +GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c 11.11 11.12 # symlink libelf from xen/common/libelf/ 11.13 LIBELF_SRCS := libelf-tools.c libelf-loader.c
12.1 --- a/tools/libxc/ia64/xc_ia64_linux_save.c Wed Apr 11 07:30:02 2007 -0600 12.2 +++ b/tools/libxc/ia64/xc_ia64_linux_save.c Wed Apr 11 15:45:29 2007 +0100 12.3 @@ -134,8 +134,10 @@ retry: 12.4 } 12.5 12.6 int 12.7 -xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, 12.8 - uint32_t max_factor, uint32_t flags, int (*suspend)(int)) 12.9 +xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, 12.10 + uint32_t max_factor, uint32_t flags, int (*suspend)(int), 12.11 + int hvm, void *(*init_qemu_maps)(int, unsigned), 12.12 + void (*qemu_flip_buffer)(int, int)) 12.13 { 12.14 DECLARE_DOMCTL; 12.15 xc_dominfo_t info;
13.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 13.2 +++ b/tools/libxc/xc_domain_save.c Wed Apr 11 15:45:29 2007 +0100 13.3 @@ -0,0 +1,1609 @@ 13.4 +/****************************************************************************** 13.5 + * xc_linux_save.c 13.6 + * 13.7 + * Save the state of a running Linux session. 13.8 + * 13.9 + * Copyright (c) 2003, K A Fraser. 13.10 + */ 13.11 + 13.12 +#include <inttypes.h> 13.13 +#include <time.h> 13.14 +#include <stdlib.h> 13.15 +#include <unistd.h> 13.16 +#include <sys/time.h> 13.17 + 13.18 +#include "xc_private.h" 13.19 +#include "xc_dom.h" 13.20 +#include "xg_private.h" 13.21 +#include "xg_save_restore.h" 13.22 + 13.23 +#include <xen/hvm/params.h> 13.24 +#include <xen/hvm/e820.h> 13.25 + 13.26 +/* 13.27 +** Default values for important tuning parameters. Can override by passing 13.28 +** non-zero replacement values to xc_domain_save(). 13.29 +** 13.30 +** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too. 13.31 +** 13.32 +*/ 13.33 +#define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */ 13.34 +#define DEF_MAX_FACTOR 3 /* never send more than 3x p2m_size */ 13.35 + 13.36 +/* max mfn of the whole machine */ 13.37 +static unsigned long max_mfn; 13.38 + 13.39 +/* virtual starting address of the hypervisor */ 13.40 +static unsigned long hvirt_start; 13.41 + 13.42 +/* #levels of page tables used by the current guest */ 13.43 +static unsigned int pt_levels; 13.44 + 13.45 +/* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */ 13.46 +static unsigned long *qemu_bitmaps[2]; 13.47 +static int qemu_active; 13.48 +static int qemu_non_active; 13.49 + 13.50 +/* number of pfns this guest has (i.e. number of entries in the P2M) */ 13.51 +static unsigned long p2m_size; 13.52 + 13.53 +/* Live mapping of the table mapping each PFN to its current MFN. */ 13.54 +static xen_pfn_t *live_p2m = NULL; 13.55 + 13.56 +/* Live mapping of system MFN to PFN table. */ 13.57 +static xen_pfn_t *live_m2p = NULL; 13.58 +static unsigned long m2p_mfn0; 13.59 + 13.60 +/* grep fodder: machine_to_phys */ 13.61 + 13.62 +#define mfn_to_pfn(_mfn) live_m2p[(_mfn)] 13.63 + 13.64 +/* 13.65 + * Returns TRUE if the given machine frame number has a unique mapping 13.66 + * in the guest's pseudophysical map. 13.67 + */ 13.68 +#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \ 13.69 + (((_mfn) < (max_mfn)) && \ 13.70 + ((mfn_to_pfn(_mfn) < (p2m_size)) && \ 13.71 + (live_p2m[mfn_to_pfn(_mfn)] == (_mfn)))) 13.72 + 13.73 +/* Returns TRUE if MFN is successfully converted to a PFN. */ 13.74 +#define translate_mfn_to_pfn(_pmfn) \ 13.75 +({ \ 13.76 + unsigned long mfn = *(_pmfn); \ 13.77 + int _res = 1; \ 13.78 + if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \ 13.79 + _res = 0; \ 13.80 + else \ 13.81 + *(_pmfn) = mfn_to_pfn(mfn); \ 13.82 + _res; \ 13.83 +}) 13.84 + 13.85 +/* 13.86 +** During (live) save/migrate, we maintain a number of bitmaps to track 13.87 +** which pages we have to send, to fixup, and to skip. 13.88 +*/ 13.89 + 13.90 +#define BITS_PER_LONG (sizeof(unsigned long) * 8) 13.91 +#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG) 13.92 +#define BITMAP_SIZE (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long)) 13.93 + 13.94 +#define BITMAP_ENTRY(_nr,_bmap) \ 13.95 + ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG] 13.96 + 13.97 +#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG) 13.98 + 13.99 +static inline int test_bit (int nr, volatile void * addr) 13.100 +{ 13.101 + return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1; 13.102 +} 13.103 + 13.104 +static inline void clear_bit (int nr, volatile void * addr) 13.105 +{ 13.106 + BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr)); 13.107 +} 13.108 + 13.109 +static inline void set_bit ( int nr, volatile void * addr) 13.110 +{ 13.111 + BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr)); 13.112 +} 13.113 + 13.114 +/* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */ 13.115 +static inline unsigned int hweight32(unsigned int w) 13.116 +{ 13.117 + unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555); 13.118 + res = (res & 0x33333333) + ((res >> 2) & 0x33333333); 13.119 + res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F); 13.120 + res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF); 13.121 + return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF); 13.122 +} 13.123 + 13.124 +static inline int count_bits ( int nr, volatile void *addr) 13.125 +{ 13.126 + int i, count = 0; 13.127 + volatile unsigned long *p = (volatile unsigned long *)addr; 13.128 + /* We know that the array is padded to unsigned long. */ 13.129 + for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ ) 13.130 + count += hweight32(*p); 13.131 + return count; 13.132 +} 13.133 + 13.134 +static inline int permute( int i, int nr, int order_nr ) 13.135 +{ 13.136 + /* Need a simple permutation function so that we scan pages in a 13.137 + pseudo random order, enabling us to get a better estimate of 13.138 + the domain's page dirtying rate as we go (there are often 13.139 + contiguous ranges of pfns that have similar behaviour, and we 13.140 + want to mix them up. */ 13.141 + 13.142 + /* e.g. nr->oder 15->4 16->4 17->5 */ 13.143 + /* 512MB domain, 128k pages, order 17 */ 13.144 + 13.145 + /* 13.146 + QPONMLKJIHGFEDCBA 13.147 + QPONMLKJIH 13.148 + GFEDCBA 13.149 + */ 13.150 + 13.151 + /* 13.152 + QPONMLKJIHGFEDCBA 13.153 + EDCBA 13.154 + QPONM 13.155 + LKJIHGF 13.156 + */ 13.157 + 13.158 + do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); } 13.159 + while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */ 13.160 + 13.161 + return i; 13.162 +} 13.163 + 13.164 +static uint64_t tv_to_us(struct timeval *new) 13.165 +{ 13.166 + return (new->tv_sec * 1000000) + new->tv_usec; 13.167 +} 13.168 + 13.169 +static uint64_t llgettimeofday(void) 13.170 +{ 13.171 + struct timeval now; 13.172 + gettimeofday(&now, NULL); 13.173 + return tv_to_us(&now); 13.174 +} 13.175 + 13.176 +static uint64_t tv_delta(struct timeval *new, struct timeval *old) 13.177 +{ 13.178 + return (((new->tv_sec - old->tv_sec)*1000000) + 13.179 + (new->tv_usec - old->tv_usec)); 13.180 +} 13.181 + 13.182 +static int noncached_write(int fd, int live, void *buffer, int len) 13.183 +{ 13.184 + static int write_count = 0; 13.185 + 13.186 + int rc = write(fd,buffer,len); 13.187 + 13.188 + write_count += len; 13.189 + if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) ) 13.190 + { 13.191 + /* Time to discard cache - dont care if this fails */ 13.192 + discard_file_cache(fd, 0 /* no flush */); 13.193 + write_count = 0; 13.194 + } 13.195 + 13.196 + return rc; 13.197 +} 13.198 + 13.199 +#ifdef ADAPTIVE_SAVE 13.200 + 13.201 +/* 13.202 +** We control the rate at which we transmit (or save) to minimize impact 13.203 +** on running domains (including the target if we're doing live migrate). 13.204 +*/ 13.205 + 13.206 +#define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */ 13.207 +#define START_MBIT_RATE 100 /* initial transmit rate for migrate */ 13.208 + 13.209 +/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */ 13.210 +#define RATE_TO_BTU 781250 13.211 + 13.212 +/* Amount in bytes we allow ourselves to send in a burst */ 13.213 +#define BURST_BUDGET (100*1024) 13.214 + 13.215 +/* We keep track of the current and previous transmission rate */ 13.216 +static int mbit_rate, ombit_rate = 0; 13.217 + 13.218 +/* Have we reached the maximum transmission rate? */ 13.219 +#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE) 13.220 + 13.221 +static inline void initialize_mbit_rate() 13.222 +{ 13.223 + mbit_rate = START_MBIT_RATE; 13.224 +} 13.225 + 13.226 +static int ratewrite(int io_fd, int live, void *buf, int n) 13.227 +{ 13.228 + static int budget = 0; 13.229 + static int burst_time_us = -1; 13.230 + static struct timeval last_put = { 0 }; 13.231 + struct timeval now; 13.232 + struct timespec delay; 13.233 + long long delta; 13.234 + 13.235 + if ( START_MBIT_RATE == 0 ) 13.236 + return noncached_write(io_fd, live, buf, n); 13.237 + 13.238 + budget -= n; 13.239 + if ( budget < 0 ) 13.240 + { 13.241 + if ( mbit_rate != ombit_rate ) 13.242 + { 13.243 + burst_time_us = RATE_TO_BTU / mbit_rate; 13.244 + ombit_rate = mbit_rate; 13.245 + DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n", 13.246 + mbit_rate, BURST_BUDGET, burst_time_us); 13.247 + } 13.248 + if ( last_put.tv_sec == 0 ) 13.249 + { 13.250 + budget += BURST_BUDGET; 13.251 + gettimeofday(&last_put, NULL); 13.252 + } 13.253 + else 13.254 + { 13.255 + while ( budget < 0 ) 13.256 + { 13.257 + gettimeofday(&now, NULL); 13.258 + delta = tv_delta(&now, &last_put); 13.259 + while ( delta > burst_time_us ) 13.260 + { 13.261 + budget += BURST_BUDGET; 13.262 + last_put.tv_usec += burst_time_us; 13.263 + if ( last_put.tv_usec > 1000000 13.264 + { 13.265 + last_put.tv_usec -= 1000000; 13.266 + last_put.tv_sec++; 13.267 + } 13.268 + delta -= burst_time_us; 13.269 + } 13.270 + if ( budget > 0 ) 13.271 + break; 13.272 + delay.tv_sec = 0; 13.273 + delay.tv_nsec = 1000 * (burst_time_us - delta); 13.274 + while ( delay.tv_nsec > 0 ) 13.275 + if ( nanosleep(&delay, &delay) == 0 ) 13.276 + break; 13.277 + } 13.278 + } 13.279 + } 13.280 + return noncached_write(io_fd, live, buf, n); 13.281 +} 13.282 + 13.283 +#else /* ! ADAPTIVE SAVE */ 13.284 + 13.285 +#define RATE_IS_MAX() (0) 13.286 +#define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), (_buf), (_n)) 13.287 +#define initialize_mbit_rate() 13.288 + 13.289 +#endif 13.290 + 13.291 +static inline ssize_t write_exact(int fd, void *buf, size_t count) 13.292 +{ 13.293 + return (write(fd, buf, count) == count); 13.294 +} 13.295 + 13.296 +static int print_stats(int xc_handle, uint32_t domid, int pages_sent, 13.297 + xc_shadow_op_stats_t *stats, int print) 13.298 +{ 13.299 + static struct timeval wall_last; 13.300 + static long long d0_cpu_last; 13.301 + static long long d1_cpu_last; 13.302 + 13.303 + struct timeval wall_now; 13.304 + long long wall_delta; 13.305 + long long d0_cpu_now, d0_cpu_delta; 13.306 + long long d1_cpu_now, d1_cpu_delta; 13.307 + 13.308 + gettimeofday(&wall_now, NULL); 13.309 + 13.310 + d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000; 13.311 + d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000; 13.312 + 13.313 + if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) ) 13.314 + DPRINTF("ARRHHH!!\n"); 13.315 + 13.316 + wall_delta = tv_delta(&wall_now,&wall_last)/1000; 13.317 + if ( wall_delta == 0 ) 13.318 + wall_delta = 1; 13.319 + 13.320 + d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000; 13.321 + d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000; 13.322 + 13.323 + if ( print ) 13.324 + DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, " 13.325 + "dirtied %dMb/s %" PRId32 " pages\n", 13.326 + wall_delta, 13.327 + (int)((d0_cpu_delta*100)/wall_delta), 13.328 + (int)((d1_cpu_delta*100)/wall_delta), 13.329 + (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))), 13.330 + (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))), 13.331 + stats->dirty_count); 13.332 + 13.333 +#ifdef ADAPTIVE_SAVE 13.334 + if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate ) 13.335 + { 13.336 + mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) 13.337 + + 50; 13.338 + if ( mbit_rate > MAX_MBIT_RATE ) 13.339 + mbit_rate = MAX_MBIT_RATE; 13.340 + } 13.341 +#endif 13.342 + 13.343 + d0_cpu_last = d0_cpu_now; 13.344 + d1_cpu_last = d1_cpu_now; 13.345 + wall_last = wall_now; 13.346 + 13.347 + return 0; 13.348 +} 13.349 + 13.350 + 13.351 +static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size, 13.352 + unsigned long *arr, int runs) 13.353 +{ 13.354 + long long start, now; 13.355 + xc_shadow_op_stats_t stats; 13.356 + int j; 13.357 + 13.358 + start = llgettimeofday(); 13.359 + 13.360 + for ( j = 0; j < runs; j++ ) 13.361 + { 13.362 + int i; 13.363 + 13.364 + xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN, 13.365 + arr, p2m_size, NULL, 0, NULL); 13.366 + DPRINTF("#Flush\n"); 13.367 + for ( i = 0; i < 40; i++ ) 13.368 + { 13.369 + usleep(50000); 13.370 + now = llgettimeofday(); 13.371 + xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK, 13.372 + NULL, 0, NULL, 0, &stats); 13.373 + DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n", 13.374 + ((now-start)+500)/1000, 13.375 + stats.fault_count, stats.dirty_count); 13.376 + } 13.377 + } 13.378 + 13.379 + return -1; 13.380 +} 13.381 + 13.382 + 13.383 +static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd, 13.384 + int dom, xc_dominfo_t *info, 13.385 + vcpu_guest_context_t *ctxt) 13.386 +{ 13.387 + int i = 0; 13.388 + 13.389 + if ( !(*suspend)(dom) ) 13.390 + { 13.391 + ERROR("Suspend request failed"); 13.392 + return -1; 13.393 + } 13.394 + 13.395 + retry: 13.396 + 13.397 + if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 ) 13.398 + { 13.399 + ERROR("Could not get domain info"); 13.400 + return -1; 13.401 + } 13.402 + 13.403 + if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) ) 13.404 + ERROR("Could not get vcpu context"); 13.405 + 13.406 + 13.407 + if ( info->dying ) 13.408 + { 13.409 + ERROR("domain is dying"); 13.410 + return -1; 13.411 + } 13.412 + 13.413 + if ( info->crashed ) 13.414 + { 13.415 + ERROR("domain has crashed"); 13.416 + return -1; 13.417 + } 13.418 + 13.419 + if ( info->shutdown ) 13.420 + { 13.421 + switch ( info->shutdown_reason ) 13.422 + { 13.423 + case SHUTDOWN_poweroff: 13.424 + case SHUTDOWN_reboot: 13.425 + ERROR("domain has shut down"); 13.426 + return -1; 13.427 + case SHUTDOWN_suspend: 13.428 + return 0; 13.429 + case SHUTDOWN_crash: 13.430 + ERROR("domain has crashed"); 13.431 + return -1; 13.432 + } 13.433 + } 13.434 + 13.435 + if ( info->paused ) 13.436 + { 13.437 + /* Try unpausing domain, wait, and retest. */ 13.438 + xc_domain_unpause( xc_handle, dom ); 13.439 + ERROR("Domain was paused. Wait and re-test."); 13.440 + usleep(10000); /* 10ms */ 13.441 + goto retry; 13.442 + } 13.443 + 13.444 + if ( ++i < 100 ) 13.445 + { 13.446 + ERROR("Retry suspend domain"); 13.447 + usleep(10000); /* 10ms */ 13.448 + goto retry; 13.449 + } 13.450 + 13.451 + ERROR("Unable to suspend domain."); 13.452 + 13.453 + return -1; 13.454 +} 13.455 + 13.456 +/* 13.457 +** Map the top-level page of MFNs from the guest. The guest might not have 13.458 +** finished resuming from a previous restore operation, so we wait a while for 13.459 +** it to update the MFN to a reasonable value. 13.460 +*/ 13.461 +static void *map_frame_list_list(int xc_handle, uint32_t dom, 13.462 + shared_info_t *shinfo) 13.463 +{ 13.464 + int count = 100; 13.465 + void *p; 13.466 + 13.467 + while ( count-- && (shinfo->arch.pfn_to_mfn_frame_list_list == 0) ) 13.468 + usleep(10000); 13.469 + 13.470 + if ( shinfo->arch.pfn_to_mfn_frame_list_list == 0 ) 13.471 + { 13.472 + ERROR("Timed out waiting for frame list updated."); 13.473 + return NULL; 13.474 + } 13.475 + 13.476 + p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, 13.477 + shinfo->arch.pfn_to_mfn_frame_list_list); 13.478 + if ( p == NULL ) 13.479 + ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno); 13.480 + 13.481 + return p; 13.482 +} 13.483 + 13.484 +/* 13.485 +** During transfer (or in the state file), all page-table pages must be 13.486 +** converted into a 'canonical' form where references to actual mfns 13.487 +** are replaced with references to the corresponding pfns. 13.488 +** 13.489 +** This function performs the appropriate conversion, taking into account 13.490 +** which entries do not require canonicalization (in particular, those 13.491 +** entries which map the virtual address reserved for the hypervisor). 13.492 +*/ 13.493 +static int canonicalize_pagetable(unsigned long type, unsigned long pfn, 13.494 + const void *spage, void *dpage) 13.495 +{ 13.496 + 13.497 + int i, pte_last, xen_start, xen_end, race = 0; 13.498 + uint64_t pte; 13.499 + 13.500 + /* 13.501 + ** We need to determine which entries in this page table hold 13.502 + ** reserved hypervisor mappings. This depends on the current 13.503 + ** page table type as well as the number of paging levels. 13.504 + */ 13.505 + xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8); 13.506 + 13.507 + if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) ) 13.508 + xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT); 13.509 + 13.510 + if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) ) 13.511 + xen_start = L3_PAGETABLE_ENTRIES_PAE; 13.512 + 13.513 + /* 13.514 + ** in PAE only the L2 mapping the top 1GB contains Xen mappings. 13.515 + ** We can spot this by looking for the guest linear mapping which 13.516 + ** Xen always ensures is present in that L2. Guests must ensure 13.517 + ** that this check will fail for other L2s. 13.518 + */ 13.519 + if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) ) 13.520 + { 13.521 + int hstart; 13.522 + uint64_t he; 13.523 + 13.524 + hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff; 13.525 + he = ((const uint64_t *) spage)[hstart]; 13.526 + 13.527 + if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 ) 13.528 + { 13.529 + /* hvirt starts with xen stuff... */ 13.530 + xen_start = hstart; 13.531 + } 13.532 + else if ( hvirt_start != 0xf5800000 ) 13.533 + { 13.534 + /* old L2s from before hole was shrunk... */ 13.535 + hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff; 13.536 + he = ((const uint64_t *) spage)[hstart]; 13.537 + if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 ) 13.538 + xen_start = hstart; 13.539 + } 13.540 + } 13.541 + 13.542 + if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) ) 13.543 + { 13.544 + /* 13.545 + ** XXX SMH: should compute these from hvirt_start (which we have) 13.546 + ** and hvirt_end (which we don't) 13.547 + */ 13.548 + xen_start = 256; 13.549 + xen_end = 272; 13.550 + } 13.551 + 13.552 + /* Now iterate through the page table, canonicalizing each PTE */ 13.553 + for (i = 0; i < pte_last; i++ ) 13.554 + { 13.555 + unsigned long pfn, mfn; 13.556 + 13.557 + if ( pt_levels == 2 ) 13.558 + pte = ((const uint32_t*)spage)[i]; 13.559 + else 13.560 + pte = ((const uint64_t*)spage)[i]; 13.561 + 13.562 + if ( (i >= xen_start) && (i < xen_end) ) 13.563 + pte = 0; 13.564 + 13.565 + if ( pte & _PAGE_PRESENT ) 13.566 + { 13.567 + mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86; 13.568 + if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) 13.569 + { 13.570 + /* This will happen if the type info is stale which 13.571 + is quite feasible under live migration */ 13.572 + pfn = 0; /* zap it - we'll retransmit this page later */ 13.573 + race = 1; /* inform the caller of race; fatal if !live */ 13.574 + } 13.575 + else 13.576 + pfn = mfn_to_pfn(mfn); 13.577 + 13.578 + pte &= ~MADDR_MASK_X86; 13.579 + pte |= (uint64_t)pfn << PAGE_SHIFT; 13.580 + 13.581 + /* 13.582 + * PAE guest L3Es can contain these flags when running on 13.583 + * a 64bit hypervisor. We zap these here to avoid any 13.584 + * surprise at restore time... 13.585 + */ 13.586 + if ( (pt_levels == 3) && 13.587 + (type == XEN_DOMCTL_PFINFO_L3TAB) && 13.588 + (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) ) 13.589 + pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); 13.590 + } 13.591 + 13.592 + if ( pt_levels == 2 ) 13.593 + ((uint32_t*)dpage)[i] = pte; 13.594 + else 13.595 + ((uint64_t*)dpage)[i] = pte; 13.596 + } 13.597 + 13.598 + return race; 13.599 +} 13.600 + 13.601 +static xen_pfn_t *xc_map_m2p(int xc_handle, 13.602 + unsigned long max_mfn, 13.603 + int prot) 13.604 +{ 13.605 + struct xen_machphys_mfn_list xmml; 13.606 + privcmd_mmap_entry_t *entries; 13.607 + unsigned long m2p_chunks, m2p_size; 13.608 + xen_pfn_t *m2p; 13.609 + xen_pfn_t *extent_start; 13.610 + int i, rc; 13.611 + 13.612 + m2p_size = M2P_SIZE(max_mfn); 13.613 + m2p_chunks = M2P_CHUNKS(max_mfn); 13.614 + 13.615 + xmml.max_extents = m2p_chunks; 13.616 + if ( !(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t))) ) 13.617 + { 13.618 + ERROR("failed to allocate space for m2p mfns"); 13.619 + return NULL; 13.620 + } 13.621 + set_xen_guest_handle(xmml.extent_start, extent_start); 13.622 + 13.623 + if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) || 13.624 + (xmml.nr_extents != m2p_chunks) ) 13.625 + { 13.626 + ERROR("xc_get_m2p_mfns"); 13.627 + return NULL; 13.628 + } 13.629 + 13.630 + if ( (m2p = mmap(NULL, m2p_size, prot, 13.631 + MAP_SHARED, xc_handle, 0)) == MAP_FAILED ) 13.632 + { 13.633 + ERROR("failed to mmap m2p"); 13.634 + return NULL; 13.635 + } 13.636 + 13.637 + if ( !(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t))) ) 13.638 + { 13.639 + ERROR("failed to allocate space for mmap entries"); 13.640 + return NULL; 13.641 + } 13.642 + 13.643 + for ( i = 0; i < m2p_chunks; i++ ) 13.644 + { 13.645 + entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE)); 13.646 + entries[i].mfn = extent_start[i]; 13.647 + entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT; 13.648 + } 13.649 + 13.650 + if ( (rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN, 13.651 + entries, m2p_chunks)) < 0 ) 13.652 + { 13.653 + ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc); 13.654 + return NULL; 13.655 + } 13.656 + 13.657 + m2p_mfn0 = entries[0].mfn; 13.658 + 13.659 + free(extent_start); 13.660 + free(entries); 13.661 + 13.662 + return m2p; 13.663 +} 13.664 + 13.665 + 13.666 +static xen_pfn_t *map_and_save_p2m_table(int xc_handle, 13.667 + int io_fd, 13.668 + uint32_t dom, 13.669 + vcpu_guest_context_t *ctxt, 13.670 + unsigned long p2m_size, 13.671 + shared_info_t *live_shinfo) 13.672 +{ 13.673 + /* Double and single indirect references to the live P2M table */ 13.674 + xen_pfn_t *live_p2m_frame_list_list = NULL; 13.675 + xen_pfn_t *live_p2m_frame_list = NULL; 13.676 + 13.677 + /* A copy of the pfn-to-mfn table frame list. */ 13.678 + xen_pfn_t *p2m_frame_list = NULL; 13.679 + 13.680 + /* The mapping of the live p2m table itself */ 13.681 + xen_pfn_t *p2m = NULL; 13.682 + 13.683 + int i, success = 0; 13.684 + 13.685 + live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom, 13.686 + live_shinfo); 13.687 + if ( !live_p2m_frame_list_list ) 13.688 + goto out; 13.689 + 13.690 + live_p2m_frame_list = 13.691 + xc_map_foreign_batch(xc_handle, dom, PROT_READ, 13.692 + live_p2m_frame_list_list, 13.693 + P2M_FLL_ENTRIES); 13.694 + if ( !live_p2m_frame_list ) 13.695 + { 13.696 + ERROR("Couldn't map p2m_frame_list"); 13.697 + goto out; 13.698 + } 13.699 + 13.700 + 13.701 + /* Map all the frames of the pfn->mfn table. For migrate to succeed, 13.702 + the guest must not change which frames are used for this purpose. 13.703 + (its not clear why it would want to change them, and we'll be OK 13.704 + from a safety POV anyhow. */ 13.705 + 13.706 + p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ, 13.707 + live_p2m_frame_list, 13.708 + P2M_FL_ENTRIES); 13.709 + if ( !p2m ) 13.710 + { 13.711 + ERROR("Couldn't map p2m table"); 13.712 + goto out; 13.713 + } 13.714 + live_p2m = p2m; /* So that translation macros will work */ 13.715 + 13.716 + /* Get a local copy of the live_P2M_frame_list */ 13.717 + if ( !(p2m_frame_list = malloc(P2M_FL_SIZE)) ) 13.718 + { 13.719 + ERROR("Couldn't allocate p2m_frame_list array"); 13.720 + goto out; 13.721 + } 13.722 + memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE); 13.723 + 13.724 + /* Canonicalise the pfn-to-mfn table frame-number list. */ 13.725 + for ( i = 0; i < p2m_size; i += fpp ) 13.726 + { 13.727 + if ( !translate_mfn_to_pfn(&p2m_frame_list[i/fpp]) ) 13.728 + { 13.729 + ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys"); 13.730 + ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp, 13.731 + (uint64_t)p2m_frame_list[i/fpp]); 13.732 + goto out; 13.733 + } 13.734 + } 13.735 + 13.736 + /* 13.737 + * Write an extended-info structure to inform the restore code that 13.738 + * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off 13.739 + * slow paths in the restore code. 13.740 + */ 13.741 + if ( (pt_levels == 3) && 13.742 + (ctxt->vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3)) ) 13.743 + { 13.744 + unsigned long signature = ~0UL; 13.745 + uint32_t tot_sz = sizeof(struct vcpu_guest_context) + 8; 13.746 + uint32_t chunk_sz = sizeof(struct vcpu_guest_context); 13.747 + char chunk_sig[] = "vcpu"; 13.748 + if ( !write_exact(io_fd, &signature, sizeof(signature)) || 13.749 + !write_exact(io_fd, &tot_sz, sizeof(tot_sz)) || 13.750 + !write_exact(io_fd, &chunk_sig, 4) || 13.751 + !write_exact(io_fd, &chunk_sz, sizeof(chunk_sz)) || 13.752 + !write_exact(io_fd, ctxt, sizeof(*ctxt)) ) 13.753 + { 13.754 + ERROR("write: extended info"); 13.755 + goto out; 13.756 + } 13.757 + } 13.758 + 13.759 + if ( !write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE) ) 13.760 + { 13.761 + ERROR("write: p2m_frame_list"); 13.762 + goto out; 13.763 + } 13.764 + 13.765 + success = 1; 13.766 + 13.767 + out: 13.768 + 13.769 + if ( !success && p2m ) 13.770 + munmap(p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT)); 13.771 + 13.772 + if ( live_p2m_frame_list_list ) 13.773 + munmap(live_p2m_frame_list_list, PAGE_SIZE); 13.774 + 13.775 + if ( live_p2m_frame_list ) 13.776 + munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE); 13.777 + 13.778 + if ( p2m_frame_list ) 13.779 + free(p2m_frame_list); 13.780 + 13.781 + return success ? p2m : NULL; 13.782 +} 13.783 + 13.784 + 13.785 + 13.786 +int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, 13.787 + uint32_t max_factor, uint32_t flags, int (*suspend)(int), 13.788 + int hvm, void *(*init_qemu_maps)(int, unsigned), 13.789 + void (*qemu_flip_buffer)(int, int)) 13.790 +{ 13.791 + xc_dominfo_t info; 13.792 + 13.793 + int rc = 1, i, j, last_iter, iter = 0; 13.794 + int live = (flags & XCFLAGS_LIVE); 13.795 + int debug = (flags & XCFLAGS_DEBUG); 13.796 + int race = 0, sent_last_iter, skip_this_iter; 13.797 + 13.798 + /* The new domain's shared-info frame number. */ 13.799 + unsigned long shared_info_frame; 13.800 + 13.801 + /* A copy of the CPU context of the guest. */ 13.802 + vcpu_guest_context_t ctxt; 13.803 + 13.804 + /* A table containing the type of each PFN (/not/ MFN!). */ 13.805 + unsigned long *pfn_type = NULL; 13.806 + unsigned long *pfn_batch = NULL; 13.807 + 13.808 + /* A copy of one frame of guest memory. */ 13.809 + char page[PAGE_SIZE]; 13.810 + 13.811 + /* Live mapping of shared info structure */ 13.812 + shared_info_t *live_shinfo = NULL; 13.813 + 13.814 + /* base of the region in which domain memory is mapped */ 13.815 + unsigned char *region_base = NULL; 13.816 + 13.817 + /* power of 2 order of p2m_size */ 13.818 + int order_nr; 13.819 + 13.820 + /* bitmap of pages: 13.821 + - that should be sent this iteration (unless later marked as skip); 13.822 + - to skip this iteration because already dirty; 13.823 + - to fixup by sending at the end if not already resent; */ 13.824 + unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL; 13.825 + 13.826 + xc_shadow_op_stats_t stats; 13.827 + 13.828 + unsigned long needed_to_fix = 0; 13.829 + unsigned long total_sent = 0; 13.830 + 13.831 + uint64_t vcpumap = 1ULL; 13.832 + 13.833 + /* HVM: a buffer for holding HVM context */ 13.834 + uint32_t hvm_buf_size = 0; 13.835 + uint8_t *hvm_buf = NULL; 13.836 + 13.837 + /* HVM: magic frames for ioreqs and xenstore comms. */ 13.838 + uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */ 13.839 + 13.840 + /* If no explicit control parameters given, use defaults */ 13.841 + max_iters = max_iters ? : DEF_MAX_ITERS; 13.842 + max_factor = max_factor ? : DEF_MAX_FACTOR; 13.843 + 13.844 + initialize_mbit_rate(); 13.845 + 13.846 + if ( !get_platform_info(xc_handle, dom, 13.847 + &max_mfn, &hvirt_start, &pt_levels) ) 13.848 + { 13.849 + ERROR("Unable to get platform info."); 13.850 + return 1; 13.851 + } 13.852 + 13.853 + if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 ) 13.854 + { 13.855 + ERROR("Could not get domain info"); 13.856 + return 1; 13.857 + } 13.858 + 13.859 + if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) ) 13.860 + { 13.861 + ERROR("Could not get vcpu context"); 13.862 + goto out; 13.863 + } 13.864 + shared_info_frame = info.shared_info_frame; 13.865 + 13.866 + /* Map the shared info frame */ 13.867 + if ( !hvm ) 13.868 + { 13.869 + live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 13.870 + PROT_READ, shared_info_frame); 13.871 + if ( !live_shinfo ) 13.872 + { 13.873 + ERROR("Couldn't map live_shinfo"); 13.874 + goto out; 13.875 + } 13.876 + } 13.877 + 13.878 + /* Get the size of the P2M table */ 13.879 + p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom); 13.880 + 13.881 + /* Domain is still running at this point */ 13.882 + if ( live ) 13.883 + { 13.884 + /* Live suspend. Enable log-dirty mode. */ 13.885 + if ( xc_shadow_control(xc_handle, dom, 13.886 + XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, 13.887 + NULL, 0, NULL, 0, NULL) < 0 ) 13.888 + { 13.889 + ERROR("Couldn't enable shadow mode"); 13.890 + goto out; 13.891 + } 13.892 + 13.893 + if ( hvm ) 13.894 + { 13.895 + /* Get qemu-dm logging dirty pages too */ 13.896 + void *seg = init_qemu_maps(dom, BITMAP_SIZE); 13.897 + qemu_bitmaps[0] = seg; 13.898 + qemu_bitmaps[1] = seg + BITMAP_SIZE; 13.899 + qemu_active = 0; 13.900 + qemu_non_active = 1; 13.901 + } 13.902 + } 13.903 + else 13.904 + { 13.905 + /* This is a non-live suspend. Suspend the domain .*/ 13.906 + if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) ) 13.907 + { 13.908 + ERROR("Domain appears not to have suspended"); 13.909 + goto out; 13.910 + } 13.911 + } 13.912 + 13.913 + last_iter = !live; 13.914 + 13.915 + /* pretend we sent all the pages last iteration */ 13.916 + sent_last_iter = p2m_size; 13.917 + 13.918 + /* calculate the power of 2 order of p2m_size, e.g. 13.919 + 15->4 16->4 17->5 */ 13.920 + for ( i = p2m_size-1, order_nr = 0; i ; i >>= 1, order_nr++ ) 13.921 + continue; 13.922 + 13.923 + /* Setup to_send / to_fix and to_skip bitmaps */ 13.924 + to_send = malloc(BITMAP_SIZE); 13.925 + to_fix = calloc(1, BITMAP_SIZE); 13.926 + to_skip = malloc(BITMAP_SIZE); 13.927 + 13.928 + if ( !to_send || !to_fix || !to_skip ) 13.929 + { 13.930 + ERROR("Couldn't allocate to_send array"); 13.931 + goto out; 13.932 + } 13.933 + 13.934 + memset(to_send, 0xff, BITMAP_SIZE); 13.935 + 13.936 + if ( lock_pages(to_send, BITMAP_SIZE) ) 13.937 + { 13.938 + ERROR("Unable to lock to_send"); 13.939 + return 1; 13.940 + } 13.941 + 13.942 + /* (to fix is local only) */ 13.943 + if ( lock_pages(to_skip, BITMAP_SIZE) ) 13.944 + { 13.945 + ERROR("Unable to lock to_skip"); 13.946 + return 1; 13.947 + } 13.948 + 13.949 + if ( hvm ) 13.950 + { 13.951 + /* Need another buffer for HVM context */ 13.952 + hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0); 13.953 + if ( hvm_buf_size == -1 ) 13.954 + { 13.955 + ERROR("Couldn't get HVM context size from Xen"); 13.956 + goto out; 13.957 + } 13.958 + hvm_buf = malloc(hvm_buf_size); 13.959 + if ( !hvm_buf ) 13.960 + { 13.961 + ERROR("Couldn't allocate memory"); 13.962 + goto out; 13.963 + } 13.964 + } 13.965 + 13.966 + analysis_phase(xc_handle, dom, p2m_size, to_skip, 0); 13.967 + 13.968 + /* We want zeroed memory so use calloc rather than malloc. */ 13.969 + pfn_type = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type)); 13.970 + pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch)); 13.971 + if ( (pfn_type == NULL) || (pfn_batch == NULL) ) 13.972 + { 13.973 + ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays"); 13.974 + errno = ENOMEM; 13.975 + goto out; 13.976 + } 13.977 + 13.978 + if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) ) 13.979 + { 13.980 + ERROR("Unable to lock"); 13.981 + goto out; 13.982 + } 13.983 + 13.984 + /* Setup the mfn_to_pfn table mapping */ 13.985 + if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ)) ) 13.986 + { 13.987 + ERROR("Failed to map live M2P table"); 13.988 + goto out; 13.989 + } 13.990 + 13.991 + /* Start writing out the saved-domain record. */ 13.992 + if ( !write_exact(io_fd, &p2m_size, sizeof(unsigned long)) ) 13.993 + { 13.994 + ERROR("write: p2m_size"); 13.995 + goto out; 13.996 + } 13.997 + 13.998 + if ( !hvm ) 13.999 + { 13.1000 + int err = 0; 13.1001 + unsigned long mfn; 13.1002 + 13.1003 + /* Map the P2M table, and write the list of P2M frames */ 13.1004 + live_p2m = map_and_save_p2m_table(xc_handle, io_fd, dom, 13.1005 + &ctxt, p2m_size, live_shinfo); 13.1006 + if ( live_p2m == NULL ) 13.1007 + { 13.1008 + ERROR("Failed to map/save the p2m frame list"); 13.1009 + goto out; 13.1010 + } 13.1011 + 13.1012 + /* 13.1013 + * Quick belt and braces sanity check. 13.1014 + */ 13.1015 + 13.1016 + for ( i = 0; i < p2m_size; i++ ) 13.1017 + { 13.1018 + mfn = live_p2m[i]; 13.1019 + if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) ) 13.1020 + { 13.1021 + DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i, 13.1022 + mfn, mfn_to_pfn(mfn)); 13.1023 + err++; 13.1024 + } 13.1025 + } 13.1026 + DPRINTF("Had %d unexplained entries in p2m table\n", err); 13.1027 + } 13.1028 + 13.1029 + print_stats(xc_handle, dom, 0, &stats, 0); 13.1030 + 13.1031 + /* Now write out each data page, canonicalising page tables as we go... */ 13.1032 + for ( ; ; ) 13.1033 + { 13.1034 + unsigned int prev_pc, sent_this_iter, N, batch; 13.1035 + 13.1036 + iter++; 13.1037 + sent_this_iter = 0; 13.1038 + skip_this_iter = 0; 13.1039 + prev_pc = 0; 13.1040 + N = 0; 13.1041 + 13.1042 + DPRINTF("Saving memory pages: iter %d 0%%", iter); 13.1043 + 13.1044 + while ( N < p2m_size ) 13.1045 + { 13.1046 + unsigned int this_pc = (N * 100) / p2m_size; 13.1047 + int rc; 13.1048 + 13.1049 + if ( (this_pc - prev_pc) >= 5 ) 13.1050 + { 13.1051 + DPRINTF("\b\b\b\b%3d%%", this_pc); 13.1052 + prev_pc = this_pc; 13.1053 + } 13.1054 + 13.1055 + if ( !last_iter ) 13.1056 + { 13.1057 + /* Slightly wasteful to peek the whole array evey time, 13.1058 + but this is fast enough for the moment. */ 13.1059 + rc = xc_shadow_control( 13.1060 + xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip, 13.1061 + p2m_size, NULL, 0, NULL); 13.1062 + if ( rc != p2m_size ) 13.1063 + { 13.1064 + ERROR("Error peeking shadow bitmap"); 13.1065 + goto out; 13.1066 + } 13.1067 + } 13.1068 + 13.1069 + /* load pfn_type[] with the mfn of all the pages we're doing in 13.1070 + this batch. */ 13.1071 + for ( batch = 0; 13.1072 + (batch < MAX_BATCH_SIZE) && (N < p2m_size); 13.1073 + N++ ) 13.1074 + { 13.1075 + int n = permute(N, p2m_size, order_nr); 13.1076 + 13.1077 + if ( debug ) 13.1078 + DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n", 13.1079 + iter, (unsigned long)n, hvm ? 0 : live_p2m[n], 13.1080 + test_bit(n, to_send), 13.1081 + hvm ? 0 : mfn_to_pfn(live_p2m[n]&0xFFFFF)); 13.1082 + 13.1083 + if ( !last_iter && 13.1084 + test_bit(n, to_send) && 13.1085 + test_bit(n, to_skip) ) 13.1086 + skip_this_iter++; /* stats keeping */ 13.1087 + 13.1088 + if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) || 13.1089 + (test_bit(n, to_send) && last_iter) || 13.1090 + (test_bit(n, to_fix) && last_iter)) ) 13.1091 + continue; 13.1092 + 13.1093 + /* Skip PFNs that aren't really there */ 13.1094 + if ( hvm && ((n >= 0xa0 && n < 0xc0) /* VGA hole */ 13.1095 + || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT) 13.1096 + && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ ) 13.1097 + continue; 13.1098 + 13.1099 + /* 13.1100 + ** we get here if: 13.1101 + ** 1. page is marked to_send & hasn't already been re-dirtied 13.1102 + ** 2. (ignore to_skip in last iteration) 13.1103 + ** 3. add in pages that still need fixup (net bufs) 13.1104 + */ 13.1105 + 13.1106 + pfn_batch[batch] = n; 13.1107 + 13.1108 + /* Hypercall interfaces operate in PFNs for HVM guests 13.1109 + * and MFNs for PV guests */ 13.1110 + if ( hvm ) 13.1111 + pfn_type[batch] = n; 13.1112 + else 13.1113 + pfn_type[batch] = live_p2m[n]; 13.1114 + 13.1115 + if ( !is_mapped(pfn_type[batch]) ) 13.1116 + { 13.1117 + /* 13.1118 + ** not currently in psuedo-physical map -- set bit 13.1119 + ** in to_fix since we must send this page in last_iter 13.1120 + ** unless its sent sooner anyhow, or it never enters 13.1121 + ** pseudo-physical map (e.g. for ballooned down doms) 13.1122 + */ 13.1123 + set_bit(n, to_fix); 13.1124 + continue; 13.1125 + } 13.1126 + 13.1127 + if ( last_iter && 13.1128 + test_bit(n, to_fix) && 13.1129 + !test_bit(n, to_send) ) 13.1130 + { 13.1131 + needed_to_fix++; 13.1132 + DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n", 13.1133 + iter, n, pfn_type[batch]); 13.1134 + } 13.1135 + 13.1136 + clear_bit(n, to_fix); 13.1137 + 13.1138 + batch++; 13.1139 + } 13.1140 + 13.1141 + if ( batch == 0 ) 13.1142 + goto skip; /* vanishingly unlikely... */ 13.1143 + 13.1144 + region_base = xc_map_foreign_batch( 13.1145 + xc_handle, dom, PROT_READ, pfn_type, batch); 13.1146 + if ( region_base == NULL ) 13.1147 + { 13.1148 + ERROR("map batch failed"); 13.1149 + goto out; 13.1150 + } 13.1151 + 13.1152 + if ( !hvm ) 13.1153 + { 13.1154 + /* Get page types */ 13.1155 + for ( j = 0; j < batch; j++ ) 13.1156 + ((uint32_t *)pfn_type)[j] = pfn_type[j]; 13.1157 + if ( xc_get_pfn_type_batch(xc_handle, dom, batch, 13.1158 + (uint32_t *)pfn_type) ) 13.1159 + { 13.1160 + ERROR("get_pfn_type_batch failed"); 13.1161 + goto out; 13.1162 + } 13.1163 + for ( j = batch-1; j >= 0; j-- ) 13.1164 + pfn_type[j] = ((uint32_t *)pfn_type)[j]; 13.1165 + 13.1166 + for ( j = 0; j < batch; j++ ) 13.1167 + { 13.1168 + 13.1169 + if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) == 13.1170 + XEN_DOMCTL_PFINFO_XTAB ) 13.1171 + { 13.1172 + DPRINTF("type fail: page %i mfn %08lx\n", 13.1173 + j, pfn_type[j]); 13.1174 + continue; 13.1175 + } 13.1176 + 13.1177 + if ( debug ) 13.1178 + DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx" 13.1179 + " sum= %08lx\n", 13.1180 + iter, 13.1181 + (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) | 13.1182 + pfn_batch[j], 13.1183 + pfn_type[j], 13.1184 + mfn_to_pfn(pfn_type[j] & 13.1185 + ~XEN_DOMCTL_PFINFO_LTAB_MASK), 13.1186 + csum_page(region_base + (PAGE_SIZE*j))); 13.1187 + 13.1188 + /* canonicalise mfn->pfn */ 13.1189 + pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) | 13.1190 + pfn_batch[j]; 13.1191 + } 13.1192 + } 13.1193 + 13.1194 + if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) ) 13.1195 + { 13.1196 + ERROR("Error when writing to state file (2) (errno %d)", 13.1197 + errno); 13.1198 + goto out; 13.1199 + } 13.1200 + 13.1201 + if ( !write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) ) 13.1202 + { 13.1203 + ERROR("Error when writing to state file (3) (errno %d)", 13.1204 + errno); 13.1205 + goto out; 13.1206 + } 13.1207 + 13.1208 + /* entering this loop, pfn_type is now in pfns (Not mfns) */ 13.1209 + for ( j = 0; j < batch; j++ ) 13.1210 + { 13.1211 + unsigned long pfn, pagetype; 13.1212 + void *spage = (char *)region_base + (PAGE_SIZE*j); 13.1213 + 13.1214 + pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK; 13.1215 + pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK; 13.1216 + 13.1217 + /* write out pages in batch */ 13.1218 + if ( pagetype == XEN_DOMCTL_PFINFO_XTAB ) 13.1219 + continue; 13.1220 + 13.1221 + pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; 13.1222 + 13.1223 + if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) && 13.1224 + (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) ) 13.1225 + { 13.1226 + /* We have a pagetable page: need to rewrite it. */ 13.1227 + race = 13.1228 + canonicalize_pagetable(pagetype, pfn, spage, page); 13.1229 + 13.1230 + if ( race && !live ) 13.1231 + { 13.1232 + ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn, 13.1233 + pagetype); 13.1234 + goto out; 13.1235 + } 13.1236 + 13.1237 + if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE ) 13.1238 + { 13.1239 + ERROR("Error when writing to state file (4)" 13.1240 + " (errno %d)", errno); 13.1241 + goto out; 13.1242 + } 13.1243 + } 13.1244 + else 13.1245 + { 13.1246 + /* We have a normal page: just write it directly. */ 13.1247 + if ( ratewrite(io_fd, live, spage, PAGE_SIZE) != 13.1248 + PAGE_SIZE ) 13.1249 + { 13.1250 + ERROR("Error when writing to state file (5)" 13.1251 + " (errno %d)", errno); 13.1252 + goto out; 13.1253 + } 13.1254 + } 13.1255 + } /* end of the write out for this batch */ 13.1256 + 13.1257 + sent_this_iter += batch; 13.1258 + 13.1259 + munmap(region_base, batch*PAGE_SIZE); 13.1260 + 13.1261 + } /* end of this while loop for this iteration */ 13.1262 + 13.1263 + skip: 13.1264 + 13.1265 + total_sent += sent_this_iter; 13.1266 + 13.1267 + DPRINTF("\r %d: sent %d, skipped %d, ", 13.1268 + iter, sent_this_iter, skip_this_iter ); 13.1269 + 13.1270 + if ( last_iter ) 13.1271 + { 13.1272 + print_stats( xc_handle, dom, sent_this_iter, &stats, 1); 13.1273 + 13.1274 + DPRINTF("Total pages sent= %ld (%.2fx)\n", 13.1275 + total_sent, ((float)total_sent)/p2m_size ); 13.1276 + DPRINTF("(of which %ld were fixups)\n", needed_to_fix ); 13.1277 + } 13.1278 + 13.1279 + if ( last_iter && debug ) 13.1280 + { 13.1281 + int minusone = -1; 13.1282 + memset(to_send, 0xff, BITMAP_SIZE); 13.1283 + debug = 0; 13.1284 + DPRINTF("Entering debug resend-all mode\n"); 13.1285 + 13.1286 + /* send "-1" to put receiver into debug mode */ 13.1287 + if ( !write_exact(io_fd, &minusone, sizeof(int)) ) 13.1288 + { 13.1289 + ERROR("Error when writing to state file (6) (errno %d)", 13.1290 + errno); 13.1291 + goto out; 13.1292 + } 13.1293 + 13.1294 + continue; 13.1295 + } 13.1296 + 13.1297 + if ( last_iter ) 13.1298 + break; 13.1299 + 13.1300 + if ( live ) 13.1301 + { 13.1302 + if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) || 13.1303 + (iter >= max_iters) || 13.1304 + (sent_this_iter+skip_this_iter < 50) || 13.1305 + (total_sent > p2m_size*max_factor) ) 13.1306 + { 13.1307 + DPRINTF("Start last iteration\n"); 13.1308 + last_iter = 1; 13.1309 + 13.1310 + if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, 13.1311 + &ctxt) ) 13.1312 + { 13.1313 + ERROR("Domain appears not to have suspended"); 13.1314 + goto out; 13.1315 + } 13.1316 + 13.1317 + DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n", 13.1318 + info.shared_info_frame, 13.1319 + (unsigned long)ctxt.user_regs.eip, 13.1320 + (unsigned long)ctxt.user_regs.edx); 13.1321 + } 13.1322 + 13.1323 + if ( xc_shadow_control(xc_handle, dom, 13.1324 + XEN_DOMCTL_SHADOW_OP_CLEAN, to_send, 13.1325 + p2m_size, NULL, 0, &stats) != p2m_size ) 13.1326 + { 13.1327 + ERROR("Error flushing shadow PT"); 13.1328 + goto out; 13.1329 + } 13.1330 + 13.1331 + if ( hvm ) 13.1332 + { 13.1333 + /* Pull in the dirty bits from qemu-dm too */ 13.1334 + if ( !last_iter ) 13.1335 + { 13.1336 + qemu_active = qemu_non_active; 13.1337 + qemu_non_active = qemu_active ? 0 : 1; 13.1338 + qemu_flip_buffer(dom, qemu_active); 13.1339 + for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ ) 13.1340 + { 13.1341 + to_send[j] |= qemu_bitmaps[qemu_non_active][j]; 13.1342 + qemu_bitmaps[qemu_non_active][j] = 0; 13.1343 + } 13.1344 + } 13.1345 + else 13.1346 + { 13.1347 + for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ ) 13.1348 + to_send[j] |= qemu_bitmaps[qemu_active][j]; 13.1349 + } 13.1350 + } 13.1351 + 13.1352 + sent_last_iter = sent_this_iter; 13.1353 + 13.1354 + print_stats(xc_handle, dom, sent_this_iter, &stats, 1); 13.1355 + 13.1356 + } 13.1357 + } /* end of infinite for loop */ 13.1358 + 13.1359 + DPRINTF("All memory is saved\n"); 13.1360 + 13.1361 + { 13.1362 + struct { 13.1363 + int minustwo; 13.1364 + int max_vcpu_id; 13.1365 + uint64_t vcpumap; 13.1366 + } chunk = { -2, info.max_vcpu_id }; 13.1367 + 13.1368 + if ( info.max_vcpu_id >= 64 ) 13.1369 + { 13.1370 + ERROR("Too many VCPUS in guest!"); 13.1371 + goto out; 13.1372 + } 13.1373 + 13.1374 + for ( i = 1; i <= info.max_vcpu_id; i++ ) 13.1375 + { 13.1376 + xc_vcpuinfo_t vinfo; 13.1377 + if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) && 13.1378 + vinfo.online ) 13.1379 + vcpumap |= 1ULL << i; 13.1380 + } 13.1381 + 13.1382 + chunk.vcpumap = vcpumap; 13.1383 + if ( !write_exact(io_fd, &chunk, sizeof(chunk)) ) 13.1384 + { 13.1385 + ERROR("Error when writing to state file (errno %d)", errno); 13.1386 + goto out; 13.1387 + } 13.1388 + } 13.1389 + 13.1390 + /* Zero terminate */ 13.1391 + i = 0; 13.1392 + if ( !write_exact(io_fd, &i, sizeof(int)) ) 13.1393 + { 13.1394 + ERROR("Error when writing to state file (6') (errno %d)", errno); 13.1395 + goto out; 13.1396 + } 13.1397 + 13.1398 + if ( hvm ) 13.1399 + { 13.1400 + uint32_t rec_size; 13.1401 + 13.1402 + /* Save magic-page locations. */ 13.1403 + memset(magic_pfns, 0, sizeof(magic_pfns)); 13.1404 + xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, 13.1405 + (unsigned long *)&magic_pfns[0]); 13.1406 + xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, 13.1407 + (unsigned long *)&magic_pfns[1]); 13.1408 + xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN, 13.1409 + (unsigned long *)&magic_pfns[2]); 13.1410 + if ( !write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) ) 13.1411 + { 13.1412 + ERROR("Error when writing to state file (7)"); 13.1413 + goto out; 13.1414 + } 13.1415 + 13.1416 + /* Save vcpu contexts */ 13.1417 + 13.1418 + for ( i = 0; i <= info.max_vcpu_id; i++ ) 13.1419 + { 13.1420 + if ( !(vcpumap & (1ULL << i)) ) 13.1421 + continue; 13.1422 + 13.1423 + if ( xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) ) 13.1424 + { 13.1425 + ERROR("HVM:Could not get vcpu context"); 13.1426 + goto out; 13.1427 + } 13.1428 + 13.1429 + DPRINTF("write vcpu %d context.\n", i); 13.1430 + if ( !write_exact(io_fd, &(ctxt), sizeof(ctxt)) ) 13.1431 + { 13.1432 + ERROR("write vcpu context failed!\n"); 13.1433 + goto out; 13.1434 + } 13.1435 + } 13.1436 + 13.1437 + /* Get HVM context from Xen and save it too */ 13.1438 + if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf, 13.1439 + hvm_buf_size)) == -1 ) 13.1440 + { 13.1441 + ERROR("HVM:Could not get hvm buffer"); 13.1442 + goto out; 13.1443 + } 13.1444 + 13.1445 + if ( !write_exact(io_fd, &rec_size, sizeof(uint32_t)) ) 13.1446 + { 13.1447 + ERROR("error write hvm buffer size"); 13.1448 + goto out; 13.1449 + } 13.1450 + 13.1451 + if ( !write_exact(io_fd, hvm_buf, rec_size) ) 13.1452 + { 13.1453 + ERROR("write HVM info failed!\n"); 13.1454 + goto out; 13.1455 + } 13.1456 + 13.1457 + /* HVM guests are done now */ 13.1458 + rc = 0; 13.1459 + goto out; 13.1460 + } 13.1461 + 13.1462 + /* PV guests only from now on */ 13.1463 + 13.1464 + /* Send through a list of all the PFNs that were not in map at the close */ 13.1465 + { 13.1466 + unsigned int i,j; 13.1467 + unsigned long pfntab[1024]; 13.1468 + 13.1469 + for ( i = 0, j = 0; i < p2m_size; i++ ) 13.1470 + { 13.1471 + if ( !is_mapped(live_p2m[i]) ) 13.1472 + j++; 13.1473 + } 13.1474 + 13.1475 + if ( !write_exact(io_fd, &j, sizeof(unsigned int)) ) 13.1476 + { 13.1477 + ERROR("Error when writing to state file (6a) (errno %d)", errno); 13.1478 + goto out; 13.1479 + } 13.1480 + 13.1481 + for ( i = 0, j = 0; i < p2m_size; ) 13.1482 + { 13.1483 + if ( !is_mapped(live_p2m[i]) ) 13.1484 + pfntab[j++] = i; 13.1485 + 13.1486 + i++; 13.1487 + if ( (j == 1024) || (i == p2m_size) ) 13.1488 + { 13.1489 + if ( !write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) ) 13.1490 + { 13.1491 + ERROR("Error when writing to state file (6b) (errno %d)", 13.1492 + errno); 13.1493 + goto out; 13.1494 + } 13.1495 + j = 0; 13.1496 + } 13.1497 + } 13.1498 + } 13.1499 + 13.1500 + /* Canonicalise the suspend-record frame number. */ 13.1501 + if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) ) 13.1502 + { 13.1503 + ERROR("Suspend record is not in range of pseudophys map"); 13.1504 + goto out; 13.1505 + } 13.1506 + 13.1507 + for ( i = 0; i <= info.max_vcpu_id; i++ ) 13.1508 + { 13.1509 + if ( !(vcpumap & (1ULL << i)) ) 13.1510 + continue; 13.1511 + 13.1512 + if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) ) 13.1513 + { 13.1514 + ERROR("No context for VCPU%d", i); 13.1515 + goto out; 13.1516 + } 13.1517 + 13.1518 + /* Canonicalise each GDT frame number. */ 13.1519 + for ( j = 0; (512*j) < ctxt.gdt_ents; j++ ) 13.1520 + { 13.1521 + if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[j]) ) 13.1522 + { 13.1523 + ERROR("GDT frame is not in range of pseudophys map"); 13.1524 + goto out; 13.1525 + } 13.1526 + } 13.1527 + 13.1528 + /* Canonicalise the page table base pointer. */ 13.1529 + if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) ) 13.1530 + { 13.1531 + ERROR("PT base is not in range of pseudophys map"); 13.1532 + goto out; 13.1533 + } 13.1534 + ctxt.ctrlreg[3] = 13.1535 + xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3]))); 13.1536 + 13.1537 + /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */ 13.1538 + if ( (pt_levels == 4) && ctxt.ctrlreg[1] ) 13.1539 + { 13.1540 + if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[1])) ) 13.1541 + { 13.1542 + ERROR("PT base is not in range of pseudophys map"); 13.1543 + goto out; 13.1544 + } 13.1545 + /* Least-significant bit means 'valid PFN'. */ 13.1546 + ctxt.ctrlreg[1] = 1 | 13.1547 + xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[1]))); 13.1548 + } 13.1549 + 13.1550 + if ( !write_exact(io_fd, &ctxt, sizeof(ctxt)) ) 13.1551 + { 13.1552 + ERROR("Error when writing to state file (1) (errno %d)", errno); 13.1553 + goto out; 13.1554 + } 13.1555 + } 13.1556 + 13.1557 + /* 13.1558 + * Reset the MFN to be a known-invalid value. See map_frame_list_list(). 13.1559 + */ 13.1560 + memcpy(page, live_shinfo, PAGE_SIZE); 13.1561 + ((shared_info_t *)page)->arch.pfn_to_mfn_frame_list_list = 0; 13.1562 + if ( !write_exact(io_fd, page, PAGE_SIZE) ) 13.1563 + { 13.1564 + ERROR("Error when writing to state file (1) (errno %d)", errno); 13.1565 + goto out; 13.1566 + } 13.1567 + 13.1568 + /* Success! */ 13.1569 + rc = 0; 13.1570 + 13.1571 + out: 13.1572 + 13.1573 + if ( live ) 13.1574 + { 13.1575 + if ( xc_shadow_control(xc_handle, dom, 13.1576 + XEN_DOMCTL_SHADOW_OP_OFF, 13.1577 + NULL, 0, NULL, 0, NULL) < 0 ) 13.1578 + DPRINTF("Warning - couldn't disable shadow mode"); 13.1579 + } 13.1580 + 13.1581 + /* Flush last write and discard cache for file. */ 13.1582 + discard_file_cache(io_fd, 1 /* flush */); 13.1583 + 13.1584 + if ( live_shinfo ) 13.1585 + munmap(live_shinfo, PAGE_SIZE); 13.1586 + 13.1587 + if ( live_p2m ) 13.1588 + munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT)); 13.1589 + 13.1590 + if ( live_m2p ) 13.1591 + munmap(live_m2p, M2P_SIZE(max_mfn)); 13.1592 + 13.1593 + free(pfn_type); 13.1594 + free(pfn_batch); 13.1595 + free(to_send); 13.1596 + free(to_fix); 13.1597 + free(to_skip); 13.1598 + 13.1599 + DPRINTF("Save exit rc=%d\n",rc); 13.1600 + 13.1601 + return !!rc; 13.1602 +} 13.1603 + 13.1604 +/* 13.1605 + * Local variables: 13.1606 + * mode: C 13.1607 + * c-set-style: "BSD" 13.1608 + * c-basic-offset: 4 13.1609 + * tab-width: 4 13.1610 + * indent-tabs-mode: nil 13.1611 + * End: 13.1612 + */
14.1 --- a/tools/libxc/xc_hvm_save.c Wed Apr 11 07:30:02 2007 -0600 14.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 14.3 @@ -1,755 +0,0 @@ 14.4 -/****************************************************************************** 14.5 - * xc_hvm_save.c 14.6 - * 14.7 - * Save the state of a running HVM guest. 14.8 - * 14.9 - * Copyright (c) 2003, K A Fraser. 14.10 - * Copyright (c) 2006 Intel Corperation 14.11 - * rewriten for hvm guest by Zhai Edwin <edwin.zhai@intel.com> 14.12 - * 14.13 - * This program is free software; you can redistribute it and/or modify it 14.14 - * under the terms and conditions of the GNU General Public License, 14.15 - * version 2, as published by the Free Software Foundation. 14.16 - * 14.17 - * This program is distributed in the hope it will be useful, but WITHOUT 14.18 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14.19 - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14.20 - * more details. 14.21 - * 14.22 - * You should have received a copy of the GNU General Public License along with 14.23 - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple 14.24 - * Place - Suite 330, Boston, MA 02111-1307 USA. 14.25 - * 14.26 - */ 14.27 - 14.28 -#include <inttypes.h> 14.29 -#include <time.h> 14.30 -#include <stdlib.h> 14.31 -#include <unistd.h> 14.32 -#include <sys/time.h> 14.33 - 14.34 -#include "xc_private.h" 14.35 -#include "xg_private.h" 14.36 -#include "xg_save_restore.h" 14.37 - 14.38 -#include <xen/hvm/e820.h> 14.39 -#include <xen/hvm/params.h> 14.40 - 14.41 -/* 14.42 -** Default values for important tuning parameters. Can override by passing 14.43 -** non-zero replacement values to xc_hvm_save(). 14.44 -** 14.45 -** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too. 14.46 -** 14.47 -*/ 14.48 -#define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */ 14.49 -#define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */ 14.50 - 14.51 -/* Shared-memory bitmaps for getting log-dirty bits from qemu */ 14.52 -static unsigned long *qemu_bitmaps[2]; 14.53 -static int qemu_active; 14.54 -static int qemu_non_active; 14.55 - 14.56 -/* 14.57 -** During (live) save/migrate, we maintain a number of bitmaps to track 14.58 -** which pages we have to send, to fixup, and to skip. 14.59 -*/ 14.60 - 14.61 -#define BITS_PER_LONG (sizeof(unsigned long) * 8) 14.62 -#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG) 14.63 -#define BITMAP_SIZE (BITS_TO_LONGS(pfn_array_size) * sizeof(unsigned long)) 14.64 - 14.65 -#define BITMAP_ENTRY(_nr,_bmap) \ 14.66 - ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG] 14.67 - 14.68 -#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG) 14.69 - 14.70 -static inline int test_bit (int nr, volatile void * addr) 14.71 -{ 14.72 - return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1; 14.73 -} 14.74 - 14.75 -static inline void clear_bit (int nr, volatile void * addr) 14.76 -{ 14.77 - BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr)); 14.78 -} 14.79 - 14.80 -static inline int permute( int i, int nr, int order_nr ) 14.81 -{ 14.82 - /* Need a simple permutation function so that we scan pages in a 14.83 - pseudo random order, enabling us to get a better estimate of 14.84 - the domain's page dirtying rate as we go (there are often 14.85 - contiguous ranges of pfns that have similar behaviour, and we 14.86 - want to mix them up. */ 14.87 - 14.88 - /* e.g. nr->oder 15->4 16->4 17->5 */ 14.89 - /* 512MB domain, 128k pages, order 17 */ 14.90 - 14.91 - /* 14.92 - QPONMLKJIHGFEDCBA 14.93 - QPONMLKJIH 14.94 - GFEDCBA 14.95 - */ 14.96 - 14.97 - /* 14.98 - QPONMLKJIHGFEDCBA 14.99 - EDCBA 14.100 - QPONM 14.101 - LKJIHGF 14.102 - */ 14.103 - 14.104 - do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); } 14.105 - while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */ 14.106 - 14.107 - return i; 14.108 -} 14.109 - 14.110 - 14.111 -static uint64_t tv_to_us(struct timeval *new) 14.112 -{ 14.113 - return (new->tv_sec * 1000000) + new->tv_usec; 14.114 -} 14.115 - 14.116 -static uint64_t llgettimeofday(void) 14.117 -{ 14.118 - struct timeval now; 14.119 - gettimeofday(&now, NULL); 14.120 - return tv_to_us(&now); 14.121 -} 14.122 - 14.123 -static uint64_t tv_delta(struct timeval *new, struct timeval *old) 14.124 -{ 14.125 - return (((new->tv_sec - old->tv_sec)*1000000) + 14.126 - (new->tv_usec - old->tv_usec)); 14.127 -} 14.128 - 14.129 - 14.130 -#define RATE_IS_MAX() (0) 14.131 -#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n)) 14.132 -#define initialize_mbit_rate() 14.133 - 14.134 -static inline ssize_t write_exact(int fd, void *buf, size_t count) 14.135 -{ 14.136 - return (write(fd, buf, count) == count); 14.137 -} 14.138 - 14.139 -static int print_stats(int xc_handle, uint32_t domid, int pages_sent, 14.140 - xc_shadow_op_stats_t *stats, int print) 14.141 -{ 14.142 - static struct timeval wall_last; 14.143 - static long long d0_cpu_last; 14.144 - static long long d1_cpu_last; 14.145 - 14.146 - struct timeval wall_now; 14.147 - long long wall_delta; 14.148 - long long d0_cpu_now, d0_cpu_delta; 14.149 - long long d1_cpu_now, d1_cpu_delta; 14.150 - 14.151 - gettimeofday(&wall_now, NULL); 14.152 - 14.153 - d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000; 14.154 - d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000; 14.155 - 14.156 - if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) ) 14.157 - DPRINTF("ARRHHH!!\n"); 14.158 - 14.159 - wall_delta = tv_delta(&wall_now,&wall_last)/1000; 14.160 - if ( wall_delta == 0 ) 14.161 - wall_delta = 1; 14.162 - 14.163 - d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000; 14.164 - d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000; 14.165 - 14.166 - if ( print ) 14.167 - DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, " 14.168 - "dirtied %dMb/s %" PRId32 " pages\n", 14.169 - wall_delta, 14.170 - (int)((d0_cpu_delta*100)/wall_delta), 14.171 - (int)((d1_cpu_delta*100)/wall_delta), 14.172 - (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))), 14.173 - (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))), 14.174 - stats->dirty_count); 14.175 - 14.176 - d0_cpu_last = d0_cpu_now; 14.177 - d1_cpu_last = d1_cpu_now; 14.178 - wall_last = wall_now; 14.179 - 14.180 - return 0; 14.181 -} 14.182 - 14.183 -static int analysis_phase(int xc_handle, uint32_t domid, int pfn_array_size, 14.184 - unsigned long *arr, int runs) 14.185 -{ 14.186 - long long start, now; 14.187 - xc_shadow_op_stats_t stats; 14.188 - int j; 14.189 - 14.190 - start = llgettimeofday(); 14.191 - 14.192 - for ( j = 0; j < runs; j++ ) 14.193 - { 14.194 - int i; 14.195 - 14.196 - xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN, 14.197 - arr, pfn_array_size, NULL, 0, NULL); 14.198 - DPRINTF("#Flush\n"); 14.199 - for ( i = 0; i < 40; i++ ) 14.200 - { 14.201 - usleep(50000); 14.202 - now = llgettimeofday(); 14.203 - xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK, 14.204 - NULL, 0, NULL, 0, &stats); 14.205 - DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n", 14.206 - ((now-start)+500)/1000, 14.207 - stats.fault_count, stats.dirty_count); 14.208 - } 14.209 - } 14.210 - 14.211 - return -1; 14.212 -} 14.213 - 14.214 -static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd, 14.215 - int dom, xc_dominfo_t *info, 14.216 - vcpu_guest_context_t *ctxt) 14.217 -{ 14.218 - int i = 0; 14.219 - 14.220 - if ( !(*suspend)(dom) ) 14.221 - { 14.222 - ERROR("Suspend request failed"); 14.223 - return -1; 14.224 - } 14.225 - 14.226 - retry: 14.227 - 14.228 - if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 ) 14.229 - { 14.230 - ERROR("Could not get domain info"); 14.231 - return -1; 14.232 - } 14.233 - 14.234 - if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) ) 14.235 - ERROR("Could not get vcpu context"); 14.236 - 14.237 - if ( info->shutdown && (info->shutdown_reason == SHUTDOWN_suspend) ) 14.238 - return 0; /* success */ 14.239 - 14.240 - if ( info->paused ) 14.241 - { 14.242 - /* Try unpausing domain, wait, and retest. */ 14.243 - xc_domain_unpause( xc_handle, dom ); 14.244 - ERROR("Domain was paused. Wait and re-test."); 14.245 - usleep(10000); /* 10ms */ 14.246 - goto retry; 14.247 - } 14.248 - 14.249 - if ( ++i < 100 ) 14.250 - { 14.251 - ERROR("Retry suspend domain."); 14.252 - usleep(10000); /* 10ms */ 14.253 - goto retry; 14.254 - } 14.255 - 14.256 - ERROR("Unable to suspend domain."); 14.257 - 14.258 - return -1; 14.259 -} 14.260 - 14.261 -int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, 14.262 - uint32_t max_factor, uint32_t flags, int (*suspend)(int), 14.263 - void *(*init_qemu_maps)(int, unsigned), 14.264 - void (*qemu_flip_buffer)(int, int)) 14.265 -{ 14.266 - xc_dominfo_t info; 14.267 - 14.268 - int rc = 1, i, j, last_iter, iter = 0; 14.269 - int live = !!(flags & XCFLAGS_LIVE); 14.270 - int debug = !!(flags & XCFLAGS_DEBUG); 14.271 - int sent_last_iter, skip_this_iter; 14.272 - 14.273 - /* The highest guest-physical frame number used by the current guest */ 14.274 - unsigned long max_pfn; 14.275 - 14.276 - /* The size of an array big enough to contain all guest pfns */ 14.277 - unsigned long pfn_array_size; 14.278 - 14.279 - /* Magic frames: ioreqs and xenstore comms. */ 14.280 - uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */ 14.281 - 14.282 - /* A copy of the CPU context of the guest. */ 14.283 - vcpu_guest_context_t ctxt; 14.284 - 14.285 - /* A table containg the PFNs (/not/ MFN!) to map. */ 14.286 - xen_pfn_t *pfn_batch = NULL; 14.287 - 14.288 - /* A copy of hvm domain context buffer*/ 14.289 - uint32_t hvm_buf_size; 14.290 - uint8_t *hvm_buf = NULL; 14.291 - 14.292 - /* base of the region in which domain memory is mapped */ 14.293 - unsigned char *region_base = NULL; 14.294 - 14.295 - uint32_t rec_size, nr_vcpus; 14.296 - 14.297 - /* power of 2 order of pfn_array_size */ 14.298 - int order_nr; 14.299 - 14.300 - /* bitmap of pages: 14.301 - - that should be sent this iteration (unless later marked as skip); 14.302 - - to skip this iteration because already dirty; */ 14.303 - unsigned long *to_send = NULL, *to_skip = NULL; 14.304 - 14.305 - xc_shadow_op_stats_t stats; 14.306 - 14.307 - unsigned long total_sent = 0; 14.308 - 14.309 - uint64_t vcpumap = 1ULL; 14.310 - 14.311 - DPRINTF("xc_hvm_save: dom=%d, max_iters=%d, max_factor=%d, flags=0x%x, " 14.312 - "live=%d, debug=%d.\n", dom, max_iters, max_factor, flags, 14.313 - live, debug); 14.314 - 14.315 - /* If no explicit control parameters given, use defaults */ 14.316 - max_iters = max_iters ? : DEF_MAX_ITERS; 14.317 - max_factor = max_factor ? : DEF_MAX_FACTOR; 14.318 - 14.319 - initialize_mbit_rate(); 14.320 - 14.321 - if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 ) 14.322 - { 14.323 - ERROR("HVM: Could not get domain info"); 14.324 - return 1; 14.325 - } 14.326 - nr_vcpus = info.nr_online_vcpus; 14.327 - 14.328 - if ( mlock(&ctxt, sizeof(ctxt)) ) 14.329 - { 14.330 - ERROR("HVM: Unable to mlock ctxt"); 14.331 - return 1; 14.332 - } 14.333 - 14.334 - /* Only have to worry about vcpu 0 even for SMP */ 14.335 - if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) ) 14.336 - { 14.337 - ERROR("HVM: Could not get vcpu context"); 14.338 - goto out; 14.339 - } 14.340 - 14.341 - DPRINTF("saved hvm domain info: max_memkb=0x%lx, nr_pages=0x%lx\n", 14.342 - info.max_memkb, info.nr_pages); 14.343 - 14.344 - if ( live ) 14.345 - { 14.346 - /* Live suspend. Enable log-dirty mode. */ 14.347 - if ( xc_shadow_control(xc_handle, dom, 14.348 - XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, 14.349 - NULL, 0, NULL, 0, NULL) < 0 ) 14.350 - { 14.351 - ERROR("Couldn't enable shadow mode"); 14.352 - goto out; 14.353 - } 14.354 - } 14.355 - else 14.356 - { 14.357 - /* This is a non-live suspend. Suspend the domain .*/ 14.358 - if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) ) 14.359 - { 14.360 - ERROR("HVM Domain appears not to have suspended"); 14.361 - goto out; 14.362 - } 14.363 - } 14.364 - 14.365 - last_iter = !live; 14.366 - 14.367 - max_pfn = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom); 14.368 - 14.369 - DPRINTF("after 1st handle hvm domain max_pfn=0x%lx, " 14.370 - "max_memkb=0x%lx, live=%d.\n", 14.371 - max_pfn, info.max_memkb, live); 14.372 - 14.373 - /* Size of any array that covers 0 ... max_pfn */ 14.374 - pfn_array_size = max_pfn + 1; 14.375 - if ( !write_exact(io_fd, &pfn_array_size, sizeof(unsigned long)) ) 14.376 - { 14.377 - ERROR("Error when writing to state file (1)"); 14.378 - goto out; 14.379 - } 14.380 - 14.381 - /* pretend we sent all the pages last iteration */ 14.382 - sent_last_iter = pfn_array_size; 14.383 - 14.384 - /* calculate the power of 2 order of pfn_array_size, e.g. 14.385 - 15->4 16->4 17->5 */ 14.386 - for ( i = pfn_array_size-1, order_nr = 0; i ; i >>= 1, order_nr++ ) 14.387 - continue; 14.388 - 14.389 - /* Setup to_send / to_fix and to_skip bitmaps */ 14.390 - to_send = malloc(BITMAP_SIZE); 14.391 - to_skip = malloc(BITMAP_SIZE); 14.392 - 14.393 - if ( live ) 14.394 - { 14.395 - /* Get qemu-dm logging dirty pages too */ 14.396 - void *seg = init_qemu_maps(dom, BITMAP_SIZE); 14.397 - qemu_bitmaps[0] = seg; 14.398 - qemu_bitmaps[1] = seg + BITMAP_SIZE; 14.399 - qemu_active = 0; 14.400 - qemu_non_active = 1; 14.401 - } 14.402 - 14.403 - hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0); 14.404 - if ( hvm_buf_size == -1 ) 14.405 - { 14.406 - ERROR("Couldn't get HVM context size from Xen"); 14.407 - goto out; 14.408 - } 14.409 - hvm_buf = malloc(hvm_buf_size); 14.410 - 14.411 - if ( !to_send || !to_skip || !hvm_buf ) 14.412 - { 14.413 - ERROR("Couldn't allocate memory"); 14.414 - goto out; 14.415 - } 14.416 - 14.417 - memset(to_send, 0xff, BITMAP_SIZE); 14.418 - 14.419 - if ( lock_pages(to_send, BITMAP_SIZE) ) 14.420 - { 14.421 - ERROR("Unable to lock to_send"); 14.422 - return 1; 14.423 - } 14.424 - 14.425 - /* (to fix is local only) */ 14.426 - if ( lock_pages(to_skip, BITMAP_SIZE) ) 14.427 - { 14.428 - ERROR("Unable to lock to_skip"); 14.429 - return 1; 14.430 - } 14.431 - 14.432 - analysis_phase(xc_handle, dom, pfn_array_size, to_skip, 0); 14.433 - 14.434 - /* We want zeroed memory so use calloc rather than malloc. */ 14.435 - pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch)); 14.436 - if ( pfn_batch == NULL ) 14.437 - { 14.438 - ERROR("failed to alloc memory for pfn_batch array"); 14.439 - errno = ENOMEM; 14.440 - goto out; 14.441 - } 14.442 - 14.443 - for ( ; ; ) 14.444 - { 14.445 - unsigned int prev_pc, sent_this_iter, N, batch; 14.446 - 14.447 - iter++; 14.448 - sent_this_iter = 0; 14.449 - skip_this_iter = 0; 14.450 - prev_pc = 0; 14.451 - N=0; 14.452 - 14.453 - DPRINTF("Saving memory pages: iter %d 0%%", iter); 14.454 - 14.455 - while ( N < pfn_array_size ) 14.456 - { 14.457 - unsigned int this_pc = (N * 100) / pfn_array_size; 14.458 - int rc; 14.459 - 14.460 - if ( (this_pc - prev_pc) >= 5 ) 14.461 - { 14.462 - DPRINTF("\b\b\b\b%3d%%", this_pc); 14.463 - prev_pc = this_pc; 14.464 - } 14.465 - 14.466 - if ( !last_iter ) 14.467 - { 14.468 - /* Slightly wasteful to peek the whole array evey time, 14.469 - but this is fast enough for the moment. */ 14.470 - rc = xc_shadow_control( 14.471 - xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip, 14.472 - pfn_array_size, NULL, 0, NULL); 14.473 - if ( rc != pfn_array_size ) 14.474 - { 14.475 - ERROR("Error peeking shadow bitmap"); 14.476 - goto out; 14.477 - } 14.478 - } 14.479 - 14.480 - /* load pfn_batch[] with the mfn of all the pages we're doing in 14.481 - this batch. */ 14.482 - for ( batch = 0; 14.483 - (batch < MAX_BATCH_SIZE) && (N < pfn_array_size); 14.484 - N++ ) 14.485 - { 14.486 - int n = permute(N, pfn_array_size, order_nr); 14.487 - 14.488 - if ( 0 && debug ) 14.489 - DPRINTF("%d pfn= %08lx %d \n", 14.490 - iter, (unsigned long)n, test_bit(n, to_send)); 14.491 - 14.492 - if ( !last_iter && 14.493 - test_bit(n, to_send) && 14.494 - test_bit(n, to_skip) ) 14.495 - skip_this_iter++; /* stats keeping */ 14.496 - 14.497 - if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) || 14.498 - (test_bit(n, to_send) && last_iter)) ) 14.499 - continue; 14.500 - 14.501 - /* Skip PFNs that aren't really there */ 14.502 - if ( (n >= 0xa0 && n < 0xc0) /* VGA hole */ 14.503 - || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT) && 14.504 - n < (1ULL << 32) >> PAGE_SHIFT) /* 4G MMIO hole */ ) 14.505 - continue; 14.506 - 14.507 - /* 14.508 - ** we get here if: 14.509 - ** 1. page is marked to_send & hasn't already been re-dirtied 14.510 - ** 2. (ignore to_skip in last iteration) 14.511 - */ 14.512 - 14.513 - pfn_batch[batch] = n; 14.514 - 14.515 - batch++; 14.516 - } 14.517 - 14.518 - if ( batch == 0 ) 14.519 - goto skip; /* vanishingly unlikely... */ 14.520 - 14.521 - region_base = xc_map_foreign_batch( 14.522 - xc_handle, dom, PROT_READ, pfn_batch, batch); 14.523 - if ( region_base == 0 ) 14.524 - { 14.525 - ERROR("map batch failed"); 14.526 - goto out; 14.527 - } 14.528 - 14.529 - /* write num of pfns */ 14.530 - if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) ) 14.531 - { 14.532 - ERROR("Error when writing to state file (2)"); 14.533 - goto out; 14.534 - } 14.535 - 14.536 - /* write all the pfns */ 14.537 - if ( !write_exact(io_fd, pfn_batch, sizeof(unsigned long)*batch) ) 14.538 - { 14.539 - ERROR("Error when writing to state file (3)"); 14.540 - goto out; 14.541 - } 14.542 - 14.543 - for ( j = 0; j < batch; j++ ) 14.544 - { 14.545 - if ( pfn_batch[j] & XEN_DOMCTL_PFINFO_LTAB_MASK ) 14.546 - continue; 14.547 - if ( ratewrite(io_fd, region_base + j*PAGE_SIZE, 14.548 - PAGE_SIZE) != PAGE_SIZE ) 14.549 - { 14.550 - ERROR("ERROR when writing to state file (4)"); 14.551 - goto out; 14.552 - } 14.553 - } 14.554 - 14.555 - sent_this_iter += batch; 14.556 - 14.557 - munmap(region_base, batch*PAGE_SIZE); 14.558 - 14.559 - } /* end of this while loop for this iteration */ 14.560 - 14.561 - skip: 14.562 - 14.563 - total_sent += sent_this_iter; 14.564 - 14.565 - DPRINTF("\r %d: sent %d, skipped %d, ", 14.566 - iter, sent_this_iter, skip_this_iter ); 14.567 - 14.568 - if ( last_iter ) 14.569 - { 14.570 - print_stats( xc_handle, dom, sent_this_iter, &stats, 1); 14.571 - DPRINTF("Total pages sent= %ld (%.2fx)\n", 14.572 - total_sent, ((float)total_sent)/pfn_array_size ); 14.573 - } 14.574 - 14.575 - if ( last_iter && debug ) 14.576 - { 14.577 - int minusone = -1; 14.578 - memset(to_send, 0xff, BITMAP_SIZE); 14.579 - debug = 0; 14.580 - DPRINTF("Entering debug resend-all mode\n"); 14.581 - 14.582 - /* send "-1" to put receiver into debug mode */ 14.583 - if ( !write_exact(io_fd, &minusone, sizeof(int)) ) 14.584 - { 14.585 - ERROR("Error when writing to state file (6)"); 14.586 - goto out; 14.587 - } 14.588 - 14.589 - continue; 14.590 - } 14.591 - 14.592 - if ( last_iter ) 14.593 - break; 14.594 - 14.595 - if ( live ) 14.596 - { 14.597 - if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) || 14.598 - (iter >= max_iters) || 14.599 - (sent_this_iter+skip_this_iter < 50) || 14.600 - (total_sent > pfn_array_size*max_factor) ) 14.601 - { 14.602 - DPRINTF("Start last iteration for HVM domain\n"); 14.603 - last_iter = 1; 14.604 - 14.605 - if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, 14.606 - &ctxt)) 14.607 - { 14.608 - ERROR("Domain appears not to have suspended"); 14.609 - goto out; 14.610 - } 14.611 - 14.612 - DPRINTF("SUSPEND eip %08lx edx %08lx\n", 14.613 - (unsigned long)ctxt.user_regs.eip, 14.614 - (unsigned long)ctxt.user_regs.edx); 14.615 - } 14.616 - 14.617 - if ( xc_shadow_control(xc_handle, dom, 14.618 - XEN_DOMCTL_SHADOW_OP_CLEAN, to_send, 14.619 - pfn_array_size, NULL, 14.620 - 0, &stats) != pfn_array_size ) 14.621 - { 14.622 - ERROR("Error flushing shadow PT"); 14.623 - goto out; 14.624 - } 14.625 - 14.626 - /* Pull in the dirty bits from qemu too */ 14.627 - if ( !last_iter ) 14.628 - { 14.629 - qemu_active = qemu_non_active; 14.630 - qemu_non_active = qemu_active ? 0 : 1; 14.631 - qemu_flip_buffer(dom, qemu_active); 14.632 - for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ ) 14.633 - { 14.634 - to_send[j] |= qemu_bitmaps[qemu_non_active][j]; 14.635 - qemu_bitmaps[qemu_non_active][j] = 0; 14.636 - } 14.637 - } 14.638 - else 14.639 - { 14.640 - for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ ) 14.641 - to_send[j] |= qemu_bitmaps[qemu_active][j]; 14.642 - } 14.643 - 14.644 - sent_last_iter = sent_this_iter; 14.645 - 14.646 - print_stats(xc_handle, dom, sent_this_iter, &stats, 1); 14.647 - } 14.648 - } /* end of while 1 */ 14.649 - 14.650 - 14.651 - DPRINTF("All HVM memory is saved\n"); 14.652 - 14.653 - { 14.654 - struct { 14.655 - int minustwo; 14.656 - int max_vcpu_id; 14.657 - uint64_t vcpumap; 14.658 - } chunk = { -2, info.max_vcpu_id }; 14.659 - 14.660 - if (info.max_vcpu_id >= 64) { 14.661 - ERROR("Too many VCPUS in guest!"); 14.662 - goto out; 14.663 - } 14.664 - 14.665 - for (i = 1; i <= info.max_vcpu_id; i++) { 14.666 - xc_vcpuinfo_t vinfo; 14.667 - if ((xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) && 14.668 - vinfo.online) 14.669 - vcpumap |= 1ULL << i; 14.670 - } 14.671 - 14.672 - chunk.vcpumap = vcpumap; 14.673 - if(!write_exact(io_fd, &chunk, sizeof(chunk))) { 14.674 - ERROR("Error when writing to state file (errno %d)", errno); 14.675 - goto out; 14.676 - } 14.677 - } 14.678 - 14.679 - /* Zero terminate */ 14.680 - i = 0; 14.681 - if ( !write_exact(io_fd, &i, sizeof(int)) ) 14.682 - { 14.683 - ERROR("Error when writing to state file (6)"); 14.684 - goto out; 14.685 - } 14.686 - 14.687 - /* Save magic-page locations. */ 14.688 - memset(magic_pfns, 0, sizeof(magic_pfns)); 14.689 - xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, 14.690 - (unsigned long *)&magic_pfns[0]); 14.691 - xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, 14.692 - (unsigned long *)&magic_pfns[1]); 14.693 - xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN, 14.694 - (unsigned long *)&magic_pfns[2]); 14.695 - if ( !write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) ) 14.696 - { 14.697 - ERROR("Error when writing to state file (7)"); 14.698 - goto out; 14.699 - } 14.700 - 14.701 - /* save vcpu/vmcs contexts */ 14.702 - for ( i = 0; i < nr_vcpus; i++ ) 14.703 - { 14.704 - if ( !(vcpumap & (1ULL << i)) ) 14.705 - continue; 14.706 - 14.707 - if ( xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) ) 14.708 - { 14.709 - ERROR("HVM:Could not get vcpu context"); 14.710 - goto out; 14.711 - } 14.712 - 14.713 - DPRINTF("write vcpu %d context.\n", i); 14.714 - if ( !write_exact(io_fd, &(ctxt), sizeof(ctxt)) ) 14.715 - { 14.716 - ERROR("write vcpu context failed!\n"); 14.717 - goto out; 14.718 - } 14.719 - } 14.720 - 14.721 - if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf, 14.722 - hvm_buf_size)) == -1 ) 14.723 - { 14.724 - ERROR("HVM:Could not get hvm buffer"); 14.725 - goto out; 14.726 - } 14.727 - 14.728 - if ( !write_exact(io_fd, &rec_size, sizeof(uint32_t)) ) 14.729 - { 14.730 - ERROR("error write hvm buffer size"); 14.731 - goto out; 14.732 - } 14.733 - 14.734 - if ( !write_exact(io_fd, hvm_buf, rec_size) ) 14.735 - { 14.736 - ERROR("write HVM info failed!\n"); 14.737 - goto out; 14.738 - } 14.739 - 14.740 - /* Success! */ 14.741 - rc = 0; 14.742 - 14.743 - out: 14.744 - 14.745 - if ( live ) 14.746 - { 14.747 - if ( xc_shadow_control(xc_handle, dom, XEN_DOMCTL_SHADOW_OP_OFF, 14.748 - NULL, 0, NULL, 0, NULL) < 0 ) 14.749 - DPRINTF("Warning - couldn't disable shadow mode"); 14.750 - } 14.751 - 14.752 - free(hvm_buf); 14.753 - free(pfn_batch); 14.754 - free(to_send); 14.755 - free(to_skip); 14.756 - 14.757 - return !!rc; 14.758 -}
15.1 --- a/tools/libxc/xc_linux_save.c Wed Apr 11 07:30:02 2007 -0600 15.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 15.3 @@ -1,1414 +0,0 @@ 15.4 -/****************************************************************************** 15.5 - * xc_linux_save.c 15.6 - * 15.7 - * Save the state of a running Linux session. 15.8 - * 15.9 - * Copyright (c) 2003, K A Fraser. 15.10 - */ 15.11 - 15.12 -#include <inttypes.h> 15.13 -#include <time.h> 15.14 -#include <stdlib.h> 15.15 -#include <unistd.h> 15.16 -#include <sys/time.h> 15.17 - 15.18 -#include "xc_private.h" 15.19 -#include "xc_dom.h" 15.20 -#include "xg_private.h" 15.21 -#include "xg_save_restore.h" 15.22 - 15.23 -/* 15.24 -** Default values for important tuning parameters. Can override by passing 15.25 -** non-zero replacement values to xc_linux_save(). 15.26 -** 15.27 -** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too. 15.28 -** 15.29 -*/ 15.30 -#define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */ 15.31 -#define DEF_MAX_FACTOR 3 /* never send more than 3x p2m_size */ 15.32 - 15.33 -/* max mfn of the whole machine */ 15.34 -static unsigned long max_mfn; 15.35 - 15.36 -/* virtual starting address of the hypervisor */ 15.37 -static unsigned long hvirt_start; 15.38 - 15.39 -/* #levels of page tables used by the current guest */ 15.40 -static unsigned int pt_levels; 15.41 - 15.42 -/* number of pfns this guest has (i.e. number of entries in the P2M) */ 15.43 -static unsigned long p2m_size; 15.44 - 15.45 -/* Live mapping of the table mapping each PFN to its current MFN. */ 15.46 -static xen_pfn_t *live_p2m = NULL; 15.47 - 15.48 -/* Live mapping of system MFN to PFN table. */ 15.49 -static xen_pfn_t *live_m2p = NULL; 15.50 -static unsigned long m2p_mfn0; 15.51 - 15.52 -/* grep fodder: machine_to_phys */ 15.53 - 15.54 -#define mfn_to_pfn(_mfn) live_m2p[(_mfn)] 15.55 - 15.56 -/* 15.57 - * Returns TRUE if the given machine frame number has a unique mapping 15.58 - * in the guest's pseudophysical map. 15.59 - */ 15.60 -#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \ 15.61 - (((_mfn) < (max_mfn)) && \ 15.62 - ((mfn_to_pfn(_mfn) < (p2m_size)) && \ 15.63 - (live_p2m[mfn_to_pfn(_mfn)] == (_mfn)))) 15.64 - 15.65 -/* Returns TRUE if MFN is successfully converted to a PFN. */ 15.66 -#define translate_mfn_to_pfn(_pmfn) \ 15.67 -({ \ 15.68 - unsigned long mfn = *(_pmfn); \ 15.69 - int _res = 1; \ 15.70 - if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \ 15.71 - _res = 0; \ 15.72 - else \ 15.73 - *(_pmfn) = mfn_to_pfn(mfn); \ 15.74 - _res; \ 15.75 -}) 15.76 - 15.77 -/* 15.78 -** During (live) save/migrate, we maintain a number of bitmaps to track 15.79 -** which pages we have to send, to fixup, and to skip. 15.80 -*/ 15.81 - 15.82 -#define BITS_PER_LONG (sizeof(unsigned long) * 8) 15.83 -#define BITMAP_SIZE ((p2m_size + BITS_PER_LONG - 1) / 8) 15.84 - 15.85 -#define BITMAP_ENTRY(_nr,_bmap) \ 15.86 - ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG] 15.87 - 15.88 -#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG) 15.89 - 15.90 -static inline int test_bit (int nr, volatile void * addr) 15.91 -{ 15.92 - return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1; 15.93 -} 15.94 - 15.95 -static inline void clear_bit (int nr, volatile void * addr) 15.96 -{ 15.97 - BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr)); 15.98 -} 15.99 - 15.100 -static inline void set_bit ( int nr, volatile void * addr) 15.101 -{ 15.102 - BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr)); 15.103 -} 15.104 - 15.105 -/* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */ 15.106 -static inline unsigned int hweight32(unsigned int w) 15.107 -{ 15.108 - unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555); 15.109 - res = (res & 0x33333333) + ((res >> 2) & 0x33333333); 15.110 - res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F); 15.111 - res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF); 15.112 - return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF); 15.113 -} 15.114 - 15.115 -static inline int count_bits ( int nr, volatile void *addr) 15.116 -{ 15.117 - int i, count = 0; 15.118 - volatile unsigned long *p = (volatile unsigned long *)addr; 15.119 - /* We know that the array is padded to unsigned long. */ 15.120 - for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ ) 15.121 - count += hweight32(*p); 15.122 - return count; 15.123 -} 15.124 - 15.125 -static inline int permute( int i, int nr, int order_nr ) 15.126 -{ 15.127 - /* Need a simple permutation function so that we scan pages in a 15.128 - pseudo random order, enabling us to get a better estimate of 15.129 - the domain's page dirtying rate as we go (there are often 15.130 - contiguous ranges of pfns that have similar behaviour, and we 15.131 - want to mix them up. */ 15.132 - 15.133 - /* e.g. nr->oder 15->4 16->4 17->5 */ 15.134 - /* 512MB domain, 128k pages, order 17 */ 15.135 - 15.136 - /* 15.137 - QPONMLKJIHGFEDCBA 15.138 - QPONMLKJIH 15.139 - GFEDCBA 15.140 - */ 15.141 - 15.142 - /* 15.143 - QPONMLKJIHGFEDCBA 15.144 - EDCBA 15.145 - QPONM 15.146 - LKJIHGF 15.147 - */ 15.148 - 15.149 - do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); } 15.150 - while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */ 15.151 - 15.152 - return i; 15.153 -} 15.154 - 15.155 -static uint64_t tv_to_us(struct timeval *new) 15.156 -{ 15.157 - return (new->tv_sec * 1000000) + new->tv_usec; 15.158 -} 15.159 - 15.160 -static uint64_t llgettimeofday(void) 15.161 -{ 15.162 - struct timeval now; 15.163 - gettimeofday(&now, NULL); 15.164 - return tv_to_us(&now); 15.165 -} 15.166 - 15.167 -static uint64_t tv_delta(struct timeval *new, struct timeval *old) 15.168 -{ 15.169 - return (((new->tv_sec - old->tv_sec)*1000000) + 15.170 - (new->tv_usec - old->tv_usec)); 15.171 -} 15.172 - 15.173 -static int noncached_write(int fd, int live, void *buffer, int len) 15.174 -{ 15.175 - static int write_count = 0; 15.176 - 15.177 - int rc = write(fd,buffer,len); 15.178 - 15.179 - write_count += len; 15.180 - if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) ) 15.181 - { 15.182 - /* Time to discard cache - dont care if this fails */ 15.183 - discard_file_cache(fd, 0 /* no flush */); 15.184 - write_count = 0; 15.185 - } 15.186 - 15.187 - return rc; 15.188 -} 15.189 - 15.190 -#ifdef ADAPTIVE_SAVE 15.191 - 15.192 -/* 15.193 -** We control the rate at which we transmit (or save) to minimize impact 15.194 -** on running domains (including the target if we're doing live migrate). 15.195 -*/ 15.196 - 15.197 -#define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */ 15.198 -#define START_MBIT_RATE 100 /* initial transmit rate for migrate */ 15.199 - 15.200 -/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */ 15.201 -#define RATE_TO_BTU 781250 15.202 - 15.203 -/* Amount in bytes we allow ourselves to send in a burst */ 15.204 -#define BURST_BUDGET (100*1024) 15.205 - 15.206 -/* We keep track of the current and previous transmission rate */ 15.207 -static int mbit_rate, ombit_rate = 0; 15.208 - 15.209 -/* Have we reached the maximum transmission rate? */ 15.210 -#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE) 15.211 - 15.212 -static inline void initialize_mbit_rate() 15.213 -{ 15.214 - mbit_rate = START_MBIT_RATE; 15.215 -} 15.216 - 15.217 -static int ratewrite(int io_fd, int live, void *buf, int n) 15.218 -{ 15.219 - static int budget = 0; 15.220 - static int burst_time_us = -1; 15.221 - static struct timeval last_put = { 0 }; 15.222 - struct timeval now; 15.223 - struct timespec delay; 15.224 - long long delta; 15.225 - 15.226 - if ( START_MBIT_RATE == 0 ) 15.227 - return noncached_write(io_fd, live, buf, n); 15.228 - 15.229 - budget -= n; 15.230 - if ( budget < 0 ) 15.231 - { 15.232 - if ( mbit_rate != ombit_rate ) 15.233 - { 15.234 - burst_time_us = RATE_TO_BTU / mbit_rate; 15.235 - ombit_rate = mbit_rate; 15.236 - DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n", 15.237 - mbit_rate, BURST_BUDGET, burst_time_us); 15.238 - } 15.239 - if ( last_put.tv_sec == 0 ) 15.240 - { 15.241 - budget += BURST_BUDGET; 15.242 - gettimeofday(&last_put, NULL); 15.243 - } 15.244 - else 15.245 - { 15.246 - while ( budget < 0 ) 15.247 - { 15.248 - gettimeofday(&now, NULL); 15.249 - delta = tv_delta(&now, &last_put); 15.250 - while ( delta > burst_time_us ) 15.251 - { 15.252 - budget += BURST_BUDGET; 15.253 - last_put.tv_usec += burst_time_us; 15.254 - if ( last_put.tv_usec > 1000000 15.255 - { 15.256 - last_put.tv_usec -= 1000000; 15.257 - last_put.tv_sec++; 15.258 - } 15.259 - delta -= burst_time_us; 15.260 - } 15.261 - if ( budget > 0 ) 15.262 - break; 15.263 - delay.tv_sec = 0; 15.264 - delay.tv_nsec = 1000 * (burst_time_us - delta); 15.265 - while ( delay.tv_nsec > 0 ) 15.266 - if ( nanosleep(&delay, &delay) == 0 ) 15.267 - break; 15.268 - } 15.269 - } 15.270 - } 15.271 - return noncached_write(io_fd, live, buf, n); 15.272 -} 15.273 - 15.274 -#else /* ! ADAPTIVE SAVE */ 15.275 - 15.276 -#define RATE_IS_MAX() (0) 15.277 -#define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), (_buf), (_n)) 15.278 -#define initialize_mbit_rate() 15.279 - 15.280 -#endif 15.281 - 15.282 -static inline ssize_t write_exact(int fd, void *buf, size_t count) 15.283 -{ 15.284 - return (write(fd, buf, count) == count); 15.285 -} 15.286 - 15.287 -static int print_stats(int xc_handle, uint32_t domid, int pages_sent, 15.288 - xc_shadow_op_stats_t *stats, int print) 15.289 -{ 15.290 - static struct timeval wall_last; 15.291 - static long long d0_cpu_last; 15.292 - static long long d1_cpu_last; 15.293 - 15.294 - struct timeval wall_now; 15.295 - long long wall_delta; 15.296 - long long d0_cpu_now, d0_cpu_delta; 15.297 - long long d1_cpu_now, d1_cpu_delta; 15.298 - 15.299 - gettimeofday(&wall_now, NULL); 15.300 - 15.301 - d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000; 15.302 - d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000; 15.303 - 15.304 - if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) ) 15.305 - DPRINTF("ARRHHH!!\n"); 15.306 - 15.307 - wall_delta = tv_delta(&wall_now,&wall_last)/1000; 15.308 - if ( wall_delta == 0 ) 15.309 - wall_delta = 1; 15.310 - 15.311 - d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000; 15.312 - d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000; 15.313 - 15.314 - if ( print ) 15.315 - DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, " 15.316 - "dirtied %dMb/s %" PRId32 " pages\n", 15.317 - wall_delta, 15.318 - (int)((d0_cpu_delta*100)/wall_delta), 15.319 - (int)((d1_cpu_delta*100)/wall_delta), 15.320 - (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))), 15.321 - (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))), 15.322 - stats->dirty_count); 15.323 - 15.324 -#ifdef ADAPTIVE_SAVE 15.325 - if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate ) 15.326 - { 15.327 - mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) 15.328 - + 50; 15.329 - if ( mbit_rate > MAX_MBIT_RATE ) 15.330 - mbit_rate = MAX_MBIT_RATE; 15.331 - } 15.332 -#endif 15.333 - 15.334 - d0_cpu_last = d0_cpu_now; 15.335 - d1_cpu_last = d1_cpu_now; 15.336 - wall_last = wall_now; 15.337 - 15.338 - return 0; 15.339 -} 15.340 - 15.341 - 15.342 -static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size, 15.343 - unsigned long *arr, int runs) 15.344 -{ 15.345 - long long start, now; 15.346 - xc_shadow_op_stats_t stats; 15.347 - int j; 15.348 - 15.349 - start = llgettimeofday(); 15.350 - 15.351 - for ( j = 0; j < runs; j++ ) 15.352 - { 15.353 - int i; 15.354 - 15.355 - xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN, 15.356 - arr, p2m_size, NULL, 0, NULL); 15.357 - DPRINTF("#Flush\n"); 15.358 - for ( i = 0; i < 40; i++ ) 15.359 - { 15.360 - usleep(50000); 15.361 - now = llgettimeofday(); 15.362 - xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK, 15.363 - NULL, 0, NULL, 0, &stats); 15.364 - DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n", 15.365 - ((now-start)+500)/1000, 15.366 - stats.fault_count, stats.dirty_count); 15.367 - } 15.368 - } 15.369 - 15.370 - return -1; 15.371 -} 15.372 - 15.373 - 15.374 -static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd, 15.375 - int dom, xc_dominfo_t *info, 15.376 - vcpu_guest_context_t *ctxt) 15.377 -{ 15.378 - int i = 0; 15.379 - 15.380 - if ( !(*suspend)(dom) ) 15.381 - { 15.382 - ERROR("Suspend request failed"); 15.383 - return -1; 15.384 - } 15.385 - 15.386 - retry: 15.387 - 15.388 - if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 ) 15.389 - { 15.390 - ERROR("Could not get domain info"); 15.391 - return -1; 15.392 - } 15.393 - 15.394 - if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) ) 15.395 - ERROR("Could not get vcpu context"); 15.396 - 15.397 - 15.398 - if ( info->dying ) 15.399 - { 15.400 - ERROR("domain is dying"); 15.401 - return -1; 15.402 - } 15.403 - 15.404 - if ( info->crashed ) 15.405 - { 15.406 - ERROR("domain has crashed"); 15.407 - return -1; 15.408 - } 15.409 - 15.410 - if ( info->shutdown ) 15.411 - { 15.412 - switch ( info->shutdown_reason ) 15.413 - { 15.414 - case SHUTDOWN_poweroff: 15.415 - case SHUTDOWN_reboot: 15.416 - ERROR("domain has shut down"); 15.417 - return -1; 15.418 - case SHUTDOWN_suspend: 15.419 - return 0; 15.420 - case SHUTDOWN_crash: 15.421 - ERROR("domain has crashed"); 15.422 - return -1; 15.423 - } 15.424 - } 15.425 - 15.426 - if ( info->paused ) 15.427 - { 15.428 - /* Try unpausing domain, wait, and retest. */ 15.429 - xc_domain_unpause( xc_handle, dom ); 15.430 - ERROR("Domain was paused. Wait and re-test."); 15.431 - usleep(10000); /* 10ms */ 15.432 - goto retry; 15.433 - } 15.434 - 15.435 - if ( ++i < 100 ) 15.436 - { 15.437 - ERROR("Retry suspend domain"); 15.438 - usleep(10000); /* 10ms */ 15.439 - goto retry; 15.440 - } 15.441 - 15.442 - ERROR("Unable to suspend domain."); 15.443 - 15.444 - return -1; 15.445 -} 15.446 - 15.447 -/* 15.448 -** Map the top-level page of MFNs from the guest. The guest might not have 15.449 -** finished resuming from a previous restore operation, so we wait a while for 15.450 -** it to update the MFN to a reasonable value. 15.451 -*/ 15.452 -static void *map_frame_list_list(int xc_handle, uint32_t dom, 15.453 - shared_info_t *shinfo) 15.454 -{ 15.455 - int count = 100; 15.456 - void *p; 15.457 - 15.458 - while ( count-- && (shinfo->arch.pfn_to_mfn_frame_list_list == 0) ) 15.459 - usleep(10000); 15.460 - 15.461 - if ( shinfo->arch.pfn_to_mfn_frame_list_list == 0 ) 15.462 - { 15.463 - ERROR("Timed out waiting for frame list updated."); 15.464 - return NULL; 15.465 - } 15.466 - 15.467 - p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, 15.468 - shinfo->arch.pfn_to_mfn_frame_list_list); 15.469 - if ( p == NULL ) 15.470 - ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno); 15.471 - 15.472 - return p; 15.473 -} 15.474 - 15.475 -/* 15.476 -** During transfer (or in the state file), all page-table pages must be 15.477 -** converted into a 'canonical' form where references to actual mfns 15.478 -** are replaced with references to the corresponding pfns. 15.479 -** 15.480 -** This function performs the appropriate conversion, taking into account 15.481 -** which entries do not require canonicalization (in particular, those 15.482 -** entries which map the virtual address reserved for the hypervisor). 15.483 -*/ 15.484 -static int canonicalize_pagetable(unsigned long type, unsigned long pfn, 15.485 - const void *spage, void *dpage) 15.486 -{ 15.487 - 15.488 - int i, pte_last, xen_start, xen_end, race = 0; 15.489 - uint64_t pte; 15.490 - 15.491 - /* 15.492 - ** We need to determine which entries in this page table hold 15.493 - ** reserved hypervisor mappings. This depends on the current 15.494 - ** page table type as well as the number of paging levels. 15.495 - */ 15.496 - xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8); 15.497 - 15.498 - if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) ) 15.499 - xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT); 15.500 - 15.501 - if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) ) 15.502 - xen_start = L3_PAGETABLE_ENTRIES_PAE; 15.503 - 15.504 - /* 15.505 - ** in PAE only the L2 mapping the top 1GB contains Xen mappings. 15.506 - ** We can spot this by looking for the guest linear mapping which 15.507 - ** Xen always ensures is present in that L2. Guests must ensure 15.508 - ** that this check will fail for other L2s. 15.509 - */ 15.510 - if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) ) 15.511 - { 15.512 - int hstart; 15.513 - uint64_t he; 15.514 - 15.515 - hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff; 15.516 - he = ((const uint64_t *) spage)[hstart]; 15.517 - 15.518 - if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 ) 15.519 - { 15.520 - /* hvirt starts with xen stuff... */ 15.521 - xen_start = hstart; 15.522 - } 15.523 - else if ( hvirt_start != 0xf5800000 ) 15.524 - { 15.525 - /* old L2s from before hole was shrunk... */ 15.526 - hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff; 15.527 - he = ((const uint64_t *) spage)[hstart]; 15.528 - if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 ) 15.529 - xen_start = hstart; 15.530 - } 15.531 - } 15.532 - 15.533 - if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) ) 15.534 - { 15.535 - /* 15.536 - ** XXX SMH: should compute these from hvirt_start (which we have) 15.537 - ** and hvirt_end (which we don't) 15.538 - */ 15.539 - xen_start = 256; 15.540 - xen_end = 272; 15.541 - } 15.542 - 15.543 - /* Now iterate through the page table, canonicalizing each PTE */ 15.544 - for (i = 0; i < pte_last; i++ ) 15.545 - { 15.546 - unsigned long pfn, mfn; 15.547 - 15.548 - if ( pt_levels == 2 ) 15.549 - pte = ((const uint32_t*)spage)[i]; 15.550 - else 15.551 - pte = ((const uint64_t*)spage)[i]; 15.552 - 15.553 - if ( (i >= xen_start) && (i < xen_end) ) 15.554 - pte = 0; 15.555 - 15.556 - if ( pte & _PAGE_PRESENT ) 15.557 - { 15.558 - mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86; 15.559 - if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) 15.560 - { 15.561 - /* This will happen if the type info is stale which 15.562 - is quite feasible under live migration */ 15.563 - pfn = 0; /* zap it - we'll retransmit this page later */ 15.564 - race = 1; /* inform the caller of race; fatal if !live */ 15.565 - } 15.566 - else 15.567 - pfn = mfn_to_pfn(mfn); 15.568 - 15.569 - pte &= ~MADDR_MASK_X86; 15.570 - pte |= (uint64_t)pfn << PAGE_SHIFT; 15.571 - 15.572 - /* 15.573 - * PAE guest L3Es can contain these flags when running on 15.574 - * a 64bit hypervisor. We zap these here to avoid any 15.575 - * surprise at restore time... 15.576 - */ 15.577 - if ( (pt_levels == 3) && 15.578 - (type == XEN_DOMCTL_PFINFO_L3TAB) && 15.579 - (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) ) 15.580 - pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED); 15.581 - } 15.582 - 15.583 - if ( pt_levels == 2 ) 15.584 - ((uint32_t*)dpage)[i] = pte; 15.585 - else 15.586 - ((uint64_t*)dpage)[i] = pte; 15.587 - } 15.588 - 15.589 - return race; 15.590 -} 15.591 - 15.592 -static xen_pfn_t *xc_map_m2p(int xc_handle, 15.593 - unsigned long max_mfn, 15.594 - int prot) 15.595 -{ 15.596 - struct xen_machphys_mfn_list xmml; 15.597 - privcmd_mmap_entry_t *entries; 15.598 - unsigned long m2p_chunks, m2p_size; 15.599 - xen_pfn_t *m2p; 15.600 - xen_pfn_t *extent_start; 15.601 - int i, rc; 15.602 - 15.603 - m2p_size = M2P_SIZE(max_mfn); 15.604 - m2p_chunks = M2P_CHUNKS(max_mfn); 15.605 - 15.606 - xmml.max_extents = m2p_chunks; 15.607 - if ( !(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t))) ) 15.608 - { 15.609 - ERROR("failed to allocate space for m2p mfns"); 15.610 - return NULL; 15.611 - } 15.612 - set_xen_guest_handle(xmml.extent_start, extent_start); 15.613 - 15.614 - if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) || 15.615 - (xmml.nr_extents != m2p_chunks) ) 15.616 - { 15.617 - ERROR("xc_get_m2p_mfns"); 15.618 - return NULL; 15.619 - } 15.620 - 15.621 - if ( (m2p = mmap(NULL, m2p_size, prot, 15.622 - MAP_SHARED, xc_handle, 0)) == MAP_FAILED ) 15.623 - { 15.624 - ERROR("failed to mmap m2p"); 15.625 - return NULL; 15.626 - } 15.627 - 15.628 - if ( !(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t))) ) 15.629 - { 15.630 - ERROR("failed to allocate space for mmap entries"); 15.631 - return NULL; 15.632 - } 15.633 - 15.634 - for ( i = 0; i < m2p_chunks; i++ ) 15.635 - { 15.636 - entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE)); 15.637 - entries[i].mfn = extent_start[i]; 15.638 - entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT; 15.639 - } 15.640 - 15.641 - if ( (rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN, 15.642 - entries, m2p_chunks)) < 0 ) 15.643 - { 15.644 - ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc); 15.645 - return NULL; 15.646 - } 15.647 - 15.648 - m2p_mfn0 = entries[0].mfn; 15.649 - 15.650 - free(extent_start); 15.651 - free(entries); 15.652 - 15.653 - return m2p; 15.654 -} 15.655 - 15.656 -int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, 15.657 - uint32_t max_factor, uint32_t flags, int (*suspend)(int)) 15.658 -{ 15.659 - xc_dominfo_t info; 15.660 - 15.661 - int rc = 1, i, j, last_iter, iter = 0; 15.662 - int live = (flags & XCFLAGS_LIVE); 15.663 - int debug = (flags & XCFLAGS_DEBUG); 15.664 - int race = 0, sent_last_iter, skip_this_iter; 15.665 - 15.666 - /* The new domain's shared-info frame number. */ 15.667 - unsigned long shared_info_frame; 15.668 - 15.669 - /* A copy of the CPU context of the guest. */ 15.670 - vcpu_guest_context_t ctxt; 15.671 - 15.672 - /* A table containg the type of each PFN (/not/ MFN!). */ 15.673 - unsigned long *pfn_type = NULL; 15.674 - unsigned long *pfn_batch = NULL; 15.675 - 15.676 - /* A temporary mapping, and a copy, of one frame of guest memory. */ 15.677 - char page[PAGE_SIZE]; 15.678 - 15.679 - /* Double and single indirect references to the live P2M table */ 15.680 - xen_pfn_t *live_p2m_frame_list_list = NULL; 15.681 - xen_pfn_t *live_p2m_frame_list = NULL; 15.682 - 15.683 - /* A copy of the pfn-to-mfn table frame list. */ 15.684 - xen_pfn_t *p2m_frame_list = NULL; 15.685 - 15.686 - /* Live mapping of shared info structure */ 15.687 - shared_info_t *live_shinfo = NULL; 15.688 - 15.689 - /* base of the region in which domain memory is mapped */ 15.690 - unsigned char *region_base = NULL; 15.691 - 15.692 - /* power of 2 order of p2m_size */ 15.693 - int order_nr; 15.694 - 15.695 - /* bitmap of pages: 15.696 - - that should be sent this iteration (unless later marked as skip); 15.697 - - to skip this iteration because already dirty; 15.698 - - to fixup by sending at the end if not already resent; */ 15.699 - unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL; 15.700 - 15.701 - xc_shadow_op_stats_t stats; 15.702 - 15.703 - unsigned long needed_to_fix = 0; 15.704 - unsigned long total_sent = 0; 15.705 - 15.706 - uint64_t vcpumap = 1ULL; 15.707 - 15.708 - /* If no explicit control parameters given, use defaults */ 15.709 - max_iters = max_iters ? : DEF_MAX_ITERS; 15.710 - max_factor = max_factor ? : DEF_MAX_FACTOR; 15.711 - 15.712 - initialize_mbit_rate(); 15.713 - 15.714 - if ( !get_platform_info(xc_handle, dom, 15.715 - &max_mfn, &hvirt_start, &pt_levels) ) 15.716 - { 15.717 - ERROR("Unable to get platform info."); 15.718 - return 1; 15.719 - } 15.720 - 15.721 - if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 ) 15.722 - { 15.723 - ERROR("Could not get domain info"); 15.724 - return 1; 15.725 - } 15.726 - 15.727 - if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) ) 15.728 - { 15.729 - ERROR("Could not get vcpu context"); 15.730 - goto out; 15.731 - } 15.732 - shared_info_frame = info.shared_info_frame; 15.733 - 15.734 - /* Map the shared info frame */ 15.735 - if ( !(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 15.736 - PROT_READ, shared_info_frame)) ) 15.737 - { 15.738 - ERROR("Couldn't map live_shinfo"); 15.739 - goto out; 15.740 - } 15.741 - 15.742 - p2m_size = live_shinfo->arch.max_pfn; 15.743 - 15.744 - live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom, 15.745 - live_shinfo); 15.746 - if ( !live_p2m_frame_list_list ) 15.747 - goto out; 15.748 - 15.749 - live_p2m_frame_list = 15.750 - xc_map_foreign_batch(xc_handle, dom, PROT_READ, 15.751 - live_p2m_frame_list_list, 15.752 - P2M_FLL_ENTRIES); 15.753 - if ( !live_p2m_frame_list ) 15.754 - { 15.755 - ERROR("Couldn't map p2m_frame_list"); 15.756 - goto out; 15.757 - } 15.758 - 15.759 - /* Map all the frames of the pfn->mfn table. For migrate to succeed, 15.760 - the guest must not change which frames are used for this purpose. 15.761 - (its not clear why it would want to change them, and we'll be OK 15.762 - from a safety POV anyhow. */ 15.763 - 15.764 - live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ, 15.765 - live_p2m_frame_list, 15.766 - P2M_FL_ENTRIES); 15.767 - if ( !live_p2m ) 15.768 - { 15.769 - ERROR("Couldn't map p2m table"); 15.770 - goto out; 15.771 - } 15.772 - 15.773 - /* Setup the mfn_to_pfn table mapping */ 15.774 - if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ)) ) 15.775 - { 15.776 - ERROR("Failed to map live M2P table"); 15.777 - goto out; 15.778 - } 15.779 - 15.780 - 15.781 - /* Get a local copy of the live_P2M_frame_list */ 15.782 - if ( !(p2m_frame_list = malloc(P2M_FL_SIZE)) ) 15.783 - { 15.784 - ERROR("Couldn't allocate p2m_frame_list array"); 15.785 - goto out; 15.786 - } 15.787 - memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE); 15.788 - 15.789 - /* Canonicalise the pfn-to-mfn table frame-number list. */ 15.790 - for ( i = 0; i < p2m_size; i += fpp ) 15.791 - { 15.792 - if ( !translate_mfn_to_pfn(&p2m_frame_list[i/fpp]) ) 15.793 - { 15.794 - ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys"); 15.795 - ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp, 15.796 - (uint64_t)p2m_frame_list[i/fpp]); 15.797 - goto out; 15.798 - } 15.799 - } 15.800 - 15.801 - /* Domain is still running at this point */ 15.802 - if ( live ) 15.803 - { 15.804 - /* Live suspend. Enable log-dirty mode. */ 15.805 - if ( xc_shadow_control(xc_handle, dom, 15.806 - XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, 15.807 - NULL, 0, NULL, 0, NULL) < 0 ) 15.808 - { 15.809 - ERROR("Couldn't enable shadow mode"); 15.810 - goto out; 15.811 - } 15.812 - } 15.813 - else 15.814 - { 15.815 - /* This is a non-live suspend. Suspend the domain .*/ 15.816 - if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) ) 15.817 - { 15.818 - ERROR("Domain appears not to have suspended"); 15.819 - goto out; 15.820 - } 15.821 - } 15.822 - 15.823 - last_iter = !live; 15.824 - 15.825 - /* pretend we sent all the pages last iteration */ 15.826 - sent_last_iter = p2m_size; 15.827 - 15.828 - /* calculate the power of 2 order of p2m_size, e.g. 15.829 - 15->4 16->4 17->5 */ 15.830 - for ( i = p2m_size-1, order_nr = 0; i ; i >>= 1, order_nr++ ) 15.831 - continue; 15.832 - 15.833 - /* Setup to_send / to_fix and to_skip bitmaps */ 15.834 - to_send = malloc(BITMAP_SIZE); 15.835 - to_fix = calloc(1, BITMAP_SIZE); 15.836 - to_skip = malloc(BITMAP_SIZE); 15.837 - 15.838 - if ( !to_send || !to_fix || !to_skip ) 15.839 - { 15.840 - ERROR("Couldn't allocate to_send array"); 15.841 - goto out; 15.842 - } 15.843 - 15.844 - memset(to_send, 0xff, BITMAP_SIZE); 15.845 - 15.846 - if ( lock_pages(to_send, BITMAP_SIZE) ) 15.847 - { 15.848 - ERROR("Unable to lock to_send"); 15.849 - return 1; 15.850 - } 15.851 - 15.852 - /* (to fix is local only) */ 15.853 - if ( lock_pages(to_skip, BITMAP_SIZE) ) 15.854 - { 15.855 - ERROR("Unable to lock to_skip"); 15.856 - return 1; 15.857 - } 15.858 - 15.859 - analysis_phase(xc_handle, dom, p2m_size, to_skip, 0); 15.860 - 15.861 - /* We want zeroed memory so use calloc rather than malloc. */ 15.862 - pfn_type = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type)); 15.863 - pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch)); 15.864 - if ( (pfn_type == NULL) || (pfn_batch == NULL) ) 15.865 - { 15.866 - ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays"); 15.867 - errno = ENOMEM; 15.868 - goto out; 15.869 - } 15.870 - 15.871 - if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) ) 15.872 - { 15.873 - ERROR("Unable to lock"); 15.874 - goto out; 15.875 - } 15.876 - 15.877 - /* 15.878 - * Quick belt and braces sanity check. 15.879 - */ 15.880 - { 15.881 - int err=0; 15.882 - unsigned long mfn; 15.883 - for ( i = 0; i < p2m_size; i++ ) 15.884 - { 15.885 - mfn = live_p2m[i]; 15.886 - if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) ) 15.887 - { 15.888 - DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i, 15.889 - mfn, mfn_to_pfn(mfn)); 15.890 - err++; 15.891 - } 15.892 - } 15.893 - DPRINTF("Had %d unexplained entries in p2m table\n", err); 15.894 - } 15.895 - 15.896 - /* Start writing out the saved-domain record. */ 15.897 - if ( !write_exact(io_fd, &p2m_size, sizeof(unsigned long)) ) 15.898 - { 15.899 - ERROR("write: p2m_size"); 15.900 - goto out; 15.901 - } 15.902 - 15.903 - /* 15.904 - * Write an extended-info structure to inform the restore code that 15.905 - * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off 15.906 - * slow paths in the restore code. 15.907 - */ 15.908 - if ( (pt_levels == 3) && 15.909 - (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3)) ) 15.910 - { 15.911 - unsigned long signature = ~0UL; 15.912 - uint32_t tot_sz = sizeof(struct vcpu_guest_context) + 8; 15.913 - uint32_t chunk_sz = sizeof(struct vcpu_guest_context); 15.914 - char chunk_sig[] = "vcpu"; 15.915 - if ( !write_exact(io_fd, &signature, sizeof(signature)) || 15.916 - !write_exact(io_fd, &tot_sz, sizeof(tot_sz)) || 15.917 - !write_exact(io_fd, &chunk_sig, 4) || 15.918 - !write_exact(io_fd, &chunk_sz, sizeof(chunk_sz)) || 15.919 - !write_exact(io_fd, &ctxt, sizeof(ctxt)) ) 15.920 - { 15.921 - ERROR("write: extended info"); 15.922 - goto out; 15.923 - } 15.924 - } 15.925 - 15.926 - if ( !write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE) ) 15.927 - { 15.928 - ERROR("write: p2m_frame_list"); 15.929 - goto out; 15.930 - } 15.931 - 15.932 - print_stats(xc_handle, dom, 0, &stats, 0); 15.933 - 15.934 - /* Now write out each data page, canonicalising page tables as we go... */ 15.935 - for ( ; ; ) 15.936 - { 15.937 - unsigned int prev_pc, sent_this_iter, N, batch; 15.938 - 15.939 - iter++; 15.940 - sent_this_iter = 0; 15.941 - skip_this_iter = 0; 15.942 - prev_pc = 0; 15.943 - N = 0; 15.944 - 15.945 - DPRINTF("Saving memory pages: iter %d 0%%", iter); 15.946 - 15.947 - while ( N < p2m_size ) 15.948 - { 15.949 - unsigned int this_pc = (N * 100) / p2m_size; 15.950 - int rc; 15.951 - 15.952 - if ( (this_pc - prev_pc) >= 5 ) 15.953 - { 15.954 - DPRINTF("\b\b\b\b%3d%%", this_pc); 15.955 - prev_pc = this_pc; 15.956 - } 15.957 - 15.958 - if ( !last_iter ) 15.959 - { 15.960 - /* Slightly wasteful to peek the whole array evey time, 15.961 - but this is fast enough for the moment. */ 15.962 - rc = xc_shadow_control( 15.963 - xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip, 15.964 - p2m_size, NULL, 0, NULL); 15.965 - if ( rc != p2m_size ) 15.966 - { 15.967 - ERROR("Error peeking shadow bitmap"); 15.968 - goto out; 15.969 - } 15.970 - } 15.971 - 15.972 - /* load pfn_type[] with the mfn of all the pages we're doing in 15.973 - this batch. */ 15.974 - for ( batch = 0; 15.975 - (batch < MAX_BATCH_SIZE) && (N < p2m_size); 15.976 - N++ ) 15.977 - { 15.978 - int n = permute(N, p2m_size, order_nr); 15.979 - 15.980 - if ( debug ) 15.981 - DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n", 15.982 - iter, (unsigned long)n, live_p2m[n], 15.983 - test_bit(n, to_send), 15.984 - mfn_to_pfn(live_p2m[n]&0xFFFFF)); 15.985 - 15.986 - if ( !last_iter && 15.987 - test_bit(n, to_send) && 15.988 - test_bit(n, to_skip) ) 15.989 - skip_this_iter++; /* stats keeping */ 15.990 - 15.991 - if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) || 15.992 - (test_bit(n, to_send) && last_iter) || 15.993 - (test_bit(n, to_fix) && last_iter)) ) 15.994 - continue; 15.995 - 15.996 - /* 15.997 - ** we get here if: 15.998 - ** 1. page is marked to_send & hasn't already been re-dirtied 15.999 - ** 2. (ignore to_skip in last iteration) 15.1000 - ** 3. add in pages that still need fixup (net bufs) 15.1001 - */ 15.1002 - 15.1003 - pfn_batch[batch] = n; 15.1004 - pfn_type[batch] = live_p2m[n]; 15.1005 - 15.1006 - if ( !is_mapped(pfn_type[batch]) ) 15.1007 - { 15.1008 - /* 15.1009 - ** not currently in psuedo-physical map -- set bit 15.1010 - ** in to_fix since we must send this page in last_iter 15.1011 - ** unless its sent sooner anyhow, or it never enters 15.1012 - ** pseudo-physical map (e.g. for ballooned down domains) 15.1013 - */ 15.1014 - set_bit(n, to_fix); 15.1015 - continue; 15.1016 - } 15.1017 - 15.1018 - if ( last_iter && 15.1019 - test_bit(n, to_fix) && 15.1020 - !test_bit(n, to_send) ) 15.1021 - { 15.1022 - needed_to_fix++; 15.1023 - DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n", 15.1024 - iter, n, pfn_type[batch]); 15.1025 - } 15.1026 - 15.1027 - clear_bit(n, to_fix); 15.1028 - 15.1029 - batch++; 15.1030 - } 15.1031 - 15.1032 - if ( batch == 0 ) 15.1033 - goto skip; /* vanishingly unlikely... */ 15.1034 - 15.1035 - region_base = xc_map_foreign_batch( 15.1036 - xc_handle, dom, PROT_READ, pfn_type, batch); 15.1037 - if ( region_base == NULL ) 15.1038 - { 15.1039 - ERROR("map batch failed"); 15.1040 - goto out; 15.1041 - } 15.1042 - 15.1043 - for ( j = 0; j < batch; j++ ) 15.1044 - ((uint32_t *)pfn_type)[j] = pfn_type[j]; 15.1045 - if ( xc_get_pfn_type_batch(xc_handle, dom, batch, 15.1046 - (uint32_t *)pfn_type) ) 15.1047 - { 15.1048 - ERROR("get_pfn_type_batch failed"); 15.1049 - goto out; 15.1050 - } 15.1051 - for ( j = batch-1; j >= 0; j-- ) 15.1052 - pfn_type[j] = ((uint32_t *)pfn_type)[j]; 15.1053 - 15.1054 - for ( j = 0; j < batch; j++ ) 15.1055 - { 15.1056 - 15.1057 - if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) == 15.1058 - XEN_DOMCTL_PFINFO_XTAB ) 15.1059 - { 15.1060 - DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]); 15.1061 - continue; 15.1062 - } 15.1063 - 15.1064 - if ( debug ) 15.1065 - DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx" 15.1066 - " sum= %08lx\n", 15.1067 - iter, 15.1068 - (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) | 15.1069 - pfn_batch[j], 15.1070 - pfn_type[j], 15.1071 - mfn_to_pfn(pfn_type[j] & 15.1072 - ~XEN_DOMCTL_PFINFO_LTAB_MASK), 15.1073 - csum_page(region_base + (PAGE_SIZE*j))); 15.1074 - 15.1075 - /* canonicalise mfn->pfn */ 15.1076 - pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) | 15.1077 - pfn_batch[j]; 15.1078 - } 15.1079 - 15.1080 - if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) ) 15.1081 - { 15.1082 - ERROR("Error when writing to state file (2) (errno %d)", 15.1083 - errno); 15.1084 - goto out; 15.1085 - } 15.1086 - 15.1087 - if ( !write_exact(io_fd, pfn_type, sizeof(unsigned long)*j) ) 15.1088 - { 15.1089 - ERROR("Error when writing to state file (3) (errno %d)", 15.1090 - errno); 15.1091 - goto out; 15.1092 - } 15.1093 - 15.1094 - /* entering this loop, pfn_type is now in pfns (Not mfns) */ 15.1095 - for ( j = 0; j < batch; j++ ) 15.1096 - { 15.1097 - unsigned long pfn, pagetype; 15.1098 - void *spage = (char *)region_base + (PAGE_SIZE*j); 15.1099 - 15.1100 - pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK; 15.1101 - pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK; 15.1102 - 15.1103 - /* write out pages in batch */ 15.1104 - if ( pagetype == XEN_DOMCTL_PFINFO_XTAB ) 15.1105 - continue; 15.1106 - 15.1107 - pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; 15.1108 - 15.1109 - if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) && 15.1110 - (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) ) 15.1111 - { 15.1112 - /* We have a pagetable page: need to rewrite it. */ 15.1113 - race = 15.1114 - canonicalize_pagetable(pagetype, pfn, spage, page); 15.1115 - 15.1116 - if ( race && !live ) 15.1117 - { 15.1118 - ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn, 15.1119 - pagetype); 15.1120 - goto out; 15.1121 - } 15.1122 - 15.1123 - if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE ) 15.1124 - { 15.1125 - ERROR("Error when writing to state file (4)" 15.1126 - " (errno %d)", errno); 15.1127 - goto out; 15.1128 - } 15.1129 - } 15.1130 - else 15.1131 - { 15.1132 - /* We have a normal page: just write it directly. */ 15.1133 - if ( ratewrite(io_fd, live, spage, PAGE_SIZE) != 15.1134 - PAGE_SIZE ) 15.1135 - { 15.1136 - ERROR("Error when writing to state file (5)" 15.1137 - " (errno %d)", errno); 15.1138 - goto out; 15.1139 - } 15.1140 - } 15.1141 - } /* end of the write out for this batch */ 15.1142 - 15.1143 - sent_this_iter += batch; 15.1144 - 15.1145 - munmap(region_base, batch*PAGE_SIZE); 15.1146 - 15.1147 - } /* end of this while loop for this iteration */ 15.1148 - 15.1149 - skip: 15.1150 - 15.1151 - total_sent += sent_this_iter; 15.1152 - 15.1153 - DPRINTF("\r %d: sent %d, skipped %d, ", 15.1154 - iter, sent_this_iter, skip_this_iter ); 15.1155 - 15.1156 - if ( last_iter ) 15.1157 - { 15.1158 - print_stats( xc_handle, dom, sent_this_iter, &stats, 1); 15.1159 - 15.1160 - DPRINTF("Total pages sent= %ld (%.2fx)\n", 15.1161 - total_sent, ((float)total_sent)/p2m_size ); 15.1162 - DPRINTF("(of which %ld were fixups)\n", needed_to_fix ); 15.1163 - } 15.1164 - 15.1165 - if ( last_iter && debug ) 15.1166 - { 15.1167 - int minusone = -1; 15.1168 - memset(to_send, 0xff, BITMAP_SIZE); 15.1169 - debug = 0; 15.1170 - DPRINTF("Entering debug resend-all mode\n"); 15.1171 - 15.1172 - /* send "-1" to put receiver into debug mode */ 15.1173 - if ( !write_exact(io_fd, &minusone, sizeof(int)) ) 15.1174 - { 15.1175 - ERROR("Error when writing to state file (6) (errno %d)", 15.1176 - errno); 15.1177 - goto out; 15.1178 - } 15.1179 - 15.1180 - continue; 15.1181 - } 15.1182 - 15.1183 - if ( last_iter ) 15.1184 - break; 15.1185 - 15.1186 - if ( live ) 15.1187 - { 15.1188 - if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) || 15.1189 - (iter >= max_iters) || 15.1190 - (sent_this_iter+skip_this_iter < 50) || 15.1191 - (total_sent > p2m_size*max_factor) ) 15.1192 - { 15.1193 - DPRINTF("Start last iteration\n"); 15.1194 - last_iter = 1; 15.1195 - 15.1196 - if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, 15.1197 - &ctxt) ) 15.1198 - { 15.1199 - ERROR("Domain appears not to have suspended"); 15.1200 - goto out; 15.1201 - } 15.1202 - 15.1203 - DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n", 15.1204 - info.shared_info_frame, 15.1205 - (unsigned long)ctxt.user_regs.eip, 15.1206 - (unsigned long)ctxt.user_regs.edx); 15.1207 - } 15.1208 - 15.1209 - if ( xc_shadow_control(xc_handle, dom, 15.1210 - XEN_DOMCTL_SHADOW_OP_CLEAN, to_send, 15.1211 - p2m_size, NULL, 0, &stats) != p2m_size ) 15.1212 - { 15.1213 - ERROR("Error flushing shadow PT"); 15.1214 - goto out; 15.1215 - } 15.1216 - 15.1217 - sent_last_iter = sent_this_iter; 15.1218 - 15.1219 - print_stats(xc_handle, dom, sent_this_iter, &stats, 1); 15.1220 - 15.1221 - } 15.1222 - } /* end of infinite for loop */ 15.1223 - 15.1224 - DPRINTF("All memory is saved\n"); 15.1225 - 15.1226 - { 15.1227 - struct { 15.1228 - int minustwo; 15.1229 - int max_vcpu_id; 15.1230 - uint64_t vcpumap; 15.1231 - } chunk = { -2, info.max_vcpu_id }; 15.1232 - 15.1233 - if ( info.max_vcpu_id >= 64 ) 15.1234 - { 15.1235 - ERROR("Too many VCPUS in guest!"); 15.1236 - goto out; 15.1237 - } 15.1238 - 15.1239 - for ( i = 1; i <= info.max_vcpu_id; i++ ) 15.1240 - { 15.1241 - xc_vcpuinfo_t vinfo; 15.1242 - if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) && 15.1243 - vinfo.online ) 15.1244 - vcpumap |= 1ULL << i; 15.1245 - } 15.1246 - 15.1247 - chunk.vcpumap = vcpumap; 15.1248 - if ( !write_exact(io_fd, &chunk, sizeof(chunk)) ) 15.1249 - { 15.1250 - ERROR("Error when writing to state file (errno %d)", errno); 15.1251 - goto out; 15.1252 - } 15.1253 - } 15.1254 - 15.1255 - /* Zero terminate */ 15.1256 - i = 0; 15.1257 - if ( !write_exact(io_fd, &i, sizeof(int)) ) 15.1258 - { 15.1259 - ERROR("Error when writing to state file (6') (errno %d)", errno); 15.1260 - goto out; 15.1261 - } 15.1262 - 15.1263 - /* Send through a list of all the PFNs that were not in map at the close */ 15.1264 - { 15.1265 - unsigned int i,j; 15.1266 - unsigned long pfntab[1024]; 15.1267 - 15.1268 - for ( i = 0, j = 0; i < p2m_size; i++ ) 15.1269 - { 15.1270 - if ( !is_mapped(live_p2m[i]) ) 15.1271 - j++; 15.1272 - } 15.1273 - 15.1274 - if ( !write_exact(io_fd, &j, sizeof(unsigned int)) ) 15.1275 - { 15.1276 - ERROR("Error when writing to state file (6a) (errno %d)", errno); 15.1277 - goto out; 15.1278 - } 15.1279 - 15.1280 - for ( i = 0, j = 0; i < p2m_size; ) 15.1281 - { 15.1282 - if ( !is_mapped(live_p2m[i]) ) 15.1283 - pfntab[j++] = i; 15.1284 - 15.1285 - i++; 15.1286 - if ( (j == 1024) || (i == p2m_size) ) 15.1287 - { 15.1288 - if ( !write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) ) 15.1289 - { 15.1290 - ERROR("Error when writing to state file (6b) (errno %d)", 15.1291 - errno); 15.1292 - goto out; 15.1293 - } 15.1294 - j = 0; 15.1295 - } 15.1296 - } 15.1297 - } 15.1298 - 15.1299 - /* Canonicalise the suspend-record frame number. */ 15.1300 - if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) ) 15.1301 - { 15.1302 - ERROR("Suspend record is not in range of pseudophys map"); 15.1303 - goto out; 15.1304 - } 15.1305 - 15.1306 - for ( i = 0; i <= info.max_vcpu_id; i++ ) 15.1307 - { 15.1308 - if ( !(vcpumap & (1ULL << i)) ) 15.1309 - continue; 15.1310 - 15.1311 - if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) ) 15.1312 - { 15.1313 - ERROR("No context for VCPU%d", i); 15.1314 - goto out; 15.1315 - } 15.1316 - 15.1317 - /* Canonicalise each GDT frame number. */ 15.1318 - for ( j = 0; (512*j) < ctxt.gdt_ents; j++ ) 15.1319 - { 15.1320 - if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[j]) ) 15.1321 - { 15.1322 - ERROR("GDT frame is not in range of pseudophys map"); 15.1323 - goto out; 15.1324 - } 15.1325 - } 15.1326 - 15.1327 - /* Canonicalise the page table base pointer. */ 15.1328 - if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) ) 15.1329 - { 15.1330 - ERROR("PT base is not in range of pseudophys map"); 15.1331 - goto out; 15.1332 - } 15.1333 - ctxt.ctrlreg[3] = 15.1334 - xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3]))); 15.1335 - 15.1336 - /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */ 15.1337 - if ( (pt_levels == 4) && ctxt.ctrlreg[1] ) 15.1338 - { 15.1339 - if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[1])) ) 15.1340 - { 15.1341 - ERROR("PT base is not in range of pseudophys map"); 15.1342 - goto out; 15.1343 - } 15.1344 - /* Least-significant bit means 'valid PFN'. */ 15.1345 - ctxt.ctrlreg[1] = 1 | 15.1346 - xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[1]))); 15.1347 - } 15.1348 - 15.1349 - if ( !write_exact(io_fd, &ctxt, sizeof(ctxt)) ) 15.1350 - { 15.1351 - ERROR("Error when writing to state file (1) (errno %d)", errno); 15.1352 - goto out; 15.1353 - } 15.1354 - } 15.1355 - 15.1356 - /* 15.1357 - * Reset the MFN to be a known-invalid value. See map_frame_list_list(). 15.1358 - */ 15.1359 - memcpy(page, live_shinfo, PAGE_SIZE); 15.1360 - ((shared_info_t *)page)->arch.pfn_to_mfn_frame_list_list = 0; 15.1361 - if ( !write_exact(io_fd, page, PAGE_SIZE) ) 15.1362 - { 15.1363 - ERROR("Error when writing to state file (1) (errno %d)", errno); 15.1364 - goto out; 15.1365 - } 15.1366 - 15.1367 - /* Success! */ 15.1368 - rc = 0; 15.1369 - 15.1370 - out: 15.1371 - 15.1372 - if ( live ) 15.1373 - { 15.1374 - if ( xc_shadow_control(xc_handle, dom, 15.1375 - XEN_DOMCTL_SHADOW_OP_OFF, 15.1376 - NULL, 0, NULL, 0, NULL) < 0 ) 15.1377 - DPRINTF("Warning - couldn't disable shadow mode"); 15.1378 - } 15.1379 - 15.1380 - /* Flush last write and discard cache for file. */ 15.1381 - discard_file_cache(io_fd, 1 /* flush */); 15.1382 - 15.1383 - if ( live_shinfo ) 15.1384 - munmap(live_shinfo, PAGE_SIZE); 15.1385 - 15.1386 - if ( live_p2m_frame_list_list ) 15.1387 - munmap(live_p2m_frame_list_list, PAGE_SIZE); 15.1388 - 15.1389 - if ( live_p2m_frame_list ) 15.1390 - munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE); 15.1391 - 15.1392 - if ( live_p2m ) 15.1393 - munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT)); 15.1394 - 15.1395 - if ( live_m2p ) 15.1396 - munmap(live_m2p, M2P_SIZE(max_mfn)); 15.1397 - 15.1398 - free(pfn_type); 15.1399 - free(pfn_batch); 15.1400 - free(to_send); 15.1401 - free(to_fix); 15.1402 - free(to_skip); 15.1403 - 15.1404 - DPRINTF("Save exit rc=%d\n",rc); 15.1405 - 15.1406 - return !!rc; 15.1407 -} 15.1408 - 15.1409 -/* 15.1410 - * Local variables: 15.1411 - * mode: C 15.1412 - * c-set-style: "BSD" 15.1413 - * c-basic-offset: 4 15.1414 - * tab-width: 4 15.1415 - * indent-tabs-mode: nil 15.1416 - * End: 15.1417 - */
16.1 --- a/tools/libxc/xenguest.h Wed Apr 11 07:30:02 2007 -0600 16.2 +++ b/tools/libxc/xenguest.h Wed Apr 11 15:45:29 2007 +0100 16.3 @@ -16,26 +16,19 @@ 16.4 16.5 16.6 /** 16.7 - * This function will save a domain running Linux. 16.8 + * This function will save a running domain. 16.9 * 16.10 * @parm xc_handle a handle to an open hypervisor interface 16.11 * @parm fd the file descriptor to save a domain to 16.12 * @parm dom the id of the domain 16.13 * @return 0 on success, -1 on failure 16.14 */ 16.15 -int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, 16.16 - uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */, 16.17 - int (*suspend)(int domid)); 16.18 +int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, 16.19 + uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */, 16.20 + int (*suspend)(int domid), int hvm, 16.21 + void *(*init_qemu_maps)(int, unsigned), /* HVM only */ 16.22 + void (*qemu_flip_buffer)(int, int)); /* HVM only */ 16.23 16.24 -/** 16.25 - * This function will save a hvm domain running unmodified guest. 16.26 - * @return 0 on success, -1 on failure 16.27 - */ 16.28 -int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, 16.29 - uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */, 16.30 - int (*suspend)(int domid), 16.31 - void *(*init_qemu_maps)(int, unsigned), 16.32 - void (*qemu_flip_buffer)(int, int)); 16.33 16.34 /** 16.35 * This function will restore a saved domain.
17.1 --- a/tools/libxc/xg_private.c Wed Apr 11 07:30:02 2007 -0600 17.2 +++ b/tools/libxc/xg_private.c Wed Apr 11 15:45:29 2007 +0100 17.3 @@ -198,17 +198,6 @@ unsigned long csum_page(void *page) 17.4 return -1; 17.5 } 17.6 17.7 -__attribute__((weak)) 17.8 - int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, 17.9 - uint32_t max_factor, uint32_t flags, 17.10 - int (*suspend)(int domid), 17.11 - void *(*init_qemu_maps)(int, unsigned), 17.12 - void (*qemu_flip_buffer)(int, int)) 17.13 -{ 17.14 - errno = ENOSYS; 17.15 - return -1; 17.16 -} 17.17 - 17.18 __attribute__((weak)) int xc_get_hvm_param( 17.19 int handle, domid_t dom, int param, unsigned long *value) 17.20 {
18.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 18.2 +++ b/tools/pygrub/src/LiloConf.py Wed Apr 11 15:45:29 2007 +0100 18.3 @@ -0,0 +1,147 @@ 18.4 +# 18.5 +#LiloConf.py 18.6 +# 18.7 + 18.8 +import sys, re, os 18.9 +import logging 18.10 +import GrubConf 18.11 + 18.12 +class LiloImage(object): 18.13 + def __init__(self, lines, path): 18.14 + self.reset(lines, path) 18.15 + 18.16 + def __repr__(self): 18.17 + return ("title: %s\n" 18.18 + " root: %s\n" 18.19 + " kernel: %s\n" 18.20 + " args: %s\n" 18.21 + " initrd: %s\n" %(self.title, self.root, self.kernel, 18.22 + self.args, self.initrd)) 18.23 + def reset(self, lines, path): 18.24 + self._root = self._initrd = self._kernel = self._args = None 18.25 + self.title = "" 18.26 + self.lines = [] 18.27 + self.path = path 18.28 + map(self.set_from_line, lines) 18.29 + self.root = "" # dummy 18.30 + 18.31 + def set_from_line(self, line, replace = None): 18.32 + (com, arg) = GrubConf.grub_exact_split(line, 2) 18.33 + 18.34 + if self.commands.has_key(com): 18.35 + if self.commands[com] is not None: 18.36 + exec("%s = r\'%s\'" %(self.commands[com], re.sub('^"(.+)"$', r"\1", arg.strip()))) 18.37 + else: 18.38 + logging.info("Ignored image directive %s" %(com,)) 18.39 + else: 18.40 + logging.warning("Unknown image directive %s" %(com,)) 18.41 + 18.42 + # now put the line in the list of lines 18.43 + if replace is None: 18.44 + self.lines.append(line) 18.45 + else: 18.46 + self.lines.pop(replace) 18.47 + self.lines.insert(replace, line) 18.48 + 18.49 + def set_kernel(self, val): 18.50 + self._kernel = (None, self.path + "/" + val) 18.51 + def get_kernel(self): 18.52 + return self._kernel 18.53 + kernel = property(get_kernel, set_kernel) 18.54 + 18.55 + def set_initrd(self, val): 18.56 + self._initrd = (None, self.path + "/" + val) 18.57 + def get_initrd(self): 18.58 + return self._initrd 18.59 + initrd = property(get_initrd, set_initrd) 18.60 + 18.61 + # set up command handlers 18.62 + commands = { "label": "self.title", 18.63 + "root": "self.root", 18.64 + "rootnoverify": "self.root", 18.65 + "image": "self.kernel", 18.66 + "initrd": "self.initrd", 18.67 + "append": "self.args", 18.68 + "read-only": None, 18.69 + "chainloader": None, 18.70 + "module": None} 18.71 + 18.72 +class LiloConfigFile(object): 18.73 + def __init__(self, fn = None): 18.74 + self.filename = fn 18.75 + self.images = [] 18.76 + self.timeout = -1 18.77 + self._default = 0 18.78 + 18.79 + if fn is not None: 18.80 + self.parse() 18.81 + 18.82 + def parse(self, buf = None): 18.83 + if buf is None: 18.84 + if self.filename is None: 18.85 + raise ValueError, "No config file defined to parse!" 18.86 + 18.87 + f = open(self.filename, 'r') 18.88 + lines = f.readlines() 18.89 + f.close() 18.90 + else: 18.91 + lines = buf.split("\n") 18.92 + 18.93 + path = os.path.dirname(self.filename) 18.94 + img = [] 18.95 + for l in lines: 18.96 + l = l.strip() 18.97 + # skip blank lines 18.98 + if len(l) == 0: 18.99 + continue 18.100 + # skip comments 18.101 + if l.startswith('#'): 18.102 + continue 18.103 + # new image 18.104 + if l.startswith("image"): 18.105 + if len(img) > 0: 18.106 + self.add_image(LiloImage(img, path)) 18.107 + img = [l] 18.108 + continue 18.109 + 18.110 + if len(img) > 0: 18.111 + img.append(l) 18.112 + continue 18.113 + 18.114 + (com, arg) = GrubConf.grub_exact_split(l, 2) 18.115 + if self.commands.has_key(com): 18.116 + if self.commands[com] is not None: 18.117 + exec("%s = r\"%s\"" %(self.commands[com], arg.strip())) 18.118 + else: 18.119 + logging.info("Ignored directive %s" %(com,)) 18.120 + else: 18.121 + logging.warning("Unknown directive %s" %(com,)) 18.122 + 18.123 + if len(img) > 0: 18.124 + self.add_image(LiloImage(img, path)) 18.125 + 18.126 + def add_image(self, image): 18.127 + self.images.append(image) 18.128 + 18.129 + def _get_default(self): 18.130 + for i in range(0, len(self.images) - 1): 18.131 + if self.images[i].title == self._default: 18.132 + return i 18.133 + return 0 18.134 + def _set_default(self, val): 18.135 + self._default = val 18.136 + default = property(_get_default, _set_default) 18.137 + 18.138 + commands = { "default": "self.default", 18.139 + "timeout": "self.timeout", 18.140 + "prompt": None, 18.141 + "relocatable": None, 18.142 + } 18.143 + 18.144 +if __name__ == "__main__": 18.145 + if sys.argv < 2: 18.146 + raise RuntimeError, "Need a grub.conf to read" 18.147 + g = LiloConfigFile(sys.argv[1]) 18.148 + for i in g.images: 18.149 + print i #, i.title, i.root, i.kernel, i.args, i.initrd 18.150 + print g.default
19.1 --- a/tools/pygrub/src/pygrub Wed Apr 11 07:30:02 2007 -0600 19.2 +++ b/tools/pygrub/src/pygrub Wed Apr 11 15:45:29 2007 +0100 19.3 @@ -16,6 +16,7 @@ 19.4 import os, sys, string, struct, tempfile, re 19.5 import copy 19.6 import logging 19.7 +import platform 19.8 19.9 import curses, _curses, curses.wrapper, curses.textpad, curses.ascii 19.10 import getopt 19.11 @@ -24,6 +25,7 @@ sys.path = [ '/usr/lib/python' ] + sys.p 19.12 19.13 import fsimage 19.14 import grub.GrubConf 19.15 +import grub.LiloConf 19.16 19.17 PYGRUB_VER = 0.5 19.18 19.19 @@ -59,6 +61,13 @@ def get_active_partition(file): 19.20 if struct.unpack("<c", buf[poff:poff+1]) == ('\x80',): 19.21 return buf[poff:poff+16] 19.22 19.23 + # type=0xee: GUID partition table 19.24 + # XXX assume the first partition is active 19.25 + if struct.unpack("<c", buf[poff+4:poff+5]) == ('\xee',): 19.26 + os.lseek(fd, 0x400, 0) 19.27 + buf = os.read(fd, 512) 19.28 + return buf[24:40] # XXX buf[32:40] 19.29 + 19.30 # if there's not a partition marked as active, fall back to 19.31 # the first partition 19.32 return buf[446:446+16] 19.33 @@ -346,7 +355,13 @@ class Grub: 19.34 if not os.access(fn, os.R_OK): 19.35 raise RuntimeError, "Unable to access %s" %(fn,) 19.36 19.37 - self.cf = grub.GrubConf.GrubConfigFile() 19.38 + if platform.machine() == 'ia64': 19.39 + self.cf = grub.LiloConf.LiloConfigFile() 19.40 + file_list = ("/efi/redhat/elilo.conf",) 19.41 + else: 19.42 + self.cf = grub.GrubConf.GrubConfigFile() 19.43 + file_list = ("/boot/grub/menu.lst", "/boot/grub/grub.conf", 19.44 + "/grub/menu.lst", "/grub/grub.conf") 19.45 19.46 if not fs: 19.47 # set the config file and parse it 19.48 @@ -354,18 +369,15 @@ class Grub: 19.49 self.cf.parse() 19.50 return 19.51 19.52 - grubfile = None 19.53 - for f in ("/boot/grub/menu.lst", "/boot/grub/grub.conf", 19.54 - "/grub/menu.lst", "/grub/grub.conf"): 19.55 + for f in file_list: 19.56 if fs.file_exists(f): 19.57 - grubfile = f 19.58 + self.cf.filename = f 19.59 break 19.60 - if grubfile is None: 19.61 - raise RuntimeError, "we couldn't find grub config file in the image provided." 19.62 - f = fs.open_file(grubfile) 19.63 + if self.cf.filename is None: 19.64 + raise RuntimeError, "couldn't find bootloader config file in the image provided." 19.65 + f = fs.open_file(self.cf.filename) 19.66 buf = f.read() 19.67 del f 19.68 - # then parse the grub config 19.69 self.cf.parse(buf) 19.70 19.71 def run(self):
20.1 --- a/tools/python/xen/xend/XendCheckpoint.py Wed Apr 11 07:30:02 2007 -0600 20.2 +++ b/tools/python/xen/xend/XendCheckpoint.py Wed Apr 11 15:45:29 2007 +0100 20.3 @@ -75,13 +75,6 @@ def save(fd, dominfo, network, live, dst 20.4 20.5 image_cfg = dominfo.info.get('image', {}) 20.6 hvm = dominfo.info.is_hvm() 20.7 - stdvga = 0 20.8 - 20.9 - if hvm: 20.10 - log.info("save hvm domain") 20.11 - if dominfo.info['platform'].has_key('stdvga'): 20.12 - if dominfo.info['platform']['stdvga'] == 1: 20.13 - stdvga = 1 20.14 20.15 # xc_save takes three customization parameters: maxit, max_f, and 20.16 # flags the last controls whether or not save is 'live', while the
21.1 --- a/tools/python/xen/xend/server/DevController.py Wed Apr 11 07:30:02 2007 -0600 21.2 +++ b/tools/python/xen/xend/server/DevController.py Wed Apr 11 15:45:29 2007 +0100 21.3 @@ -223,6 +223,7 @@ class DevController: 21.4 xstransact.Remove(backpath) 21.5 xstransact.Remove(frontpath) 21.6 21.7 + self.vm._removeVm("device/%s/%d" % (self.deviceClass, devid)) 21.8 21.9 def configurations(self): 21.10 return map(self.configuration, self.deviceIDs())
22.1 --- a/tools/python/xen/xend/server/netif.py Wed Apr 11 07:30:02 2007 -0600 22.2 +++ b/tools/python/xen/xend/server/netif.py Wed Apr 11 15:45:29 2007 +0100 22.3 @@ -88,46 +88,6 @@ def parseRate(ratestr): 22.4 return "%lu,%lu" % (bytes_per_interval, interval_usecs) 22.5 22.6 22.7 -write_rate_G_re = re.compile('^([0-9]+)000000000(B/s@[0-9]+us)$') 22.8 -write_rate_M_re = re.compile('^([0-9]+)000000(B/s@[0-9]+us)$') 22.9 -write_rate_K_re = re.compile('^([0-9]+)000(B/s@[0-9]+us)$') 22.10 -write_rate_s_re = re.compile('^([0-9]+[GMK]?B/s@[0-9]+)000000us$') 22.11 -write_rate_m_re = re.compile('^([0-9]+[GMK]?B/s@[0-9]+)000us$') 22.12 - 22.13 -def formatRate(rate): 22.14 - (bytes_per_interval, interval_usecs) = map(long, rate.split(',')) 22.15 - 22.16 - if interval_usecs != 0: 22.17 - bytes_per_second = (bytes_per_interval * 1000 * 1000) / interval_usecs 22.18 - else: 22.19 - bytes_per_second = 0xffffffffL 22.20 - 22.21 - ratestr = "%uB/s@%uus" % (bytes_per_second, interval_usecs) 22.22 - 22.23 - # look for '000's 22.24 - m = write_rate_G_re.match(ratestr) 22.25 - if m: 22.26 - ratestr = m.group(1) + "G" + m.group(2) 22.27 - else: 22.28 - m = write_rate_M_re.match(ratestr) 22.29 - if m: 22.30 - ratestr = m.group(1) + "M" + m.group(2) 22.31 - else: 22.32 - m = write_rate_K_re.match(ratestr) 22.33 - if m: 22.34 - ratestr = m.group(1) + "K" + m.group(2) 22.35 - 22.36 - m = write_rate_s_re.match(ratestr) 22.37 - if m: 22.38 - ratestr = m.group(1) + "s" 22.39 - else: 22.40 - m = write_rate_m_re.match(ratestr) 22.41 - if m: 22.42 - ratestr = m.group(1) + "ms" 22.43 - 22.44 - return ratestr 22.45 - 22.46 - 22.47 class NetifController(DevController): 22.48 """Network interface controller. Handles all network devices for a domain. 22.49 """ 22.50 @@ -138,8 +98,7 @@ class NetifController(DevController): 22.51 def getDeviceDetails(self, config): 22.52 """@see DevController.getDeviceDetails""" 22.53 22.54 - script = os.path.join(xoptions.network_script_dir, 22.55 - config.get('script', xoptions.get_vif_script())) 22.56 + script = config.get('script', xoptions.get_vif_script()) 22.57 typ = config.get('type') 22.58 bridge = config.get('bridge') 22.59 mac = config.get('mac') 22.60 @@ -149,24 +108,17 @@ class NetifController(DevController): 22.61 ipaddr = config.get('ip') 22.62 model = config.get('model') 22.63 22.64 - devid = self.allocateDeviceID() 22.65 - 22.66 if not typ: 22.67 typ = xoptions.netback_type 22.68 - 22.69 + 22.70 if not mac: 22.71 mac = randomMAC() 22.72 22.73 + devid = self.allocateDeviceID() 22.74 + 22.75 back = { 'script' : script, 22.76 'mac' : mac, 22.77 - 'handle' : "%i" % devid, 22.78 'type' : typ } 22.79 - 22.80 - if typ == 'ioemu': 22.81 - front = {} 22.82 - else: 22.83 - front = { 'handle' : "%i" % devid, 22.84 - 'mac' : mac } 22.85 if ipaddr: 22.86 back['ip'] = ipaddr 22.87 if bridge: 22.88 @@ -174,12 +126,26 @@ class NetifController(DevController): 22.89 if vifname: 22.90 back['vifname'] = vifname 22.91 if rate: 22.92 - back['rate'] = parseRate(rate) 22.93 + back['rate'] = rate 22.94 if uuid: 22.95 back['uuid'] = uuid 22.96 if model: 22.97 back['model'] = model 22.98 22.99 + config_path = "device/%s/%d/" % (self.deviceClass, devid) 22.100 + for x in back: 22.101 + self.vm._writeVm(config_path + x, back[x]) 22.102 + 22.103 + back['handle'] = "%i" % devid 22.104 + back['script'] = os.path.join(xoptions.network_script_dir, script) 22.105 + if rate: 22.106 + back['rate'] = parseRate(rate) 22.107 + 22.108 + front = {} 22.109 + if typ != 'ioemu': 22.110 + front = { 'handle' : "%i" % devid, 22.111 + 'mac' : mac } 22.112 + 22.113 return (devid, back, front) 22.114 22.115 22.116 @@ -187,14 +153,17 @@ class NetifController(DevController): 22.117 """@see DevController.configuration""" 22.118 22.119 result = DevController.getDeviceConfiguration(self, devid) 22.120 - devinfo = self.readBackend(devid, 'script', 'ip', 'bridge', 22.121 - 'mac', 'type', 'vifname', 'rate', 22.122 - 'uuid', 'model') 22.123 + 22.124 + config_path = "device/%s/%d/" % (self.deviceClass, devid) 22.125 + devinfo = () 22.126 + for x in ( 'script', 'ip', 'bridge', 'mac', 22.127 + 'type', 'vifname', 'rate', 'uuid', 'model' ): 22.128 + y = self.vm._readVm(config_path + x) 22.129 + devinfo += (y,) 22.130 (script, ip, bridge, mac, typ, vifname, rate, uuid, model) = devinfo 22.131 22.132 if script: 22.133 - network_script_dir = xoptions.network_script_dir + os.sep 22.134 - result['script'] = script.replace(network_script_dir, "") 22.135 + result['script'] = script 22.136 if ip: 22.137 result['ip'] = ip 22.138 if bridge: 22.139 @@ -206,11 +175,10 @@ class NetifController(DevController): 22.140 if vifname: 22.141 result['vifname'] = vifname 22.142 if rate: 22.143 - result['rate'] = formatRate(rate) 22.144 + result['rate'] = rate 22.145 if uuid: 22.146 result['uuid'] = uuid 22.147 if model: 22.148 result['model'] = model 22.149 22.150 return result 22.151 -
23.1 --- a/tools/xcutils/xc_save.c Wed Apr 11 07:30:02 2007 -0600 23.2 +++ b/tools/xcutils/xc_save.c Wed Apr 11 15:45:29 2007 +0100 23.3 @@ -174,12 +174,9 @@ main(int argc, char **argv) 23.4 max_f = atoi(argv[4]); 23.5 flags = atoi(argv[5]); 23.6 23.7 - if (flags & XCFLAGS_HVM) 23.8 - ret = xc_hvm_save(xc_fd, io_fd, domid, maxit, max_f, flags, 23.9 - &suspend, &init_qemu_maps, &qemu_flip_buffer); 23.10 - else 23.11 - ret = xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags, 23.12 - &suspend); 23.13 + ret = xc_domain_save(xc_fd, io_fd, domid, maxit, max_f, flags, 23.14 + &suspend, !!(flags & XCFLAGS_HVM), 23.15 + &init_qemu_maps, &qemu_flip_buffer); 23.16 23.17 xc_interface_close(xc_fd); 23.18
24.1 --- a/unmodified_drivers/linux-2.6/platform-pci/evtchn.c Wed Apr 11 07:30:02 2007 -0600 24.2 +++ b/unmodified_drivers/linux-2.6/platform-pci/evtchn.c Wed Apr 11 15:45:29 2007 +0100 24.3 @@ -28,8 +28,10 @@ 24.4 * IN THE SOFTWARE. 24.5 */ 24.6 24.7 +#include <linux/config.h> 24.8 #include <linux/module.h> 24.9 #include <linux/kernel.h> 24.10 +#include <linux/spinlock.h> 24.11 #include <xen/evtchn.h> 24.12 #include <xen/interface/hvm/ioreq.h> 24.13 #include <xen/features.h> 24.14 @@ -41,29 +43,37 @@ 24.15 24.16 void *shared_info_area; 24.17 24.18 -static DEFINE_MUTEX(irq_evtchn_mutex); 24.19 - 24.20 #define is_valid_evtchn(x) ((x) != 0) 24.21 #define evtchn_from_irq(x) (irq_evtchn[irq].evtchn) 24.22 24.23 static struct { 24.24 + spinlock_t lock; 24.25 irqreturn_t(*handler) (int, void *, struct pt_regs *); 24.26 void *dev_id; 24.27 int evtchn; 24.28 int close:1; /* close on unbind_from_irqhandler()? */ 24.29 int inuse:1; 24.30 + int in_handler:1; 24.31 } irq_evtchn[256]; 24.32 static int evtchn_to_irq[NR_EVENT_CHANNELS] = { 24.33 [0 ... NR_EVENT_CHANNELS-1] = -1 }; 24.34 24.35 -static int find_unbound_irq(void) 24.36 +static DEFINE_SPINLOCK(irq_alloc_lock); 24.37 + 24.38 +static int alloc_xen_irq(void) 24.39 { 24.40 static int warned; 24.41 int irq; 24.42 24.43 - for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++) 24.44 - if (!irq_evtchn[irq].inuse) 24.45 - return irq; 24.46 + spin_lock(&irq_alloc_lock); 24.47 + 24.48 + for (irq = 1; irq < ARRAY_SIZE(irq_evtchn); irq++) { 24.49 + if (irq_evtchn[irq].inuse) 24.50 + continue; 24.51 + irq_evtchn[irq].inuse = 1; 24.52 + spin_unlock(&irq_alloc_lock); 24.53 + return irq; 24.54 + } 24.55 24.56 if (!warned) { 24.57 warned = 1; 24.58 @@ -71,9 +81,18 @@ static int find_unbound_irq(void) 24.59 "increase irq_evtchn[] size in evtchn.c.\n"); 24.60 } 24.61 24.62 + spin_unlock(&irq_alloc_lock); 24.63 + 24.64 return -ENOSPC; 24.65 } 24.66 24.67 +static void free_xen_irq(int irq) 24.68 +{ 24.69 + spin_lock(&irq_alloc_lock); 24.70 + irq_evtchn[irq].inuse = 0; 24.71 + spin_unlock(&irq_alloc_lock); 24.72 +} 24.73 + 24.74 int irq_to_evtchn_port(int irq) 24.75 { 24.76 return irq_evtchn[irq].evtchn; 24.77 @@ -93,8 +112,7 @@ void unmask_evtchn(int port) 24.78 shared_info_t *s = shared_info_area; 24.79 vcpu_info_t *vcpu_info; 24.80 24.81 - preempt_disable(); 24.82 - cpu = smp_processor_id(); 24.83 + cpu = get_cpu(); 24.84 vcpu_info = &s->vcpu_info[cpu]; 24.85 24.86 /* Slow path (hypercall) if this is a non-local port. We only 24.87 @@ -103,7 +121,7 @@ void unmask_evtchn(int port) 24.88 evtchn_unmask_t op = { .port = port }; 24.89 (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, 24.90 &op); 24.91 - preempt_enable(); 24.92 + put_cpu(); 24.93 return; 24.94 } 24.95 24.96 @@ -121,7 +139,8 @@ void unmask_evtchn(int port) 24.97 if (!vcpu_info->evtchn_upcall_mask) 24.98 force_evtchn_callback(); 24.99 } 24.100 - preempt_enable(); 24.101 + 24.102 + put_cpu(); 24.103 } 24.104 EXPORT_SYMBOL(unmask_evtchn); 24.105 24.106 @@ -135,20 +154,19 @@ int bind_listening_port_to_irqhandler( 24.107 struct evtchn_alloc_unbound alloc_unbound; 24.108 int err, irq; 24.109 24.110 - mutex_lock(&irq_evtchn_mutex); 24.111 + irq = alloc_xen_irq(); 24.112 + if (irq < 0) 24.113 + return irq; 24.114 24.115 - irq = find_unbound_irq(); 24.116 - if (irq < 0) { 24.117 - mutex_unlock(&irq_evtchn_mutex); 24.118 - return irq; 24.119 - } 24.120 + spin_lock_irq(&irq_evtchn[irq].lock); 24.121 24.122 alloc_unbound.dom = DOMID_SELF; 24.123 alloc_unbound.remote_dom = remote_domain; 24.124 err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, 24.125 &alloc_unbound); 24.126 if (err) { 24.127 - mutex_unlock(&irq_evtchn_mutex); 24.128 + spin_unlock_irq(&irq_evtchn[irq].lock); 24.129 + free_xen_irq(irq); 24.130 return err; 24.131 } 24.132 24.133 @@ -156,13 +174,13 @@ int bind_listening_port_to_irqhandler( 24.134 irq_evtchn[irq].dev_id = dev_id; 24.135 irq_evtchn[irq].evtchn = alloc_unbound.port; 24.136 irq_evtchn[irq].close = 1; 24.137 - irq_evtchn[irq].inuse = 1; 24.138 24.139 evtchn_to_irq[alloc_unbound.port] = irq; 24.140 24.141 unmask_evtchn(alloc_unbound.port); 24.142 24.143 - mutex_unlock(&irq_evtchn_mutex); 24.144 + spin_unlock_irq(&irq_evtchn[irq].lock); 24.145 + 24.146 return irq; 24.147 } 24.148 EXPORT_SYMBOL(bind_listening_port_to_irqhandler); 24.149 @@ -176,34 +194,34 @@ int bind_caller_port_to_irqhandler( 24.150 { 24.151 int irq; 24.152 24.153 - mutex_lock(&irq_evtchn_mutex); 24.154 + irq = alloc_xen_irq(); 24.155 + if (irq < 0) 24.156 + return irq; 24.157 24.158 - irq = find_unbound_irq(); 24.159 - if (irq < 0) { 24.160 - mutex_unlock(&irq_evtchn_mutex); 24.161 - return irq; 24.162 - } 24.163 + spin_lock_irq(&irq_evtchn[irq].lock); 24.164 24.165 irq_evtchn[irq].handler = handler; 24.166 irq_evtchn[irq].dev_id = dev_id; 24.167 irq_evtchn[irq].evtchn = caller_port; 24.168 irq_evtchn[irq].close = 0; 24.169 - irq_evtchn[irq].inuse = 1; 24.170 24.171 evtchn_to_irq[caller_port] = irq; 24.172 24.173 unmask_evtchn(caller_port); 24.174 24.175 - mutex_unlock(&irq_evtchn_mutex); 24.176 + spin_unlock_irq(&irq_evtchn[irq].lock); 24.177 + 24.178 return irq; 24.179 } 24.180 EXPORT_SYMBOL(bind_caller_port_to_irqhandler); 24.181 24.182 void unbind_from_irqhandler(unsigned int irq, void *dev_id) 24.183 { 24.184 - int evtchn = evtchn_from_irq(irq); 24.185 + int evtchn; 24.186 24.187 - mutex_lock(&irq_evtchn_mutex); 24.188 + spin_lock_irq(&irq_evtchn[irq].lock); 24.189 + 24.190 + evtchn = evtchn_from_irq(irq); 24.191 24.192 if (is_valid_evtchn(evtchn)) { 24.193 evtchn_to_irq[irq] = -1; 24.194 @@ -216,21 +234,28 @@ void unbind_from_irqhandler(unsigned int 24.195 24.196 irq_evtchn[irq].handler = NULL; 24.197 irq_evtchn[irq].evtchn = 0; 24.198 - irq_evtchn[irq].inuse = 0; 24.199 + 24.200 + spin_unlock_irq(&irq_evtchn[irq].lock); 24.201 24.202 - mutex_unlock(&irq_evtchn_mutex); 24.203 + while (irq_evtchn[irq].in_handler) 24.204 + cpu_relax(); 24.205 + 24.206 + free_xen_irq(irq); 24.207 } 24.208 EXPORT_SYMBOL(unbind_from_irqhandler); 24.209 24.210 void notify_remote_via_irq(int irq) 24.211 { 24.212 - int evtchn = evtchn_from_irq(irq); 24.213 + int evtchn; 24.214 + 24.215 + evtchn = evtchn_from_irq(irq); 24.216 if (is_valid_evtchn(evtchn)) 24.217 notify_remote_via_evtchn(evtchn); 24.218 } 24.219 EXPORT_SYMBOL(notify_remote_via_irq); 24.220 24.221 -irqreturn_t evtchn_interrupt(int irq, void *dev_id, struct pt_regs *regs) 24.222 +static irqreturn_t evtchn_interrupt(int irq, void *dev_id, 24.223 + struct pt_regs *regs) 24.224 { 24.225 unsigned int l1i, port; 24.226 /* XXX: All events are bound to vcpu0 but irq may be redirected. */ 24.227 @@ -249,13 +274,30 @@ irqreturn_t evtchn_interrupt(int irq, vo 24.228 while ((l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i])) { 24.229 port = (l1i * BITS_PER_LONG) + __ffs(l2); 24.230 synch_clear_bit(port, &s->evtchn_pending[0]); 24.231 + 24.232 irq = evtchn_to_irq[port]; 24.233 - if ((irq >= 0) && 24.234 - ((handler = irq_evtchn[irq].handler) != NULL)) 24.235 - handler(irq, irq_evtchn[irq].dev_id, regs); 24.236 - else 24.237 - printk(KERN_WARNING "unexpected event channel " 24.238 - "upcall on port %d!\n", port); 24.239 + if (irq < 0) 24.240 + continue; 24.241 + 24.242 + spin_lock(&irq_evtchn[irq].lock); 24.243 + handler = irq_evtchn[irq].handler; 24.244 + dev_id = irq_evtchn[irq].dev_id; 24.245 + if (unlikely(handler == NULL)) { 24.246 + printk("Xen IRQ%d (port %d) has no handler!\n", 24.247 + irq, port); 24.248 + spin_unlock(&irq_evtchn[irq].lock); 24.249 + continue; 24.250 + } 24.251 + irq_evtchn[irq].in_handler = 1; 24.252 + spin_unlock(&irq_evtchn[irq].lock); 24.253 + 24.254 + local_irq_enable(); 24.255 + handler(irq, irq_evtchn[irq].dev_id, regs); 24.256 + local_irq_disable(); 24.257 + 24.258 + spin_lock(&irq_evtchn[irq].lock); 24.259 + irq_evtchn[irq].in_handler = 0; 24.260 + spin_unlock(&irq_evtchn[irq].lock); 24.261 } 24.262 } 24.263 24.264 @@ -268,16 +310,6 @@ void force_evtchn_callback(void) 24.265 } 24.266 EXPORT_SYMBOL(force_evtchn_callback); 24.267 24.268 -void irq_suspend(void) 24.269 -{ 24.270 - mutex_lock(&irq_evtchn_mutex); 24.271 -} 24.272 - 24.273 -void irq_suspend_cancel(void) 24.274 -{ 24.275 - mutex_unlock(&irq_evtchn_mutex); 24.276 -} 24.277 - 24.278 void irq_resume(void) 24.279 { 24.280 int evtchn, irq; 24.281 @@ -289,6 +321,16 @@ void irq_resume(void) 24.282 24.283 for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++) 24.284 irq_evtchn[irq].evtchn = 0; 24.285 +} 24.286 24.287 - mutex_unlock(&irq_evtchn_mutex); 24.288 +int xen_irq_init(struct pci_dev *pdev) 24.289 +{ 24.290 + int irq; 24.291 + 24.292 + for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++) 24.293 + spin_lock_init(&irq_evtchn[irq].lock); 24.294 + 24.295 + return request_irq(pdev->irq, evtchn_interrupt, 24.296 + SA_SHIRQ | SA_SAMPLE_RANDOM | SA_INTERRUPT, 24.297 + "xen-platform-pci", pdev); 24.298 }
25.1 --- a/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c Wed Apr 11 07:30:02 2007 -0600 25.2 +++ b/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c Wed Apr 11 15:45:29 2007 +0100 25.3 @@ -1,24 +1,81 @@ 25.4 #include <linux/config.h> 25.5 +#include <linux/stop_machine.h> 25.6 +#include <xen/evtchn.h> 25.7 +#include <xen/gnttab.h> 25.8 #include <xen/xenbus.h> 25.9 #include "platform-pci.h" 25.10 #include <asm/hypervisor.h> 25.11 25.12 -int __xen_suspend(int fast_suspend) 25.13 +/* 25.14 + * Spinning prevents, for example, APs touching grant table entries while 25.15 + * the shared grant table is not mapped into the address space imemdiately 25.16 + * after resume. 25.17 + */ 25.18 +static void ap_suspend(void *_ap_spin) 25.19 +{ 25.20 + int *ap_spin = _ap_spin; 25.21 + 25.22 + BUG_ON(!irqs_disabled()); 25.23 + 25.24 + while (*ap_spin) { 25.25 + cpu_relax(); 25.26 + HYPERVISOR_yield(); 25.27 + } 25.28 +} 25.29 + 25.30 +static int bp_suspend(void) 25.31 { 25.32 int suspend_cancelled; 25.33 25.34 - xenbus_suspend(); 25.35 - platform_pci_suspend(); 25.36 + BUG_ON(!irqs_disabled()); 25.37 25.38 suspend_cancelled = HYPERVISOR_shutdown(SHUTDOWN_suspend); 25.39 25.40 - if (suspend_cancelled) { 25.41 - platform_pci_suspend_cancel(); 25.42 + if (!suspend_cancelled) { 25.43 + platform_pci_resume(); 25.44 + gnttab_resume(); 25.45 + irq_resume(); 25.46 + } 25.47 + 25.48 + return suspend_cancelled; 25.49 +} 25.50 + 25.51 +int __xen_suspend(int fast_suspend) 25.52 +{ 25.53 + int err, suspend_cancelled, ap_spin; 25.54 + 25.55 + xenbus_suspend(); 25.56 + 25.57 + preempt_disable(); 25.58 + 25.59 + /* Prevent any races with evtchn_interrupt() handler. */ 25.60 + disable_irq(xen_platform_pdev->irq); 25.61 + 25.62 + ap_spin = 1; 25.63 + smp_mb(); 25.64 + 25.65 + err = smp_call_function(ap_suspend, &ap_spin, 0, 0); 25.66 + if (err < 0) { 25.67 + preempt_enable(); 25.68 xenbus_suspend_cancel(); 25.69 - } else { 25.70 - platform_pci_resume(); 25.71 + return err; 25.72 + } 25.73 + 25.74 + local_irq_disable(); 25.75 + suspend_cancelled = bp_suspend(); 25.76 + local_irq_enable(); 25.77 + 25.78 + smp_mb(); 25.79 + ap_spin = 0; 25.80 + 25.81 + enable_irq(xen_platform_pdev->irq); 25.82 + 25.83 + preempt_enable(); 25.84 + 25.85 + if (!suspend_cancelled) 25.86 xenbus_resume(); 25.87 - } 25.88 + else 25.89 + xenbus_suspend_cancel(); 25.90 25.91 return 0; 25.92 }
26.1 --- a/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c Wed Apr 11 07:30:02 2007 -0600 26.2 +++ b/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c Wed Apr 11 15:45:29 2007 +0100 26.3 @@ -12,11 +12,10 @@ static int system_state = 1; 26.4 EXPORT_SYMBOL(system_state); 26.5 #endif 26.6 26.7 -static inline void ctrl_alt_del(void) 26.8 +void ctrl_alt_del(void) 26.9 { 26.10 kill_proc(1, SIGINT, 1); /* interrupt init */ 26.11 } 26.12 -EXPORT_SYMBOL(ctrl_alt_del); 26.13 26.14 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8) 26.15 size_t strcspn(const char *s, const char *reject)
27.1 --- a/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c Wed Apr 11 07:30:02 2007 -0600 27.2 +++ b/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c Wed Apr 11 15:45:29 2007 +0100 27.3 @@ -40,7 +40,6 @@ 27.4 #include <xen/interface/hvm/params.h> 27.5 #include <xen/features.h> 27.6 #include <xen/evtchn.h> 27.7 -#include <xen/gnttab.h> 27.8 #ifdef __ia64__ 27.9 #include <asm/xen/xencomm.h> 27.10 #endif 27.11 @@ -62,6 +61,8 @@ MODULE_AUTHOR("ssmith@xensource.com"); 27.12 MODULE_DESCRIPTION("Xen platform PCI device"); 27.13 MODULE_LICENSE("GPL"); 27.14 27.15 +struct pci_dev *xen_platform_pdev; 27.16 + 27.17 static unsigned long shared_info_frame; 27.18 static uint64_t callback_via; 27.19 27.20 @@ -89,8 +90,6 @@ static int __devinit init_xen_info(void) 27.21 if (shared_info_area == NULL) 27.22 panic("can't map shared info\n"); 27.23 27.24 - gnttab_init(); 27.25 - 27.26 return 0; 27.27 } 27.28 27.29 @@ -199,8 +198,10 @@ static int set_callback_via(uint64_t via 27.30 return HYPERVISOR_hvm_op(HVMOP_set_param, &a); 27.31 } 27.32 27.33 +int xen_irq_init(struct pci_dev *pdev); 27.34 int xenbus_init(void); 27.35 int xen_reboot_init(void); 27.36 +int gnttab_init(void); 27.37 27.38 static int __devinit platform_pci_init(struct pci_dev *pdev, 27.39 const struct pci_device_id *ent) 27.40 @@ -209,6 +210,10 @@ static int __devinit platform_pci_init(s 27.41 long ioaddr, iolen; 27.42 long mmio_addr, mmio_len; 27.43 27.44 + if (xen_platform_pdev) 27.45 + return -EBUSY; 27.46 + xen_platform_pdev = pdev; 27.47 + 27.48 i = pci_enable_device(pdev); 27.49 if (i) 27.50 return i; 27.51 @@ -249,9 +254,10 @@ static int __devinit platform_pci_init(s 27.52 if ((ret = init_xen_info())) 27.53 goto out; 27.54 27.55 - if ((ret = request_irq(pdev->irq, evtchn_interrupt, 27.56 - SA_SHIRQ | SA_SAMPLE_RANDOM, 27.57 - "xen-platform-pci", pdev))) 27.58 + if ((ret = gnttab_init())) 27.59 + goto out; 27.60 + 27.61 + if ((ret = xen_irq_init(pdev))) 27.62 goto out; 27.63 27.64 if ((ret = set_callback_via(callback_via))) 27.65 @@ -292,18 +298,6 @@ static struct pci_driver platform_driver 27.66 27.67 static int pci_device_registered; 27.68 27.69 -void platform_pci_suspend(void) 27.70 -{ 27.71 - gnttab_suspend(); 27.72 - irq_suspend(); 27.73 -} 27.74 - 27.75 -void platform_pci_suspend_cancel(void) 27.76 -{ 27.77 - irq_suspend_cancel(); 27.78 - gnttab_resume(); 27.79 -} 27.80 - 27.81 void platform_pci_resume(void) 27.82 { 27.83 struct xen_add_to_physmap xatp; 27.84 @@ -319,12 +313,8 @@ void platform_pci_resume(void) 27.85 if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) 27.86 BUG(); 27.87 27.88 - irq_resume(); 27.89 - 27.90 if (set_callback_via(callback_via)) 27.91 printk("platform_pci_resume failure!\n"); 27.92 - 27.93 - gnttab_resume(); 27.94 } 27.95 27.96 static int __init platform_pci_module_init(void)
28.1 --- a/unmodified_drivers/linux-2.6/platform-pci/platform-pci.h Wed Apr 11 07:30:02 2007 -0600 28.2 +++ b/unmodified_drivers/linux-2.6/platform-pci/platform-pci.h Wed Apr 11 15:45:29 2007 +0100 28.3 @@ -22,16 +22,11 @@ 28.4 #ifndef _XEN_PLATFORM_PCI_H 28.5 #define _XEN_PLATFORM_PCI_H 28.6 28.7 -#include <linux/interrupt.h> 28.8 +#include <linux/pci.h> 28.9 28.10 unsigned long alloc_xen_mmio(unsigned long len); 28.11 -int gnttab_init(void); 28.12 -irqreturn_t evtchn_interrupt(int irq, void *dev_id, struct pt_regs *regs); 28.13 -void irq_suspend(void); 28.14 -void irq_suspend_cancel(void); 28.15 - 28.16 -void platform_pci_suspend(void); 28.17 -void platform_pci_suspend_cancel(void); 28.18 void platform_pci_resume(void); 28.19 28.20 +extern struct pci_dev *xen_platform_pdev; 28.21 + 28.22 #endif /* _XEN_PLATFORM_PCI_H */