ia64/xen-unstable

changeset 14811:db4fcb609383

Merge with xen-ia64-unstable.hg
author kfraser@localhost.localdomain
date Wed Apr 11 15:45:29 2007 +0100 (2007-04-11)
parents 3d356a2b1c75 0d92cd901f80
children 38204c93428e
files tools/libxc/xc_hvm_save.c tools/libxc/xc_linux_save.c
line diff
     1.1 --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h	Wed Apr 11 07:30:02 2007 -0600
     1.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h	Wed Apr 11 15:45:29 2007 +0100
     1.3 @@ -56,20 +56,6 @@
     1.4  #include <asm/atomic.h>
     1.5  #include <asm/uaccess.h>
     1.6  
     1.7 -#if 1
     1.8 -#define IPRINTK(fmt, args...)				\
     1.9 -	printk(KERN_INFO "xen_blk: " fmt, ##args)
    1.10 -#else
    1.11 -#define IPRINTK(fmt, args...) ((void)0)
    1.12 -#endif
    1.13 -
    1.14 -#if 1
    1.15 -#define WPRINTK(fmt, args...)				\
    1.16 -	printk(KERN_WARNING "xen_blk: " fmt, ##args)
    1.17 -#else
    1.18 -#define WPRINTK(fmt, args...) ((void)0)
    1.19 -#endif
    1.20 -
    1.21  #define DPRINTK(_f, _a...) pr_debug(_f, ## _a)
    1.22  
    1.23  #if 0
     2.1 --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c	Wed Apr 11 07:30:02 2007 -0600
     2.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c	Wed Apr 11 15:45:29 2007 +0100
     2.3 @@ -128,14 +128,12 @@ xlbd_alloc_major_info(int major, int min
     2.4  		break;
     2.5  	}
     2.6  
     2.7 -	printk("Registering block device major %i\n", ptr->major);
     2.8  	if (register_blkdev(ptr->major, ptr->type->devname)) {
     2.9 -		WPRINTK("can't get major %d with name %s\n",
    2.10 -			ptr->major, ptr->type->devname);
    2.11  		kfree(ptr);
    2.12  		return NULL;
    2.13  	}
    2.14  
    2.15 +	printk("xen-vbd: registered block device major %i\n", ptr->major);
    2.16  	major_info[index] = ptr;
    2.17  	return ptr;
    2.18  }
     3.1 --- a/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c	Wed Apr 11 07:30:02 2007 -0600
     3.2 +++ b/linux-2.6-xen-sparse/drivers/xen/core/gnttab.c	Wed Apr 11 15:45:29 2007 +0100
     3.3 @@ -60,9 +60,6 @@ static grant_ref_t gnttab_free_head;
     3.4  static DEFINE_SPINLOCK(gnttab_list_lock);
     3.5  
     3.6  static struct grant_entry *shared;
     3.7 -#ifndef CONFIG_XEN
     3.8 -static unsigned long resume_frames;
     3.9 -#endif
    3.10  
    3.11  static struct gnttab_free_callback *gnttab_free_callback_list;
    3.12  
    3.13 @@ -514,6 +511,8 @@ int gnttab_suspend(void)
    3.14  
    3.15  #include <platform-pci.h>
    3.16  
    3.17 +static unsigned long resume_frames;
    3.18 +
    3.19  static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
    3.20  {
    3.21  	struct xen_add_to_physmap xatp;
    3.22 @@ -543,23 +542,17 @@ int gnttab_resume(void)
    3.23  	if (max_nr_gframes < nr_gframes)
    3.24  		return -ENOSYS;
    3.25  
    3.26 -	resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
    3.27 +	if (!resume_frames) {
    3.28 +		resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
    3.29 +		shared = ioremap(resume_frames, PAGE_SIZE * max_nr_gframes);
    3.30 +		if (shared == NULL) {
    3.31 +			printk("error to ioremap gnttab share frames\n");
    3.32 +			return -1;
    3.33 +		}
    3.34 +	}
    3.35  
    3.36  	gnttab_map(0, nr_gframes - 1);
    3.37  
    3.38 -	shared = ioremap(resume_frames, PAGE_SIZE * max_nr_gframes);
    3.39 -	if (shared == NULL) {
    3.40 -		printk("error to ioremap gnttab share frames\n");
    3.41 -		return -1;
    3.42 -	}
    3.43 -
    3.44 -	return 0;
    3.45 -}
    3.46 -
    3.47 -int gnttab_suspend(void)
    3.48 -{
    3.49 -	iounmap(shared);
    3.50 -	resume_frames = 0;
    3.51  	return 0;
    3.52  }
    3.53  
    3.54 @@ -624,7 +617,6 @@ int __devinit gnttab_init(void)
    3.55  	gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES;
    3.56  	gnttab_free_head  = NR_RESERVED_ENTRIES;
    3.57  
    3.58 -	printk("Grant table initialized\n");
    3.59  	return 0;
    3.60  
    3.61   ini_nomem:
     4.1 --- a/linux-2.6-xen-sparse/drivers/xen/core/machine_reboot.c	Wed Apr 11 07:30:02 2007 -0600
     4.2 +++ b/linux-2.6-xen-sparse/drivers/xen/core/machine_reboot.c	Wed Apr 11 15:45:29 2007 +0100
     4.3 @@ -209,6 +209,8 @@ int __xen_suspend(int fast_suspend)
     4.4  	if (fast_suspend) {
     4.5  		xenbus_suspend();
     4.6  		err = stop_machine_run(take_machine_down, &fast_suspend, 0);
     4.7 +		if (err < 0)
     4.8 +			xenbus_suspend_cancel();
     4.9  	} else {
    4.10  		err = take_machine_down(&fast_suspend);
    4.11  	}
     5.1 --- a/tools/blktap/drivers/block-qcow.c	Wed Apr 11 07:30:02 2007 -0600
     5.2 +++ b/tools/blktap/drivers/block-qcow.c	Wed Apr 11 15:45:29 2007 +0100
     5.3 @@ -949,8 +949,14 @@ int tdqcow_open (struct disk_driver *dd,
     5.4  		goto fail;
     5.5  	}
     5.6  	init_fds(dd);
     5.7 -	s->fd_end = (final_cluster == 0 ? (s->l1_table_offset + l1_table_size) : 
     5.8 -				(final_cluster + s->cluster_size));
     5.9 +
    5.10 +	if (!final_cluster)
    5.11 +		s->fd_end = s->l1_table_offset + l1_table_size;
    5.12 +	else {
    5.13 +		s->fd_end = lseek64(fd, 0, SEEK_END);
    5.14 +		if (s->fd_end == (off64_t)-1)
    5.15 +			goto fail;
    5.16 +	}
    5.17  
    5.18  	return 0;
    5.19  	
     6.1 --- a/tools/ioemu/hw/pc.c	Wed Apr 11 07:30:02 2007 -0600
     6.2 +++ b/tools/ioemu/hw/pc.c	Wed Apr 11 15:45:29 2007 +0100
     6.3 @@ -902,7 +902,6 @@ static void pc_init1(uint64_t ram_size, 
     6.4      if (pci_enabled && acpi_enabled) {
     6.5          piix4_pm_init(pci_bus, piix3_devfn + 3);
     6.6      }
     6.7 -#endif /* !CONFIG_DM */
     6.8  
     6.9  #if 0
    6.10      /* ??? Need to figure out some way for the user to
    6.11 @@ -921,6 +920,17 @@ static void pc_init1(uint64_t ram_size, 
    6.12          lsi_scsi_attach(scsi, bdrv, -1);
    6.13      }
    6.14  #endif
    6.15 +#else
    6.16 +    if (pci_enabled) {
    6.17 +        void *scsi;
    6.18 +
    6.19 +        scsi = lsi_scsi_init(pci_bus, -1);
    6.20 +        for (i = 0; i < MAX_SCSI_DISKS ; i++) {
    6.21 +            if (bs_table[i + MAX_DISKS]) 
    6.22 +                lsi_scsi_attach(scsi, bs_table[i + MAX_DISKS], -1);
    6.23 +        }
    6.24 +    }
    6.25 +#endif /* !CONFIG_DM */
    6.26      /* must be done after all PCI devices are instanciated */
    6.27      /* XXX: should be done in the Bochs BIOS */
    6.28      if (pci_enabled) {
     7.1 --- a/tools/ioemu/vl.c	Wed Apr 11 07:30:02 2007 -0600
     7.2 +++ b/tools/ioemu/vl.c	Wed Apr 11 15:45:29 2007 +0100
     7.3 @@ -116,7 +116,7 @@ char phys_ram_file[1024];
     7.4  void *ioport_opaque[MAX_IOPORTS];
     7.5  IOPortReadFunc *ioport_read_table[3][MAX_IOPORTS];
     7.6  IOPortWriteFunc *ioport_write_table[3][MAX_IOPORTS];
     7.7 -BlockDriverState *bs_table[MAX_DISKS], *fd_table[MAX_FD];
     7.8 +BlockDriverState *bs_table[MAX_DISKS+MAX_SCSI_DISKS], *fd_table[MAX_FD];
     7.9  int vga_ram_size;
    7.10  int bios_size;
    7.11  static DisplayState display_state;
     8.1 --- a/tools/ioemu/vl.h	Wed Apr 11 07:30:02 2007 -0600
     8.2 +++ b/tools/ioemu/vl.h	Wed Apr 11 15:45:29 2007 +0100
     8.3 @@ -818,8 +818,9 @@ int vnc_start_viewer(int port);
     8.4  
     8.5  /* ide.c */
     8.6  #define MAX_DISKS 4
     8.7 +#define MAX_SCSI_DISKS 7
     8.8  
     8.9 -extern BlockDriverState *bs_table[MAX_DISKS];
    8.10 +extern BlockDriverState *bs_table[MAX_DISKS+MAX_SCSI_DISKS];
    8.11  
    8.12  void isa_ide_init(int iobase, int iobase2, int irq,
    8.13                    BlockDriverState *hd0, BlockDriverState *hd1);
     9.1 --- a/tools/ioemu/xenstore.c	Wed Apr 11 07:30:02 2007 -0600
     9.2 +++ b/tools/ioemu/xenstore.c	Wed Apr 11 15:45:29 2007 +0100
     9.3 @@ -30,11 +30,11 @@ static int pasprintf(char **buf, const c
     9.4      int ret = 0;
     9.5  
     9.6      if (*buf)
     9.7 -	free(*buf);
     9.8 +        free(*buf);
     9.9      va_start(ap, fmt);
    9.10      if (vasprintf(buf, fmt, ap) == -1) {
    9.11 -	buf = NULL;
    9.12 -	ret = -1;
    9.13 +        buf = NULL;
    9.14 +        ret = -1;
    9.15      }
    9.16      va_end(ap);
    9.17      return ret;
    9.18 @@ -45,11 +45,11 @@ static void insert_media(void *opaque)
    9.19      int i;
    9.20  
    9.21      for (i = 0; i < MAX_DISKS; i++) {
    9.22 -	if (media_filename[i] && bs_table[i]) {
    9.23 -	    do_change(bs_table[i]->device_name, media_filename[i]);
    9.24 -	    free(media_filename[i]);
    9.25 -	    media_filename[i] = NULL;
    9.26 -	}
    9.27 +        if (media_filename[i] && bs_table[i]) {
    9.28 +            do_change(bs_table[i]->device_name, media_filename[i]);
    9.29 +            free(media_filename[i]);
    9.30 +            media_filename[i] = NULL;
    9.31 +        }
    9.32      }
    9.33  }
    9.34  
    9.35 @@ -57,7 +57,7 @@ void xenstore_check_new_media_present(in
    9.36  {
    9.37  
    9.38      if (insert_timer == NULL)
    9.39 -	insert_timer = qemu_new_timer(rt_clock, insert_media, NULL);
    9.40 +        insert_timer = qemu_new_timer(rt_clock, insert_media, NULL);
    9.41      qemu_mod_timer(insert_timer, qemu_get_clock(rt_clock) + timeout);
    9.42  }
    9.43  
    9.44 @@ -82,8 +82,8 @@ void xenstore_parse_domain_config(int do
    9.45      char **e = NULL;
    9.46      char *buf = NULL, *path;
    9.47      char *fpath = NULL, *bpath = NULL,
    9.48 -         *dev = NULL, *params = NULL, *type = NULL;
    9.49 -    int i;
    9.50 +        *dev = NULL, *params = NULL, *type = NULL;
    9.51 +    int i, is_scsi;
    9.52      unsigned int len, num, hd_index;
    9.53  
    9.54      for(i = 0; i < MAX_DISKS; i++)
    9.55 @@ -91,8 +91,8 @@ void xenstore_parse_domain_config(int do
    9.56  
    9.57      xsh = xs_daemon_open();
    9.58      if (xsh == NULL) {
    9.59 -	fprintf(logfile, "Could not contact xenstore for domain config\n");
    9.60 -	return;
    9.61 +        fprintf(logfile, "Could not contact xenstore for domain config\n");
    9.62 +        return;
    9.63      }
    9.64  
    9.65      path = xs_get_domain_path(xsh, domid);
    9.66 @@ -102,59 +102,60 @@ void xenstore_parse_domain_config(int do
    9.67      }
    9.68  
    9.69      if (pasprintf(&buf, "%s/device/vbd", path) == -1)
    9.70 -	goto out;
    9.71 +        goto out;
    9.72  
    9.73      e = xs_directory(xsh, XBT_NULL, buf, &num);
    9.74      if (e == NULL)
    9.75 -	goto out;
    9.76 +        goto out;
    9.77  
    9.78      for (i = 0; i < num; i++) {
    9.79 -	/* read the backend path */
    9.80 -	if (pasprintf(&buf, "%s/device/vbd/%s/backend", path, e[i]) == -1)
    9.81 -	    continue;
    9.82 -	free(bpath);
    9.83 +        /* read the backend path */
    9.84 +        if (pasprintf(&buf, "%s/device/vbd/%s/backend", path, e[i]) == -1)
    9.85 +            continue;
    9.86 +        free(bpath);
    9.87          bpath = xs_read(xsh, XBT_NULL, buf, &len);
    9.88 -	if (bpath == NULL)
    9.89 -	    continue;
    9.90 -	/* read the name of the device */
    9.91 -	if (pasprintf(&buf, "%s/dev", bpath) == -1)
    9.92 -	    continue;
    9.93 -	free(dev);
    9.94 -	dev = xs_read(xsh, XBT_NULL, buf, &len);
    9.95 -	if (dev == NULL)
    9.96 -	    continue;
    9.97 -	if (strncmp(dev, "hd", 2) || strlen(dev) != 3)
    9.98 -	    continue;
    9.99 -	hd_index = dev[2] - 'a';
   9.100 -	if (hd_index >= MAX_DISKS)
   9.101 -	    continue;
   9.102 -	/* read the type of the device */
   9.103 -	if (pasprintf(&buf, "%s/device/vbd/%s/device-type", path, e[i]) == -1)
   9.104 -	    continue;
   9.105 -	free(type);
   9.106 -	type = xs_read(xsh, XBT_NULL, buf, &len);
   9.107 -	if (pasprintf(&buf, "%s/params", bpath) == -1)
   9.108 -	    continue;
   9.109 -	free(params);
   9.110 -	params = xs_read(xsh, XBT_NULL, buf, &len);
   9.111 -	if (params == NULL)
   9.112 -	    continue;
   9.113 +        if (bpath == NULL)
   9.114 +            continue;
   9.115 +        /* read the name of the device */
   9.116 +        if (pasprintf(&buf, "%s/dev", bpath) == -1)
   9.117 +            continue;
   9.118 +        free(dev);
   9.119 +        dev = xs_read(xsh, XBT_NULL, buf, &len);
   9.120 +        if (dev == NULL)
   9.121 +            continue;
   9.122 +        is_scsi = !strncmp(dev, "sd", 2);
   9.123 +        if ((strncmp(dev, "hd", 2) && !is_scsi) || strlen(dev) != 3 )
   9.124 +            continue;
   9.125 +        hd_index = dev[2] - 'a';
   9.126 +        if (hd_index >= (is_scsi ? MAX_SCSI_DISKS : MAX_DISKS))
   9.127 +            continue;
   9.128 +        /* read the type of the device */
   9.129 +        if (pasprintf(&buf, "%s/device/vbd/%s/device-type", path, e[i]) == -1)
   9.130 +            continue;
   9.131 +        free(type);
   9.132 +        type = xs_read(xsh, XBT_NULL, buf, &len);
   9.133 +        if (pasprintf(&buf, "%s/params", bpath) == -1)
   9.134 +            continue;
   9.135 +        free(params);
   9.136 +        params = xs_read(xsh, XBT_NULL, buf, &len);
   9.137 +        if (params == NULL)
   9.138 +            continue;
   9.139          /* 
   9.140           * check if device has a phantom vbd; the phantom is hooked
   9.141           * to the frontend device (for ease of cleanup), so lookup 
   9.142           * the frontend device, and see if there is a phantom_vbd
   9.143           * if there is, we will use resolution as the filename
   9.144           */
   9.145 -	if (pasprintf(&buf, "%s/device/vbd/%s/phantom_vbd", path, e[i]) == -1)
   9.146 -	    continue;
   9.147 -	free(fpath);
   9.148 +        if (pasprintf(&buf, "%s/device/vbd/%s/phantom_vbd", path, e[i]) == -1)
   9.149 +            continue;
   9.150 +        free(fpath);
   9.151          fpath = xs_read(xsh, XBT_NULL, buf, &len);
   9.152 -	if (fpath) {
   9.153 -	    if (pasprintf(&buf, "%s/dev", fpath) == -1)
   9.154 -	        continue;
   9.155 -	    free(params);
   9.156 +        if (fpath) {
   9.157 +            if (pasprintf(&buf, "%s/dev", fpath) == -1)
   9.158 +                continue;
   9.159 +            free(params);
   9.160              params = xs_read(xsh, XBT_NULL, buf , &len);
   9.161 -	    if (params) {
   9.162 +            if (params) {
   9.163                  /* 
   9.164                   * wait for device, on timeout silently fail because we will 
   9.165                   * fail to open below
   9.166 @@ -163,19 +164,20 @@ void xenstore_parse_domain_config(int do
   9.167              }
   9.168          }
   9.169  
   9.170 -	bs_table[hd_index] = bdrv_new(dev);
   9.171 -	/* check if it is a cdrom */
   9.172 -	if (type && !strcmp(type, "cdrom")) {
   9.173 -	    bdrv_set_type_hint(bs_table[hd_index], BDRV_TYPE_CDROM);
   9.174 -	    if (pasprintf(&buf, "%s/params", bpath) != -1)
   9.175 -		xs_watch(xsh, buf, dev);
   9.176 -	}
   9.177 -	/* open device now if media present */
   9.178 -	if (params[0]) {
   9.179 -            if (bdrv_open(bs_table[hd_index], params, 0 /* snapshot */) < 0)
   9.180 +        bs_table[hd_index + (is_scsi ? MAX_DISKS : 0)] = bdrv_new(dev);
   9.181 +        /* check if it is a cdrom */
   9.182 +        if (type && !strcmp(type, "cdrom")) {
   9.183 +            bdrv_set_type_hint(bs_table[hd_index], BDRV_TYPE_CDROM);
   9.184 +            if (pasprintf(&buf, "%s/params", bpath) != -1)
   9.185 +                xs_watch(xsh, buf, dev);
   9.186 +        }
   9.187 +        /* open device now if media present */
   9.188 +        if (params[0]) {
   9.189 +            if (bdrv_open(bs_table[hd_index + (is_scsi ? MAX_DISKS : 0)],
   9.190 +                          params, 0 /* snapshot */) < 0)
   9.191                  fprintf(stderr, "qemu: could not open hard disk image '%s'\n",
   9.192                          params);
   9.193 -	}
   9.194 +        }
   9.195      }
   9.196  
   9.197      /* Set a watch for log-dirty requests from the migration tools */
   9.198 @@ -199,7 +201,7 @@ void xenstore_parse_domain_config(int do
   9.199  int xenstore_fd(void)
   9.200  {
   9.201      if (xsh)
   9.202 -	return xs_fileno(xsh);
   9.203 +        return xs_fileno(xsh);
   9.204      return -1;
   9.205  }
   9.206  
   9.207 @@ -316,7 +318,7 @@ void xenstore_process_event(void *opaque
   9.208  
   9.209      vec = xs_read_watch(xsh, &num);
   9.210      if (!vec)
   9.211 -	return;
   9.212 +        return;
   9.213  
   9.214      if (!strcmp(vec[XS_WATCH_TOKEN], "logdirty")) {
   9.215          xenstore_process_logdirty_event();
   9.216 @@ -324,23 +326,23 @@ void xenstore_process_event(void *opaque
   9.217      }
   9.218  
   9.219      if (strncmp(vec[XS_WATCH_TOKEN], "hd", 2) ||
   9.220 -	strlen(vec[XS_WATCH_TOKEN]) != 3)
   9.221 -	goto out;
   9.222 +        strlen(vec[XS_WATCH_TOKEN]) != 3)
   9.223 +        goto out;
   9.224      hd_index = vec[XS_WATCH_TOKEN][2] - 'a';
   9.225      image = xs_read(xsh, XBT_NULL, vec[XS_WATCH_PATH], &len);
   9.226      if (image == NULL || !strcmp(image, bs_table[hd_index]->filename))
   9.227 -	goto out;		/* gone or identical */
   9.228 +        goto out;  /* gone or identical */
   9.229  
   9.230      do_eject(0, vec[XS_WATCH_TOKEN]);
   9.231      bs_table[hd_index]->filename[0] = 0;
   9.232      if (media_filename[hd_index]) {
   9.233 -	free(media_filename[hd_index]);
   9.234 -	media_filename[hd_index] = NULL;
   9.235 +        free(media_filename[hd_index]);
   9.236 +        media_filename[hd_index] = NULL;
   9.237      }
   9.238  
   9.239      if (image[0]) {
   9.240 -	media_filename[hd_index] = strdup(image);
   9.241 -	xenstore_check_new_media_present(5000);
   9.242 +        media_filename[hd_index] = strdup(image);
   9.243 +        xenstore_check_new_media_present(5000);
   9.244      }
   9.245  
   9.246   out:
   9.247 @@ -354,7 +356,7 @@ void xenstore_write_vncport(int display)
   9.248      char *portstr = NULL;
   9.249  
   9.250      if (xsh == NULL)
   9.251 -	return;
   9.252 +        return;
   9.253  
   9.254      path = xs_get_domain_path(xsh, domid);
   9.255      if (path == NULL) {
   9.256 @@ -363,10 +365,10 @@ void xenstore_write_vncport(int display)
   9.257      }
   9.258  
   9.259      if (pasprintf(&buf, "%s/console/vnc-port", path) == -1)
   9.260 -	goto out;
   9.261 +        goto out;
   9.262  
   9.263      if (pasprintf(&portstr, "%d", 5900 + display) == -1)
   9.264 -	goto out;
   9.265 +        goto out;
   9.266  
   9.267      if (xs_write(xsh, XBT_NULL, buf, portstr, strlen(portstr)) == 0)
   9.268          fprintf(logfile, "xs_write() vncport failed\n");
   9.269 @@ -383,41 +385,41 @@ int xenstore_read_vncpasswd(int domid)
   9.270      unsigned int i, len, rc = 0;
   9.271  
   9.272      if (xsh == NULL) {
   9.273 -	return -1;
   9.274 +        return -1;
   9.275      }
   9.276  
   9.277      path = xs_get_domain_path(xsh, domid);
   9.278      if (path == NULL) {
   9.279 -	fprintf(logfile, "xs_get_domain_path() error. domid %d.\n", domid);
   9.280 -	return -1;
   9.281 +        fprintf(logfile, "xs_get_domain_path() error. domid %d.\n", domid);
   9.282 +        return -1;
   9.283      }
   9.284  
   9.285      pasprintf(&buf, "%s/vm", path);
   9.286      uuid = xs_read(xsh, XBT_NULL, buf, &len);
   9.287      if (uuid == NULL) {
   9.288 -	fprintf(logfile, "xs_read(): uuid get error. %s.\n", buf);
   9.289 -	free(path);
   9.290 -	return -1;
   9.291 +        fprintf(logfile, "xs_read(): uuid get error. %s.\n", buf);
   9.292 +        free(path);
   9.293 +        return -1;
   9.294      }
   9.295  
   9.296      pasprintf(&buf, "%s/vncpasswd", uuid);
   9.297      passwd = xs_read(xsh, XBT_NULL, buf, &len);
   9.298      if (passwd == NULL) {
   9.299 -	fprintf(logfile, "xs_read(): vncpasswd get error. %s.\n", buf);
   9.300 -	free(uuid);
   9.301 -	free(path);
   9.302 -	return rc;
   9.303 +        fprintf(logfile, "xs_read(): vncpasswd get error. %s.\n", buf);
   9.304 +        free(uuid);
   9.305 +        free(path);
   9.306 +        return rc;
   9.307      }
   9.308  
   9.309      for (i=0; i<len && i<63; i++) {
   9.310 -	vncpasswd[i] = passwd[i];
   9.311 -	passwd[i] = '\0';
   9.312 +        vncpasswd[i] = passwd[i];
   9.313 +        passwd[i] = '\0';
   9.314      }
   9.315      vncpasswd[len] = '\0';
   9.316      pasprintf(&buf, "%s/vncpasswd", uuid);
   9.317      if (xs_write(xsh, XBT_NULL, buf, passwd, len) == 0) {
   9.318 -	fprintf(logfile, "xs_write() vncpasswd failed.\n");
   9.319 -	rc = -1;
   9.320 +        fprintf(logfile, "xs_write() vncpasswd failed.\n");
   9.321 +        rc = -1;
   9.322      }
   9.323  
   9.324      free(passwd);
   9.325 @@ -443,7 +445,7 @@ char **xenstore_domain_get_devices(struc
   9.326          goto out;
   9.327  
   9.328      if (pasprintf(&buf, "%s/device/%s", path,devtype) == -1)
   9.329 -	goto out;
   9.330 +        goto out;
   9.331  
   9.332      e = xs_directory(handle, XBT_NULL, buf, num);
   9.333  
   9.334 @@ -496,13 +498,13 @@ char *xenstore_backend_read_variable(str
   9.335  
   9.336      buf = get_device_variable_path(devtype, inst, var);
   9.337      if (NULL == buf)
   9.338 -	goto out;
   9.339 +        goto out;
   9.340  
   9.341      value = xs_read(handle, XBT_NULL, buf, &len);
   9.342  
   9.343      free(buf);
   9.344  
   9.345 -out:
   9.346 + out:
   9.347      return value;
   9.348  }
   9.349  
   9.350 @@ -569,27 +571,27 @@ char *xenstore_vm_read(int domid, char *
   9.351      char *buf = NULL, *path = NULL, *value = NULL;
   9.352  
   9.353      if (xsh == NULL)
   9.354 -	goto out;
   9.355 +        goto out;
   9.356  
   9.357      path = xs_get_domain_path(xsh, domid);
   9.358      if (path == NULL) {
   9.359 -	fprintf(logfile, "xs_get_domain_path(%d): error\n", domid);
   9.360 -	goto out;
   9.361 +        fprintf(logfile, "xs_get_domain_path(%d): error\n", domid);
   9.362 +        goto out;
   9.363      }
   9.364  
   9.365      pasprintf(&buf, "%s/vm", path);
   9.366      free(path);
   9.367      path = xs_read(xsh, XBT_NULL, buf, NULL);
   9.368      if (path == NULL) {
   9.369 -	fprintf(logfile, "xs_read(%s): read error\n", buf);
   9.370 -	goto out;
   9.371 +        fprintf(logfile, "xs_read(%s): read error\n", buf);
   9.372 +        goto out;
   9.373      }
   9.374  
   9.375      pasprintf(&buf, "%s/%s", path, key);
   9.376      value = xs_read(xsh, XBT_NULL, buf, len);
   9.377      if (value == NULL) {
   9.378 -	fprintf(logfile, "xs_read(%s): read error\n", buf);
   9.379 -	goto out;
   9.380 +        fprintf(logfile, "xs_read(%s): read error\n", buf);
   9.381 +        goto out;
   9.382      }
   9.383  
   9.384   out:
   9.385 @@ -604,27 +606,27 @@ int xenstore_vm_write(int domid, char *k
   9.386      int rc = -1;
   9.387  
   9.388      if (xsh == NULL)
   9.389 -	goto out;
   9.390 +        goto out;
   9.391  
   9.392      path = xs_get_domain_path(xsh, domid);
   9.393      if (path == NULL) {
   9.394 -	fprintf(logfile, "xs_get_domain_path: error\n");
   9.395 -	goto out;
   9.396 +        fprintf(logfile, "xs_get_domain_path: error\n");
   9.397 +        goto out;
   9.398      }
   9.399  
   9.400      pasprintf(&buf, "%s/vm", path);
   9.401      free(path);
   9.402      path = xs_read(xsh, XBT_NULL, buf, NULL);
   9.403      if (path == NULL) {
   9.404 -	fprintf(logfile, "xs_read(%s): read error\n", buf);
   9.405 -	goto out;
   9.406 +        fprintf(logfile, "xs_read(%s): read error\n", buf);
   9.407 +        goto out;
   9.408      }
   9.409  
   9.410      pasprintf(&buf, "%s/%s", path, key);
   9.411      rc = xs_write(xsh, XBT_NULL, buf, value, strlen(value));
   9.412      if (rc) {
   9.413 -	fprintf(logfile, "xs_write(%s, %s): write error\n", buf, key);
   9.414 -	goto out;
   9.415 +        fprintf(logfile, "xs_write(%s, %s): write error\n", buf, key);
   9.416 +        goto out;
   9.417      }
   9.418  
   9.419   out:
    10.1 --- a/tools/libfsimage/fat/fat.h	Wed Apr 11 07:30:02 2007 -0600
    10.2 +++ b/tools/libfsimage/fat/fat.h	Wed Apr 11 15:45:29 2007 +0100
    10.3 @@ -84,17 +84,17 @@ struct fat_bpb {
    10.4  #define FAT_DIRENTRY_LENGTH       32
    10.5  
    10.6  #define FAT_DIRENTRY_ATTRIB(entry) \
    10.7 -  (*((unsigned char *) (entry+11)))
    10.8 +  (*((__u8 *) (entry+11)))
    10.9  #define FAT_DIRENTRY_VALID(entry) \
   10.10 -  ( ((*((unsigned char *) entry)) != 0) \
   10.11 -    && ((*((unsigned char *) entry)) != 0xE5) \
   10.12 +  ( ((*((__u8 *) entry)) != 0) \
   10.13 +    && ((*((__u8 *) entry)) != 0xE5) \
   10.14      && !(FAT_DIRENTRY_ATTRIB(entry) & FAT_ATTRIB_NOT_OK_MASK) )
   10.15  #define FAT_DIRENTRY_FIRST_CLUSTER(entry) \
   10.16 -  ((*((unsigned short *) (entry+26)))+(*((unsigned short *) (entry+20)) << 16))
   10.17 +  ((*((__u16 *) (entry+26)))+(*((__u16 *) (entry+20)) << 16))
   10.18  #define FAT_DIRENTRY_FILELENGTH(entry) \
   10.19 -  (*((unsigned long *) (entry+28)))
   10.20 +  (*((__u32 *) (entry+28)))
   10.21  
   10.22  #define FAT_LONGDIR_ID(entry) \
   10.23 -  (*((unsigned char *) (entry)))
   10.24 +  (*((__u8 *) (entry)))
   10.25  #define FAT_LONGDIR_ALIASCHECKSUM(entry) \
   10.26 -  (*((unsigned char *) (entry+13)))
   10.27 +  (*((__u8 *) (entry+13)))
    11.1 --- a/tools/libxc/Makefile	Wed Apr 11 07:30:02 2007 -0600
    11.2 +++ b/tools/libxc/Makefile	Wed Apr 11 15:45:29 2007 +0100
    11.3 @@ -26,8 +26,8 @@ CTRL_SRCS-$(CONFIG_X86_Linux) += xc_ptra
    11.4  
    11.5  GUEST_SRCS-y :=
    11.6  GUEST_SRCS-y += xg_private.c
    11.7 -GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_linux_save.c
    11.8 -GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c xc_hvm_save.c
    11.9 +GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_domain_save.c
   11.10 +GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c
   11.11  
   11.12  # symlink libelf from xen/common/libelf/
   11.13  LIBELF_SRCS := libelf-tools.c libelf-loader.c
    12.1 --- a/tools/libxc/ia64/xc_ia64_linux_save.c	Wed Apr 11 07:30:02 2007 -0600
    12.2 +++ b/tools/libxc/ia64/xc_ia64_linux_save.c	Wed Apr 11 15:45:29 2007 +0100
    12.3 @@ -134,8 +134,10 @@ retry:
    12.4  }
    12.5  
    12.6  int
    12.7 -xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
    12.8 -              uint32_t max_factor, uint32_t flags, int (*suspend)(int))
    12.9 +xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
   12.10 +               uint32_t max_factor, uint32_t flags, int (*suspend)(int),
   12.11 +               int hvm, void *(*init_qemu_maps)(int, unsigned),
   12.12 +               void (*qemu_flip_buffer)(int, int))
   12.13  {
   12.14      DECLARE_DOMCTL;
   12.15      xc_dominfo_t info;
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/tools/libxc/xc_domain_save.c	Wed Apr 11 15:45:29 2007 +0100
    13.3 @@ -0,0 +1,1609 @@
    13.4 +/******************************************************************************
    13.5 + * xc_linux_save.c
    13.6 + *
    13.7 + * Save the state of a running Linux session.
    13.8 + *
    13.9 + * Copyright (c) 2003, K A Fraser.
   13.10 + */
   13.11 +
   13.12 +#include <inttypes.h>
   13.13 +#include <time.h>
   13.14 +#include <stdlib.h>
   13.15 +#include <unistd.h>
   13.16 +#include <sys/time.h>
   13.17 +
   13.18 +#include "xc_private.h"
   13.19 +#include "xc_dom.h"
   13.20 +#include "xg_private.h"
   13.21 +#include "xg_save_restore.h"
   13.22 +
   13.23 +#include <xen/hvm/params.h>
   13.24 +#include <xen/hvm/e820.h>
   13.25 +
   13.26 +/*
   13.27 +** Default values for important tuning parameters. Can override by passing
   13.28 +** non-zero replacement values to xc_domain_save().
   13.29 +**
   13.30 +** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
   13.31 +**
   13.32 +*/
   13.33 +#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop   */
   13.34 +#define DEF_MAX_FACTOR   3   /* never send more than 3x p2m_size  */
   13.35 +
   13.36 +/* max mfn of the whole machine */
   13.37 +static unsigned long max_mfn;
   13.38 +
   13.39 +/* virtual starting address of the hypervisor */
   13.40 +static unsigned long hvirt_start;
   13.41 +
   13.42 +/* #levels of page tables used by the current guest */
   13.43 +static unsigned int pt_levels;
   13.44 +
   13.45 +/* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */
   13.46 +static unsigned long *qemu_bitmaps[2];
   13.47 +static int qemu_active;
   13.48 +static int qemu_non_active;
   13.49 +
   13.50 +/* number of pfns this guest has (i.e. number of entries in the P2M) */
   13.51 +static unsigned long p2m_size;
   13.52 +
   13.53 +/* Live mapping of the table mapping each PFN to its current MFN. */
   13.54 +static xen_pfn_t *live_p2m = NULL;
   13.55 +
   13.56 +/* Live mapping of system MFN to PFN table. */
   13.57 +static xen_pfn_t *live_m2p = NULL;
   13.58 +static unsigned long m2p_mfn0;
   13.59 +
   13.60 +/* grep fodder: machine_to_phys */
   13.61 +
   13.62 +#define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
   13.63 +
   13.64 +/*
   13.65 + * Returns TRUE if the given machine frame number has a unique mapping
   13.66 + * in the guest's pseudophysical map.
   13.67 + */
   13.68 +#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn)          \
   13.69 +    (((_mfn) < (max_mfn)) &&                    \
   13.70 +     ((mfn_to_pfn(_mfn) < (p2m_size)) &&        \
   13.71 +      (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
   13.72 +
   13.73 +/* Returns TRUE if MFN is successfully converted to a PFN. */
   13.74 +#define translate_mfn_to_pfn(_pmfn)                             \
   13.75 +({                                                              \
   13.76 +    unsigned long mfn = *(_pmfn);                               \
   13.77 +    int _res = 1;                                               \
   13.78 +    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )                       \
   13.79 +        _res = 0;                                               \
   13.80 +    else                                                        \
   13.81 +        *(_pmfn) = mfn_to_pfn(mfn);                             \
   13.82 +    _res;                                                       \
   13.83 +})
   13.84 +
   13.85 +/*
   13.86 +** During (live) save/migrate, we maintain a number of bitmaps to track
   13.87 +** which pages we have to send, to fixup, and to skip.
   13.88 +*/
   13.89 +
   13.90 +#define BITS_PER_LONG (sizeof(unsigned long) * 8)
   13.91 +#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
   13.92 +#define BITMAP_SIZE   (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long))
   13.93 +
   13.94 +#define BITMAP_ENTRY(_nr,_bmap) \
   13.95 +   ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
   13.96 +
   13.97 +#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
   13.98 +
   13.99 +static inline int test_bit (int nr, volatile void * addr)
  13.100 +{
  13.101 +    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
  13.102 +}
  13.103 +
  13.104 +static inline void clear_bit (int nr, volatile void * addr)
  13.105 +{
  13.106 +    BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
  13.107 +}
  13.108 +
  13.109 +static inline void set_bit ( int nr, volatile void * addr)
  13.110 +{
  13.111 +    BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
  13.112 +}
  13.113 +
  13.114 +/* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
  13.115 +static inline unsigned int hweight32(unsigned int w)
  13.116 +{
  13.117 +    unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
  13.118 +    res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
  13.119 +    res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
  13.120 +    res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
  13.121 +    return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
  13.122 +}
  13.123 +
  13.124 +static inline int count_bits ( int nr, volatile void *addr)
  13.125 +{
  13.126 +    int i, count = 0;
  13.127 +    volatile unsigned long *p = (volatile unsigned long *)addr;
  13.128 +    /* We know that the array is padded to unsigned long. */
  13.129 +    for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
  13.130 +        count += hweight32(*p);
  13.131 +    return count;
  13.132 +}
  13.133 +
  13.134 +static inline int permute( int i, int nr, int order_nr  )
  13.135 +{
  13.136 +    /* Need a simple permutation function so that we scan pages in a
  13.137 +       pseudo random order, enabling us to get a better estimate of
  13.138 +       the domain's page dirtying rate as we go (there are often
  13.139 +       contiguous ranges of pfns that have similar behaviour, and we
  13.140 +       want to mix them up. */
  13.141 +
  13.142 +    /* e.g. nr->oder 15->4 16->4 17->5 */
  13.143 +    /* 512MB domain, 128k pages, order 17 */
  13.144 +
  13.145 +    /*
  13.146 +      QPONMLKJIHGFEDCBA
  13.147 +             QPONMLKJIH
  13.148 +      GFEDCBA
  13.149 +     */
  13.150 +
  13.151 +    /*
  13.152 +      QPONMLKJIHGFEDCBA
  13.153 +                  EDCBA
  13.154 +             QPONM
  13.155 +      LKJIHGF
  13.156 +      */
  13.157 +
  13.158 +    do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
  13.159 +    while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
  13.160 +
  13.161 +    return i;
  13.162 +}
  13.163 +
  13.164 +static uint64_t tv_to_us(struct timeval *new)
  13.165 +{
  13.166 +    return (new->tv_sec * 1000000) + new->tv_usec;
  13.167 +}
  13.168 +
  13.169 +static uint64_t llgettimeofday(void)
  13.170 +{
  13.171 +    struct timeval now;
  13.172 +    gettimeofday(&now, NULL);
  13.173 +    return tv_to_us(&now);
  13.174 +}
  13.175 +
  13.176 +static uint64_t tv_delta(struct timeval *new, struct timeval *old)
  13.177 +{
  13.178 +    return (((new->tv_sec - old->tv_sec)*1000000) +
  13.179 +            (new->tv_usec - old->tv_usec));
  13.180 +}
  13.181 +
  13.182 +static int noncached_write(int fd, int live, void *buffer, int len) 
  13.183 +{
  13.184 +    static int write_count = 0;
  13.185 +
  13.186 +    int rc = write(fd,buffer,len);
  13.187 +
  13.188 +    write_count += len;
  13.189 +    if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
  13.190 +    {
  13.191 +        /* Time to discard cache - dont care if this fails */
  13.192 +        discard_file_cache(fd, 0 /* no flush */);
  13.193 +        write_count = 0;
  13.194 +    }
  13.195 +
  13.196 +    return rc;
  13.197 +}
  13.198 +
  13.199 +#ifdef ADAPTIVE_SAVE
  13.200 +
  13.201 +/*
  13.202 +** We control the rate at which we transmit (or save) to minimize impact
  13.203 +** on running domains (including the target if we're doing live migrate).
  13.204 +*/
  13.205 +
  13.206 +#define MAX_MBIT_RATE    500      /* maximum transmit rate for migrate */
  13.207 +#define START_MBIT_RATE  100      /* initial transmit rate for migrate */
  13.208 +
  13.209 +/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
  13.210 +#define RATE_TO_BTU      781250
  13.211 +
  13.212 +/* Amount in bytes we allow ourselves to send in a burst */
  13.213 +#define BURST_BUDGET (100*1024)
  13.214 +
  13.215 +/* We keep track of the current and previous transmission rate */
  13.216 +static int mbit_rate, ombit_rate = 0;
  13.217 +
  13.218 +/* Have we reached the maximum transmission rate? */
  13.219 +#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
  13.220 +
  13.221 +static inline void initialize_mbit_rate()
  13.222 +{
  13.223 +    mbit_rate = START_MBIT_RATE;
  13.224 +}
  13.225 +
  13.226 +static int ratewrite(int io_fd, int live, void *buf, int n)
  13.227 +{
  13.228 +    static int budget = 0;
  13.229 +    static int burst_time_us = -1;
  13.230 +    static struct timeval last_put = { 0 };
  13.231 +    struct timeval now;
  13.232 +    struct timespec delay;
  13.233 +    long long delta;
  13.234 +
  13.235 +    if ( START_MBIT_RATE == 0 )
  13.236 +        return noncached_write(io_fd, live, buf, n);
  13.237 +
  13.238 +    budget -= n;
  13.239 +    if ( budget < 0 )
  13.240 +    {
  13.241 +        if ( mbit_rate != ombit_rate )
  13.242 +        {
  13.243 +            burst_time_us = RATE_TO_BTU / mbit_rate;
  13.244 +            ombit_rate = mbit_rate;
  13.245 +            DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
  13.246 +                    mbit_rate, BURST_BUDGET, burst_time_us);
  13.247 +        }
  13.248 +        if ( last_put.tv_sec == 0 )
  13.249 +        {
  13.250 +            budget += BURST_BUDGET;
  13.251 +            gettimeofday(&last_put, NULL);
  13.252 +        }
  13.253 +        else
  13.254 +        {
  13.255 +            while ( budget < 0 )
  13.256 +            {
  13.257 +                gettimeofday(&now, NULL);
  13.258 +                delta = tv_delta(&now, &last_put);
  13.259 +                while ( delta > burst_time_us )
  13.260 +                {
  13.261 +                    budget += BURST_BUDGET;
  13.262 +                    last_put.tv_usec += burst_time_us;
  13.263 +                    if ( last_put.tv_usec > 1000000 
  13.264 +                    {
  13.265 +                        last_put.tv_usec -= 1000000;
  13.266 +                        last_put.tv_sec++;
  13.267 +                    }
  13.268 +                    delta -= burst_time_us;
  13.269 +                }
  13.270 +                if ( budget > 0 )
  13.271 +                    break;
  13.272 +                delay.tv_sec = 0;
  13.273 +                delay.tv_nsec = 1000 * (burst_time_us - delta);
  13.274 +                while ( delay.tv_nsec > 0 )
  13.275 +                    if ( nanosleep(&delay, &delay) == 0 )
  13.276 +                        break;
  13.277 +            }
  13.278 +        }
  13.279 +    }
  13.280 +    return noncached_write(io_fd, live, buf, n);
  13.281 +}
  13.282 +
  13.283 +#else /* ! ADAPTIVE SAVE */
  13.284 +
  13.285 +#define RATE_IS_MAX() (0)
  13.286 +#define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), (_buf), (_n))
  13.287 +#define initialize_mbit_rate()
  13.288 +
  13.289 +#endif
  13.290 +
  13.291 +static inline ssize_t write_exact(int fd, void *buf, size_t count)
  13.292 +{
  13.293 +    return (write(fd, buf, count) == count);
  13.294 +}
  13.295 +
  13.296 +static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
  13.297 +                       xc_shadow_op_stats_t *stats, int print)
  13.298 +{
  13.299 +    static struct timeval wall_last;
  13.300 +    static long long      d0_cpu_last;
  13.301 +    static long long      d1_cpu_last;
  13.302 +
  13.303 +    struct timeval        wall_now;
  13.304 +    long long             wall_delta;
  13.305 +    long long             d0_cpu_now, d0_cpu_delta;
  13.306 +    long long             d1_cpu_now, d1_cpu_delta;
  13.307 +
  13.308 +    gettimeofday(&wall_now, NULL);
  13.309 +
  13.310 +    d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
  13.311 +    d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
  13.312 +
  13.313 +    if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
  13.314 +        DPRINTF("ARRHHH!!\n");
  13.315 +
  13.316 +    wall_delta = tv_delta(&wall_now,&wall_last)/1000;
  13.317 +    if ( wall_delta == 0 )
  13.318 +        wall_delta = 1;
  13.319 +
  13.320 +    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
  13.321 +    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
  13.322 +
  13.323 +    if ( print )
  13.324 +        DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
  13.325 +                "dirtied %dMb/s %" PRId32 " pages\n",
  13.326 +                wall_delta,
  13.327 +                (int)((d0_cpu_delta*100)/wall_delta),
  13.328 +                (int)((d1_cpu_delta*100)/wall_delta),
  13.329 +                (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
  13.330 +                (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
  13.331 +                stats->dirty_count);
  13.332 +
  13.333 +#ifdef ADAPTIVE_SAVE
  13.334 +    if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
  13.335 +    {
  13.336 +        mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
  13.337 +            + 50;
  13.338 +        if ( mbit_rate > MAX_MBIT_RATE )
  13.339 +            mbit_rate = MAX_MBIT_RATE;
  13.340 +    }
  13.341 +#endif
  13.342 +
  13.343 +    d0_cpu_last = d0_cpu_now;
  13.344 +    d1_cpu_last = d1_cpu_now;
  13.345 +    wall_last   = wall_now;
  13.346 +
  13.347 +    return 0;
  13.348 +}
  13.349 +
  13.350 +
  13.351 +static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size,
  13.352 +                          unsigned long *arr, int runs)
  13.353 +{
  13.354 +    long long start, now;
  13.355 +    xc_shadow_op_stats_t stats;
  13.356 +    int j;
  13.357 +
  13.358 +    start = llgettimeofday();
  13.359 +
  13.360 +    for ( j = 0; j < runs; j++ )
  13.361 +    {
  13.362 +        int i;
  13.363 +
  13.364 +        xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
  13.365 +                          arr, p2m_size, NULL, 0, NULL);
  13.366 +        DPRINTF("#Flush\n");
  13.367 +        for ( i = 0; i < 40; i++ )
  13.368 +        {
  13.369 +            usleep(50000);
  13.370 +            now = llgettimeofday();
  13.371 +            xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
  13.372 +                              NULL, 0, NULL, 0, &stats);
  13.373 +            DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
  13.374 +                    ((now-start)+500)/1000,
  13.375 +                    stats.fault_count, stats.dirty_count);
  13.376 +        }
  13.377 +    }
  13.378 +
  13.379 +    return -1;
  13.380 +}
  13.381 +
  13.382 +
  13.383 +static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
  13.384 +                             int dom, xc_dominfo_t *info,
  13.385 +                             vcpu_guest_context_t *ctxt)
  13.386 +{
  13.387 +    int i = 0;
  13.388 +
  13.389 +    if ( !(*suspend)(dom) )
  13.390 +    {
  13.391 +        ERROR("Suspend request failed");
  13.392 +        return -1;
  13.393 +    }
  13.394 +
  13.395 + retry:
  13.396 +
  13.397 +    if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
  13.398 +    {
  13.399 +        ERROR("Could not get domain info");
  13.400 +        return -1;
  13.401 +    }
  13.402 +
  13.403 +    if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) )
  13.404 +        ERROR("Could not get vcpu context");
  13.405 +
  13.406 +
  13.407 +    if ( info->dying )
  13.408 +    {
  13.409 +        ERROR("domain is dying");
  13.410 +        return -1;
  13.411 +    }
  13.412 +
  13.413 +    if ( info->crashed )
  13.414 +    {
  13.415 +        ERROR("domain has crashed");
  13.416 +        return -1;
  13.417 +    }
  13.418 +
  13.419 +    if ( info->shutdown )
  13.420 +    {
  13.421 +        switch ( info->shutdown_reason )
  13.422 +        {
  13.423 +        case SHUTDOWN_poweroff:
  13.424 +        case SHUTDOWN_reboot:
  13.425 +            ERROR("domain has shut down");
  13.426 +            return -1;
  13.427 +        case SHUTDOWN_suspend:
  13.428 +            return 0;
  13.429 +        case SHUTDOWN_crash:
  13.430 +            ERROR("domain has crashed");
  13.431 +            return -1;
  13.432 +        }
  13.433 +    }
  13.434 +
  13.435 +    if ( info->paused )
  13.436 +    {
  13.437 +        /* Try unpausing domain, wait, and retest. */
  13.438 +        xc_domain_unpause( xc_handle, dom );
  13.439 +        ERROR("Domain was paused. Wait and re-test.");
  13.440 +        usleep(10000); /* 10ms */
  13.441 +        goto retry;
  13.442 +    }
  13.443 +
  13.444 +    if ( ++i < 100 )
  13.445 +    {
  13.446 +        ERROR("Retry suspend domain");
  13.447 +        usleep(10000); /* 10ms */
  13.448 +        goto retry;
  13.449 +    }
  13.450 +
  13.451 +    ERROR("Unable to suspend domain.");
  13.452 +
  13.453 +    return -1;
  13.454 +}
  13.455 +
  13.456 +/*
  13.457 +** Map the top-level page of MFNs from the guest. The guest might not have
  13.458 +** finished resuming from a previous restore operation, so we wait a while for
  13.459 +** it to update the MFN to a reasonable value.
  13.460 +*/
  13.461 +static void *map_frame_list_list(int xc_handle, uint32_t dom,
  13.462 +                                 shared_info_t *shinfo)
  13.463 +{
  13.464 +    int count = 100;
  13.465 +    void *p;
  13.466 +
  13.467 +    while ( count-- && (shinfo->arch.pfn_to_mfn_frame_list_list == 0) )
  13.468 +        usleep(10000);
  13.469 +
  13.470 +    if ( shinfo->arch.pfn_to_mfn_frame_list_list == 0 )
  13.471 +    {
  13.472 +        ERROR("Timed out waiting for frame list updated.");
  13.473 +        return NULL;
  13.474 +    }
  13.475 +
  13.476 +    p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
  13.477 +                             shinfo->arch.pfn_to_mfn_frame_list_list);
  13.478 +    if ( p == NULL )
  13.479 +        ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
  13.480 +
  13.481 +    return p;
  13.482 +}
  13.483 +
  13.484 +/*
  13.485 +** During transfer (or in the state file), all page-table pages must be
  13.486 +** converted into a 'canonical' form where references to actual mfns
  13.487 +** are replaced with references to the corresponding pfns.
  13.488 +**
  13.489 +** This function performs the appropriate conversion, taking into account
  13.490 +** which entries do not require canonicalization (in particular, those
  13.491 +** entries which map the virtual address reserved for the hypervisor).
  13.492 +*/
  13.493 +static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
  13.494 +                           const void *spage, void *dpage)
  13.495 +{
  13.496 +
  13.497 +    int i, pte_last, xen_start, xen_end, race = 0; 
  13.498 +    uint64_t pte;
  13.499 +
  13.500 +    /*
  13.501 +    ** We need to determine which entries in this page table hold
  13.502 +    ** reserved hypervisor mappings. This depends on the current
  13.503 +    ** page table type as well as the number of paging levels.
  13.504 +    */
  13.505 +    xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
  13.506 +
  13.507 +    if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
  13.508 +        xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
  13.509 +
  13.510 +    if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
  13.511 +        xen_start = L3_PAGETABLE_ENTRIES_PAE;
  13.512 +
  13.513 +    /*
  13.514 +    ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
  13.515 +    ** We can spot this by looking for the guest linear mapping which
  13.516 +    ** Xen always ensures is present in that L2. Guests must ensure
  13.517 +    ** that this check will fail for other L2s.
  13.518 +    */
  13.519 +    if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
  13.520 +    {
  13.521 +        int hstart;
  13.522 +        uint64_t he;
  13.523 +
  13.524 +        hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
  13.525 +        he = ((const uint64_t *) spage)[hstart];
  13.526 +
  13.527 +        if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
  13.528 +        {
  13.529 +            /* hvirt starts with xen stuff... */
  13.530 +            xen_start = hstart;
  13.531 +        }
  13.532 +        else if ( hvirt_start != 0xf5800000 )
  13.533 +        {
  13.534 +            /* old L2s from before hole was shrunk... */
  13.535 +            hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
  13.536 +            he = ((const uint64_t *) spage)[hstart];
  13.537 +            if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
  13.538 +                xen_start = hstart;
  13.539 +        }
  13.540 +    }
  13.541 +
  13.542 +    if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
  13.543 +    {
  13.544 +        /*
  13.545 +        ** XXX SMH: should compute these from hvirt_start (which we have)
  13.546 +        ** and hvirt_end (which we don't)
  13.547 +        */
  13.548 +        xen_start = 256;
  13.549 +        xen_end   = 272;
  13.550 +    }
  13.551 +
  13.552 +    /* Now iterate through the page table, canonicalizing each PTE */
  13.553 +    for (i = 0; i < pte_last; i++ )
  13.554 +    {
  13.555 +        unsigned long pfn, mfn;
  13.556 +
  13.557 +        if ( pt_levels == 2 )
  13.558 +            pte = ((const uint32_t*)spage)[i];
  13.559 +        else
  13.560 +            pte = ((const uint64_t*)spage)[i];
  13.561 +
  13.562 +        if ( (i >= xen_start) && (i < xen_end) )
  13.563 +            pte = 0;
  13.564 +
  13.565 +        if ( pte & _PAGE_PRESENT )
  13.566 +        {
  13.567 +            mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
  13.568 +            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
  13.569 +            {
  13.570 +                /* This will happen if the type info is stale which
  13.571 +                   is quite feasible under live migration */
  13.572 +                pfn  = 0;  /* zap it - we'll retransmit this page later */
  13.573 +                race = 1;  /* inform the caller of race; fatal if !live */ 
  13.574 +            }
  13.575 +            else
  13.576 +                pfn = mfn_to_pfn(mfn);
  13.577 +
  13.578 +            pte &= ~MADDR_MASK_X86;
  13.579 +            pte |= (uint64_t)pfn << PAGE_SHIFT;
  13.580 +
  13.581 +            /*
  13.582 +             * PAE guest L3Es can contain these flags when running on
  13.583 +             * a 64bit hypervisor. We zap these here to avoid any
  13.584 +             * surprise at restore time...
  13.585 +             */
  13.586 +            if ( (pt_levels == 3) &&
  13.587 +                 (type == XEN_DOMCTL_PFINFO_L3TAB) &&
  13.588 +                 (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
  13.589 +                pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
  13.590 +        }
  13.591 +
  13.592 +        if ( pt_levels == 2 )
  13.593 +            ((uint32_t*)dpage)[i] = pte;
  13.594 +        else
  13.595 +            ((uint64_t*)dpage)[i] = pte;
  13.596 +    }
  13.597 +
  13.598 +    return race;
  13.599 +}
  13.600 +
  13.601 +static xen_pfn_t *xc_map_m2p(int xc_handle,
  13.602 +                                 unsigned long max_mfn,
  13.603 +                                 int prot)
  13.604 +{
  13.605 +    struct xen_machphys_mfn_list xmml;
  13.606 +    privcmd_mmap_entry_t *entries;
  13.607 +    unsigned long m2p_chunks, m2p_size;
  13.608 +    xen_pfn_t *m2p;
  13.609 +    xen_pfn_t *extent_start;
  13.610 +    int i, rc;
  13.611 +
  13.612 +    m2p_size   = M2P_SIZE(max_mfn);
  13.613 +    m2p_chunks = M2P_CHUNKS(max_mfn);
  13.614 +
  13.615 +    xmml.max_extents = m2p_chunks;
  13.616 +    if ( !(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t))) )
  13.617 +    {
  13.618 +        ERROR("failed to allocate space for m2p mfns");
  13.619 +        return NULL;
  13.620 +    }
  13.621 +    set_xen_guest_handle(xmml.extent_start, extent_start);
  13.622 +
  13.623 +    if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
  13.624 +         (xmml.nr_extents != m2p_chunks) )
  13.625 +    {
  13.626 +        ERROR("xc_get_m2p_mfns");
  13.627 +        return NULL;
  13.628 +    }
  13.629 +
  13.630 +    if ( (m2p = mmap(NULL, m2p_size, prot,
  13.631 +                     MAP_SHARED, xc_handle, 0)) == MAP_FAILED )
  13.632 +    {
  13.633 +        ERROR("failed to mmap m2p");
  13.634 +        return NULL;
  13.635 +    }
  13.636 +
  13.637 +    if ( !(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t))) )
  13.638 +    {
  13.639 +        ERROR("failed to allocate space for mmap entries");
  13.640 +        return NULL;
  13.641 +    }
  13.642 +
  13.643 +    for ( i = 0; i < m2p_chunks; i++ )
  13.644 +    {
  13.645 +        entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
  13.646 +        entries[i].mfn = extent_start[i];
  13.647 +        entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
  13.648 +    }
  13.649 +
  13.650 +    if ( (rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
  13.651 +                                     entries, m2p_chunks)) < 0 )
  13.652 +    {
  13.653 +        ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
  13.654 +        return NULL;
  13.655 +    }
  13.656 +
  13.657 +    m2p_mfn0 = entries[0].mfn;
  13.658 +
  13.659 +    free(extent_start);
  13.660 +    free(entries);
  13.661 +
  13.662 +    return m2p;
  13.663 +}
  13.664 +
  13.665 +
  13.666 +static xen_pfn_t *map_and_save_p2m_table(int xc_handle, 
  13.667 +                                         int io_fd, 
  13.668 +                                         uint32_t dom,
  13.669 +                                         vcpu_guest_context_t *ctxt,
  13.670 +                                         unsigned long p2m_size,
  13.671 +                                         shared_info_t *live_shinfo)
  13.672 +{
  13.673 +    /* Double and single indirect references to the live P2M table */
  13.674 +    xen_pfn_t *live_p2m_frame_list_list = NULL;
  13.675 +    xen_pfn_t *live_p2m_frame_list = NULL;
  13.676 +
  13.677 +    /* A copy of the pfn-to-mfn table frame list. */
  13.678 +    xen_pfn_t *p2m_frame_list = NULL;
  13.679 +
  13.680 +    /* The mapping of the live p2m table itself */
  13.681 +    xen_pfn_t *p2m = NULL;
  13.682 +
  13.683 +    int i, success = 0;
  13.684 +
  13.685 +    live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
  13.686 +                                                   live_shinfo);
  13.687 +    if ( !live_p2m_frame_list_list )
  13.688 +        goto out;
  13.689 +
  13.690 +    live_p2m_frame_list =
  13.691 +        xc_map_foreign_batch(xc_handle, dom, PROT_READ,
  13.692 +                             live_p2m_frame_list_list,
  13.693 +                             P2M_FLL_ENTRIES);
  13.694 +    if ( !live_p2m_frame_list )
  13.695 +    {
  13.696 +        ERROR("Couldn't map p2m_frame_list");
  13.697 +        goto out;
  13.698 +    }
  13.699 +
  13.700 +
  13.701 +    /* Map all the frames of the pfn->mfn table. For migrate to succeed,
  13.702 +       the guest must not change which frames are used for this purpose.
  13.703 +       (its not clear why it would want to change them, and we'll be OK
  13.704 +       from a safety POV anyhow. */
  13.705 +
  13.706 +    p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
  13.707 +                               live_p2m_frame_list,
  13.708 +                               P2M_FL_ENTRIES);
  13.709 +    if ( !p2m )
  13.710 +    {
  13.711 +        ERROR("Couldn't map p2m table");
  13.712 +        goto out;
  13.713 +    }
  13.714 +    live_p2m = p2m; /* So that translation macros will work */
  13.715 +    
  13.716 +    /* Get a local copy of the live_P2M_frame_list */
  13.717 +    if ( !(p2m_frame_list = malloc(P2M_FL_SIZE)) )
  13.718 +    {
  13.719 +        ERROR("Couldn't allocate p2m_frame_list array");
  13.720 +        goto out;
  13.721 +    }
  13.722 +    memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
  13.723 +
  13.724 +    /* Canonicalise the pfn-to-mfn table frame-number list. */
  13.725 +    for ( i = 0; i < p2m_size; i += fpp )
  13.726 +    {
  13.727 +        if ( !translate_mfn_to_pfn(&p2m_frame_list[i/fpp]) )
  13.728 +        {
  13.729 +            ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
  13.730 +            ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
  13.731 +                  (uint64_t)p2m_frame_list[i/fpp]);
  13.732 +            goto out;
  13.733 +        }
  13.734 +    }
  13.735 +
  13.736 +    /*
  13.737 +     * Write an extended-info structure to inform the restore code that
  13.738 +     * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
  13.739 +     * slow paths in the restore code.
  13.740 +     */
  13.741 +    if ( (pt_levels == 3) &&
  13.742 +         (ctxt->vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3)) )
  13.743 +    {
  13.744 +        unsigned long signature = ~0UL;
  13.745 +        uint32_t tot_sz   = sizeof(struct vcpu_guest_context) + 8;
  13.746 +        uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
  13.747 +        char chunk_sig[]  = "vcpu";
  13.748 +        if ( !write_exact(io_fd, &signature, sizeof(signature)) ||
  13.749 +             !write_exact(io_fd, &tot_sz,    sizeof(tot_sz)) ||
  13.750 +             !write_exact(io_fd, &chunk_sig, 4) ||
  13.751 +             !write_exact(io_fd, &chunk_sz,  sizeof(chunk_sz)) ||
  13.752 +             !write_exact(io_fd, ctxt,       sizeof(*ctxt)) )
  13.753 +        {
  13.754 +            ERROR("write: extended info");
  13.755 +            goto out;
  13.756 +        }
  13.757 +    }
  13.758 +
  13.759 +    if ( !write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE) )
  13.760 +    {
  13.761 +        ERROR("write: p2m_frame_list");
  13.762 +        goto out;
  13.763 +    }    
  13.764 +
  13.765 +    success = 1;
  13.766 +
  13.767 + out:
  13.768 +    
  13.769 +    if ( !success && p2m )
  13.770 +        munmap(p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
  13.771 +
  13.772 +    if ( live_p2m_frame_list_list )
  13.773 +        munmap(live_p2m_frame_list_list, PAGE_SIZE);
  13.774 +
  13.775 +    if ( live_p2m_frame_list )
  13.776 +        munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
  13.777 +
  13.778 +    if ( p2m_frame_list ) 
  13.779 +        free(p2m_frame_list);
  13.780 +
  13.781 +    return success ? p2m : NULL;
  13.782 +}
  13.783 +
  13.784 +
  13.785 +
  13.786 +int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
  13.787 +                   uint32_t max_factor, uint32_t flags, int (*suspend)(int),
  13.788 +                   int hvm, void *(*init_qemu_maps)(int, unsigned), 
  13.789 +                   void (*qemu_flip_buffer)(int, int))
  13.790 +{
  13.791 +    xc_dominfo_t info;
  13.792 +
  13.793 +    int rc = 1, i, j, last_iter, iter = 0;
  13.794 +    int live  = (flags & XCFLAGS_LIVE);
  13.795 +    int debug = (flags & XCFLAGS_DEBUG);
  13.796 +    int race = 0, sent_last_iter, skip_this_iter;
  13.797 +
  13.798 +    /* The new domain's shared-info frame number. */
  13.799 +    unsigned long shared_info_frame;
  13.800 +
  13.801 +    /* A copy of the CPU context of the guest. */
  13.802 +    vcpu_guest_context_t ctxt;
  13.803 +
  13.804 +    /* A table containing the type of each PFN (/not/ MFN!). */
  13.805 +    unsigned long *pfn_type = NULL;
  13.806 +    unsigned long *pfn_batch = NULL;
  13.807 +
  13.808 +    /* A copy of one frame of guest memory. */
  13.809 +    char page[PAGE_SIZE];
  13.810 +
  13.811 +    /* Live mapping of shared info structure */
  13.812 +    shared_info_t *live_shinfo = NULL;
  13.813 +
  13.814 +    /* base of the region in which domain memory is mapped */
  13.815 +    unsigned char *region_base = NULL;
  13.816 +
  13.817 +    /* power of 2 order of p2m_size */
  13.818 +    int order_nr;
  13.819 +
  13.820 +    /* bitmap of pages:
  13.821 +       - that should be sent this iteration (unless later marked as skip);
  13.822 +       - to skip this iteration because already dirty;
  13.823 +       - to fixup by sending at the end if not already resent; */
  13.824 +    unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
  13.825 +
  13.826 +    xc_shadow_op_stats_t stats;
  13.827 +
  13.828 +    unsigned long needed_to_fix = 0;
  13.829 +    unsigned long total_sent    = 0;
  13.830 +
  13.831 +    uint64_t vcpumap = 1ULL;
  13.832 +
  13.833 +    /* HVM: a buffer for holding HVM context */
  13.834 +    uint32_t hvm_buf_size = 0;
  13.835 +    uint8_t *hvm_buf = NULL;
  13.836 +
  13.837 +    /* HVM: magic frames for ioreqs and xenstore comms. */
  13.838 +    uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
  13.839 +
  13.840 +    /* If no explicit control parameters given, use defaults */
  13.841 +    max_iters  = max_iters  ? : DEF_MAX_ITERS;
  13.842 +    max_factor = max_factor ? : DEF_MAX_FACTOR;
  13.843 +
  13.844 +    initialize_mbit_rate();
  13.845 +
  13.846 +    if ( !get_platform_info(xc_handle, dom,
  13.847 +                            &max_mfn, &hvirt_start, &pt_levels) )
  13.848 +    {
  13.849 +        ERROR("Unable to get platform info.");
  13.850 +        return 1;
  13.851 +    }
  13.852 +
  13.853 +    if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
  13.854 +    {
  13.855 +        ERROR("Could not get domain info");
  13.856 +        return 1;
  13.857 +    }
  13.858 +
  13.859 +    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
  13.860 +    {
  13.861 +        ERROR("Could not get vcpu context");
  13.862 +        goto out;
  13.863 +    }
  13.864 +    shared_info_frame = info.shared_info_frame;
  13.865 +
  13.866 +    /* Map the shared info frame */
  13.867 +    if ( !hvm )
  13.868 +    {
  13.869 +        live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  13.870 +                                           PROT_READ, shared_info_frame);
  13.871 +        if ( !live_shinfo )
  13.872 +        {
  13.873 +            ERROR("Couldn't map live_shinfo");
  13.874 +            goto out;
  13.875 +        }
  13.876 +    }
  13.877 +
  13.878 +    /* Get the size of the P2M table */
  13.879 +    p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom);
  13.880 +
  13.881 +    /* Domain is still running at this point */
  13.882 +    if ( live )
  13.883 +    {
  13.884 +        /* Live suspend. Enable log-dirty mode. */
  13.885 +        if ( xc_shadow_control(xc_handle, dom,
  13.886 +                               XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
  13.887 +                               NULL, 0, NULL, 0, NULL) < 0 )
  13.888 +        {
  13.889 +            ERROR("Couldn't enable shadow mode");
  13.890 +            goto out;
  13.891 +        }
  13.892 +
  13.893 +        if ( hvm )
  13.894 +        {
  13.895 +            /* Get qemu-dm logging dirty pages too */
  13.896 +            void *seg = init_qemu_maps(dom, BITMAP_SIZE);
  13.897 +            qemu_bitmaps[0] = seg;
  13.898 +            qemu_bitmaps[1] = seg + BITMAP_SIZE;
  13.899 +            qemu_active = 0;
  13.900 +            qemu_non_active = 1;
  13.901 +        }
  13.902 +    }
  13.903 +    else
  13.904 +    {
  13.905 +        /* This is a non-live suspend. Suspend the domain .*/
  13.906 +        if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) )
  13.907 +        {
  13.908 +            ERROR("Domain appears not to have suspended");
  13.909 +            goto out;
  13.910 +        }
  13.911 +    }
  13.912 +
  13.913 +    last_iter = !live;
  13.914 +
  13.915 +    /* pretend we sent all the pages last iteration */
  13.916 +    sent_last_iter = p2m_size;
  13.917 +
  13.918 +    /* calculate the power of 2 order of p2m_size, e.g.
  13.919 +       15->4 16->4 17->5 */
  13.920 +    for ( i = p2m_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
  13.921 +        continue;
  13.922 +
  13.923 +    /* Setup to_send / to_fix and to_skip bitmaps */
  13.924 +    to_send = malloc(BITMAP_SIZE);
  13.925 +    to_fix  = calloc(1, BITMAP_SIZE);
  13.926 +    to_skip = malloc(BITMAP_SIZE);
  13.927 +
  13.928 +    if ( !to_send || !to_fix || !to_skip )
  13.929 +    {
  13.930 +        ERROR("Couldn't allocate to_send array");
  13.931 +        goto out;
  13.932 +    }
  13.933 +
  13.934 +    memset(to_send, 0xff, BITMAP_SIZE);
  13.935 +
  13.936 +    if ( lock_pages(to_send, BITMAP_SIZE) )
  13.937 +    {
  13.938 +        ERROR("Unable to lock to_send");
  13.939 +        return 1;
  13.940 +    }
  13.941 +
  13.942 +    /* (to fix is local only) */
  13.943 +    if ( lock_pages(to_skip, BITMAP_SIZE) )
  13.944 +    {
  13.945 +        ERROR("Unable to lock to_skip");
  13.946 +        return 1;
  13.947 +    }
  13.948 +
  13.949 +    if ( hvm ) 
  13.950 +    {
  13.951 +        /* Need another buffer for HVM context */
  13.952 +        hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
  13.953 +        if ( hvm_buf_size == -1 )
  13.954 +        {
  13.955 +            ERROR("Couldn't get HVM context size from Xen");
  13.956 +            goto out;
  13.957 +        }
  13.958 +        hvm_buf = malloc(hvm_buf_size);
  13.959 +        if ( !hvm_buf )
  13.960 +        {
  13.961 +            ERROR("Couldn't allocate memory");
  13.962 +            goto out;
  13.963 +        }
  13.964 +    }
  13.965 +
  13.966 +    analysis_phase(xc_handle, dom, p2m_size, to_skip, 0);
  13.967 +
  13.968 +    /* We want zeroed memory so use calloc rather than malloc. */
  13.969 +    pfn_type   = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
  13.970 +    pfn_batch  = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
  13.971 +    if ( (pfn_type == NULL) || (pfn_batch == NULL) )
  13.972 +    {
  13.973 +        ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
  13.974 +        errno = ENOMEM;
  13.975 +        goto out;
  13.976 +    }
  13.977 +
  13.978 +    if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
  13.979 +    {
  13.980 +        ERROR("Unable to lock");
  13.981 +        goto out;
  13.982 +    }
  13.983 +
  13.984 +    /* Setup the mfn_to_pfn table mapping */
  13.985 +    if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ)) )
  13.986 +    {
  13.987 +        ERROR("Failed to map live M2P table");
  13.988 +        goto out;
  13.989 +    }
  13.990 +
  13.991 +    /* Start writing out the saved-domain record. */
  13.992 +    if ( !write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
  13.993 +    {
  13.994 +        ERROR("write: p2m_size");
  13.995 +        goto out;
  13.996 +    }
  13.997 +
  13.998 +    if ( !hvm )
  13.999 +    {
 13.1000 +        int err = 0;
 13.1001 +        unsigned long mfn;
 13.1002 +
 13.1003 +        /* Map the P2M table, and write the list of P2M frames */
 13.1004 +        live_p2m = map_and_save_p2m_table(xc_handle, io_fd, dom, 
 13.1005 +                                          &ctxt, p2m_size, live_shinfo);
 13.1006 +        if ( live_p2m == NULL )
 13.1007 +        {
 13.1008 +            ERROR("Failed to map/save the p2m frame list");
 13.1009 +            goto out;
 13.1010 +        }
 13.1011 +
 13.1012 +        /*
 13.1013 +         * Quick belt and braces sanity check.
 13.1014 +         */
 13.1015 +        
 13.1016 +        for ( i = 0; i < p2m_size; i++ )
 13.1017 +        {
 13.1018 +            mfn = live_p2m[i];
 13.1019 +            if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
 13.1020 +            {
 13.1021 +                DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
 13.1022 +                        mfn, mfn_to_pfn(mfn));
 13.1023 +                err++;
 13.1024 +            }
 13.1025 +        }
 13.1026 +        DPRINTF("Had %d unexplained entries in p2m table\n", err);
 13.1027 +    }
 13.1028 +
 13.1029 +    print_stats(xc_handle, dom, 0, &stats, 0);
 13.1030 +
 13.1031 +    /* Now write out each data page, canonicalising page tables as we go... */
 13.1032 +    for ( ; ; )
 13.1033 +    {
 13.1034 +        unsigned int prev_pc, sent_this_iter, N, batch;
 13.1035 +
 13.1036 +        iter++;
 13.1037 +        sent_this_iter = 0;
 13.1038 +        skip_this_iter = 0;
 13.1039 +        prev_pc = 0;
 13.1040 +        N = 0;
 13.1041 +
 13.1042 +        DPRINTF("Saving memory pages: iter %d   0%%", iter);
 13.1043 +
 13.1044 +        while ( N < p2m_size )
 13.1045 +        {
 13.1046 +            unsigned int this_pc = (N * 100) / p2m_size;
 13.1047 +            int rc;
 13.1048 +
 13.1049 +            if ( (this_pc - prev_pc) >= 5 )
 13.1050 +            {
 13.1051 +                DPRINTF("\b\b\b\b%3d%%", this_pc);
 13.1052 +                prev_pc = this_pc;
 13.1053 +            }
 13.1054 +
 13.1055 +            if ( !last_iter )
 13.1056 +            {
 13.1057 +                /* Slightly wasteful to peek the whole array evey time,
 13.1058 +                   but this is fast enough for the moment. */
 13.1059 +                rc = xc_shadow_control(
 13.1060 +                    xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip, 
 13.1061 +                    p2m_size, NULL, 0, NULL);
 13.1062 +                if ( rc != p2m_size )
 13.1063 +                {
 13.1064 +                    ERROR("Error peeking shadow bitmap");
 13.1065 +                    goto out;
 13.1066 +                }
 13.1067 +            }
 13.1068 +
 13.1069 +            /* load pfn_type[] with the mfn of all the pages we're doing in
 13.1070 +               this batch. */
 13.1071 +            for  ( batch = 0;
 13.1072 +                   (batch < MAX_BATCH_SIZE) && (N < p2m_size);
 13.1073 +                   N++ )
 13.1074 +            {
 13.1075 +                int n = permute(N, p2m_size, order_nr);
 13.1076 +
 13.1077 +                if ( debug )
 13.1078 +                    DPRINTF("%d pfn= %08lx mfn= %08lx %d  [mfn]= %08lx\n",
 13.1079 +                            iter, (unsigned long)n, hvm ? 0 : live_p2m[n],
 13.1080 +                            test_bit(n, to_send),
 13.1081 +                            hvm ? 0 : mfn_to_pfn(live_p2m[n]&0xFFFFF));
 13.1082 +
 13.1083 +                if ( !last_iter &&
 13.1084 +                     test_bit(n, to_send) &&
 13.1085 +                     test_bit(n, to_skip) )
 13.1086 +                    skip_this_iter++; /* stats keeping */
 13.1087 +
 13.1088 +                if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
 13.1089 +                       (test_bit(n, to_send) && last_iter) ||
 13.1090 +                       (test_bit(n, to_fix)  && last_iter)) )
 13.1091 +                    continue;
 13.1092 +
 13.1093 +                /* Skip PFNs that aren't really there */
 13.1094 +                if ( hvm && ((n >= 0xa0 && n < 0xc0) /* VGA hole */
 13.1095 +                             || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT) 
 13.1096 +                                 && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ )
 13.1097 +                    continue;
 13.1098 +
 13.1099 +                /*
 13.1100 +                ** we get here if:
 13.1101 +                **  1. page is marked to_send & hasn't already been re-dirtied
 13.1102 +                **  2. (ignore to_skip in last iteration)
 13.1103 +                **  3. add in pages that still need fixup (net bufs)
 13.1104 +                */
 13.1105 +
 13.1106 +                pfn_batch[batch] = n;
 13.1107 +
 13.1108 +                /* Hypercall interfaces operate in PFNs for HVM guests
 13.1109 +                * and MFNs for PV guests */
 13.1110 +                if ( hvm ) 
 13.1111 +                    pfn_type[batch] = n;
 13.1112 +                else
 13.1113 +                    pfn_type[batch] = live_p2m[n];
 13.1114 +                    
 13.1115 +                if ( !is_mapped(pfn_type[batch]) )
 13.1116 +                {
 13.1117 +                    /*
 13.1118 +                    ** not currently in psuedo-physical map -- set bit
 13.1119 +                    ** in to_fix since we must send this page in last_iter
 13.1120 +                    ** unless its sent sooner anyhow, or it never enters
 13.1121 +                    ** pseudo-physical map (e.g. for ballooned down doms)
 13.1122 +                    */
 13.1123 +                    set_bit(n, to_fix);
 13.1124 +                    continue;
 13.1125 +                }
 13.1126 +
 13.1127 +                if ( last_iter &&
 13.1128 +                     test_bit(n, to_fix) &&
 13.1129 +                     !test_bit(n, to_send) )
 13.1130 +                {
 13.1131 +                    needed_to_fix++;
 13.1132 +                    DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
 13.1133 +                            iter, n, pfn_type[batch]);
 13.1134 +                }
 13.1135 +                
 13.1136 +                clear_bit(n, to_fix);
 13.1137 +                
 13.1138 +                batch++;
 13.1139 +            }
 13.1140 +
 13.1141 +            if ( batch == 0 )
 13.1142 +                goto skip; /* vanishingly unlikely... */
 13.1143 +
 13.1144 +            region_base = xc_map_foreign_batch(
 13.1145 +                xc_handle, dom, PROT_READ, pfn_type, batch);
 13.1146 +            if ( region_base == NULL )
 13.1147 +            {
 13.1148 +                ERROR("map batch failed");
 13.1149 +                goto out;
 13.1150 +            }
 13.1151 +
 13.1152 +            if ( !hvm )
 13.1153 +            {
 13.1154 +                /* Get page types */
 13.1155 +                for ( j = 0; j < batch; j++ )
 13.1156 +                    ((uint32_t *)pfn_type)[j] = pfn_type[j];
 13.1157 +                if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
 13.1158 +                                           (uint32_t *)pfn_type) )
 13.1159 +                {
 13.1160 +                    ERROR("get_pfn_type_batch failed");
 13.1161 +                    goto out;
 13.1162 +                }
 13.1163 +                for ( j = batch-1; j >= 0; j-- )
 13.1164 +                    pfn_type[j] = ((uint32_t *)pfn_type)[j];
 13.1165 +
 13.1166 +                for ( j = 0; j < batch; j++ )
 13.1167 +                {
 13.1168 +                    
 13.1169 +                    if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
 13.1170 +                         XEN_DOMCTL_PFINFO_XTAB )
 13.1171 +                    {
 13.1172 +                        DPRINTF("type fail: page %i mfn %08lx\n", 
 13.1173 +                                j, pfn_type[j]);
 13.1174 +                        continue;
 13.1175 +                    }
 13.1176 +                    
 13.1177 +                    if ( debug )
 13.1178 +                        DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
 13.1179 +                                " sum= %08lx\n",
 13.1180 +                                iter,
 13.1181 +                                (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
 13.1182 +                                pfn_batch[j],
 13.1183 +                                pfn_type[j],
 13.1184 +                                mfn_to_pfn(pfn_type[j] &
 13.1185 +                                           ~XEN_DOMCTL_PFINFO_LTAB_MASK),
 13.1186 +                                csum_page(region_base + (PAGE_SIZE*j)));
 13.1187 +                    
 13.1188 +                    /* canonicalise mfn->pfn */
 13.1189 +                    pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
 13.1190 +                        pfn_batch[j];
 13.1191 +                }
 13.1192 +            }
 13.1193 +
 13.1194 +            if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) )
 13.1195 +            {
 13.1196 +                ERROR("Error when writing to state file (2) (errno %d)",
 13.1197 +                      errno);
 13.1198 +                goto out;
 13.1199 +            }
 13.1200 +
 13.1201 +            if ( !write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
 13.1202 +            {
 13.1203 +                ERROR("Error when writing to state file (3) (errno %d)",
 13.1204 +                      errno);
 13.1205 +                goto out;
 13.1206 +            }
 13.1207 +
 13.1208 +            /* entering this loop, pfn_type is now in pfns (Not mfns) */
 13.1209 +            for ( j = 0; j < batch; j++ )
 13.1210 +            {
 13.1211 +                unsigned long pfn, pagetype;
 13.1212 +                void *spage = (char *)region_base + (PAGE_SIZE*j);
 13.1213 +
 13.1214 +                pfn      = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
 13.1215 +                pagetype = pfn_type[j] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
 13.1216 +
 13.1217 +                /* write out pages in batch */
 13.1218 +                if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
 13.1219 +                    continue;
 13.1220 +
 13.1221 +                pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
 13.1222 +
 13.1223 +                if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
 13.1224 +                     (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
 13.1225 +                {
 13.1226 +                    /* We have a pagetable page: need to rewrite it. */
 13.1227 +                    race = 
 13.1228 +                        canonicalize_pagetable(pagetype, pfn, spage, page); 
 13.1229 +
 13.1230 +                    if ( race && !live )
 13.1231 +                    {
 13.1232 +                        ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
 13.1233 +                              pagetype);
 13.1234 +                        goto out;
 13.1235 +                    }
 13.1236 +
 13.1237 +                    if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
 13.1238 +                    {
 13.1239 +                        ERROR("Error when writing to state file (4)"
 13.1240 +                              " (errno %d)", errno);
 13.1241 +                        goto out;
 13.1242 +                    }
 13.1243 +                }
 13.1244 +                else
 13.1245 +                {
 13.1246 +                    /* We have a normal page: just write it directly. */
 13.1247 +                    if ( ratewrite(io_fd, live, spage, PAGE_SIZE) !=
 13.1248 +                         PAGE_SIZE )
 13.1249 +                    {
 13.1250 +                        ERROR("Error when writing to state file (5)"
 13.1251 +                              " (errno %d)", errno);
 13.1252 +                        goto out;
 13.1253 +                    }
 13.1254 +                }
 13.1255 +            } /* end of the write out for this batch */
 13.1256 +
 13.1257 +            sent_this_iter += batch;
 13.1258 +
 13.1259 +            munmap(region_base, batch*PAGE_SIZE);
 13.1260 +
 13.1261 +        } /* end of this while loop for this iteration */
 13.1262 +
 13.1263 +      skip:
 13.1264 +
 13.1265 +        total_sent += sent_this_iter;
 13.1266 +
 13.1267 +        DPRINTF("\r %d: sent %d, skipped %d, ",
 13.1268 +                iter, sent_this_iter, skip_this_iter );
 13.1269 +
 13.1270 +        if ( last_iter )
 13.1271 +        {
 13.1272 +            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
 13.1273 +
 13.1274 +            DPRINTF("Total pages sent= %ld (%.2fx)\n",
 13.1275 +                    total_sent, ((float)total_sent)/p2m_size );
 13.1276 +            DPRINTF("(of which %ld were fixups)\n", needed_to_fix  );
 13.1277 +        }
 13.1278 +
 13.1279 +        if ( last_iter && debug )
 13.1280 +        {
 13.1281 +            int minusone = -1;
 13.1282 +            memset(to_send, 0xff, BITMAP_SIZE);
 13.1283 +            debug = 0;
 13.1284 +            DPRINTF("Entering debug resend-all mode\n");
 13.1285 +
 13.1286 +            /* send "-1" to put receiver into debug mode */
 13.1287 +            if ( !write_exact(io_fd, &minusone, sizeof(int)) )
 13.1288 +            {
 13.1289 +                ERROR("Error when writing to state file (6) (errno %d)",
 13.1290 +                      errno);
 13.1291 +                goto out;
 13.1292 +            }
 13.1293 +
 13.1294 +            continue;
 13.1295 +        }
 13.1296 +
 13.1297 +        if ( last_iter )
 13.1298 +            break;
 13.1299 +
 13.1300 +        if ( live )
 13.1301 +        {
 13.1302 +            if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
 13.1303 +                 (iter >= max_iters) ||
 13.1304 +                 (sent_this_iter+skip_this_iter < 50) ||
 13.1305 +                 (total_sent > p2m_size*max_factor) )
 13.1306 +            {
 13.1307 +                DPRINTF("Start last iteration\n");
 13.1308 +                last_iter = 1;
 13.1309 +
 13.1310 +                if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
 13.1311 +                                       &ctxt) )
 13.1312 +                {
 13.1313 +                    ERROR("Domain appears not to have suspended");
 13.1314 +                    goto out;
 13.1315 +                }
 13.1316 +
 13.1317 +                DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
 13.1318 +                        info.shared_info_frame,
 13.1319 +                        (unsigned long)ctxt.user_regs.eip,
 13.1320 +                        (unsigned long)ctxt.user_regs.edx);
 13.1321 +            }
 13.1322 +
 13.1323 +            if ( xc_shadow_control(xc_handle, dom, 
 13.1324 +                                   XEN_DOMCTL_SHADOW_OP_CLEAN, to_send, 
 13.1325 +                                   p2m_size, NULL, 0, &stats) != p2m_size )
 13.1326 +            {
 13.1327 +                ERROR("Error flushing shadow PT");
 13.1328 +                goto out;
 13.1329 +            }
 13.1330 +
 13.1331 +            if ( hvm ) 
 13.1332 +            {
 13.1333 +                /* Pull in the dirty bits from qemu-dm too */
 13.1334 +                if ( !last_iter )
 13.1335 +                {
 13.1336 +                    qemu_active = qemu_non_active;
 13.1337 +                    qemu_non_active = qemu_active ? 0 : 1;
 13.1338 +                    qemu_flip_buffer(dom, qemu_active);
 13.1339 +                    for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
 13.1340 +                    {
 13.1341 +                        to_send[j] |= qemu_bitmaps[qemu_non_active][j];
 13.1342 +                        qemu_bitmaps[qemu_non_active][j] = 0;
 13.1343 +                    }
 13.1344 +                }
 13.1345 +                else
 13.1346 +                {
 13.1347 +                    for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
 13.1348 +                        to_send[j] |= qemu_bitmaps[qemu_active][j];
 13.1349 +                }
 13.1350 +            }
 13.1351 +
 13.1352 +            sent_last_iter = sent_this_iter;
 13.1353 +
 13.1354 +            print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
 13.1355 +
 13.1356 +        }
 13.1357 +    } /* end of infinite for loop */
 13.1358 +
 13.1359 +    DPRINTF("All memory is saved\n");
 13.1360 +
 13.1361 +    {
 13.1362 +        struct {
 13.1363 +            int minustwo;
 13.1364 +            int max_vcpu_id;
 13.1365 +            uint64_t vcpumap;
 13.1366 +        } chunk = { -2, info.max_vcpu_id };
 13.1367 +
 13.1368 +        if ( info.max_vcpu_id >= 64 )
 13.1369 +        {
 13.1370 +            ERROR("Too many VCPUS in guest!");
 13.1371 +            goto out;
 13.1372 +        }
 13.1373 +
 13.1374 +        for ( i = 1; i <= info.max_vcpu_id; i++ )
 13.1375 +        {
 13.1376 +            xc_vcpuinfo_t vinfo;
 13.1377 +            if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
 13.1378 +                 vinfo.online )
 13.1379 +                vcpumap |= 1ULL << i;
 13.1380 +        }
 13.1381 +
 13.1382 +        chunk.vcpumap = vcpumap;
 13.1383 +        if ( !write_exact(io_fd, &chunk, sizeof(chunk)) )
 13.1384 +        {
 13.1385 +            ERROR("Error when writing to state file (errno %d)", errno);
 13.1386 +            goto out;
 13.1387 +        }
 13.1388 +    }
 13.1389 +
 13.1390 +    /* Zero terminate */
 13.1391 +    i = 0;
 13.1392 +    if ( !write_exact(io_fd, &i, sizeof(int)) )
 13.1393 +    {
 13.1394 +        ERROR("Error when writing to state file (6') (errno %d)", errno);
 13.1395 +        goto out;
 13.1396 +    }
 13.1397 +
 13.1398 +    if ( hvm ) 
 13.1399 +    {
 13.1400 +        uint32_t rec_size;
 13.1401 +
 13.1402 +        /* Save magic-page locations. */
 13.1403 +        memset(magic_pfns, 0, sizeof(magic_pfns));
 13.1404 +        xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
 13.1405 +                         (unsigned long *)&magic_pfns[0]);
 13.1406 +        xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
 13.1407 +                         (unsigned long *)&magic_pfns[1]);
 13.1408 +        xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
 13.1409 +                         (unsigned long *)&magic_pfns[2]);
 13.1410 +        if ( !write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
 13.1411 +        {
 13.1412 +            ERROR("Error when writing to state file (7)");
 13.1413 +            goto out;
 13.1414 +        }
 13.1415 +
 13.1416 +        /* Save vcpu contexts */
 13.1417 +
 13.1418 +        for ( i = 0; i <= info.max_vcpu_id; i++ )
 13.1419 +        {
 13.1420 +            if ( !(vcpumap & (1ULL << i)) )
 13.1421 +                continue;
 13.1422 +            
 13.1423 +            if ( xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
 13.1424 +            {
 13.1425 +                ERROR("HVM:Could not get vcpu context");
 13.1426 +                goto out;
 13.1427 +            }
 13.1428 +            
 13.1429 +            DPRINTF("write vcpu %d context.\n", i); 
 13.1430 +            if ( !write_exact(io_fd, &(ctxt), sizeof(ctxt)) )
 13.1431 +            {
 13.1432 +                ERROR("write vcpu context failed!\n");
 13.1433 +                goto out;
 13.1434 +            }
 13.1435 +        }
 13.1436 +
 13.1437 +        /* Get HVM context from Xen and save it too */
 13.1438 +        if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf, 
 13.1439 +                                                  hvm_buf_size)) == -1 )
 13.1440 +        {
 13.1441 +            ERROR("HVM:Could not get hvm buffer");
 13.1442 +            goto out;
 13.1443 +        }
 13.1444 +        
 13.1445 +        if ( !write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
 13.1446 +        {
 13.1447 +            ERROR("error write hvm buffer size");
 13.1448 +            goto out;
 13.1449 +        }
 13.1450 +        
 13.1451 +        if ( !write_exact(io_fd, hvm_buf, rec_size) )
 13.1452 +        {
 13.1453 +            ERROR("write HVM info failed!\n");
 13.1454 +            goto out;
 13.1455 +        }
 13.1456 +        
 13.1457 +        /* HVM guests are done now */
 13.1458 +        rc = 0;
 13.1459 +        goto out;
 13.1460 +    }
 13.1461 +
 13.1462 +    /* PV guests only from now on */
 13.1463 +
 13.1464 +    /* Send through a list of all the PFNs that were not in map at the close */
 13.1465 +    {
 13.1466 +        unsigned int i,j;
 13.1467 +        unsigned long pfntab[1024];
 13.1468 +
 13.1469 +        for ( i = 0, j = 0; i < p2m_size; i++ )
 13.1470 +        {
 13.1471 +            if ( !is_mapped(live_p2m[i]) )
 13.1472 +                j++;
 13.1473 +        }
 13.1474 +
 13.1475 +        if ( !write_exact(io_fd, &j, sizeof(unsigned int)) )
 13.1476 +        {
 13.1477 +            ERROR("Error when writing to state file (6a) (errno %d)", errno);
 13.1478 +            goto out;
 13.1479 +        }
 13.1480 +
 13.1481 +        for ( i = 0, j = 0; i < p2m_size; )
 13.1482 +        {
 13.1483 +            if ( !is_mapped(live_p2m[i]) )
 13.1484 +                pfntab[j++] = i;
 13.1485 +
 13.1486 +            i++;
 13.1487 +            if ( (j == 1024) || (i == p2m_size) )
 13.1488 +            {
 13.1489 +                if ( !write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
 13.1490 +                {
 13.1491 +                    ERROR("Error when writing to state file (6b) (errno %d)",
 13.1492 +                          errno);
 13.1493 +                    goto out;
 13.1494 +                }
 13.1495 +                j = 0;
 13.1496 +            }
 13.1497 +        }
 13.1498 +    }
 13.1499 +
 13.1500 +    /* Canonicalise the suspend-record frame number. */
 13.1501 +    if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) )
 13.1502 +    {
 13.1503 +        ERROR("Suspend record is not in range of pseudophys map");
 13.1504 +        goto out;
 13.1505 +    }
 13.1506 +
 13.1507 +    for ( i = 0; i <= info.max_vcpu_id; i++ )
 13.1508 +    {
 13.1509 +        if ( !(vcpumap & (1ULL << i)) )
 13.1510 +            continue;
 13.1511 +
 13.1512 +        if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
 13.1513 +        {
 13.1514 +            ERROR("No context for VCPU%d", i);
 13.1515 +            goto out;
 13.1516 +        }
 13.1517 +
 13.1518 +        /* Canonicalise each GDT frame number. */
 13.1519 +        for ( j = 0; (512*j) < ctxt.gdt_ents; j++ )
 13.1520 +        {
 13.1521 +            if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[j]) )
 13.1522 +            {
 13.1523 +                ERROR("GDT frame is not in range of pseudophys map");
 13.1524 +                goto out;
 13.1525 +            }
 13.1526 +        }
 13.1527 +
 13.1528 +        /* Canonicalise the page table base pointer. */
 13.1529 +        if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) )
 13.1530 +        {
 13.1531 +            ERROR("PT base is not in range of pseudophys map");
 13.1532 +            goto out;
 13.1533 +        }
 13.1534 +        ctxt.ctrlreg[3] = 
 13.1535 +            xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3])));
 13.1536 +
 13.1537 +        /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
 13.1538 +        if ( (pt_levels == 4) && ctxt.ctrlreg[1] )
 13.1539 +        {
 13.1540 +            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[1])) )
 13.1541 +            {
 13.1542 +                ERROR("PT base is not in range of pseudophys map");
 13.1543 +                goto out;
 13.1544 +            }
 13.1545 +            /* Least-significant bit means 'valid PFN'. */
 13.1546 +            ctxt.ctrlreg[1] = 1 |
 13.1547 +                xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[1])));
 13.1548 +        }
 13.1549 +
 13.1550 +        if ( !write_exact(io_fd, &ctxt, sizeof(ctxt)) )
 13.1551 +        {
 13.1552 +            ERROR("Error when writing to state file (1) (errno %d)", errno);
 13.1553 +            goto out;
 13.1554 +        }
 13.1555 +    }
 13.1556 +
 13.1557 +    /*
 13.1558 +     * Reset the MFN to be a known-invalid value. See map_frame_list_list().
 13.1559 +     */
 13.1560 +    memcpy(page, live_shinfo, PAGE_SIZE);
 13.1561 +    ((shared_info_t *)page)->arch.pfn_to_mfn_frame_list_list = 0;
 13.1562 +    if ( !write_exact(io_fd, page, PAGE_SIZE) )
 13.1563 +    {
 13.1564 +        ERROR("Error when writing to state file (1) (errno %d)", errno);
 13.1565 +        goto out;
 13.1566 +    }
 13.1567 +
 13.1568 +    /* Success! */
 13.1569 +    rc = 0;
 13.1570 +
 13.1571 + out:
 13.1572 +
 13.1573 +    if ( live )
 13.1574 +    {
 13.1575 +        if ( xc_shadow_control(xc_handle, dom, 
 13.1576 +                               XEN_DOMCTL_SHADOW_OP_OFF,
 13.1577 +                               NULL, 0, NULL, 0, NULL) < 0 )
 13.1578 +            DPRINTF("Warning - couldn't disable shadow mode");
 13.1579 +    }
 13.1580 +
 13.1581 +    /* Flush last write and discard cache for file. */
 13.1582 +    discard_file_cache(io_fd, 1 /* flush */);
 13.1583 +
 13.1584 +    if ( live_shinfo )
 13.1585 +        munmap(live_shinfo, PAGE_SIZE);
 13.1586 +
 13.1587 +    if ( live_p2m )
 13.1588 +        munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
 13.1589 +
 13.1590 +    if ( live_m2p )
 13.1591 +        munmap(live_m2p, M2P_SIZE(max_mfn));
 13.1592 +
 13.1593 +    free(pfn_type);
 13.1594 +    free(pfn_batch);
 13.1595 +    free(to_send);
 13.1596 +    free(to_fix);
 13.1597 +    free(to_skip);
 13.1598 +
 13.1599 +    DPRINTF("Save exit rc=%d\n",rc);
 13.1600 +
 13.1601 +    return !!rc;
 13.1602 +}
 13.1603 +
 13.1604 +/*
 13.1605 + * Local variables:
 13.1606 + * mode: C
 13.1607 + * c-set-style: "BSD"
 13.1608 + * c-basic-offset: 4
 13.1609 + * tab-width: 4
 13.1610 + * indent-tabs-mode: nil
 13.1611 + * End:
 13.1612 + */
    14.1 --- a/tools/libxc/xc_hvm_save.c	Wed Apr 11 07:30:02 2007 -0600
    14.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.3 @@ -1,755 +0,0 @@
    14.4 -/******************************************************************************
    14.5 - * xc_hvm_save.c
    14.6 - *
    14.7 - * Save the state of a running HVM guest.
    14.8 - *
    14.9 - * Copyright (c) 2003, K A Fraser.
   14.10 - * Copyright (c) 2006 Intel Corperation
   14.11 - * rewriten for hvm guest by Zhai Edwin <edwin.zhai@intel.com>
   14.12 - *
   14.13 - * This program is free software; you can redistribute it and/or modify it
   14.14 - * under the terms and conditions of the GNU General Public License,
   14.15 - * version 2, as published by the Free Software Foundation.
   14.16 - *
   14.17 - * This program is distributed in the hope it will be useful, but WITHOUT
   14.18 - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   14.19 - * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   14.20 - * more details.
   14.21 - *
   14.22 - * You should have received a copy of the GNU General Public License along with
   14.23 - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
   14.24 - * Place - Suite 330, Boston, MA 02111-1307 USA.
   14.25 - *
   14.26 - */
   14.27 -
   14.28 -#include <inttypes.h>
   14.29 -#include <time.h>
   14.30 -#include <stdlib.h>
   14.31 -#include <unistd.h>
   14.32 -#include <sys/time.h>
   14.33 -
   14.34 -#include "xc_private.h"
   14.35 -#include "xg_private.h"
   14.36 -#include "xg_save_restore.h"
   14.37 -
   14.38 -#include <xen/hvm/e820.h>
   14.39 -#include <xen/hvm/params.h>
   14.40 -
   14.41 -/*
   14.42 -** Default values for important tuning parameters. Can override by passing
   14.43 -** non-zero replacement values to xc_hvm_save().
   14.44 -**
   14.45 -** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
   14.46 -**
   14.47 -*/
   14.48 -#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop   */
   14.49 -#define DEF_MAX_FACTOR   3   /* never send more than 3x nr_pfns   */
   14.50 -
   14.51 -/* Shared-memory bitmaps for getting log-dirty bits from qemu */
   14.52 -static unsigned long *qemu_bitmaps[2];
   14.53 -static int qemu_active;
   14.54 -static int qemu_non_active;
   14.55 -
   14.56 -/*
   14.57 -** During (live) save/migrate, we maintain a number of bitmaps to track
   14.58 -** which pages we have to send, to fixup, and to skip.
   14.59 -*/
   14.60 -
   14.61 -#define BITS_PER_LONG (sizeof(unsigned long) * 8)
   14.62 -#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
   14.63 -#define BITMAP_SIZE   (BITS_TO_LONGS(pfn_array_size) * sizeof(unsigned long))
   14.64 -
   14.65 -#define BITMAP_ENTRY(_nr,_bmap) \
   14.66 -   ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
   14.67 -
   14.68 -#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
   14.69 -
   14.70 -static inline int test_bit (int nr, volatile void * addr)
   14.71 -{
   14.72 -    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
   14.73 -}
   14.74 -
   14.75 -static inline void clear_bit (int nr, volatile void * addr)
   14.76 -{
   14.77 -    BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
   14.78 -}
   14.79 -
   14.80 -static inline int permute( int i, int nr, int order_nr  )
   14.81 -{
   14.82 -    /* Need a simple permutation function so that we scan pages in a
   14.83 -       pseudo random order, enabling us to get a better estimate of
   14.84 -       the domain's page dirtying rate as we go (there are often
   14.85 -       contiguous ranges of pfns that have similar behaviour, and we
   14.86 -       want to mix them up. */
   14.87 -
   14.88 -    /* e.g. nr->oder 15->4 16->4 17->5 */
   14.89 -    /* 512MB domain, 128k pages, order 17 */
   14.90 -
   14.91 -    /*
   14.92 -      QPONMLKJIHGFEDCBA
   14.93 -             QPONMLKJIH
   14.94 -      GFEDCBA
   14.95 -     */
   14.96 -
   14.97 -    /*
   14.98 -      QPONMLKJIHGFEDCBA
   14.99 -                  EDCBA
  14.100 -             QPONM
  14.101 -      LKJIHGF
  14.102 -      */
  14.103 -
  14.104 -    do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
  14.105 -    while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
  14.106 -
  14.107 -    return i;
  14.108 -}
  14.109 -
  14.110 -
  14.111 -static uint64_t tv_to_us(struct timeval *new)
  14.112 -{
  14.113 -    return (new->tv_sec * 1000000) + new->tv_usec;
  14.114 -}
  14.115 -
  14.116 -static uint64_t llgettimeofday(void)
  14.117 -{
  14.118 -    struct timeval now;
  14.119 -    gettimeofday(&now, NULL);
  14.120 -    return tv_to_us(&now);
  14.121 -}
  14.122 -
  14.123 -static uint64_t tv_delta(struct timeval *new, struct timeval *old)
  14.124 -{
  14.125 -    return (((new->tv_sec - old->tv_sec)*1000000) +
  14.126 -            (new->tv_usec - old->tv_usec));
  14.127 -}
  14.128 -
  14.129 -
  14.130 -#define RATE_IS_MAX() (0)
  14.131 -#define ratewrite(_io_fd, _buf, _n) write((_io_fd), (_buf), (_n))
  14.132 -#define initialize_mbit_rate()
  14.133 -
  14.134 -static inline ssize_t write_exact(int fd, void *buf, size_t count)
  14.135 -{
  14.136 -    return (write(fd, buf, count) == count);
  14.137 -}
  14.138 -
  14.139 -static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
  14.140 -                       xc_shadow_op_stats_t *stats, int print)
  14.141 -{
  14.142 -    static struct timeval wall_last;
  14.143 -    static long long      d0_cpu_last;
  14.144 -    static long long      d1_cpu_last;
  14.145 -
  14.146 -    struct timeval        wall_now;
  14.147 -    long long             wall_delta;
  14.148 -    long long             d0_cpu_now, d0_cpu_delta;
  14.149 -    long long             d1_cpu_now, d1_cpu_delta;
  14.150 -
  14.151 -    gettimeofday(&wall_now, NULL);
  14.152 -
  14.153 -    d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
  14.154 -    d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
  14.155 -
  14.156 -    if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
  14.157 -        DPRINTF("ARRHHH!!\n");
  14.158 -
  14.159 -    wall_delta = tv_delta(&wall_now,&wall_last)/1000;
  14.160 -    if ( wall_delta == 0 )
  14.161 -        wall_delta = 1;
  14.162 -
  14.163 -    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
  14.164 -    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
  14.165 -
  14.166 -    if ( print )
  14.167 -        DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
  14.168 -                "dirtied %dMb/s %" PRId32 " pages\n",
  14.169 -                wall_delta,
  14.170 -                (int)((d0_cpu_delta*100)/wall_delta),
  14.171 -                (int)((d1_cpu_delta*100)/wall_delta),
  14.172 -                (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
  14.173 -                (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
  14.174 -                stats->dirty_count);
  14.175 -
  14.176 -    d0_cpu_last = d0_cpu_now;
  14.177 -    d1_cpu_last = d1_cpu_now;
  14.178 -    wall_last   = wall_now;
  14.179 -
  14.180 -    return 0;
  14.181 -}
  14.182 -
  14.183 -static int analysis_phase(int xc_handle, uint32_t domid, int pfn_array_size,
  14.184 -                          unsigned long *arr, int runs)
  14.185 -{
  14.186 -    long long start, now;
  14.187 -    xc_shadow_op_stats_t stats;
  14.188 -    int j;
  14.189 -
  14.190 -    start = llgettimeofday();
  14.191 -
  14.192 -    for ( j = 0; j < runs; j++ )
  14.193 -    {
  14.194 -        int i;
  14.195 -
  14.196 -        xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
  14.197 -                          arr, pfn_array_size, NULL, 0, NULL);
  14.198 -        DPRINTF("#Flush\n");
  14.199 -        for ( i = 0; i < 40; i++ )
  14.200 -        {
  14.201 -            usleep(50000);
  14.202 -            now = llgettimeofday();
  14.203 -            xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
  14.204 -                              NULL, 0, NULL, 0, &stats);
  14.205 -            DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
  14.206 -                    ((now-start)+500)/1000,
  14.207 -                    stats.fault_count, stats.dirty_count);
  14.208 -        }
  14.209 -    }
  14.210 -
  14.211 -    return -1;
  14.212 -}
  14.213 -
  14.214 -static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
  14.215 -                             int dom, xc_dominfo_t *info,
  14.216 -                             vcpu_guest_context_t *ctxt)
  14.217 -{
  14.218 -    int i = 0;
  14.219 -
  14.220 -    if ( !(*suspend)(dom) )
  14.221 -    {
  14.222 -        ERROR("Suspend request failed");
  14.223 -        return -1;
  14.224 -    }
  14.225 -
  14.226 - retry:
  14.227 -
  14.228 -    if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
  14.229 -    {
  14.230 -        ERROR("Could not get domain info");
  14.231 -        return -1;
  14.232 -    }
  14.233 -
  14.234 -    if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) )
  14.235 -        ERROR("Could not get vcpu context");
  14.236 -
  14.237 -    if ( info->shutdown && (info->shutdown_reason == SHUTDOWN_suspend) )
  14.238 -        return 0; /* success */
  14.239 -
  14.240 -    if ( info->paused )
  14.241 -    {
  14.242 -        /* Try unpausing domain, wait, and retest. */
  14.243 -        xc_domain_unpause( xc_handle, dom );
  14.244 -        ERROR("Domain was paused. Wait and re-test.");
  14.245 -        usleep(10000);  /* 10ms */
  14.246 -        goto retry;
  14.247 -    }
  14.248 -
  14.249 -    if ( ++i < 100 )
  14.250 -    {
  14.251 -        ERROR("Retry suspend domain.");
  14.252 -        usleep(10000); /* 10ms */
  14.253 -        goto retry;
  14.254 -    }
  14.255 -
  14.256 -    ERROR("Unable to suspend domain.");
  14.257 -
  14.258 -    return -1;
  14.259 -}
  14.260 -
  14.261 -int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
  14.262 -                uint32_t max_factor, uint32_t flags, int (*suspend)(int),
  14.263 -                void *(*init_qemu_maps)(int, unsigned), 
  14.264 -                void (*qemu_flip_buffer)(int, int))
  14.265 -{
  14.266 -    xc_dominfo_t info;
  14.267 -
  14.268 -    int rc = 1, i, j, last_iter, iter = 0;
  14.269 -    int live  = !!(flags & XCFLAGS_LIVE);
  14.270 -    int debug = !!(flags & XCFLAGS_DEBUG);
  14.271 -    int sent_last_iter, skip_this_iter;
  14.272 -
  14.273 -    /* The highest guest-physical frame number used by the current guest */
  14.274 -    unsigned long max_pfn;
  14.275 -
  14.276 -    /* The size of an array big enough to contain all guest pfns */
  14.277 -    unsigned long pfn_array_size;
  14.278 -
  14.279 -    /* Magic frames: ioreqs and xenstore comms. */
  14.280 -    uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
  14.281 -
  14.282 -    /* A copy of the CPU context of the guest. */
  14.283 -    vcpu_guest_context_t ctxt;
  14.284 -
  14.285 -    /* A table containg the PFNs (/not/ MFN!) to map. */
  14.286 -    xen_pfn_t *pfn_batch = NULL;
  14.287 -
  14.288 -    /* A copy of hvm domain context buffer*/
  14.289 -    uint32_t hvm_buf_size;
  14.290 -    uint8_t *hvm_buf = NULL;
  14.291 -
  14.292 -    /* base of the region in which domain memory is mapped */
  14.293 -    unsigned char *region_base = NULL;
  14.294 -
  14.295 -    uint32_t rec_size, nr_vcpus;
  14.296 -
  14.297 -    /* power of 2 order of pfn_array_size */
  14.298 -    int order_nr;
  14.299 -
  14.300 -    /* bitmap of pages:
  14.301 -       - that should be sent this iteration (unless later marked as skip);
  14.302 -       - to skip this iteration because already dirty; */
  14.303 -    unsigned long *to_send = NULL, *to_skip = NULL;
  14.304 -
  14.305 -    xc_shadow_op_stats_t stats;
  14.306 -
  14.307 -    unsigned long total_sent = 0;
  14.308 -
  14.309 -    uint64_t vcpumap = 1ULL;
  14.310 -
  14.311 -    DPRINTF("xc_hvm_save: dom=%d, max_iters=%d, max_factor=%d, flags=0x%x, "
  14.312 -            "live=%d, debug=%d.\n", dom, max_iters, max_factor, flags,
  14.313 -            live, debug);
  14.314 -    
  14.315 -    /* If no explicit control parameters given, use defaults */
  14.316 -    max_iters  = max_iters  ? : DEF_MAX_ITERS;
  14.317 -    max_factor = max_factor ? : DEF_MAX_FACTOR;
  14.318 -
  14.319 -    initialize_mbit_rate();
  14.320 -
  14.321 -    if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
  14.322 -    {
  14.323 -        ERROR("HVM: Could not get domain info");
  14.324 -        return 1;
  14.325 -    }
  14.326 -    nr_vcpus = info.nr_online_vcpus;
  14.327 -
  14.328 -    if ( mlock(&ctxt, sizeof(ctxt)) )
  14.329 -    {
  14.330 -        ERROR("HVM: Unable to mlock ctxt");
  14.331 -        return 1;
  14.332 -    }
  14.333 -
  14.334 -    /* Only have to worry about vcpu 0 even for SMP */
  14.335 -    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
  14.336 -    {
  14.337 -        ERROR("HVM: Could not get vcpu context");
  14.338 -        goto out;
  14.339 -    }
  14.340 -
  14.341 -    DPRINTF("saved hvm domain info: max_memkb=0x%lx, nr_pages=0x%lx\n",
  14.342 -            info.max_memkb, info.nr_pages); 
  14.343 -
  14.344 -    if ( live )
  14.345 -    {
  14.346 -        /* Live suspend. Enable log-dirty mode. */
  14.347 -        if ( xc_shadow_control(xc_handle, dom,
  14.348 -                               XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
  14.349 -                               NULL, 0, NULL, 0, NULL) < 0 )
  14.350 -        {
  14.351 -            ERROR("Couldn't enable shadow mode");
  14.352 -            goto out;
  14.353 -        }
  14.354 -    }
  14.355 -    else
  14.356 -    {
  14.357 -        /* This is a non-live suspend. Suspend the domain .*/
  14.358 -        if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) )
  14.359 -        {
  14.360 -            ERROR("HVM Domain appears not to have suspended");
  14.361 -            goto out;
  14.362 -        }
  14.363 -    }
  14.364 -
  14.365 -    last_iter = !live;
  14.366 -
  14.367 -    max_pfn = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom);
  14.368 -
  14.369 -    DPRINTF("after 1st handle hvm domain max_pfn=0x%lx, "
  14.370 -            "max_memkb=0x%lx, live=%d.\n",
  14.371 -            max_pfn, info.max_memkb, live);
  14.372 -
  14.373 -    /* Size of any array that covers 0 ... max_pfn */
  14.374 -    pfn_array_size = max_pfn + 1;
  14.375 -    if ( !write_exact(io_fd, &pfn_array_size, sizeof(unsigned long)) )
  14.376 -    {
  14.377 -        ERROR("Error when writing to state file (1)");
  14.378 -        goto out;
  14.379 -    }
  14.380 -
  14.381 -    /* pretend we sent all the pages last iteration */
  14.382 -    sent_last_iter = pfn_array_size;
  14.383 -
  14.384 -    /* calculate the power of 2 order of pfn_array_size, e.g.
  14.385 -       15->4 16->4 17->5 */
  14.386 -    for ( i = pfn_array_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
  14.387 -        continue;
  14.388 -
  14.389 -    /* Setup to_send / to_fix and to_skip bitmaps */
  14.390 -    to_send = malloc(BITMAP_SIZE);
  14.391 -    to_skip = malloc(BITMAP_SIZE);
  14.392 -
  14.393 -    if ( live )
  14.394 -    {
  14.395 -        /* Get qemu-dm logging dirty pages too */
  14.396 -        void *seg = init_qemu_maps(dom, BITMAP_SIZE);
  14.397 -        qemu_bitmaps[0] = seg;
  14.398 -        qemu_bitmaps[1] = seg + BITMAP_SIZE;
  14.399 -        qemu_active = 0;
  14.400 -        qemu_non_active = 1;
  14.401 -    }
  14.402 -
  14.403 -    hvm_buf_size = xc_domain_hvm_getcontext(xc_handle, dom, 0, 0);
  14.404 -    if ( hvm_buf_size == -1 )
  14.405 -    {
  14.406 -        ERROR("Couldn't get HVM context size from Xen");
  14.407 -        goto out;
  14.408 -    }
  14.409 -    hvm_buf = malloc(hvm_buf_size);
  14.410 -
  14.411 -    if ( !to_send || !to_skip || !hvm_buf )
  14.412 -    {
  14.413 -        ERROR("Couldn't allocate memory");
  14.414 -        goto out;
  14.415 -    }
  14.416 -
  14.417 -    memset(to_send, 0xff, BITMAP_SIZE);
  14.418 -
  14.419 -    if ( lock_pages(to_send, BITMAP_SIZE) )
  14.420 -    {
  14.421 -        ERROR("Unable to lock to_send");
  14.422 -        return 1;
  14.423 -    }
  14.424 -
  14.425 -    /* (to fix is local only) */
  14.426 -    if ( lock_pages(to_skip, BITMAP_SIZE) )
  14.427 -    {
  14.428 -        ERROR("Unable to lock to_skip");
  14.429 -        return 1;
  14.430 -    }
  14.431 -
  14.432 -    analysis_phase(xc_handle, dom, pfn_array_size, to_skip, 0);
  14.433 -
  14.434 -    /* We want zeroed memory so use calloc rather than malloc. */
  14.435 -    pfn_batch = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
  14.436 -    if ( pfn_batch == NULL )
  14.437 -    {
  14.438 -        ERROR("failed to alloc memory for pfn_batch array");
  14.439 -        errno = ENOMEM;
  14.440 -        goto out;
  14.441 -    }
  14.442 -
  14.443 -    for ( ; ; )
  14.444 -    {
  14.445 -        unsigned int prev_pc, sent_this_iter, N, batch;
  14.446 -
  14.447 -        iter++;
  14.448 -        sent_this_iter = 0;
  14.449 -        skip_this_iter = 0;
  14.450 -        prev_pc = 0;
  14.451 -        N=0;
  14.452 -
  14.453 -        DPRINTF("Saving memory pages: iter %d   0%%", iter);
  14.454 -
  14.455 -        while ( N < pfn_array_size )
  14.456 -        {
  14.457 -            unsigned int this_pc = (N * 100) / pfn_array_size;
  14.458 -            int rc;
  14.459 -
  14.460 -            if ( (this_pc - prev_pc) >= 5 )
  14.461 -            {
  14.462 -                DPRINTF("\b\b\b\b%3d%%", this_pc);
  14.463 -                prev_pc = this_pc;
  14.464 -            }
  14.465 -
  14.466 -            if ( !last_iter )
  14.467 -            {
  14.468 -                /* Slightly wasteful to peek the whole array evey time,
  14.469 -                   but this is fast enough for the moment. */
  14.470 -                rc = xc_shadow_control(
  14.471 -                    xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip, 
  14.472 -                    pfn_array_size, NULL, 0, NULL);
  14.473 -                if ( rc != pfn_array_size )
  14.474 -                {
  14.475 -                    ERROR("Error peeking shadow bitmap");
  14.476 -                    goto out;
  14.477 -                }
  14.478 -            }
  14.479 -
  14.480 -            /* load pfn_batch[] with the mfn of all the pages we're doing in
  14.481 -               this batch. */
  14.482 -            for ( batch = 0;
  14.483 -                  (batch < MAX_BATCH_SIZE) && (N < pfn_array_size);
  14.484 -                  N++ )
  14.485 -            {
  14.486 -                int n = permute(N, pfn_array_size, order_nr);
  14.487 -
  14.488 -                if ( 0 && debug )
  14.489 -                    DPRINTF("%d pfn= %08lx %d \n",
  14.490 -                            iter, (unsigned long)n, test_bit(n, to_send));
  14.491 -
  14.492 -                if ( !last_iter &&
  14.493 -                     test_bit(n, to_send) &&
  14.494 -                     test_bit(n, to_skip) )
  14.495 -                    skip_this_iter++; /* stats keeping */
  14.496 -
  14.497 -                if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
  14.498 -                       (test_bit(n, to_send) && last_iter)) )
  14.499 -                    continue;
  14.500 -
  14.501 -                /* Skip PFNs that aren't really there */
  14.502 -                if ( (n >= 0xa0 && n < 0xc0) /* VGA hole */
  14.503 -                     || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT) &&
  14.504 -                         n < (1ULL << 32) >> PAGE_SHIFT) /* 4G MMIO hole */ )
  14.505 -                    continue;
  14.506 -
  14.507 -                /*
  14.508 -                ** we get here if:
  14.509 -                **  1. page is marked to_send & hasn't already been re-dirtied
  14.510 -                **  2. (ignore to_skip in last iteration)
  14.511 -                */
  14.512 -
  14.513 -                pfn_batch[batch] = n;
  14.514 -
  14.515 -                batch++;
  14.516 -            }
  14.517 -
  14.518 -            if ( batch == 0 )
  14.519 -                goto skip; /* vanishingly unlikely... */
  14.520 -
  14.521 -            region_base = xc_map_foreign_batch(
  14.522 -                xc_handle, dom, PROT_READ, pfn_batch, batch);
  14.523 -            if ( region_base == 0 )
  14.524 -            {
  14.525 -                ERROR("map batch failed");
  14.526 -                goto out;
  14.527 -            }
  14.528 -
  14.529 -            /* write num of pfns */
  14.530 -            if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) )
  14.531 -            {
  14.532 -                ERROR("Error when writing to state file (2)");
  14.533 -                goto out;
  14.534 -            }
  14.535 -
  14.536 -            /* write all the pfns */
  14.537 -            if ( !write_exact(io_fd, pfn_batch, sizeof(unsigned long)*batch) )
  14.538 -            {
  14.539 -                ERROR("Error when writing to state file (3)");
  14.540 -                goto out;
  14.541 -            }
  14.542 -
  14.543 -            for ( j = 0; j < batch; j++ )
  14.544 -            {
  14.545 -                if ( pfn_batch[j] & XEN_DOMCTL_PFINFO_LTAB_MASK )
  14.546 -                    continue;
  14.547 -                if ( ratewrite(io_fd, region_base + j*PAGE_SIZE,
  14.548 -                               PAGE_SIZE) != PAGE_SIZE )
  14.549 -                {
  14.550 -                    ERROR("ERROR when writing to state file (4)");
  14.551 -                    goto out;
  14.552 -                }
  14.553 -            }
  14.554 -
  14.555 -            sent_this_iter += batch;
  14.556 -
  14.557 -            munmap(region_base, batch*PAGE_SIZE);
  14.558 -
  14.559 -        } /* end of this while loop for this iteration */
  14.560 -
  14.561 -      skip:
  14.562 -
  14.563 -        total_sent += sent_this_iter;
  14.564 -
  14.565 -        DPRINTF("\r %d: sent %d, skipped %d, ",
  14.566 -                iter, sent_this_iter, skip_this_iter );
  14.567 -
  14.568 -        if ( last_iter )
  14.569 -        {
  14.570 -            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
  14.571 -            DPRINTF("Total pages sent= %ld (%.2fx)\n",
  14.572 -                    total_sent, ((float)total_sent)/pfn_array_size );
  14.573 -        }
  14.574 -
  14.575 -        if ( last_iter && debug )
  14.576 -        {
  14.577 -            int minusone = -1;
  14.578 -            memset(to_send, 0xff, BITMAP_SIZE);
  14.579 -            debug = 0;
  14.580 -            DPRINTF("Entering debug resend-all mode\n");
  14.581 -
  14.582 -            /* send "-1" to put receiver into debug mode */
  14.583 -            if ( !write_exact(io_fd, &minusone, sizeof(int)) )
  14.584 -            {
  14.585 -                ERROR("Error when writing to state file (6)");
  14.586 -                goto out;
  14.587 -            }
  14.588 -
  14.589 -            continue;
  14.590 -        }
  14.591 -
  14.592 -        if ( last_iter )
  14.593 -            break;
  14.594 -
  14.595 -        if ( live )
  14.596 -        {
  14.597 -            if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
  14.598 -                 (iter >= max_iters) ||
  14.599 -                 (sent_this_iter+skip_this_iter < 50) ||
  14.600 -                 (total_sent > pfn_array_size*max_factor) )
  14.601 -            {
  14.602 -                DPRINTF("Start last iteration for HVM domain\n");
  14.603 -                last_iter = 1;
  14.604 -
  14.605 -                if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
  14.606 -                                       &ctxt))
  14.607 -                {
  14.608 -                    ERROR("Domain appears not to have suspended");
  14.609 -                    goto out;
  14.610 -                }
  14.611 -
  14.612 -                DPRINTF("SUSPEND eip %08lx edx %08lx\n",
  14.613 -                        (unsigned long)ctxt.user_regs.eip,
  14.614 -                        (unsigned long)ctxt.user_regs.edx);
  14.615 -            }
  14.616 -
  14.617 -            if ( xc_shadow_control(xc_handle, dom, 
  14.618 -                                   XEN_DOMCTL_SHADOW_OP_CLEAN, to_send, 
  14.619 -                                   pfn_array_size, NULL, 
  14.620 -                                   0, &stats) != pfn_array_size )
  14.621 -            {
  14.622 -                ERROR("Error flushing shadow PT");
  14.623 -                goto out;
  14.624 -            }
  14.625 -
  14.626 -            /* Pull in the dirty bits from qemu too */
  14.627 -            if ( !last_iter )
  14.628 -            {
  14.629 -                qemu_active = qemu_non_active;
  14.630 -                qemu_non_active = qemu_active ? 0 : 1;
  14.631 -                qemu_flip_buffer(dom, qemu_active);
  14.632 -                for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
  14.633 -                {
  14.634 -                    to_send[j] |= qemu_bitmaps[qemu_non_active][j];
  14.635 -                    qemu_bitmaps[qemu_non_active][j] = 0;
  14.636 -                }
  14.637 -            }
  14.638 -            else
  14.639 -            {
  14.640 -                for ( j = 0; j < BITMAP_SIZE / sizeof(unsigned long); j++ )
  14.641 -                    to_send[j] |= qemu_bitmaps[qemu_active][j];
  14.642 -            }
  14.643 -
  14.644 -            sent_last_iter = sent_this_iter;
  14.645 -
  14.646 -            print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
  14.647 -        }
  14.648 -    } /* end of while 1 */
  14.649 -
  14.650 -
  14.651 -    DPRINTF("All HVM memory is saved\n");
  14.652 -
  14.653 -    {
  14.654 -        struct {
  14.655 -            int minustwo;
  14.656 -            int max_vcpu_id;
  14.657 -            uint64_t vcpumap;
  14.658 -        } chunk = { -2, info.max_vcpu_id };
  14.659 -
  14.660 -        if (info.max_vcpu_id >= 64) {
  14.661 -            ERROR("Too many VCPUS in guest!");
  14.662 -            goto out;
  14.663 -        }
  14.664 -
  14.665 -        for (i = 1; i <= info.max_vcpu_id; i++) {
  14.666 -            xc_vcpuinfo_t vinfo;
  14.667 -            if ((xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
  14.668 -                vinfo.online)
  14.669 -                vcpumap |= 1ULL << i;
  14.670 -        }
  14.671 -
  14.672 -        chunk.vcpumap = vcpumap;
  14.673 -        if(!write_exact(io_fd, &chunk, sizeof(chunk))) {
  14.674 -            ERROR("Error when writing to state file (errno %d)", errno);
  14.675 -            goto out;
  14.676 -        }
  14.677 -    }
  14.678 -
  14.679 -    /* Zero terminate */
  14.680 -    i = 0;
  14.681 -    if ( !write_exact(io_fd, &i, sizeof(int)) )
  14.682 -    {
  14.683 -        ERROR("Error when writing to state file (6)");
  14.684 -        goto out;
  14.685 -    }
  14.686 -
  14.687 -    /* Save magic-page locations. */
  14.688 -    memset(magic_pfns, 0, sizeof(magic_pfns));
  14.689 -    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
  14.690 -                     (unsigned long *)&magic_pfns[0]);
  14.691 -    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
  14.692 -                     (unsigned long *)&magic_pfns[1]);
  14.693 -    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
  14.694 -                     (unsigned long *)&magic_pfns[2]);
  14.695 -    if ( !write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
  14.696 -    {
  14.697 -        ERROR("Error when writing to state file (7)");
  14.698 -        goto out;
  14.699 -    }
  14.700 -
  14.701 -    /* save vcpu/vmcs contexts */
  14.702 -    for ( i = 0; i < nr_vcpus; i++ )
  14.703 -    {
  14.704 -        if ( !(vcpumap & (1ULL << i)) )
  14.705 -            continue;
  14.706 -
  14.707 -        if ( xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
  14.708 -        {
  14.709 -            ERROR("HVM:Could not get vcpu context");
  14.710 -            goto out;
  14.711 -        }
  14.712 -
  14.713 -        DPRINTF("write vcpu %d context.\n", i); 
  14.714 -        if ( !write_exact(io_fd, &(ctxt), sizeof(ctxt)) )
  14.715 -        {
  14.716 -            ERROR("write vcpu context failed!\n");
  14.717 -            goto out;
  14.718 -        }
  14.719 -    }
  14.720 -
  14.721 -    if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf, 
  14.722 -                                              hvm_buf_size)) == -1 )
  14.723 -    {
  14.724 -        ERROR("HVM:Could not get hvm buffer");
  14.725 -        goto out;
  14.726 -    }
  14.727 -
  14.728 -    if ( !write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
  14.729 -    {
  14.730 -        ERROR("error write hvm buffer size");
  14.731 -        goto out;
  14.732 -    }
  14.733 -
  14.734 -    if ( !write_exact(io_fd, hvm_buf, rec_size) )
  14.735 -    {
  14.736 -        ERROR("write HVM info failed!\n");
  14.737 -        goto out;
  14.738 -    }
  14.739 -
  14.740 -    /* Success! */
  14.741 -    rc = 0;
  14.742 -
  14.743 - out:
  14.744 -
  14.745 -    if ( live )
  14.746 -    {
  14.747 -        if ( xc_shadow_control(xc_handle, dom, XEN_DOMCTL_SHADOW_OP_OFF,
  14.748 -                               NULL, 0, NULL, 0, NULL) < 0 )
  14.749 -            DPRINTF("Warning - couldn't disable shadow mode");
  14.750 -    }
  14.751 -
  14.752 -    free(hvm_buf);
  14.753 -    free(pfn_batch);
  14.754 -    free(to_send);
  14.755 -    free(to_skip);
  14.756 -
  14.757 -    return !!rc;
  14.758 -}
    15.1 --- a/tools/libxc/xc_linux_save.c	Wed Apr 11 07:30:02 2007 -0600
    15.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.3 @@ -1,1414 +0,0 @@
    15.4 -/******************************************************************************
    15.5 - * xc_linux_save.c
    15.6 - *
    15.7 - * Save the state of a running Linux session.
    15.8 - *
    15.9 - * Copyright (c) 2003, K A Fraser.
   15.10 - */
   15.11 -
   15.12 -#include <inttypes.h>
   15.13 -#include <time.h>
   15.14 -#include <stdlib.h>
   15.15 -#include <unistd.h>
   15.16 -#include <sys/time.h>
   15.17 -
   15.18 -#include "xc_private.h"
   15.19 -#include "xc_dom.h"
   15.20 -#include "xg_private.h"
   15.21 -#include "xg_save_restore.h"
   15.22 -
   15.23 -/*
   15.24 -** Default values for important tuning parameters. Can override by passing
   15.25 -** non-zero replacement values to xc_linux_save().
   15.26 -**
   15.27 -** XXX SMH: should consider if want to be able to override MAX_MBIT_RATE too.
   15.28 -**
   15.29 -*/
   15.30 -#define DEF_MAX_ITERS   29   /* limit us to 30 times round loop   */
   15.31 -#define DEF_MAX_FACTOR   3   /* never send more than 3x p2m_size  */
   15.32 -
   15.33 -/* max mfn of the whole machine */
   15.34 -static unsigned long max_mfn;
   15.35 -
   15.36 -/* virtual starting address of the hypervisor */
   15.37 -static unsigned long hvirt_start;
   15.38 -
   15.39 -/* #levels of page tables used by the current guest */
   15.40 -static unsigned int pt_levels;
   15.41 -
   15.42 -/* number of pfns this guest has (i.e. number of entries in the P2M) */
   15.43 -static unsigned long p2m_size;
   15.44 -
   15.45 -/* Live mapping of the table mapping each PFN to its current MFN. */
   15.46 -static xen_pfn_t *live_p2m = NULL;
   15.47 -
   15.48 -/* Live mapping of system MFN to PFN table. */
   15.49 -static xen_pfn_t *live_m2p = NULL;
   15.50 -static unsigned long m2p_mfn0;
   15.51 -
   15.52 -/* grep fodder: machine_to_phys */
   15.53 -
   15.54 -#define mfn_to_pfn(_mfn) live_m2p[(_mfn)]
   15.55 -
   15.56 -/*
   15.57 - * Returns TRUE if the given machine frame number has a unique mapping
   15.58 - * in the guest's pseudophysical map.
   15.59 - */
   15.60 -#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn)          \
   15.61 -    (((_mfn) < (max_mfn)) &&                    \
   15.62 -     ((mfn_to_pfn(_mfn) < (p2m_size)) &&        \
   15.63 -      (live_p2m[mfn_to_pfn(_mfn)] == (_mfn))))
   15.64 -
   15.65 -/* Returns TRUE if MFN is successfully converted to a PFN. */
   15.66 -#define translate_mfn_to_pfn(_pmfn)                             \
   15.67 -({                                                              \
   15.68 -    unsigned long mfn = *(_pmfn);                               \
   15.69 -    int _res = 1;                                               \
   15.70 -    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )                       \
   15.71 -        _res = 0;                                               \
   15.72 -    else                                                        \
   15.73 -        *(_pmfn) = mfn_to_pfn(mfn);                             \
   15.74 -    _res;                                                       \
   15.75 -})
   15.76 -
   15.77 -/*
   15.78 -** During (live) save/migrate, we maintain a number of bitmaps to track
   15.79 -** which pages we have to send, to fixup, and to skip.
   15.80 -*/
   15.81 -
   15.82 -#define BITS_PER_LONG (sizeof(unsigned long) * 8)
   15.83 -#define BITMAP_SIZE   ((p2m_size + BITS_PER_LONG - 1) / 8)
   15.84 -
   15.85 -#define BITMAP_ENTRY(_nr,_bmap) \
   15.86 -   ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
   15.87 -
   15.88 -#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
   15.89 -
   15.90 -static inline int test_bit (int nr, volatile void * addr)
   15.91 -{
   15.92 -    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
   15.93 -}
   15.94 -
   15.95 -static inline void clear_bit (int nr, volatile void * addr)
   15.96 -{
   15.97 -    BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
   15.98 -}
   15.99 -
  15.100 -static inline void set_bit ( int nr, volatile void * addr)
  15.101 -{
  15.102 -    BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
  15.103 -}
  15.104 -
  15.105 -/* Returns the hamming weight (i.e. the number of bits set) in a N-bit word */
  15.106 -static inline unsigned int hweight32(unsigned int w)
  15.107 -{
  15.108 -    unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
  15.109 -    res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
  15.110 -    res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
  15.111 -    res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
  15.112 -    return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
  15.113 -}
  15.114 -
  15.115 -static inline int count_bits ( int nr, volatile void *addr)
  15.116 -{
  15.117 -    int i, count = 0;
  15.118 -    volatile unsigned long *p = (volatile unsigned long *)addr;
  15.119 -    /* We know that the array is padded to unsigned long. */
  15.120 -    for ( i = 0; i < (nr / (sizeof(unsigned long)*8)); i++, p++ )
  15.121 -        count += hweight32(*p);
  15.122 -    return count;
  15.123 -}
  15.124 -
  15.125 -static inline int permute( int i, int nr, int order_nr  )
  15.126 -{
  15.127 -    /* Need a simple permutation function so that we scan pages in a
  15.128 -       pseudo random order, enabling us to get a better estimate of
  15.129 -       the domain's page dirtying rate as we go (there are often
  15.130 -       contiguous ranges of pfns that have similar behaviour, and we
  15.131 -       want to mix them up. */
  15.132 -
  15.133 -    /* e.g. nr->oder 15->4 16->4 17->5 */
  15.134 -    /* 512MB domain, 128k pages, order 17 */
  15.135 -
  15.136 -    /*
  15.137 -      QPONMLKJIHGFEDCBA
  15.138 -             QPONMLKJIH
  15.139 -      GFEDCBA
  15.140 -     */
  15.141 -
  15.142 -    /*
  15.143 -      QPONMLKJIHGFEDCBA
  15.144 -                  EDCBA
  15.145 -             QPONM
  15.146 -      LKJIHGF
  15.147 -      */
  15.148 -
  15.149 -    do { i = ((i>>(order_nr-10)) | ( i<<10 ) ) & ((1<<order_nr)-1); }
  15.150 -    while ( i >= nr ); /* this won't ever loop if nr is a power of 2 */
  15.151 -
  15.152 -    return i;
  15.153 -}
  15.154 -
  15.155 -static uint64_t tv_to_us(struct timeval *new)
  15.156 -{
  15.157 -    return (new->tv_sec * 1000000) + new->tv_usec;
  15.158 -}
  15.159 -
  15.160 -static uint64_t llgettimeofday(void)
  15.161 -{
  15.162 -    struct timeval now;
  15.163 -    gettimeofday(&now, NULL);
  15.164 -    return tv_to_us(&now);
  15.165 -}
  15.166 -
  15.167 -static uint64_t tv_delta(struct timeval *new, struct timeval *old)
  15.168 -{
  15.169 -    return (((new->tv_sec - old->tv_sec)*1000000) +
  15.170 -            (new->tv_usec - old->tv_usec));
  15.171 -}
  15.172 -
  15.173 -static int noncached_write(int fd, int live, void *buffer, int len) 
  15.174 -{
  15.175 -    static int write_count = 0;
  15.176 -
  15.177 -    int rc = write(fd,buffer,len);
  15.178 -
  15.179 -    write_count += len;
  15.180 -    if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
  15.181 -    {
  15.182 -        /* Time to discard cache - dont care if this fails */
  15.183 -        discard_file_cache(fd, 0 /* no flush */);
  15.184 -        write_count = 0;
  15.185 -    }
  15.186 -
  15.187 -    return rc;
  15.188 -}
  15.189 -
  15.190 -#ifdef ADAPTIVE_SAVE
  15.191 -
  15.192 -/*
  15.193 -** We control the rate at which we transmit (or save) to minimize impact
  15.194 -** on running domains (including the target if we're doing live migrate).
  15.195 -*/
  15.196 -
  15.197 -#define MAX_MBIT_RATE    500      /* maximum transmit rate for migrate */
  15.198 -#define START_MBIT_RATE  100      /* initial transmit rate for migrate */
  15.199 -
  15.200 -/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
  15.201 -#define RATE_TO_BTU      781250
  15.202 -
  15.203 -/* Amount in bytes we allow ourselves to send in a burst */
  15.204 -#define BURST_BUDGET (100*1024)
  15.205 -
  15.206 -/* We keep track of the current and previous transmission rate */
  15.207 -static int mbit_rate, ombit_rate = 0;
  15.208 -
  15.209 -/* Have we reached the maximum transmission rate? */
  15.210 -#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
  15.211 -
  15.212 -static inline void initialize_mbit_rate()
  15.213 -{
  15.214 -    mbit_rate = START_MBIT_RATE;
  15.215 -}
  15.216 -
  15.217 -static int ratewrite(int io_fd, int live, void *buf, int n)
  15.218 -{
  15.219 -    static int budget = 0;
  15.220 -    static int burst_time_us = -1;
  15.221 -    static struct timeval last_put = { 0 };
  15.222 -    struct timeval now;
  15.223 -    struct timespec delay;
  15.224 -    long long delta;
  15.225 -
  15.226 -    if ( START_MBIT_RATE == 0 )
  15.227 -        return noncached_write(io_fd, live, buf, n);
  15.228 -
  15.229 -    budget -= n;
  15.230 -    if ( budget < 0 )
  15.231 -    {
  15.232 -        if ( mbit_rate != ombit_rate )
  15.233 -        {
  15.234 -            burst_time_us = RATE_TO_BTU / mbit_rate;
  15.235 -            ombit_rate = mbit_rate;
  15.236 -            DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
  15.237 -                    mbit_rate, BURST_BUDGET, burst_time_us);
  15.238 -        }
  15.239 -        if ( last_put.tv_sec == 0 )
  15.240 -        {
  15.241 -            budget += BURST_BUDGET;
  15.242 -            gettimeofday(&last_put, NULL);
  15.243 -        }
  15.244 -        else
  15.245 -        {
  15.246 -            while ( budget < 0 )
  15.247 -            {
  15.248 -                gettimeofday(&now, NULL);
  15.249 -                delta = tv_delta(&now, &last_put);
  15.250 -                while ( delta > burst_time_us )
  15.251 -                {
  15.252 -                    budget += BURST_BUDGET;
  15.253 -                    last_put.tv_usec += burst_time_us;
  15.254 -                    if ( last_put.tv_usec > 1000000 
  15.255 -                    {
  15.256 -                        last_put.tv_usec -= 1000000;
  15.257 -                        last_put.tv_sec++;
  15.258 -                    }
  15.259 -                    delta -= burst_time_us;
  15.260 -                }
  15.261 -                if ( budget > 0 )
  15.262 -                    break;
  15.263 -                delay.tv_sec = 0;
  15.264 -                delay.tv_nsec = 1000 * (burst_time_us - delta);
  15.265 -                while ( delay.tv_nsec > 0 )
  15.266 -                    if ( nanosleep(&delay, &delay) == 0 )
  15.267 -                        break;
  15.268 -            }
  15.269 -        }
  15.270 -    }
  15.271 -    return noncached_write(io_fd, live, buf, n);
  15.272 -}
  15.273 -
  15.274 -#else /* ! ADAPTIVE SAVE */
  15.275 -
  15.276 -#define RATE_IS_MAX() (0)
  15.277 -#define ratewrite(_io_fd, _live, _buf, _n) noncached_write((_io_fd), (_live), (_buf), (_n))
  15.278 -#define initialize_mbit_rate()
  15.279 -
  15.280 -#endif
  15.281 -
  15.282 -static inline ssize_t write_exact(int fd, void *buf, size_t count)
  15.283 -{
  15.284 -    return (write(fd, buf, count) == count);
  15.285 -}
  15.286 -
  15.287 -static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
  15.288 -                       xc_shadow_op_stats_t *stats, int print)
  15.289 -{
  15.290 -    static struct timeval wall_last;
  15.291 -    static long long      d0_cpu_last;
  15.292 -    static long long      d1_cpu_last;
  15.293 -
  15.294 -    struct timeval        wall_now;
  15.295 -    long long             wall_delta;
  15.296 -    long long             d0_cpu_now, d0_cpu_delta;
  15.297 -    long long             d1_cpu_now, d1_cpu_delta;
  15.298 -
  15.299 -    gettimeofday(&wall_now, NULL);
  15.300 -
  15.301 -    d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
  15.302 -    d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
  15.303 -
  15.304 -    if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
  15.305 -        DPRINTF("ARRHHH!!\n");
  15.306 -
  15.307 -    wall_delta = tv_delta(&wall_now,&wall_last)/1000;
  15.308 -    if ( wall_delta == 0 )
  15.309 -        wall_delta = 1;
  15.310 -
  15.311 -    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
  15.312 -    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
  15.313 -
  15.314 -    if ( print )
  15.315 -        DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
  15.316 -                "dirtied %dMb/s %" PRId32 " pages\n",
  15.317 -                wall_delta,
  15.318 -                (int)((d0_cpu_delta*100)/wall_delta),
  15.319 -                (int)((d1_cpu_delta*100)/wall_delta),
  15.320 -                (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
  15.321 -                (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
  15.322 -                stats->dirty_count);
  15.323 -
  15.324 -#ifdef ADAPTIVE_SAVE
  15.325 -    if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
  15.326 -    {
  15.327 -        mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
  15.328 -            + 50;
  15.329 -        if ( mbit_rate > MAX_MBIT_RATE )
  15.330 -            mbit_rate = MAX_MBIT_RATE;
  15.331 -    }
  15.332 -#endif
  15.333 -
  15.334 -    d0_cpu_last = d0_cpu_now;
  15.335 -    d1_cpu_last = d1_cpu_now;
  15.336 -    wall_last   = wall_now;
  15.337 -
  15.338 -    return 0;
  15.339 -}
  15.340 -
  15.341 -
  15.342 -static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size,
  15.343 -                          unsigned long *arr, int runs)
  15.344 -{
  15.345 -    long long start, now;
  15.346 -    xc_shadow_op_stats_t stats;
  15.347 -    int j;
  15.348 -
  15.349 -    start = llgettimeofday();
  15.350 -
  15.351 -    for ( j = 0; j < runs; j++ )
  15.352 -    {
  15.353 -        int i;
  15.354 -
  15.355 -        xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
  15.356 -                          arr, p2m_size, NULL, 0, NULL);
  15.357 -        DPRINTF("#Flush\n");
  15.358 -        for ( i = 0; i < 40; i++ )
  15.359 -        {
  15.360 -            usleep(50000);
  15.361 -            now = llgettimeofday();
  15.362 -            xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_PEEK,
  15.363 -                              NULL, 0, NULL, 0, &stats);
  15.364 -            DPRINTF("now= %lld faults= %"PRId32" dirty= %"PRId32"\n",
  15.365 -                    ((now-start)+500)/1000,
  15.366 -                    stats.fault_count, stats.dirty_count);
  15.367 -        }
  15.368 -    }
  15.369 -
  15.370 -    return -1;
  15.371 -}
  15.372 -
  15.373 -
  15.374 -static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
  15.375 -                             int dom, xc_dominfo_t *info,
  15.376 -                             vcpu_guest_context_t *ctxt)
  15.377 -{
  15.378 -    int i = 0;
  15.379 -
  15.380 -    if ( !(*suspend)(dom) )
  15.381 -    {
  15.382 -        ERROR("Suspend request failed");
  15.383 -        return -1;
  15.384 -    }
  15.385 -
  15.386 - retry:
  15.387 -
  15.388 -    if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
  15.389 -    {
  15.390 -        ERROR("Could not get domain info");
  15.391 -        return -1;
  15.392 -    }
  15.393 -
  15.394 -    if ( xc_vcpu_getcontext(xc_handle, dom, 0, ctxt) )
  15.395 -        ERROR("Could not get vcpu context");
  15.396 -
  15.397 -
  15.398 -    if ( info->dying )
  15.399 -    {
  15.400 -        ERROR("domain is dying");
  15.401 -        return -1;
  15.402 -    }
  15.403 -
  15.404 -    if ( info->crashed )
  15.405 -    {
  15.406 -        ERROR("domain has crashed");
  15.407 -        return -1;
  15.408 -    }
  15.409 -
  15.410 -    if ( info->shutdown )
  15.411 -    {
  15.412 -        switch ( info->shutdown_reason )
  15.413 -        {
  15.414 -        case SHUTDOWN_poweroff:
  15.415 -        case SHUTDOWN_reboot:
  15.416 -            ERROR("domain has shut down");
  15.417 -            return -1;
  15.418 -        case SHUTDOWN_suspend:
  15.419 -            return 0;
  15.420 -        case SHUTDOWN_crash:
  15.421 -            ERROR("domain has crashed");
  15.422 -            return -1;
  15.423 -        }
  15.424 -    }
  15.425 -
  15.426 -    if ( info->paused )
  15.427 -    {
  15.428 -        /* Try unpausing domain, wait, and retest. */
  15.429 -        xc_domain_unpause( xc_handle, dom );
  15.430 -        ERROR("Domain was paused. Wait and re-test.");
  15.431 -        usleep(10000); /* 10ms */
  15.432 -        goto retry;
  15.433 -    }
  15.434 -
  15.435 -    if ( ++i < 100 )
  15.436 -    {
  15.437 -        ERROR("Retry suspend domain");
  15.438 -        usleep(10000); /* 10ms */
  15.439 -        goto retry;
  15.440 -    }
  15.441 -
  15.442 -    ERROR("Unable to suspend domain.");
  15.443 -
  15.444 -    return -1;
  15.445 -}
  15.446 -
  15.447 -/*
  15.448 -** Map the top-level page of MFNs from the guest. The guest might not have
  15.449 -** finished resuming from a previous restore operation, so we wait a while for
  15.450 -** it to update the MFN to a reasonable value.
  15.451 -*/
  15.452 -static void *map_frame_list_list(int xc_handle, uint32_t dom,
  15.453 -                                 shared_info_t *shinfo)
  15.454 -{
  15.455 -    int count = 100;
  15.456 -    void *p;
  15.457 -
  15.458 -    while ( count-- && (shinfo->arch.pfn_to_mfn_frame_list_list == 0) )
  15.459 -        usleep(10000);
  15.460 -
  15.461 -    if ( shinfo->arch.pfn_to_mfn_frame_list_list == 0 )
  15.462 -    {
  15.463 -        ERROR("Timed out waiting for frame list updated.");
  15.464 -        return NULL;
  15.465 -    }
  15.466 -
  15.467 -    p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ,
  15.468 -                             shinfo->arch.pfn_to_mfn_frame_list_list);
  15.469 -    if ( p == NULL )
  15.470 -        ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno);
  15.471 -
  15.472 -    return p;
  15.473 -}
  15.474 -
  15.475 -/*
  15.476 -** During transfer (or in the state file), all page-table pages must be
  15.477 -** converted into a 'canonical' form where references to actual mfns
  15.478 -** are replaced with references to the corresponding pfns.
  15.479 -**
  15.480 -** This function performs the appropriate conversion, taking into account
  15.481 -** which entries do not require canonicalization (in particular, those
  15.482 -** entries which map the virtual address reserved for the hypervisor).
  15.483 -*/
  15.484 -static int canonicalize_pagetable(unsigned long type, unsigned long pfn,
  15.485 -                           const void *spage, void *dpage)
  15.486 -{
  15.487 -
  15.488 -    int i, pte_last, xen_start, xen_end, race = 0; 
  15.489 -    uint64_t pte;
  15.490 -
  15.491 -    /*
  15.492 -    ** We need to determine which entries in this page table hold
  15.493 -    ** reserved hypervisor mappings. This depends on the current
  15.494 -    ** page table type as well as the number of paging levels.
  15.495 -    */
  15.496 -    xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2) ? 4 : 8);
  15.497 -
  15.498 -    if ( (pt_levels == 2) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
  15.499 -        xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT);
  15.500 -
  15.501 -    if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
  15.502 -        xen_start = L3_PAGETABLE_ENTRIES_PAE;
  15.503 -
  15.504 -    /*
  15.505 -    ** in PAE only the L2 mapping the top 1GB contains Xen mappings.
  15.506 -    ** We can spot this by looking for the guest linear mapping which
  15.507 -    ** Xen always ensures is present in that L2. Guests must ensure
  15.508 -    ** that this check will fail for other L2s.
  15.509 -    */
  15.510 -    if ( (pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
  15.511 -    {
  15.512 -        int hstart;
  15.513 -        uint64_t he;
  15.514 -
  15.515 -        hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
  15.516 -        he = ((const uint64_t *) spage)[hstart];
  15.517 -
  15.518 -        if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
  15.519 -        {
  15.520 -            /* hvirt starts with xen stuff... */
  15.521 -            xen_start = hstart;
  15.522 -        }
  15.523 -        else if ( hvirt_start != 0xf5800000 )
  15.524 -        {
  15.525 -            /* old L2s from before hole was shrunk... */
  15.526 -            hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
  15.527 -            he = ((const uint64_t *) spage)[hstart];
  15.528 -            if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
  15.529 -                xen_start = hstart;
  15.530 -        }
  15.531 -    }
  15.532 -
  15.533 -    if ( (pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
  15.534 -    {
  15.535 -        /*
  15.536 -        ** XXX SMH: should compute these from hvirt_start (which we have)
  15.537 -        ** and hvirt_end (which we don't)
  15.538 -        */
  15.539 -        xen_start = 256;
  15.540 -        xen_end   = 272;
  15.541 -    }
  15.542 -
  15.543 -    /* Now iterate through the page table, canonicalizing each PTE */
  15.544 -    for (i = 0; i < pte_last; i++ )
  15.545 -    {
  15.546 -        unsigned long pfn, mfn;
  15.547 -
  15.548 -        if ( pt_levels == 2 )
  15.549 -            pte = ((const uint32_t*)spage)[i];
  15.550 -        else
  15.551 -            pte = ((const uint64_t*)spage)[i];
  15.552 -
  15.553 -        if ( (i >= xen_start) && (i < xen_end) )
  15.554 -            pte = 0;
  15.555 -
  15.556 -        if ( pte & _PAGE_PRESENT )
  15.557 -        {
  15.558 -            mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
  15.559 -            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
  15.560 -            {
  15.561 -                /* This will happen if the type info is stale which
  15.562 -                   is quite feasible under live migration */
  15.563 -                pfn  = 0;  /* zap it - we'll retransmit this page later */
  15.564 -                race = 1;  /* inform the caller of race; fatal if !live */ 
  15.565 -            }
  15.566 -            else
  15.567 -                pfn = mfn_to_pfn(mfn);
  15.568 -
  15.569 -            pte &= ~MADDR_MASK_X86;
  15.570 -            pte |= (uint64_t)pfn << PAGE_SHIFT;
  15.571 -
  15.572 -            /*
  15.573 -             * PAE guest L3Es can contain these flags when running on
  15.574 -             * a 64bit hypervisor. We zap these here to avoid any
  15.575 -             * surprise at restore time...
  15.576 -             */
  15.577 -            if ( (pt_levels == 3) &&
  15.578 -                 (type == XEN_DOMCTL_PFINFO_L3TAB) &&
  15.579 -                 (pte & (_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED)) )
  15.580 -                pte &= ~(_PAGE_USER|_PAGE_RW|_PAGE_ACCESSED);
  15.581 -        }
  15.582 -
  15.583 -        if ( pt_levels == 2 )
  15.584 -            ((uint32_t*)dpage)[i] = pte;
  15.585 -        else
  15.586 -            ((uint64_t*)dpage)[i] = pte;
  15.587 -    }
  15.588 -
  15.589 -    return race;
  15.590 -}
  15.591 -
  15.592 -static xen_pfn_t *xc_map_m2p(int xc_handle,
  15.593 -                                 unsigned long max_mfn,
  15.594 -                                 int prot)
  15.595 -{
  15.596 -    struct xen_machphys_mfn_list xmml;
  15.597 -    privcmd_mmap_entry_t *entries;
  15.598 -    unsigned long m2p_chunks, m2p_size;
  15.599 -    xen_pfn_t *m2p;
  15.600 -    xen_pfn_t *extent_start;
  15.601 -    int i, rc;
  15.602 -
  15.603 -    m2p_size   = M2P_SIZE(max_mfn);
  15.604 -    m2p_chunks = M2P_CHUNKS(max_mfn);
  15.605 -
  15.606 -    xmml.max_extents = m2p_chunks;
  15.607 -    if ( !(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t))) )
  15.608 -    {
  15.609 -        ERROR("failed to allocate space for m2p mfns");
  15.610 -        return NULL;
  15.611 -    }
  15.612 -    set_xen_guest_handle(xmml.extent_start, extent_start);
  15.613 -
  15.614 -    if ( xc_memory_op(xc_handle, XENMEM_machphys_mfn_list, &xmml) ||
  15.615 -         (xmml.nr_extents != m2p_chunks) )
  15.616 -    {
  15.617 -        ERROR("xc_get_m2p_mfns");
  15.618 -        return NULL;
  15.619 -    }
  15.620 -
  15.621 -    if ( (m2p = mmap(NULL, m2p_size, prot,
  15.622 -                     MAP_SHARED, xc_handle, 0)) == MAP_FAILED )
  15.623 -    {
  15.624 -        ERROR("failed to mmap m2p");
  15.625 -        return NULL;
  15.626 -    }
  15.627 -
  15.628 -    if ( !(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t))) )
  15.629 -    {
  15.630 -        ERROR("failed to allocate space for mmap entries");
  15.631 -        return NULL;
  15.632 -    }
  15.633 -
  15.634 -    for ( i = 0; i < m2p_chunks; i++ )
  15.635 -    {
  15.636 -        entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE));
  15.637 -        entries[i].mfn = extent_start[i];
  15.638 -        entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
  15.639 -    }
  15.640 -
  15.641 -    if ( (rc = xc_map_foreign_ranges(xc_handle, DOMID_XEN,
  15.642 -                                     entries, m2p_chunks)) < 0 )
  15.643 -    {
  15.644 -        ERROR("xc_mmap_foreign_ranges failed (rc = %d)", rc);
  15.645 -        return NULL;
  15.646 -    }
  15.647 -
  15.648 -    m2p_mfn0 = entries[0].mfn;
  15.649 -
  15.650 -    free(extent_start);
  15.651 -    free(entries);
  15.652 -
  15.653 -    return m2p;
  15.654 -}
  15.655 -
  15.656 -int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
  15.657 -                  uint32_t max_factor, uint32_t flags, int (*suspend)(int))
  15.658 -{
  15.659 -    xc_dominfo_t info;
  15.660 -
  15.661 -    int rc = 1, i, j, last_iter, iter = 0;
  15.662 -    int live  = (flags & XCFLAGS_LIVE);
  15.663 -    int debug = (flags & XCFLAGS_DEBUG);
  15.664 -    int race = 0, sent_last_iter, skip_this_iter;
  15.665 -
  15.666 -    /* The new domain's shared-info frame number. */
  15.667 -    unsigned long shared_info_frame;
  15.668 -
  15.669 -    /* A copy of the CPU context of the guest. */
  15.670 -    vcpu_guest_context_t ctxt;
  15.671 -
  15.672 -    /* A table containg the type of each PFN (/not/ MFN!). */
  15.673 -    unsigned long *pfn_type = NULL;
  15.674 -    unsigned long *pfn_batch = NULL;
  15.675 -
  15.676 -    /* A temporary mapping, and a copy, of one frame of guest memory. */
  15.677 -    char page[PAGE_SIZE];
  15.678 -
  15.679 -    /* Double and single indirect references to the live P2M table */
  15.680 -    xen_pfn_t *live_p2m_frame_list_list = NULL;
  15.681 -    xen_pfn_t *live_p2m_frame_list = NULL;
  15.682 -
  15.683 -    /* A copy of the pfn-to-mfn table frame list. */
  15.684 -    xen_pfn_t *p2m_frame_list = NULL;
  15.685 -
  15.686 -    /* Live mapping of shared info structure */
  15.687 -    shared_info_t *live_shinfo = NULL;
  15.688 -
  15.689 -    /* base of the region in which domain memory is mapped */
  15.690 -    unsigned char *region_base = NULL;
  15.691 -
  15.692 -    /* power of 2 order of p2m_size */
  15.693 -    int order_nr;
  15.694 -
  15.695 -    /* bitmap of pages:
  15.696 -       - that should be sent this iteration (unless later marked as skip);
  15.697 -       - to skip this iteration because already dirty;
  15.698 -       - to fixup by sending at the end if not already resent; */
  15.699 -    unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL;
  15.700 -
  15.701 -    xc_shadow_op_stats_t stats;
  15.702 -
  15.703 -    unsigned long needed_to_fix = 0;
  15.704 -    unsigned long total_sent    = 0;
  15.705 -
  15.706 -    uint64_t vcpumap = 1ULL;
  15.707 -
  15.708 -    /* If no explicit control parameters given, use defaults */
  15.709 -    max_iters  = max_iters  ? : DEF_MAX_ITERS;
  15.710 -    max_factor = max_factor ? : DEF_MAX_FACTOR;
  15.711 -
  15.712 -    initialize_mbit_rate();
  15.713 -
  15.714 -    if ( !get_platform_info(xc_handle, dom,
  15.715 -                            &max_mfn, &hvirt_start, &pt_levels) )
  15.716 -    {
  15.717 -        ERROR("Unable to get platform info.");
  15.718 -        return 1;
  15.719 -    }
  15.720 -
  15.721 -    if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
  15.722 -    {
  15.723 -        ERROR("Could not get domain info");
  15.724 -        return 1;
  15.725 -    }
  15.726 -
  15.727 -    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
  15.728 -    {
  15.729 -        ERROR("Could not get vcpu context");
  15.730 -        goto out;
  15.731 -    }
  15.732 -    shared_info_frame = info.shared_info_frame;
  15.733 -
  15.734 -    /* Map the shared info frame */
  15.735 -    if ( !(live_shinfo = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
  15.736 -                                              PROT_READ, shared_info_frame)) )
  15.737 -    {
  15.738 -        ERROR("Couldn't map live_shinfo");
  15.739 -        goto out;
  15.740 -    }
  15.741 -
  15.742 -    p2m_size = live_shinfo->arch.max_pfn;
  15.743 -
  15.744 -    live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom,
  15.745 -                                                   live_shinfo);
  15.746 -    if ( !live_p2m_frame_list_list )
  15.747 -        goto out;
  15.748 -
  15.749 -    live_p2m_frame_list =
  15.750 -        xc_map_foreign_batch(xc_handle, dom, PROT_READ,
  15.751 -                             live_p2m_frame_list_list,
  15.752 -                             P2M_FLL_ENTRIES);
  15.753 -    if ( !live_p2m_frame_list )
  15.754 -    {
  15.755 -        ERROR("Couldn't map p2m_frame_list");
  15.756 -        goto out;
  15.757 -    }
  15.758 -
  15.759 -    /* Map all the frames of the pfn->mfn table. For migrate to succeed,
  15.760 -       the guest must not change which frames are used for this purpose.
  15.761 -       (its not clear why it would want to change them, and we'll be OK
  15.762 -       from a safety POV anyhow. */
  15.763 -
  15.764 -    live_p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ,
  15.765 -                                    live_p2m_frame_list,
  15.766 -                                    P2M_FL_ENTRIES);
  15.767 -    if ( !live_p2m )
  15.768 -    {
  15.769 -        ERROR("Couldn't map p2m table");
  15.770 -        goto out;
  15.771 -    }
  15.772 -
  15.773 -    /* Setup the mfn_to_pfn table mapping */
  15.774 -    if ( !(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ)) )
  15.775 -    {
  15.776 -        ERROR("Failed to map live M2P table");
  15.777 -        goto out;
  15.778 -    }
  15.779 -
  15.780 -
  15.781 -    /* Get a local copy of the live_P2M_frame_list */
  15.782 -    if ( !(p2m_frame_list = malloc(P2M_FL_SIZE)) )
  15.783 -    {
  15.784 -        ERROR("Couldn't allocate p2m_frame_list array");
  15.785 -        goto out;
  15.786 -    }
  15.787 -    memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
  15.788 -
  15.789 -    /* Canonicalise the pfn-to-mfn table frame-number list. */
  15.790 -    for ( i = 0; i < p2m_size; i += fpp )
  15.791 -    {
  15.792 -        if ( !translate_mfn_to_pfn(&p2m_frame_list[i/fpp]) )
  15.793 -        {
  15.794 -            ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys");
  15.795 -            ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
  15.796 -                  (uint64_t)p2m_frame_list[i/fpp]);
  15.797 -            goto out;
  15.798 -        }
  15.799 -    }
  15.800 -
  15.801 -    /* Domain is still running at this point */
  15.802 -    if ( live )
  15.803 -    {
  15.804 -        /* Live suspend. Enable log-dirty mode. */
  15.805 -        if ( xc_shadow_control(xc_handle, dom,
  15.806 -                               XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
  15.807 -                               NULL, 0, NULL, 0, NULL) < 0 )
  15.808 -        {
  15.809 -            ERROR("Couldn't enable shadow mode");
  15.810 -            goto out;
  15.811 -        }
  15.812 -    }
  15.813 -    else
  15.814 -    {
  15.815 -        /* This is a non-live suspend. Suspend the domain .*/
  15.816 -        if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info, &ctxt) )
  15.817 -        {
  15.818 -            ERROR("Domain appears not to have suspended");
  15.819 -            goto out;
  15.820 -        }
  15.821 -    }
  15.822 -
  15.823 -    last_iter = !live;
  15.824 -
  15.825 -    /* pretend we sent all the pages last iteration */
  15.826 -    sent_last_iter = p2m_size;
  15.827 -
  15.828 -    /* calculate the power of 2 order of p2m_size, e.g.
  15.829 -       15->4 16->4 17->5 */
  15.830 -    for ( i = p2m_size-1, order_nr = 0; i ; i >>= 1, order_nr++ )
  15.831 -        continue;
  15.832 -
  15.833 -    /* Setup to_send / to_fix and to_skip bitmaps */
  15.834 -    to_send = malloc(BITMAP_SIZE);
  15.835 -    to_fix  = calloc(1, BITMAP_SIZE);
  15.836 -    to_skip = malloc(BITMAP_SIZE);
  15.837 -
  15.838 -    if ( !to_send || !to_fix || !to_skip )
  15.839 -    {
  15.840 -        ERROR("Couldn't allocate to_send array");
  15.841 -        goto out;
  15.842 -    }
  15.843 -
  15.844 -    memset(to_send, 0xff, BITMAP_SIZE);
  15.845 -
  15.846 -    if ( lock_pages(to_send, BITMAP_SIZE) )
  15.847 -    {
  15.848 -        ERROR("Unable to lock to_send");
  15.849 -        return 1;
  15.850 -    }
  15.851 -
  15.852 -    /* (to fix is local only) */
  15.853 -    if ( lock_pages(to_skip, BITMAP_SIZE) )
  15.854 -    {
  15.855 -        ERROR("Unable to lock to_skip");
  15.856 -        return 1;
  15.857 -    }
  15.858 -
  15.859 -    analysis_phase(xc_handle, dom, p2m_size, to_skip, 0);
  15.860 -
  15.861 -    /* We want zeroed memory so use calloc rather than malloc. */
  15.862 -    pfn_type   = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type));
  15.863 -    pfn_batch  = calloc(MAX_BATCH_SIZE, sizeof(*pfn_batch));
  15.864 -    if ( (pfn_type == NULL) || (pfn_batch == NULL) )
  15.865 -    {
  15.866 -        ERROR("failed to alloc memory for pfn_type and/or pfn_batch arrays");
  15.867 -        errno = ENOMEM;
  15.868 -        goto out;
  15.869 -    }
  15.870 -
  15.871 -    if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
  15.872 -    {
  15.873 -        ERROR("Unable to lock");
  15.874 -        goto out;
  15.875 -    }
  15.876 -
  15.877 -    /*
  15.878 -     * Quick belt and braces sanity check.
  15.879 -     */
  15.880 -    {
  15.881 -        int err=0;
  15.882 -        unsigned long mfn;
  15.883 -        for ( i = 0; i < p2m_size; i++ )
  15.884 -        {
  15.885 -            mfn = live_p2m[i];
  15.886 -            if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) )
  15.887 -            {
  15.888 -                DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i,
  15.889 -                        mfn, mfn_to_pfn(mfn));
  15.890 -                err++;
  15.891 -            }
  15.892 -        }
  15.893 -        DPRINTF("Had %d unexplained entries in p2m table\n", err);
  15.894 -    }
  15.895 -
  15.896 -    /* Start writing out the saved-domain record. */
  15.897 -    if ( !write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
  15.898 -    {
  15.899 -        ERROR("write: p2m_size");
  15.900 -        goto out;
  15.901 -    }
  15.902 -
  15.903 -    /*
  15.904 -     * Write an extended-info structure to inform the restore code that
  15.905 -     * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
  15.906 -     * slow paths in the restore code.
  15.907 -     */
  15.908 -    if ( (pt_levels == 3) &&
  15.909 -         (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3)) )
  15.910 -    {
  15.911 -        unsigned long signature = ~0UL;
  15.912 -        uint32_t tot_sz   = sizeof(struct vcpu_guest_context) + 8;
  15.913 -        uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
  15.914 -        char chunk_sig[]  = "vcpu";
  15.915 -        if ( !write_exact(io_fd, &signature, sizeof(signature)) ||
  15.916 -             !write_exact(io_fd, &tot_sz,    sizeof(tot_sz)) ||
  15.917 -             !write_exact(io_fd, &chunk_sig, 4) ||
  15.918 -             !write_exact(io_fd, &chunk_sz,  sizeof(chunk_sz)) ||
  15.919 -             !write_exact(io_fd, &ctxt,      sizeof(ctxt)) )
  15.920 -        {
  15.921 -            ERROR("write: extended info");
  15.922 -            goto out;
  15.923 -        }
  15.924 -    }
  15.925 -
  15.926 -    if ( !write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE) )
  15.927 -    {
  15.928 -        ERROR("write: p2m_frame_list");
  15.929 -        goto out;
  15.930 -    }
  15.931 -
  15.932 -    print_stats(xc_handle, dom, 0, &stats, 0);
  15.933 -
  15.934 -    /* Now write out each data page, canonicalising page tables as we go... */
  15.935 -    for ( ; ; )
  15.936 -    {
  15.937 -        unsigned int prev_pc, sent_this_iter, N, batch;
  15.938 -
  15.939 -        iter++;
  15.940 -        sent_this_iter = 0;
  15.941 -        skip_this_iter = 0;
  15.942 -        prev_pc = 0;
  15.943 -        N = 0;
  15.944 -
  15.945 -        DPRINTF("Saving memory pages: iter %d   0%%", iter);
  15.946 -
  15.947 -        while ( N < p2m_size )
  15.948 -        {
  15.949 -            unsigned int this_pc = (N * 100) / p2m_size;
  15.950 -            int rc;
  15.951 -
  15.952 -            if ( (this_pc - prev_pc) >= 5 )
  15.953 -            {
  15.954 -                DPRINTF("\b\b\b\b%3d%%", this_pc);
  15.955 -                prev_pc = this_pc;
  15.956 -            }
  15.957 -
  15.958 -            if ( !last_iter )
  15.959 -            {
  15.960 -                /* Slightly wasteful to peek the whole array evey time,
  15.961 -                   but this is fast enough for the moment. */
  15.962 -                rc = xc_shadow_control(
  15.963 -                    xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, to_skip, 
  15.964 -                    p2m_size, NULL, 0, NULL);
  15.965 -                if ( rc != p2m_size )
  15.966 -                {
  15.967 -                    ERROR("Error peeking shadow bitmap");
  15.968 -                    goto out;
  15.969 -                }
  15.970 -            }
  15.971 -
  15.972 -            /* load pfn_type[] with the mfn of all the pages we're doing in
  15.973 -               this batch. */
  15.974 -            for  ( batch = 0;
  15.975 -                   (batch < MAX_BATCH_SIZE) && (N < p2m_size);
  15.976 -                   N++ )
  15.977 -            {
  15.978 -                int n = permute(N, p2m_size, order_nr);
  15.979 -
  15.980 -                if ( debug )
  15.981 -                    DPRINTF("%d pfn= %08lx mfn= %08lx %d  [mfn]= %08lx\n",
  15.982 -                            iter, (unsigned long)n, live_p2m[n],
  15.983 -                            test_bit(n, to_send),
  15.984 -                            mfn_to_pfn(live_p2m[n]&0xFFFFF));
  15.985 -
  15.986 -                if ( !last_iter &&
  15.987 -                     test_bit(n, to_send) &&
  15.988 -                     test_bit(n, to_skip) )
  15.989 -                    skip_this_iter++; /* stats keeping */
  15.990 -
  15.991 -                if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
  15.992 -                       (test_bit(n, to_send) && last_iter) ||
  15.993 -                       (test_bit(n, to_fix)  && last_iter)) )
  15.994 -                    continue;
  15.995 -
  15.996 -                /*
  15.997 -                ** we get here if:
  15.998 -                **  1. page is marked to_send & hasn't already been re-dirtied
  15.999 -                **  2. (ignore to_skip in last iteration)
 15.1000 -                **  3. add in pages that still need fixup (net bufs)
 15.1001 -                */
 15.1002 -
 15.1003 -                pfn_batch[batch] = n;
 15.1004 -                pfn_type[batch]  = live_p2m[n];
 15.1005 -
 15.1006 -                if ( !is_mapped(pfn_type[batch]) )
 15.1007 -                {
 15.1008 -                    /*
 15.1009 -                    ** not currently in psuedo-physical map -- set bit
 15.1010 -                    ** in to_fix since we must send this page in last_iter
 15.1011 -                    ** unless its sent sooner anyhow, or it never enters
 15.1012 -                    ** pseudo-physical map (e.g. for ballooned down domains)
 15.1013 -                    */
 15.1014 -                    set_bit(n, to_fix);
 15.1015 -                    continue;
 15.1016 -                }
 15.1017 -
 15.1018 -                if ( last_iter &&
 15.1019 -                     test_bit(n, to_fix) &&
 15.1020 -                     !test_bit(n, to_send) )
 15.1021 -                {
 15.1022 -                    needed_to_fix++;
 15.1023 -                    DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
 15.1024 -                            iter, n, pfn_type[batch]);
 15.1025 -                }
 15.1026 -
 15.1027 -                clear_bit(n, to_fix);
 15.1028 -
 15.1029 -                batch++;
 15.1030 -            }
 15.1031 -
 15.1032 -            if ( batch == 0 )
 15.1033 -                goto skip; /* vanishingly unlikely... */
 15.1034 -
 15.1035 -            region_base = xc_map_foreign_batch(
 15.1036 -                xc_handle, dom, PROT_READ, pfn_type, batch);
 15.1037 -            if ( region_base == NULL )
 15.1038 -            {
 15.1039 -                ERROR("map batch failed");
 15.1040 -                goto out;
 15.1041 -            }
 15.1042 -
 15.1043 -            for ( j = 0; j < batch; j++ )
 15.1044 -                ((uint32_t *)pfn_type)[j] = pfn_type[j];
 15.1045 -            if ( xc_get_pfn_type_batch(xc_handle, dom, batch,
 15.1046 -                                       (uint32_t *)pfn_type) )
 15.1047 -            {
 15.1048 -                ERROR("get_pfn_type_batch failed");
 15.1049 -                goto out;
 15.1050 -            }
 15.1051 -            for ( j = batch-1; j >= 0; j-- )
 15.1052 -                pfn_type[j] = ((uint32_t *)pfn_type)[j];
 15.1053 -
 15.1054 -            for ( j = 0; j < batch; j++ )
 15.1055 -            {
 15.1056 -
 15.1057 -                if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) ==
 15.1058 -                     XEN_DOMCTL_PFINFO_XTAB )
 15.1059 -                {
 15.1060 -                    DPRINTF("type fail: page %i mfn %08lx\n", j, pfn_type[j]);
 15.1061 -                    continue;
 15.1062 -                }
 15.1063 -
 15.1064 -                if ( debug )
 15.1065 -                    DPRINTF("%d pfn= %08lx mfn= %08lx [mfn]= %08lx"
 15.1066 -                            " sum= %08lx\n",
 15.1067 -                            iter,
 15.1068 -                            (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
 15.1069 -                            pfn_batch[j],
 15.1070 -                            pfn_type[j],
 15.1071 -                            mfn_to_pfn(pfn_type[j] &
 15.1072 -                                       ~XEN_DOMCTL_PFINFO_LTAB_MASK),
 15.1073 -                            csum_page(region_base + (PAGE_SIZE*j)));
 15.1074 -
 15.1075 -                /* canonicalise mfn->pfn */
 15.1076 -                pfn_type[j] = (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) |
 15.1077 -                    pfn_batch[j];
 15.1078 -            }
 15.1079 -
 15.1080 -            if ( !write_exact(io_fd, &batch, sizeof(unsigned int)) )
 15.1081 -            {
 15.1082 -                ERROR("Error when writing to state file (2) (errno %d)",
 15.1083 -                      errno);
 15.1084 -                goto out;
 15.1085 -            }
 15.1086 -
 15.1087 -            if ( !write_exact(io_fd, pfn_type, sizeof(unsigned long)*j) )
 15.1088 -            {
 15.1089 -                ERROR("Error when writing to state file (3) (errno %d)",
 15.1090 -                      errno);
 15.1091 -                goto out;
 15.1092 -            }
 15.1093 -
 15.1094 -            /* entering this loop, pfn_type is now in pfns (Not mfns) */
 15.1095 -            for ( j = 0; j < batch; j++ )
 15.1096 -            {
 15.1097 -                unsigned long pfn, pagetype;
 15.1098 -                void *spage = (char *)region_base + (PAGE_SIZE*j);
 15.1099 -
 15.1100 -                pfn      = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
 15.1101 -                pagetype = pfn_type[j] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
 15.1102 -
 15.1103 -                /* write out pages in batch */
 15.1104 -                if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
 15.1105 -                    continue;
 15.1106 -
 15.1107 -                pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
 15.1108 -
 15.1109 -                if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
 15.1110 -                     (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
 15.1111 -                {
 15.1112 -                    /* We have a pagetable page: need to rewrite it. */
 15.1113 -                    race = 
 15.1114 -                        canonicalize_pagetable(pagetype, pfn, spage, page); 
 15.1115 -
 15.1116 -                    if ( race && !live )
 15.1117 -                    {
 15.1118 -                        ERROR("Fatal PT race (pfn %lx, type %08lx)", pfn,
 15.1119 -                              pagetype);
 15.1120 -                        goto out;
 15.1121 -                    }
 15.1122 -
 15.1123 -                    if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
 15.1124 -                    {
 15.1125 -                        ERROR("Error when writing to state file (4)"
 15.1126 -                              " (errno %d)", errno);
 15.1127 -                        goto out;
 15.1128 -                    }
 15.1129 -                }
 15.1130 -                else
 15.1131 -                {
 15.1132 -                    /* We have a normal page: just write it directly. */
 15.1133 -                    if ( ratewrite(io_fd, live, spage, PAGE_SIZE) !=
 15.1134 -                         PAGE_SIZE )
 15.1135 -                    {
 15.1136 -                        ERROR("Error when writing to state file (5)"
 15.1137 -                              " (errno %d)", errno);
 15.1138 -                        goto out;
 15.1139 -                    }
 15.1140 -                }
 15.1141 -            } /* end of the write out for this batch */
 15.1142 -
 15.1143 -            sent_this_iter += batch;
 15.1144 -
 15.1145 -            munmap(region_base, batch*PAGE_SIZE);
 15.1146 -
 15.1147 -        } /* end of this while loop for this iteration */
 15.1148 -
 15.1149 -      skip:
 15.1150 -
 15.1151 -        total_sent += sent_this_iter;
 15.1152 -
 15.1153 -        DPRINTF("\r %d: sent %d, skipped %d, ",
 15.1154 -                iter, sent_this_iter, skip_this_iter );
 15.1155 -
 15.1156 -        if ( last_iter )
 15.1157 -        {
 15.1158 -            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
 15.1159 -
 15.1160 -            DPRINTF("Total pages sent= %ld (%.2fx)\n",
 15.1161 -                    total_sent, ((float)total_sent)/p2m_size );
 15.1162 -            DPRINTF("(of which %ld were fixups)\n", needed_to_fix  );
 15.1163 -        }
 15.1164 -
 15.1165 -        if ( last_iter && debug )
 15.1166 -        {
 15.1167 -            int minusone = -1;
 15.1168 -            memset(to_send, 0xff, BITMAP_SIZE);
 15.1169 -            debug = 0;
 15.1170 -            DPRINTF("Entering debug resend-all mode\n");
 15.1171 -
 15.1172 -            /* send "-1" to put receiver into debug mode */
 15.1173 -            if ( !write_exact(io_fd, &minusone, sizeof(int)) )
 15.1174 -            {
 15.1175 -                ERROR("Error when writing to state file (6) (errno %d)",
 15.1176 -                      errno);
 15.1177 -                goto out;
 15.1178 -            }
 15.1179 -
 15.1180 -            continue;
 15.1181 -        }
 15.1182 -
 15.1183 -        if ( last_iter )
 15.1184 -            break;
 15.1185 -
 15.1186 -        if ( live )
 15.1187 -        {
 15.1188 -            if ( ((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) ||
 15.1189 -                 (iter >= max_iters) ||
 15.1190 -                 (sent_this_iter+skip_this_iter < 50) ||
 15.1191 -                 (total_sent > p2m_size*max_factor) )
 15.1192 -            {
 15.1193 -                DPRINTF("Start last iteration\n");
 15.1194 -                last_iter = 1;
 15.1195 -
 15.1196 -                if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info,
 15.1197 -                                       &ctxt) )
 15.1198 -                {
 15.1199 -                    ERROR("Domain appears not to have suspended");
 15.1200 -                    goto out;
 15.1201 -                }
 15.1202 -
 15.1203 -                DPRINTF("SUSPEND shinfo %08lx eip %08lx edx %08lx\n",
 15.1204 -                        info.shared_info_frame,
 15.1205 -                        (unsigned long)ctxt.user_regs.eip,
 15.1206 -                        (unsigned long)ctxt.user_regs.edx);
 15.1207 -            }
 15.1208 -
 15.1209 -            if ( xc_shadow_control(xc_handle, dom, 
 15.1210 -                                   XEN_DOMCTL_SHADOW_OP_CLEAN, to_send, 
 15.1211 -                                   p2m_size, NULL, 0, &stats) != p2m_size )
 15.1212 -            {
 15.1213 -                ERROR("Error flushing shadow PT");
 15.1214 -                goto out;
 15.1215 -            }
 15.1216 -
 15.1217 -            sent_last_iter = sent_this_iter;
 15.1218 -
 15.1219 -            print_stats(xc_handle, dom, sent_this_iter, &stats, 1);
 15.1220 -
 15.1221 -        }
 15.1222 -    } /* end of infinite for loop */
 15.1223 -
 15.1224 -    DPRINTF("All memory is saved\n");
 15.1225 -
 15.1226 -    {
 15.1227 -        struct {
 15.1228 -            int minustwo;
 15.1229 -            int max_vcpu_id;
 15.1230 -            uint64_t vcpumap;
 15.1231 -        } chunk = { -2, info.max_vcpu_id };
 15.1232 -
 15.1233 -        if ( info.max_vcpu_id >= 64 )
 15.1234 -        {
 15.1235 -            ERROR("Too many VCPUS in guest!");
 15.1236 -            goto out;
 15.1237 -        }
 15.1238 -
 15.1239 -        for ( i = 1; i <= info.max_vcpu_id; i++ )
 15.1240 -        {
 15.1241 -            xc_vcpuinfo_t vinfo;
 15.1242 -            if ( (xc_vcpu_getinfo(xc_handle, dom, i, &vinfo) == 0) &&
 15.1243 -                 vinfo.online )
 15.1244 -                vcpumap |= 1ULL << i;
 15.1245 -        }
 15.1246 -
 15.1247 -        chunk.vcpumap = vcpumap;
 15.1248 -        if ( !write_exact(io_fd, &chunk, sizeof(chunk)) )
 15.1249 -        {
 15.1250 -            ERROR("Error when writing to state file (errno %d)", errno);
 15.1251 -            goto out;
 15.1252 -        }
 15.1253 -    }
 15.1254 -
 15.1255 -    /* Zero terminate */
 15.1256 -    i = 0;
 15.1257 -    if ( !write_exact(io_fd, &i, sizeof(int)) )
 15.1258 -    {
 15.1259 -        ERROR("Error when writing to state file (6') (errno %d)", errno);
 15.1260 -        goto out;
 15.1261 -    }
 15.1262 -
 15.1263 -    /* Send through a list of all the PFNs that were not in map at the close */
 15.1264 -    {
 15.1265 -        unsigned int i,j;
 15.1266 -        unsigned long pfntab[1024];
 15.1267 -
 15.1268 -        for ( i = 0, j = 0; i < p2m_size; i++ )
 15.1269 -        {
 15.1270 -            if ( !is_mapped(live_p2m[i]) )
 15.1271 -                j++;
 15.1272 -        }
 15.1273 -
 15.1274 -        if ( !write_exact(io_fd, &j, sizeof(unsigned int)) )
 15.1275 -        {
 15.1276 -            ERROR("Error when writing to state file (6a) (errno %d)", errno);
 15.1277 -            goto out;
 15.1278 -        }
 15.1279 -
 15.1280 -        for ( i = 0, j = 0; i < p2m_size; )
 15.1281 -        {
 15.1282 -            if ( !is_mapped(live_p2m[i]) )
 15.1283 -                pfntab[j++] = i;
 15.1284 -
 15.1285 -            i++;
 15.1286 -            if ( (j == 1024) || (i == p2m_size) )
 15.1287 -            {
 15.1288 -                if ( !write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
 15.1289 -                {
 15.1290 -                    ERROR("Error when writing to state file (6b) (errno %d)",
 15.1291 -                          errno);
 15.1292 -                    goto out;
 15.1293 -                }
 15.1294 -                j = 0;
 15.1295 -            }
 15.1296 -        }
 15.1297 -    }
 15.1298 -
 15.1299 -    /* Canonicalise the suspend-record frame number. */
 15.1300 -    if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) )
 15.1301 -    {
 15.1302 -        ERROR("Suspend record is not in range of pseudophys map");
 15.1303 -        goto out;
 15.1304 -    }
 15.1305 -
 15.1306 -    for ( i = 0; i <= info.max_vcpu_id; i++ )
 15.1307 -    {
 15.1308 -        if ( !(vcpumap & (1ULL << i)) )
 15.1309 -            continue;
 15.1310 -
 15.1311 -        if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
 15.1312 -        {
 15.1313 -            ERROR("No context for VCPU%d", i);
 15.1314 -            goto out;
 15.1315 -        }
 15.1316 -
 15.1317 -        /* Canonicalise each GDT frame number. */
 15.1318 -        for ( j = 0; (512*j) < ctxt.gdt_ents; j++ )
 15.1319 -        {
 15.1320 -            if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[j]) )
 15.1321 -            {
 15.1322 -                ERROR("GDT frame is not in range of pseudophys map");
 15.1323 -                goto out;
 15.1324 -            }
 15.1325 -        }
 15.1326 -
 15.1327 -        /* Canonicalise the page table base pointer. */
 15.1328 -        if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) )
 15.1329 -        {
 15.1330 -            ERROR("PT base is not in range of pseudophys map");
 15.1331 -            goto out;
 15.1332 -        }
 15.1333 -        ctxt.ctrlreg[3] = 
 15.1334 -            xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3])));
 15.1335 -
 15.1336 -        /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */
 15.1337 -        if ( (pt_levels == 4) && ctxt.ctrlreg[1] )
 15.1338 -        {
 15.1339 -            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[1])) )
 15.1340 -            {
 15.1341 -                ERROR("PT base is not in range of pseudophys map");
 15.1342 -                goto out;
 15.1343 -            }
 15.1344 -            /* Least-significant bit means 'valid PFN'. */
 15.1345 -            ctxt.ctrlreg[1] = 1 |
 15.1346 -                xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[1])));
 15.1347 -        }
 15.1348 -
 15.1349 -        if ( !write_exact(io_fd, &ctxt, sizeof(ctxt)) )
 15.1350 -        {
 15.1351 -            ERROR("Error when writing to state file (1) (errno %d)", errno);
 15.1352 -            goto out;
 15.1353 -        }
 15.1354 -    }
 15.1355 -
 15.1356 -    /*
 15.1357 -     * Reset the MFN to be a known-invalid value. See map_frame_list_list().
 15.1358 -     */
 15.1359 -    memcpy(page, live_shinfo, PAGE_SIZE);
 15.1360 -    ((shared_info_t *)page)->arch.pfn_to_mfn_frame_list_list = 0;
 15.1361 -    if ( !write_exact(io_fd, page, PAGE_SIZE) )
 15.1362 -    {
 15.1363 -        ERROR("Error when writing to state file (1) (errno %d)", errno);
 15.1364 -        goto out;
 15.1365 -    }
 15.1366 -
 15.1367 -    /* Success! */
 15.1368 -    rc = 0;
 15.1369 -
 15.1370 - out:
 15.1371 -
 15.1372 -    if ( live )
 15.1373 -    {
 15.1374 -        if ( xc_shadow_control(xc_handle, dom, 
 15.1375 -                               XEN_DOMCTL_SHADOW_OP_OFF,
 15.1376 -                               NULL, 0, NULL, 0, NULL) < 0 )
 15.1377 -            DPRINTF("Warning - couldn't disable shadow mode");
 15.1378 -    }
 15.1379 -
 15.1380 -    /* Flush last write and discard cache for file. */
 15.1381 -    discard_file_cache(io_fd, 1 /* flush */);
 15.1382 -
 15.1383 -    if ( live_shinfo )
 15.1384 -        munmap(live_shinfo, PAGE_SIZE);
 15.1385 -
 15.1386 -    if ( live_p2m_frame_list_list )
 15.1387 -        munmap(live_p2m_frame_list_list, PAGE_SIZE);
 15.1388 -
 15.1389 -    if ( live_p2m_frame_list )
 15.1390 -        munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
 15.1391 -
 15.1392 -    if ( live_p2m )
 15.1393 -        munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT));
 15.1394 -
 15.1395 -    if ( live_m2p )
 15.1396 -        munmap(live_m2p, M2P_SIZE(max_mfn));
 15.1397 -
 15.1398 -    free(pfn_type);
 15.1399 -    free(pfn_batch);
 15.1400 -    free(to_send);
 15.1401 -    free(to_fix);
 15.1402 -    free(to_skip);
 15.1403 -
 15.1404 -    DPRINTF("Save exit rc=%d\n",rc);
 15.1405 -
 15.1406 -    return !!rc;
 15.1407 -}
 15.1408 -
 15.1409 -/*
 15.1410 - * Local variables:
 15.1411 - * mode: C
 15.1412 - * c-set-style: "BSD"
 15.1413 - * c-basic-offset: 4
 15.1414 - * tab-width: 4
 15.1415 - * indent-tabs-mode: nil
 15.1416 - * End:
 15.1417 - */
    16.1 --- a/tools/libxc/xenguest.h	Wed Apr 11 07:30:02 2007 -0600
    16.2 +++ b/tools/libxc/xenguest.h	Wed Apr 11 15:45:29 2007 +0100
    16.3 @@ -16,26 +16,19 @@
    16.4  
    16.5  
    16.6  /**
    16.7 - * This function will save a domain running Linux.
    16.8 + * This function will save a running domain.
    16.9   *
   16.10   * @parm xc_handle a handle to an open hypervisor interface
   16.11   * @parm fd the file descriptor to save a domain to
   16.12   * @parm dom the id of the domain
   16.13   * @return 0 on success, -1 on failure
   16.14   */
   16.15 -int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
   16.16 -                  uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
   16.17 -                  int (*suspend)(int domid));
   16.18 +int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
   16.19 +                   uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
   16.20 +                   int (*suspend)(int domid), int hvm,
   16.21 +                   void *(*init_qemu_maps)(int, unsigned),  /* HVM only */
   16.22 +                   void (*qemu_flip_buffer)(int, int));     /* HVM only */
   16.23  
   16.24 -/**
   16.25 - * This function will save a hvm domain running unmodified guest.
   16.26 - * @return 0 on success, -1 on failure
   16.27 - */
   16.28 -int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
   16.29 -                uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
   16.30 -                int (*suspend)(int domid),  
   16.31 -                void *(*init_qemu_maps)(int, unsigned), 
   16.32 -                void (*qemu_flip_buffer)(int, int));
   16.33  
   16.34  /**
   16.35   * This function will restore a saved domain.
    17.1 --- a/tools/libxc/xg_private.c	Wed Apr 11 07:30:02 2007 -0600
    17.2 +++ b/tools/libxc/xg_private.c	Wed Apr 11 15:45:29 2007 +0100
    17.3 @@ -198,17 +198,6 @@ unsigned long csum_page(void *page)
    17.4      return -1;
    17.5  }
    17.6  
    17.7 -__attribute__((weak)) 
    17.8 -    int xc_hvm_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
    17.9 -                    uint32_t max_factor, uint32_t flags,
   17.10 -                    int (*suspend)(int domid), 
   17.11 -                    void *(*init_qemu_maps)(int, unsigned), 
   17.12 -                    void (*qemu_flip_buffer)(int, int))
   17.13 -{
   17.14 -    errno = ENOSYS;
   17.15 -    return -1;
   17.16 -}
   17.17 -
   17.18  __attribute__((weak)) int xc_get_hvm_param(
   17.19      int handle, domid_t dom, int param, unsigned long *value)
   17.20  {
    18.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    18.2 +++ b/tools/pygrub/src/LiloConf.py	Wed Apr 11 15:45:29 2007 +0100
    18.3 @@ -0,0 +1,147 @@
    18.4 +#
    18.5 +#LiloConf.py
    18.6 +#
    18.7 +
    18.8 +import sys, re, os
    18.9 +import logging
   18.10 +import GrubConf
   18.11 +
   18.12 +class LiloImage(object):
   18.13 +    def __init__(self, lines, path):
   18.14 +        self.reset(lines, path)
   18.15 +
   18.16 +    def __repr__(self):
   18.17 +        return ("title: %s\n"
   18.18 +                "  root: %s\n"
   18.19 +                "  kernel: %s\n"
   18.20 +                "  args: %s\n"
   18.21 +                "  initrd: %s\n" %(self.title, self.root, self.kernel,
   18.22 +                                   self.args, self.initrd))
   18.23 +    def reset(self, lines, path):
   18.24 +        self._root = self._initrd = self._kernel = self._args = None
   18.25 +        self.title = ""
   18.26 +        self.lines = []
   18.27 +        self.path = path
   18.28 +        map(self.set_from_line, lines)
   18.29 +        self.root = "" # dummy
   18.30 +
   18.31 +    def set_from_line(self, line, replace = None):
   18.32 +        (com, arg) = GrubConf.grub_exact_split(line, 2)
   18.33 +
   18.34 +        if self.commands.has_key(com):
   18.35 +            if self.commands[com] is not None:
   18.36 +                exec("%s = r\'%s\'" %(self.commands[com], re.sub('^"(.+)"$', r"\1", arg.strip())))
   18.37 +            else:
   18.38 +                logging.info("Ignored image directive %s" %(com,))
   18.39 +        else:
   18.40 +            logging.warning("Unknown image directive %s" %(com,))
   18.41 +
   18.42 +        # now put the line in the list of lines
   18.43 +        if replace is None:
   18.44 +            self.lines.append(line)
   18.45 +        else:
   18.46 +            self.lines.pop(replace)
   18.47 +            self.lines.insert(replace, line)
   18.48 +
   18.49 +    def set_kernel(self, val):
   18.50 +        self._kernel = (None, self.path + "/" + val)
   18.51 +    def get_kernel(self):
   18.52 +        return self._kernel
   18.53 +    kernel = property(get_kernel, set_kernel)
   18.54 +
   18.55 +    def set_initrd(self, val):
   18.56 +        self._initrd = (None, self.path + "/" + val)
   18.57 +    def get_initrd(self):
   18.58 +        return self._initrd
   18.59 +    initrd = property(get_initrd, set_initrd)
   18.60 +
   18.61 +    # set up command handlers
   18.62 +    commands = { "label": "self.title",
   18.63 +                 "root": "self.root",
   18.64 +                 "rootnoverify": "self.root",
   18.65 +                 "image": "self.kernel",
   18.66 +                 "initrd": "self.initrd",
   18.67 +                 "append": "self.args",
   18.68 +                 "read-only": None,
   18.69 +                 "chainloader": None,
   18.70 +                 "module": None}
   18.71 +
   18.72 +class LiloConfigFile(object):
   18.73 +    def __init__(self, fn = None):
   18.74 +        self.filename = fn
   18.75 +        self.images = []
   18.76 +        self.timeout = -1
   18.77 +        self._default = 0
   18.78 +
   18.79 +        if fn is not None:
   18.80 +            self.parse()
   18.81 +
   18.82 +    def parse(self, buf = None):
   18.83 +        if buf is None:
   18.84 +            if self.filename is None:
   18.85 +                raise ValueError, "No config file defined to parse!"
   18.86 +
   18.87 +            f = open(self.filename, 'r')
   18.88 +            lines = f.readlines()
   18.89 +            f.close()
   18.90 +        else:
   18.91 +            lines = buf.split("\n")
   18.92 +
   18.93 +        path = os.path.dirname(self.filename)
   18.94 +        img = []
   18.95 +        for l in lines:
   18.96 +            l = l.strip()
   18.97 +            # skip blank lines
   18.98 +            if len(l) == 0:
   18.99 +                continue
  18.100 +            # skip comments
  18.101 +            if l.startswith('#'):
  18.102 +                continue
  18.103 +            # new image
  18.104 +            if l.startswith("image"):
  18.105 +                if len(img) > 0:
  18.106 +                    self.add_image(LiloImage(img, path))
  18.107 +                img = [l]
  18.108 +                continue
  18.109 +
  18.110 +            if len(img) > 0:
  18.111 +                img.append(l)
  18.112 +                continue
  18.113 +
  18.114 +            (com, arg) = GrubConf.grub_exact_split(l, 2)
  18.115 +            if self.commands.has_key(com):
  18.116 +                if self.commands[com] is not None:
  18.117 +                    exec("%s = r\"%s\"" %(self.commands[com], arg.strip()))
  18.118 +                else:
  18.119 +                    logging.info("Ignored directive %s" %(com,))
  18.120 +            else:
  18.121 +                logging.warning("Unknown directive %s" %(com,))
  18.122 +
  18.123 +        if len(img) > 0:
  18.124 +            self.add_image(LiloImage(img, path))
  18.125 +
  18.126 +    def add_image(self, image):
  18.127 +        self.images.append(image)
  18.128 +
  18.129 +    def _get_default(self):
  18.130 +        for i in range(0, len(self.images) - 1):
  18.131 +            if self.images[i].title == self._default:
  18.132 +                return i
  18.133 +        return 0
  18.134 +    def _set_default(self, val):
  18.135 +        self._default = val
  18.136 +    default = property(_get_default, _set_default)
  18.137 +
  18.138 +    commands = { "default": "self.default",
  18.139 +                 "timeout": "self.timeout",
  18.140 +                 "prompt": None,
  18.141 +                 "relocatable": None,
  18.142 +                 }
  18.143 +
  18.144 +if __name__ == "__main__":
  18.145 +    if sys.argv < 2:
  18.146 +        raise RuntimeError, "Need a grub.conf to read"
  18.147 +    g = LiloConfigFile(sys.argv[1])
  18.148 +    for i in g.images:
  18.149 +        print i #, i.title, i.root, i.kernel, i.args, i.initrd
  18.150 +    print g.default
    19.1 --- a/tools/pygrub/src/pygrub	Wed Apr 11 07:30:02 2007 -0600
    19.2 +++ b/tools/pygrub/src/pygrub	Wed Apr 11 15:45:29 2007 +0100
    19.3 @@ -16,6 +16,7 @@
    19.4  import os, sys, string, struct, tempfile, re
    19.5  import copy
    19.6  import logging
    19.7 +import platform
    19.8  
    19.9  import curses, _curses, curses.wrapper, curses.textpad, curses.ascii
   19.10  import getopt
   19.11 @@ -24,6 +25,7 @@ sys.path = [ '/usr/lib/python' ] + sys.p
   19.12  
   19.13  import fsimage
   19.14  import grub.GrubConf
   19.15 +import grub.LiloConf
   19.16  
   19.17  PYGRUB_VER = 0.5
   19.18  
   19.19 @@ -59,6 +61,13 @@ def get_active_partition(file):
   19.20          if struct.unpack("<c", buf[poff:poff+1]) == ('\x80',):
   19.21              return buf[poff:poff+16]
   19.22  
   19.23 +        # type=0xee: GUID partition table
   19.24 +        # XXX assume the first partition is active
   19.25 +        if struct.unpack("<c", buf[poff+4:poff+5]) == ('\xee',):
   19.26 +            os.lseek(fd, 0x400, 0)
   19.27 +            buf = os.read(fd, 512)
   19.28 +            return buf[24:40] # XXX buf[32:40]
   19.29 +
   19.30      # if there's not a partition marked as active, fall back to
   19.31      # the first partition
   19.32      return buf[446:446+16]
   19.33 @@ -346,7 +355,13 @@ class Grub:
   19.34          if not os.access(fn, os.R_OK):
   19.35              raise RuntimeError, "Unable to access %s" %(fn,)
   19.36  
   19.37 -        self.cf = grub.GrubConf.GrubConfigFile()
   19.38 +        if platform.machine() == 'ia64':
   19.39 +            self.cf = grub.LiloConf.LiloConfigFile()
   19.40 +            file_list = ("/efi/redhat/elilo.conf",)
   19.41 +        else:
   19.42 +            self.cf = grub.GrubConf.GrubConfigFile()
   19.43 +            file_list = ("/boot/grub/menu.lst", "/boot/grub/grub.conf",
   19.44 +                         "/grub/menu.lst", "/grub/grub.conf")
   19.45  
   19.46          if not fs:
   19.47              # set the config file and parse it
   19.48 @@ -354,18 +369,15 @@ class Grub:
   19.49              self.cf.parse()
   19.50              return
   19.51  
   19.52 -        grubfile = None
   19.53 -        for f in ("/boot/grub/menu.lst", "/boot/grub/grub.conf",
   19.54 -                  "/grub/menu.lst", "/grub/grub.conf"):
   19.55 +        for f in file_list:
   19.56              if fs.file_exists(f):
   19.57 -                grubfile = f
   19.58 +                self.cf.filename = f
   19.59                  break
   19.60 -        if grubfile is None:
   19.61 -            raise RuntimeError, "we couldn't find grub config file in the image provided."
   19.62 -        f = fs.open_file(grubfile)
   19.63 +        if self.cf.filename is None:
   19.64 +            raise RuntimeError, "couldn't find bootloader config file in the image provided."
   19.65 +        f = fs.open_file(self.cf.filename)
   19.66          buf = f.read()
   19.67          del f
   19.68 -        # then parse the grub config
   19.69          self.cf.parse(buf)
   19.70  
   19.71      def run(self):
    20.1 --- a/tools/python/xen/xend/XendCheckpoint.py	Wed Apr 11 07:30:02 2007 -0600
    20.2 +++ b/tools/python/xen/xend/XendCheckpoint.py	Wed Apr 11 15:45:29 2007 +0100
    20.3 @@ -75,13 +75,6 @@ def save(fd, dominfo, network, live, dst
    20.4  
    20.5          image_cfg = dominfo.info.get('image', {})
    20.6          hvm = dominfo.info.is_hvm()
    20.7 -        stdvga = 0
    20.8 -
    20.9 -        if hvm:
   20.10 -            log.info("save hvm domain")
   20.11 -            if dominfo.info['platform'].has_key('stdvga'):
   20.12 -                if dominfo.info['platform']['stdvga'] == 1:
   20.13 -                    stdvga = 1
   20.14  
   20.15          # xc_save takes three customization parameters: maxit, max_f, and
   20.16          # flags the last controls whether or not save is 'live', while the
    21.1 --- a/tools/python/xen/xend/server/DevController.py	Wed Apr 11 07:30:02 2007 -0600
    21.2 +++ b/tools/python/xen/xend/server/DevController.py	Wed Apr 11 15:45:29 2007 +0100
    21.3 @@ -223,6 +223,7 @@ class DevController:
    21.4                  xstransact.Remove(backpath)
    21.5              xstransact.Remove(frontpath)
    21.6  
    21.7 +        self.vm._removeVm("device/%s/%d" % (self.deviceClass, devid))
    21.8  
    21.9      def configurations(self):
   21.10          return map(self.configuration, self.deviceIDs())
    22.1 --- a/tools/python/xen/xend/server/netif.py	Wed Apr 11 07:30:02 2007 -0600
    22.2 +++ b/tools/python/xen/xend/server/netif.py	Wed Apr 11 15:45:29 2007 +0100
    22.3 @@ -88,46 +88,6 @@ def parseRate(ratestr):
    22.4      return "%lu,%lu" % (bytes_per_interval, interval_usecs)
    22.5  
    22.6  
    22.7 -write_rate_G_re = re.compile('^([0-9]+)000000000(B/s@[0-9]+us)$')
    22.8 -write_rate_M_re = re.compile('^([0-9]+)000000(B/s@[0-9]+us)$')
    22.9 -write_rate_K_re = re.compile('^([0-9]+)000(B/s@[0-9]+us)$')
   22.10 -write_rate_s_re = re.compile('^([0-9]+[GMK]?B/s@[0-9]+)000000us$')
   22.11 -write_rate_m_re = re.compile('^([0-9]+[GMK]?B/s@[0-9]+)000us$')
   22.12 -
   22.13 -def formatRate(rate):
   22.14 -    (bytes_per_interval, interval_usecs) = map(long, rate.split(','))
   22.15 -
   22.16 -    if interval_usecs != 0:
   22.17 -        bytes_per_second = (bytes_per_interval * 1000 * 1000) / interval_usecs
   22.18 -    else:
   22.19 -        bytes_per_second = 0xffffffffL
   22.20 -
   22.21 -    ratestr = "%uB/s@%uus" % (bytes_per_second, interval_usecs)
   22.22 -
   22.23 -    # look for '000's
   22.24 -    m = write_rate_G_re.match(ratestr)
   22.25 -    if m:
   22.26 -        ratestr = m.group(1) + "G" + m.group(2)
   22.27 -    else:
   22.28 -        m = write_rate_M_re.match(ratestr)
   22.29 -        if m:
   22.30 -            ratestr = m.group(1) + "M" + m.group(2)
   22.31 -        else:
   22.32 -            m = write_rate_K_re.match(ratestr)
   22.33 -            if m:
   22.34 -                ratestr = m.group(1) + "K" + m.group(2)
   22.35 -
   22.36 -    m = write_rate_s_re.match(ratestr)
   22.37 -    if m:
   22.38 -        ratestr = m.group(1) + "s"
   22.39 -    else:
   22.40 -        m = write_rate_m_re.match(ratestr)
   22.41 -        if m:
   22.42 -            ratestr = m.group(1) + "ms"
   22.43 -
   22.44 -    return ratestr
   22.45 -
   22.46 -
   22.47  class NetifController(DevController):
   22.48      """Network interface controller. Handles all network devices for a domain.
   22.49      """
   22.50 @@ -138,8 +98,7 @@ class NetifController(DevController):
   22.51      def getDeviceDetails(self, config):
   22.52          """@see DevController.getDeviceDetails"""
   22.53  
   22.54 -        script = os.path.join(xoptions.network_script_dir,
   22.55 -                              config.get('script', xoptions.get_vif_script()))
   22.56 +        script  = config.get('script', xoptions.get_vif_script())
   22.57          typ     = config.get('type')
   22.58          bridge  = config.get('bridge')
   22.59          mac     = config.get('mac')
   22.60 @@ -149,24 +108,17 @@ class NetifController(DevController):
   22.61          ipaddr  = config.get('ip')
   22.62          model   = config.get('model')
   22.63  
   22.64 -        devid = self.allocateDeviceID()
   22.65 -
   22.66          if not typ:
   22.67              typ = xoptions.netback_type
   22.68 -            
   22.69 +
   22.70          if not mac:
   22.71              mac = randomMAC()
   22.72  
   22.73 +        devid = self.allocateDeviceID()
   22.74 +
   22.75          back = { 'script' : script,
   22.76                   'mac'    : mac,
   22.77 -                 'handle' : "%i" % devid,
   22.78                   'type'   : typ }
   22.79 -
   22.80 -        if typ == 'ioemu':
   22.81 -            front = {}
   22.82 -        else:
   22.83 -            front = { 'handle' : "%i" % devid,
   22.84 -                      'mac'    : mac }
   22.85          if ipaddr:
   22.86              back['ip'] = ipaddr
   22.87          if bridge:
   22.88 @@ -174,12 +126,26 @@ class NetifController(DevController):
   22.89          if vifname:
   22.90              back['vifname'] = vifname
   22.91          if rate:
   22.92 -            back['rate'] = parseRate(rate)
   22.93 +            back['rate'] = rate
   22.94          if uuid:
   22.95              back['uuid'] = uuid
   22.96          if model:
   22.97              back['model'] = model
   22.98  
   22.99 +        config_path = "device/%s/%d/" % (self.deviceClass, devid)
  22.100 +        for x in back:
  22.101 +            self.vm._writeVm(config_path + x, back[x])
  22.102 +
  22.103 +        back['handle'] = "%i" % devid
  22.104 +        back['script'] = os.path.join(xoptions.network_script_dir, script)
  22.105 +        if rate:
  22.106 +            back['rate'] = parseRate(rate)
  22.107 +
  22.108 +        front = {}
  22.109 +        if typ != 'ioemu':
  22.110 +            front = { 'handle' : "%i" % devid,
  22.111 +                      'mac'    : mac }
  22.112 +
  22.113          return (devid, back, front)
  22.114  
  22.115  
  22.116 @@ -187,14 +153,17 @@ class NetifController(DevController):
  22.117          """@see DevController.configuration"""
  22.118  
  22.119          result = DevController.getDeviceConfiguration(self, devid)
  22.120 -        devinfo =  self.readBackend(devid, 'script', 'ip', 'bridge',
  22.121 -                                    'mac', 'type', 'vifname', 'rate',
  22.122 -                                    'uuid', 'model')
  22.123 +
  22.124 +        config_path = "device/%s/%d/" % (self.deviceClass, devid)
  22.125 +        devinfo = ()
  22.126 +        for x in ( 'script', 'ip', 'bridge', 'mac',
  22.127 +                   'type', 'vifname', 'rate', 'uuid', 'model' ):
  22.128 +            y = self.vm._readVm(config_path + x)
  22.129 +            devinfo += (y,)
  22.130          (script, ip, bridge, mac, typ, vifname, rate, uuid, model) = devinfo
  22.131  
  22.132          if script:
  22.133 -            network_script_dir = xoptions.network_script_dir + os.sep
  22.134 -            result['script'] = script.replace(network_script_dir, "")
  22.135 +            result['script'] = script
  22.136          if ip:
  22.137              result['ip'] = ip
  22.138          if bridge:
  22.139 @@ -206,11 +175,10 @@ class NetifController(DevController):
  22.140          if vifname:
  22.141              result['vifname'] = vifname
  22.142          if rate:
  22.143 -            result['rate'] = formatRate(rate)
  22.144 +            result['rate'] = rate
  22.145          if uuid:
  22.146              result['uuid'] = uuid
  22.147          if model:
  22.148              result['model'] = model
  22.149              
  22.150          return result
  22.151 -
    23.1 --- a/tools/xcutils/xc_save.c	Wed Apr 11 07:30:02 2007 -0600
    23.2 +++ b/tools/xcutils/xc_save.c	Wed Apr 11 15:45:29 2007 +0100
    23.3 @@ -174,12 +174,9 @@ main(int argc, char **argv)
    23.4      max_f = atoi(argv[4]);
    23.5      flags = atoi(argv[5]);
    23.6  
    23.7 -    if (flags & XCFLAGS_HVM)
    23.8 -        ret = xc_hvm_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
    23.9 -                          &suspend, &init_qemu_maps, &qemu_flip_buffer);
   23.10 -    else 
   23.11 -        ret = xc_linux_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
   23.12 -                            &suspend);
   23.13 +    ret = xc_domain_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
   23.14 +                         &suspend, !!(flags & XCFLAGS_HVM),
   23.15 +                         &init_qemu_maps, &qemu_flip_buffer);
   23.16  
   23.17      xc_interface_close(xc_fd);
   23.18  
    24.1 --- a/unmodified_drivers/linux-2.6/platform-pci/evtchn.c	Wed Apr 11 07:30:02 2007 -0600
    24.2 +++ b/unmodified_drivers/linux-2.6/platform-pci/evtchn.c	Wed Apr 11 15:45:29 2007 +0100
    24.3 @@ -28,8 +28,10 @@
    24.4   * IN THE SOFTWARE.
    24.5   */
    24.6  
    24.7 +#include <linux/config.h>
    24.8  #include <linux/module.h>
    24.9  #include <linux/kernel.h>
   24.10 +#include <linux/spinlock.h>
   24.11  #include <xen/evtchn.h>
   24.12  #include <xen/interface/hvm/ioreq.h>
   24.13  #include <xen/features.h>
   24.14 @@ -41,29 +43,37 @@
   24.15  
   24.16  void *shared_info_area;
   24.17  
   24.18 -static DEFINE_MUTEX(irq_evtchn_mutex);
   24.19 -
   24.20  #define is_valid_evtchn(x)	((x) != 0)
   24.21  #define evtchn_from_irq(x)	(irq_evtchn[irq].evtchn)
   24.22  
   24.23  static struct {
   24.24 +	spinlock_t lock;
   24.25  	irqreturn_t(*handler) (int, void *, struct pt_regs *);
   24.26  	void *dev_id;
   24.27  	int evtchn;
   24.28  	int close:1; /* close on unbind_from_irqhandler()? */
   24.29  	int inuse:1;
   24.30 +	int in_handler:1;
   24.31  } irq_evtchn[256];
   24.32  static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
   24.33  	[0 ...  NR_EVENT_CHANNELS-1] = -1 };
   24.34  
   24.35 -static int find_unbound_irq(void)
   24.36 +static DEFINE_SPINLOCK(irq_alloc_lock);
   24.37 +
   24.38 +static int alloc_xen_irq(void)
   24.39  {
   24.40  	static int warned;
   24.41  	int irq;
   24.42  
   24.43 -	for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++)
   24.44 -		if (!irq_evtchn[irq].inuse)
   24.45 -			return irq;
   24.46 +	spin_lock(&irq_alloc_lock);
   24.47 +
   24.48 +	for (irq = 1; irq < ARRAY_SIZE(irq_evtchn); irq++) {
   24.49 +		if (irq_evtchn[irq].inuse) 
   24.50 +			continue;
   24.51 +		irq_evtchn[irq].inuse = 1;
   24.52 +		spin_unlock(&irq_alloc_lock);
   24.53 +		return irq;
   24.54 +	}
   24.55  
   24.56  	if (!warned) {
   24.57  		warned = 1;
   24.58 @@ -71,9 +81,18 @@ static int find_unbound_irq(void)
   24.59  		       "increase irq_evtchn[] size in evtchn.c.\n");
   24.60  	}
   24.61  
   24.62 +	spin_unlock(&irq_alloc_lock);
   24.63 +
   24.64  	return -ENOSPC;
   24.65  }
   24.66  
   24.67 +static void free_xen_irq(int irq)
   24.68 +{
   24.69 +	spin_lock(&irq_alloc_lock);
   24.70 +	irq_evtchn[irq].inuse = 0;
   24.71 +	spin_unlock(&irq_alloc_lock);
   24.72 +}
   24.73 +
   24.74  int irq_to_evtchn_port(int irq)
   24.75  {
   24.76  	return irq_evtchn[irq].evtchn;
   24.77 @@ -93,8 +112,7 @@ void unmask_evtchn(int port)
   24.78  	shared_info_t *s = shared_info_area;
   24.79  	vcpu_info_t *vcpu_info;
   24.80  
   24.81 -	preempt_disable();
   24.82 -	cpu = smp_processor_id();
   24.83 +	cpu = get_cpu();
   24.84  	vcpu_info = &s->vcpu_info[cpu];
   24.85  
   24.86  	/* Slow path (hypercall) if this is a non-local port.  We only
   24.87 @@ -103,7 +121,7 @@ void unmask_evtchn(int port)
   24.88  		evtchn_unmask_t op = { .port = port };
   24.89  		(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask,
   24.90  						  &op);
   24.91 -		preempt_enable();
   24.92 +		put_cpu();
   24.93  		return;
   24.94  	}
   24.95  
   24.96 @@ -121,7 +139,8 @@ void unmask_evtchn(int port)
   24.97  		if (!vcpu_info->evtchn_upcall_mask)
   24.98  			force_evtchn_callback();
   24.99  	}
  24.100 -	preempt_enable();
  24.101 +
  24.102 +	put_cpu();
  24.103  }
  24.104  EXPORT_SYMBOL(unmask_evtchn);
  24.105  
  24.106 @@ -135,20 +154,19 @@ int bind_listening_port_to_irqhandler(
  24.107  	struct evtchn_alloc_unbound alloc_unbound;
  24.108  	int err, irq;
  24.109  
  24.110 -	mutex_lock(&irq_evtchn_mutex);
  24.111 +	irq = alloc_xen_irq();
  24.112 +	if (irq < 0)
  24.113 +		return irq;
  24.114  
  24.115 -	irq = find_unbound_irq();
  24.116 -	if (irq < 0) {
  24.117 -		mutex_unlock(&irq_evtchn_mutex);
  24.118 -		return irq;
  24.119 -	}
  24.120 +	spin_lock_irq(&irq_evtchn[irq].lock);
  24.121  
  24.122  	alloc_unbound.dom        = DOMID_SELF;
  24.123  	alloc_unbound.remote_dom = remote_domain;
  24.124  	err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
  24.125  					  &alloc_unbound);
  24.126  	if (err) {
  24.127 -		mutex_unlock(&irq_evtchn_mutex);
  24.128 +		spin_unlock_irq(&irq_evtchn[irq].lock);
  24.129 +		free_xen_irq(irq);
  24.130  		return err;
  24.131  	}
  24.132  
  24.133 @@ -156,13 +174,13 @@ int bind_listening_port_to_irqhandler(
  24.134  	irq_evtchn[irq].dev_id  = dev_id;
  24.135  	irq_evtchn[irq].evtchn  = alloc_unbound.port;
  24.136  	irq_evtchn[irq].close   = 1;
  24.137 -	irq_evtchn[irq].inuse   = 1;
  24.138  
  24.139  	evtchn_to_irq[alloc_unbound.port] = irq;
  24.140  
  24.141  	unmask_evtchn(alloc_unbound.port);
  24.142  
  24.143 -	mutex_unlock(&irq_evtchn_mutex);
  24.144 +	spin_unlock_irq(&irq_evtchn[irq].lock);
  24.145 +
  24.146  	return irq;
  24.147  }
  24.148  EXPORT_SYMBOL(bind_listening_port_to_irqhandler);
  24.149 @@ -176,34 +194,34 @@ int bind_caller_port_to_irqhandler(
  24.150  {
  24.151  	int irq;
  24.152  
  24.153 -	mutex_lock(&irq_evtchn_mutex);
  24.154 +	irq = alloc_xen_irq();
  24.155 +	if (irq < 0)
  24.156 +		return irq;
  24.157  
  24.158 -	irq = find_unbound_irq();
  24.159 -	if (irq < 0) {
  24.160 -		mutex_unlock(&irq_evtchn_mutex);
  24.161 -		return irq;
  24.162 -	}
  24.163 +	spin_lock_irq(&irq_evtchn[irq].lock);
  24.164  
  24.165  	irq_evtchn[irq].handler = handler;
  24.166  	irq_evtchn[irq].dev_id  = dev_id;
  24.167  	irq_evtchn[irq].evtchn  = caller_port;
  24.168  	irq_evtchn[irq].close   = 0;
  24.169 -	irq_evtchn[irq].inuse   = 1;
  24.170  
  24.171  	evtchn_to_irq[caller_port] = irq;
  24.172  
  24.173  	unmask_evtchn(caller_port);
  24.174  
  24.175 -	mutex_unlock(&irq_evtchn_mutex);
  24.176 +	spin_unlock_irq(&irq_evtchn[irq].lock);
  24.177 +
  24.178  	return irq;
  24.179  }
  24.180  EXPORT_SYMBOL(bind_caller_port_to_irqhandler);
  24.181  
  24.182  void unbind_from_irqhandler(unsigned int irq, void *dev_id)
  24.183  {
  24.184 -	int evtchn = evtchn_from_irq(irq);
  24.185 +	int evtchn;
  24.186  
  24.187 -	mutex_lock(&irq_evtchn_mutex);
  24.188 +	spin_lock_irq(&irq_evtchn[irq].lock);
  24.189 +
  24.190 +	evtchn = evtchn_from_irq(irq);
  24.191  
  24.192  	if (is_valid_evtchn(evtchn)) {
  24.193  		evtchn_to_irq[irq] = -1;
  24.194 @@ -216,21 +234,28 @@ void unbind_from_irqhandler(unsigned int
  24.195  
  24.196  	irq_evtchn[irq].handler = NULL;
  24.197  	irq_evtchn[irq].evtchn  = 0;
  24.198 -	irq_evtchn[irq].inuse   = 0;
  24.199 +
  24.200 +	spin_unlock_irq(&irq_evtchn[irq].lock);
  24.201  
  24.202 -	mutex_unlock(&irq_evtchn_mutex);
  24.203 +	while (irq_evtchn[irq].in_handler)
  24.204 +		cpu_relax();
  24.205 +
  24.206 +	free_xen_irq(irq);
  24.207  }
  24.208  EXPORT_SYMBOL(unbind_from_irqhandler);
  24.209  
  24.210  void notify_remote_via_irq(int irq)
  24.211  {
  24.212 -	int evtchn = evtchn_from_irq(irq);
  24.213 +	int evtchn;
  24.214 +
  24.215 +	evtchn = evtchn_from_irq(irq);
  24.216  	if (is_valid_evtchn(evtchn))
  24.217  		notify_remote_via_evtchn(evtchn);
  24.218  }
  24.219  EXPORT_SYMBOL(notify_remote_via_irq);
  24.220  
  24.221 -irqreturn_t evtchn_interrupt(int irq, void *dev_id, struct pt_regs *regs)
  24.222 +static irqreturn_t evtchn_interrupt(int irq, void *dev_id,
  24.223 +				    struct pt_regs *regs)
  24.224  {
  24.225  	unsigned int l1i, port;
  24.226  	/* XXX: All events are bound to vcpu0 but irq may be redirected. */
  24.227 @@ -249,13 +274,30 @@ irqreturn_t evtchn_interrupt(int irq, vo
  24.228  		while ((l2 = s->evtchn_pending[l1i] & ~s->evtchn_mask[l1i])) {
  24.229  			port = (l1i * BITS_PER_LONG) + __ffs(l2);
  24.230  			synch_clear_bit(port, &s->evtchn_pending[0]);
  24.231 +
  24.232  			irq = evtchn_to_irq[port];
  24.233 -			if ((irq >= 0) &&
  24.234 -			    ((handler = irq_evtchn[irq].handler) != NULL))
  24.235 -				handler(irq, irq_evtchn[irq].dev_id, regs);
  24.236 -			else
  24.237 -				printk(KERN_WARNING "unexpected event channel "
  24.238 -				       "upcall on port %d!\n", port);
  24.239 +			if (irq < 0)
  24.240 +				continue;
  24.241 +
  24.242 +			spin_lock(&irq_evtchn[irq].lock);
  24.243 +			handler = irq_evtchn[irq].handler;
  24.244 +			dev_id  = irq_evtchn[irq].dev_id;
  24.245 +			if (unlikely(handler == NULL)) {
  24.246 +				printk("Xen IRQ%d (port %d) has no handler!\n",
  24.247 +				       irq, port);
  24.248 +				spin_unlock(&irq_evtchn[irq].lock);
  24.249 +				continue;
  24.250 +			}
  24.251 +			irq_evtchn[irq].in_handler = 1;
  24.252 +			spin_unlock(&irq_evtchn[irq].lock);
  24.253 +
  24.254 +			local_irq_enable();
  24.255 +			handler(irq, irq_evtchn[irq].dev_id, regs);
  24.256 +			local_irq_disable();
  24.257 +
  24.258 +			spin_lock(&irq_evtchn[irq].lock);
  24.259 +			irq_evtchn[irq].in_handler = 0;
  24.260 +			spin_unlock(&irq_evtchn[irq].lock);
  24.261  		}
  24.262  	}
  24.263  
  24.264 @@ -268,16 +310,6 @@ void force_evtchn_callback(void)
  24.265  }
  24.266  EXPORT_SYMBOL(force_evtchn_callback);
  24.267  
  24.268 -void irq_suspend(void)
  24.269 -{
  24.270 -	mutex_lock(&irq_evtchn_mutex);
  24.271 -}
  24.272 -
  24.273 -void irq_suspend_cancel(void)
  24.274 -{
  24.275 -	mutex_unlock(&irq_evtchn_mutex);
  24.276 -}
  24.277 -
  24.278  void irq_resume(void)
  24.279  {
  24.280  	int evtchn, irq;
  24.281 @@ -289,6 +321,16 @@ void irq_resume(void)
  24.282  
  24.283  	for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++)
  24.284  		irq_evtchn[irq].evtchn = 0;
  24.285 +}
  24.286  
  24.287 -	mutex_unlock(&irq_evtchn_mutex);
  24.288 +int xen_irq_init(struct pci_dev *pdev)
  24.289 +{
  24.290 +	int irq;
  24.291 +
  24.292 +	for (irq = 0; irq < ARRAY_SIZE(irq_evtchn); irq++)
  24.293 +		spin_lock_init(&irq_evtchn[irq].lock);
  24.294 +
  24.295 +	return request_irq(pdev->irq, evtchn_interrupt,
  24.296 +			   SA_SHIRQ | SA_SAMPLE_RANDOM | SA_INTERRUPT,
  24.297 +			   "xen-platform-pci", pdev);
  24.298  }
    25.1 --- a/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c	Wed Apr 11 07:30:02 2007 -0600
    25.2 +++ b/unmodified_drivers/linux-2.6/platform-pci/machine_reboot.c	Wed Apr 11 15:45:29 2007 +0100
    25.3 @@ -1,24 +1,81 @@
    25.4  #include <linux/config.h>
    25.5 +#include <linux/stop_machine.h>
    25.6 +#include <xen/evtchn.h>
    25.7 +#include <xen/gnttab.h>
    25.8  #include <xen/xenbus.h>
    25.9  #include "platform-pci.h"
   25.10  #include <asm/hypervisor.h>
   25.11  
   25.12 -int __xen_suspend(int fast_suspend)
   25.13 +/*
   25.14 + * Spinning prevents, for example, APs touching grant table entries while
   25.15 + * the shared grant table is not mapped into the address space imemdiately
   25.16 + * after resume.
   25.17 + */
   25.18 +static void ap_suspend(void *_ap_spin)
   25.19 +{
   25.20 +	int *ap_spin = _ap_spin;
   25.21 +
   25.22 +	BUG_ON(!irqs_disabled());
   25.23 +
   25.24 +	while (*ap_spin) {
   25.25 +		cpu_relax();
   25.26 +		HYPERVISOR_yield();
   25.27 +	}
   25.28 +}
   25.29 +
   25.30 +static int bp_suspend(void)
   25.31  {
   25.32  	int suspend_cancelled;
   25.33  
   25.34 -	xenbus_suspend();
   25.35 -	platform_pci_suspend();
   25.36 +	BUG_ON(!irqs_disabled());
   25.37  
   25.38  	suspend_cancelled = HYPERVISOR_shutdown(SHUTDOWN_suspend);
   25.39  
   25.40 -	if (suspend_cancelled) {
   25.41 -		platform_pci_suspend_cancel();
   25.42 +	if (!suspend_cancelled) {
   25.43 +		platform_pci_resume();
   25.44 +		gnttab_resume();
   25.45 +		irq_resume();
   25.46 +	}
   25.47 +
   25.48 +	return suspend_cancelled;
   25.49 +}
   25.50 +
   25.51 +int __xen_suspend(int fast_suspend)
   25.52 +{
   25.53 +	int err, suspend_cancelled, ap_spin;
   25.54 +
   25.55 +	xenbus_suspend();
   25.56 +
   25.57 +	preempt_disable();
   25.58 +
   25.59 +	/* Prevent any races with evtchn_interrupt() handler. */
   25.60 +	disable_irq(xen_platform_pdev->irq);
   25.61 +
   25.62 +	ap_spin = 1;
   25.63 +	smp_mb();
   25.64 +
   25.65 +	err = smp_call_function(ap_suspend, &ap_spin, 0, 0);
   25.66 +	if (err < 0) {
   25.67 +		preempt_enable();
   25.68  		xenbus_suspend_cancel();
   25.69 -	} else {
   25.70 -		platform_pci_resume();
   25.71 +		return err;
   25.72 +	}
   25.73 +
   25.74 +	local_irq_disable();
   25.75 +	suspend_cancelled = bp_suspend();
   25.76 +	local_irq_enable();
   25.77 +
   25.78 +	smp_mb();
   25.79 +	ap_spin = 0;
   25.80 +
   25.81 +	enable_irq(xen_platform_pdev->irq);
   25.82 +
   25.83 +	preempt_enable();
   25.84 +
   25.85 +	if (!suspend_cancelled)
   25.86  		xenbus_resume();
   25.87 -	}
   25.88 +	else
   25.89 +		xenbus_suspend_cancel();
   25.90  
   25.91  	return 0;
   25.92  }
    26.1 --- a/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c	Wed Apr 11 07:30:02 2007 -0600
    26.2 +++ b/unmodified_drivers/linux-2.6/platform-pci/platform-compat.c	Wed Apr 11 15:45:29 2007 +0100
    26.3 @@ -12,11 +12,10 @@ static int system_state = 1;
    26.4  EXPORT_SYMBOL(system_state);
    26.5  #endif
    26.6  
    26.7 -static inline void ctrl_alt_del(void)
    26.8 +void ctrl_alt_del(void)
    26.9  {
   26.10  	kill_proc(1, SIGINT, 1); /* interrupt init */
   26.11  }
   26.12 -EXPORT_SYMBOL(ctrl_alt_del);
   26.13  
   26.14  #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8)
   26.15  size_t strcspn(const char *s, const char *reject)
    27.1 --- a/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c	Wed Apr 11 07:30:02 2007 -0600
    27.2 +++ b/unmodified_drivers/linux-2.6/platform-pci/platform-pci.c	Wed Apr 11 15:45:29 2007 +0100
    27.3 @@ -40,7 +40,6 @@
    27.4  #include <xen/interface/hvm/params.h>
    27.5  #include <xen/features.h>
    27.6  #include <xen/evtchn.h>
    27.7 -#include <xen/gnttab.h>
    27.8  #ifdef __ia64__
    27.9  #include <asm/xen/xencomm.h>
   27.10  #endif
   27.11 @@ -62,6 +61,8 @@ MODULE_AUTHOR("ssmith@xensource.com");
   27.12  MODULE_DESCRIPTION("Xen platform PCI device");
   27.13  MODULE_LICENSE("GPL");
   27.14  
   27.15 +struct pci_dev *xen_platform_pdev;
   27.16 +
   27.17  static unsigned long shared_info_frame;
   27.18  static uint64_t callback_via;
   27.19  
   27.20 @@ -89,8 +90,6 @@ static int __devinit init_xen_info(void)
   27.21  	if (shared_info_area == NULL)
   27.22  		panic("can't map shared info\n");
   27.23  
   27.24 -	gnttab_init();
   27.25 -
   27.26  	return 0;
   27.27  }
   27.28  
   27.29 @@ -199,8 +198,10 @@ static int set_callback_via(uint64_t via
   27.30  	return HYPERVISOR_hvm_op(HVMOP_set_param, &a);
   27.31  }
   27.32  
   27.33 +int xen_irq_init(struct pci_dev *pdev);
   27.34  int xenbus_init(void);
   27.35  int xen_reboot_init(void);
   27.36 +int gnttab_init(void);
   27.37  
   27.38  static int __devinit platform_pci_init(struct pci_dev *pdev,
   27.39  				       const struct pci_device_id *ent)
   27.40 @@ -209,6 +210,10 @@ static int __devinit platform_pci_init(s
   27.41  	long ioaddr, iolen;
   27.42  	long mmio_addr, mmio_len;
   27.43  
   27.44 +	if (xen_platform_pdev)
   27.45 +		return -EBUSY;
   27.46 +	xen_platform_pdev = pdev;
   27.47 +
   27.48  	i = pci_enable_device(pdev);
   27.49  	if (i)
   27.50  		return i;
   27.51 @@ -249,9 +254,10 @@ static int __devinit platform_pci_init(s
   27.52  	if ((ret = init_xen_info()))
   27.53  		goto out;
   27.54  
   27.55 -	if ((ret = request_irq(pdev->irq, evtchn_interrupt,
   27.56 -			       SA_SHIRQ | SA_SAMPLE_RANDOM,
   27.57 -			       "xen-platform-pci", pdev)))
   27.58 +	if ((ret = gnttab_init()))
   27.59 +		goto out;
   27.60 +
   27.61 +	if ((ret = xen_irq_init(pdev)))
   27.62  		goto out;
   27.63  
   27.64  	if ((ret = set_callback_via(callback_via)))
   27.65 @@ -292,18 +298,6 @@ static struct pci_driver platform_driver
   27.66  
   27.67  static int pci_device_registered;
   27.68  
   27.69 -void platform_pci_suspend(void)
   27.70 -{
   27.71 -	gnttab_suspend();
   27.72 -	irq_suspend();
   27.73 -}
   27.74 -
   27.75 -void platform_pci_suspend_cancel(void)
   27.76 -{
   27.77 -	irq_suspend_cancel();
   27.78 -	gnttab_resume();
   27.79 -}
   27.80 -
   27.81  void platform_pci_resume(void)
   27.82  {
   27.83  	struct xen_add_to_physmap xatp;
   27.84 @@ -319,12 +313,8 @@ void platform_pci_resume(void)
   27.85  	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
   27.86  		BUG();
   27.87  
   27.88 -	irq_resume();
   27.89 -
   27.90  	if (set_callback_via(callback_via))
   27.91  		printk("platform_pci_resume failure!\n");
   27.92 -
   27.93 -	gnttab_resume();
   27.94  }
   27.95  
   27.96  static int __init platform_pci_module_init(void)
    28.1 --- a/unmodified_drivers/linux-2.6/platform-pci/platform-pci.h	Wed Apr 11 07:30:02 2007 -0600
    28.2 +++ b/unmodified_drivers/linux-2.6/platform-pci/platform-pci.h	Wed Apr 11 15:45:29 2007 +0100
    28.3 @@ -22,16 +22,11 @@
    28.4  #ifndef _XEN_PLATFORM_PCI_H
    28.5  #define _XEN_PLATFORM_PCI_H
    28.6  
    28.7 -#include <linux/interrupt.h>
    28.8 +#include <linux/pci.h>
    28.9  
   28.10  unsigned long alloc_xen_mmio(unsigned long len);
   28.11 -int gnttab_init(void);
   28.12 -irqreturn_t evtchn_interrupt(int irq, void *dev_id, struct pt_regs *regs);
   28.13 -void irq_suspend(void);
   28.14 -void irq_suspend_cancel(void);
   28.15 -
   28.16 -void platform_pci_suspend(void);
   28.17 -void platform_pci_suspend_cancel(void);
   28.18  void platform_pci_resume(void);
   28.19  
   28.20 +extern struct pci_dev *xen_platform_pdev;
   28.21 +
   28.22  #endif /* _XEN_PLATFORM_PCI_H */