ia64/xen-unstable

changeset 17940:08f77df14cba

merge with xen-unstable.hg
author Isaku Yamahata <yamahata@valinux.co.jp>
date Wed Jul 02 11:30:37 2008 +0900 (2008-07-02)
parents 11318234588e 19970181d6a4
children ac8bc814faba 40e7329105fa
files tools/libxc/ia64/xc_ia64_linux_restore.c tools/libxc/ia64/xc_ia64_linux_save.c
line diff
     1.1 --- a/docs/ChangeLog	Thu Jun 19 12:48:04 2008 +0900
     1.2 +++ b/docs/ChangeLog	Wed Jul 02 11:30:37 2008 +0900
     1.3 @@ -16,6 +16,15 @@ http://lists.xensource.com/archives/html
     1.4  Xen 3.3 release
     1.5  ---------------
     1.6  
     1.7 +17903: Add greater than 16 xvd device availability
     1.8 +http://xenbits.xensource.com/xen-unstable.hg?rev/0728459b3c8d
     1.9 +
    1.10 +The tools can now attach a disk of the form:
    1.11 +(1<<28) | (device<<8) | partition
    1.12 +to support many more xvd disks and up to 256 partitions.
    1.13 +The linux guest frontend has been expanded to support
    1.14 +this new construct, while legacy guests should just ignore it.
    1.15 +	
    1.16  17538: Add XENPF_set_processor_pminfo
    1.17  http://xenbits.xensource.com/xen-unstable.hg?rev/5bb9093eb0e9
    1.18  
     2.1 --- a/extras/mini-os/arch/x86/mm.c	Thu Jun 19 12:48:04 2008 +0900
     2.2 +++ b/extras/mini-os/arch/x86/mm.c	Wed Jul 02 11:30:37 2008 +0900
     2.3 @@ -528,18 +528,13 @@ void *map_frames_ex(unsigned long *f, un
     2.4  
     2.5  static void clear_bootstrap(void)
     2.6  {
     2.7 -    xen_pfn_t mfns[] = { virt_to_mfn(&shared_info) };
     2.8 -    int n = sizeof(mfns)/sizeof(*mfns);
     2.9      pte_t nullpte = { };
    2.10  
    2.11      /* Use first page as the CoW zero page */
    2.12      memset(&_text, 0, PAGE_SIZE);
    2.13 -    mfn_zero = pfn_to_mfn((unsigned long) &_text);
    2.14 -    if (HYPERVISOR_update_va_mapping((unsigned long) &_text, nullpte, UVMF_INVLPG))
    2.15 -	printk("Unable to unmap first page\n");
    2.16 -
    2.17 -    if (free_physical_pages(mfns, n) != n)
    2.18 -	printk("Unable to free bootstrap pages\n");
    2.19 +    mfn_zero = virt_to_mfn((unsigned long) &_text);
    2.20 +    if (HYPERVISOR_update_va_mapping(0, nullpte, UVMF_INVLPG))
    2.21 +	printk("Unable to unmap NULL page\n");
    2.22  }
    2.23  
    2.24  void arch_init_p2m(unsigned long max_pfn)
     3.1 --- a/extras/mini-os/blkfront.c	Thu Jun 19 12:48:04 2008 +0900
     3.2 +++ b/extras/mini-os/blkfront.c	Wed Jul 02 11:30:37 2008 +0900
     3.3 @@ -125,7 +125,6 @@ struct blkfront_dev *init_blkfront(char 
     3.4  
     3.5      dev->events = NULL;
     3.6  
     3.7 -    // FIXME: proper frees on failures
     3.8  again:
     3.9      err = xenbus_transaction_start(&xbt);
    3.10      if (err) {
     4.1 --- a/extras/mini-os/fbfront.c	Thu Jun 19 12:48:04 2008 +0900
     4.2 +++ b/extras/mini-os/fbfront.c	Wed Jul 02 11:30:37 2008 +0900
     4.3 @@ -100,7 +100,6 @@ struct kbdfront_dev *init_kbdfront(char 
     4.4      s->in_cons = s->in_prod = 0;
     4.5      s->out_cons = s->out_prod = 0;
     4.6  
     4.7 -    // FIXME: proper frees on failures
     4.8  again:
     4.9      err = xenbus_transaction_start(&xbt);
    4.10      if (err) {
    4.11 @@ -408,7 +407,6 @@ struct fbfront_dev *init_fbfront(char *n
    4.12          s->pd[i] = 0;
    4.13  
    4.14  
    4.15 -    // FIXME: proper frees on failures
    4.16  again:
    4.17      err = xenbus_transaction_start(&xbt);
    4.18      if (err) {
     5.1 --- a/extras/mini-os/fs-front.c	Thu Jun 19 12:48:04 2008 +0900
     5.2 +++ b/extras/mini-os/fs-front.c	Wed Jul 02 11:30:37 2008 +0900
     5.3 @@ -136,8 +136,8 @@ static inline void add_id_to_freelist(un
     5.4  again:    
     5.5      old_id = freelist[0];
     5.6      /* Note: temporal inconsistency, since freelist[0] can be changed by someone
     5.7 -     * else, but we are a sole owner of freelist[id], it's OK. */
     5.8 -    freelist[id] = old_id;
     5.9 +     * else, but we are a sole owner of freelist[id + 1], it's OK. */
    5.10 +    freelist[id + 1] = old_id;
    5.11      new_id = id;
    5.12      if(cmpxchg(&freelist[0], old_id, new_id) != old_id)
    5.13      {
    5.14 @@ -154,7 +154,7 @@ static inline unsigned short get_id_from
    5.15  
    5.16  again:    
    5.17      old_id = freelist[0];
    5.18 -    new_id = freelist[old_id];
    5.19 +    new_id = freelist[old_id + 1];
    5.20      if(cmpxchg(&freelist[0], old_id, new_id) != old_id)
    5.21      {
    5.22          printk("Cmpxchg on freelist remove failed.\n");
    5.23 @@ -785,8 +785,8 @@ static void alloc_request_table(struct f
    5.24      printk("Allocating request array for import %d, nr_entries = %d.\n",
    5.25              import->import_id, import->nr_entries);
    5.26      requests = xmalloc_array(struct fs_request, import->nr_entries);
    5.27 -    import->freelist = xmalloc_array(unsigned short, import->nr_entries);
    5.28 -    memset(import->freelist, 0, sizeof(unsigned short) * import->nr_entries);
    5.29 +    import->freelist = xmalloc_array(unsigned short, import->nr_entries + 1);
    5.30 +    memset(import->freelist, 0, sizeof(unsigned short) * (import->nr_entries + 1));
    5.31      for(i=0; i<import->nr_entries; i++)
    5.32      {
    5.33  	/* TODO: that's a lot of memory */
     6.1 --- a/extras/mini-os/lib/sys.c	Thu Jun 19 12:48:04 2008 +0900
     6.2 +++ b/extras/mini-os/lib/sys.c	Wed Jul 02 11:30:37 2008 +0900
     6.3 @@ -686,7 +686,7 @@ static int select_poll(int nfds, fd_set 
     6.4  #ifdef LIBC_VERBOSE
     6.5      static int nb;
     6.6      static int nbread[NOFILE], nbwrite[NOFILE], nbexcept[NOFILE];
     6.7 -    static s64_t lastshown;
     6.8 +    static s_time_t lastshown;
     6.9  
    6.10      nb++;
    6.11  #endif
     7.1 --- a/extras/mini-os/netfront.c	Thu Jun 19 12:48:04 2008 +0900
     7.2 +++ b/extras/mini-os/netfront.c	Wed Jul 02 11:30:37 2008 +0900
     7.3 @@ -38,7 +38,7 @@ struct net_buffer {
     7.4  struct netfront_dev {
     7.5      domid_t dom;
     7.6  
     7.7 -    unsigned short tx_freelist[NET_TX_RING_SIZE];
     7.8 +    unsigned short tx_freelist[NET_TX_RING_SIZE + 1];
     7.9      struct semaphore tx_sem;
    7.10  
    7.11      struct net_buffer rx_buffers[NET_RX_RING_SIZE];
    7.12 @@ -70,14 +70,14 @@ void init_rx_buffers(struct netfront_dev
    7.13  
    7.14  static inline void add_id_to_freelist(unsigned int id,unsigned short* freelist)
    7.15  {
    7.16 -    freelist[id] = freelist[0];
    7.17 +    freelist[id + 1] = freelist[0];
    7.18      freelist[0]  = id;
    7.19  }
    7.20  
    7.21  static inline unsigned short get_id_from_freelist(unsigned short* freelist)
    7.22  {
    7.23      unsigned int id = freelist[0];
    7.24 -    freelist[0] = freelist[id];
    7.25 +    freelist[0] = freelist[id + 1];
    7.26      return id;
    7.27  }
    7.28  
     8.1 --- a/stubdom/grub.patches/99minios	Thu Jun 19 12:48:04 2008 +0900
     8.2 +++ b/stubdom/grub.patches/99minios	Wed Jul 02 11:30:37 2008 +0900
     8.3 @@ -832,7 +832,18 @@ Index: grub/stage2/fsys_iso9660.c
     8.4  Index: grub/stage2/fsys_reiserfs.c
     8.5  ===================================================================
     8.6  --- grub.orig/stage2/fsys_reiserfs.c	2008-06-16 15:18:03.410933000 +0100
     8.7 -+++ grub/stage2/fsys_reiserfs.c	2008-06-16 15:18:14.786009000 +0100
     8.8 ++++ grub/stage2/fsys_reiserfs.c	2008-06-20 18:33:52.002100000 +0100
     8.9 +@@ -224,8 +224,8 @@
    8.10 + 
    8.11 + struct disk_child
    8.12 + {
    8.13 +-  unsigned long       dc_block_number;              /* Disk child's block number. */
    8.14 +-  unsigned short      dc_size;		            /* Disk child's used space.   */
    8.15 ++  __u32       dc_block_number;              /* Disk child's block number. */
    8.16 ++  __u16      dc_size;		            /* Disk child's used space.   */
    8.17 + };
    8.18 + 
    8.19 + #define DC_SIZE (sizeof (struct disk_child))
    8.20  @@ -369,7 +369,14 @@
    8.21   static __inline__ unsigned long
    8.22   log2 (unsigned long word)
     9.1 --- a/stubdom/grub/Makefile	Thu Jun 19 12:48:04 2008 +0900
     9.2 +++ b/stubdom/grub/Makefile	Wed Jul 02 11:30:37 2008 +0900
     9.3 @@ -5,7 +5,7 @@ vpath %.c ../grub-cvs
     9.4  
     9.5  BOOT=boot-$(XEN_TARGET_ARCH).o
     9.6  
     9.7 -DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libxc -I.
     9.8 +DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libxc -I$(XEN_ROOT)/tools/include -I.
     9.9  DEF_CPPFLAGS += -I../grub-cvs/stage1
    9.10  DEF_CPPFLAGS += -I../grub-cvs/stage2
    9.11  DEF_CPPFLAGS += -I../grub-cvs/netboot
    10.1 --- a/tools/blktap/drivers/Makefile	Thu Jun 19 12:48:04 2008 +0900
    10.2 +++ b/tools/blktap/drivers/Makefile	Wed Jul 02 11:30:37 2008 +0900
    10.3 @@ -17,8 +17,16 @@ CFLAGS   += -D_GNU_SOURCE
    10.4  CFLAGS   += -Wp,-MD,.$(@F).d
    10.5  DEPS      = .*.d
    10.6  
    10.7 +ifeq ($(shell . ./check_gcrypt),"yes")
    10.8 +CFLAGS += -DUSE_GCRYPT
    10.9 +CRYPT_LIB := -lgcrypt
   10.10 +else
   10.11 +CRYPT_LIB := -lcrypto
   10.12 +$(warning *** libgcrypt not installed: falling back to libcrypto ***)
   10.13 +endif
   10.14 +
   10.15  LDFLAGS_blktapctrl := $(LDFLAGS_libxenctrl) $(LDFLAGS_libxenstore) -L../lib -lblktap
   10.16 -LDFLAGS_img := $(LIBAIO_DIR)/libaio.a -lcrypto -lpthread -lz
   10.17 +LDFLAGS_img := $(LIBAIO_DIR)/libaio.a $(CRYPT_LIB) -lpthread -lz
   10.18  
   10.19  BLK-OBJS-y  := block-aio.o
   10.20  BLK-OBJS-y  += block-sync.o
    11.1 --- a/tools/blktap/drivers/blktapctrl.c	Thu Jun 19 12:48:04 2008 +0900
    11.2 +++ b/tools/blktap/drivers/blktapctrl.c	Wed Jul 02 11:30:37 2008 +0900
    11.3 @@ -127,7 +127,7 @@ static int get_new_dev(int *major, int *
    11.4  	char *devname;
    11.5  	
    11.6  	tr.domid = blkif->domid;
    11.7 -        tr.busid = (unsigned short)blkif->be_id;
    11.8 +        tr.busid = blkif->be_id;
    11.9  	ret = ioctl(ctlfd, BLKTAP_IOCTL_NEWINTF, tr );
   11.10  	
   11.11  	if ( (ret <= 0)||(ret > MAX_TAP_DEV) ) {
    12.1 --- a/tools/blktap/drivers/block-qcow.c	Thu Jun 19 12:48:04 2008 +0900
    12.2 +++ b/tools/blktap/drivers/block-qcow.c	Wed Jul 02 11:30:37 2008 +0900
    12.3 @@ -33,7 +33,6 @@
    12.4  #include <zlib.h>
    12.5  #include <inttypes.h>
    12.6  #include <libaio.h>
    12.7 -#include <openssl/md5.h>
    12.8  #include "bswap.h"
    12.9  #include "aes.h"
   12.10  #include "tapdisk.h"
   12.11 @@ -146,6 +145,35 @@ struct tdqcow_state {
   12.12  
   12.13  static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
   12.14  
   12.15 +#ifdef USE_GCRYPT
   12.16 +
   12.17 +#include <gcrypt.h>
   12.18 +
   12.19 +static uint32_t gen_cksum(char *ptr, int len)
   12.20 +{
   12.21 +	int i;
   12.22 +	uint32_t md[4];
   12.23 +
   12.24 +	/* Convert L1 table to big endian */
   12.25 +	for(i = 0; i < len / sizeof(uint64_t); i++) {
   12.26 +		cpu_to_be64s(&((uint64_t*) ptr)[i]);
   12.27 +	}
   12.28 +
   12.29 +	/* Generate checksum */
   12.30 +	gcry_md_hash_buffer(GCRY_MD_MD5, md, ptr, len);
   12.31 +
   12.32 +	/* Convert L1 table back to native endianess */
   12.33 +	for(i = 0; i < len / sizeof(uint64_t); i++) {
   12.34 +		be64_to_cpus(&((uint64_t*) ptr)[i]);
   12.35 +	}
   12.36 +
   12.37 +	return md[0];
   12.38 +}
   12.39 +
   12.40 +#else /* use libcrypto */
   12.41 +
   12.42 +#include <openssl/md5.h>
   12.43 +
   12.44  static uint32_t gen_cksum(char *ptr, int len)
   12.45  {
   12.46  	int i;
   12.47 @@ -153,9 +181,8 @@ static uint32_t gen_cksum(char *ptr, int
   12.48  	uint32_t ret;
   12.49  
   12.50  	md = malloc(MD5_DIGEST_LENGTH);
   12.51 +	if(!md) return 0;
   12.52  
   12.53 -	if(!md) return 0;
   12.54 -	
   12.55  	/* Convert L1 table to big endian */
   12.56  	for(i = 0; i < len / sizeof(uint64_t); i++) {
   12.57  		cpu_to_be64s(&((uint64_t*) ptr)[i]);
   12.58 @@ -176,6 +203,8 @@ static uint32_t gen_cksum(char *ptr, int
   12.59  	return ret;
   12.60  }
   12.61  
   12.62 +#endif
   12.63 +
   12.64  static int get_filesize(char *filename, uint64_t *size, struct stat *st)
   12.65  {
   12.66  	int fd;
    13.1 --- a/tools/blktap/drivers/block-qcow2.c	Thu Jun 19 12:48:04 2008 +0900
    13.2 +++ b/tools/blktap/drivers/block-qcow2.c	Wed Jul 02 11:30:37 2008 +0900
    13.3 @@ -254,10 +254,7 @@ static int bdrv_pread(int fd, int64_t of
    13.4   */
    13.5  static int bdrv_pwrite(int fd, int64_t offset, const void *buf, int count)
    13.6  {
    13.7 -	int ret;
    13.8 -	
    13.9 -	ret = lseek(fd, offset, SEEK_SET);
   13.10 -	if (ret != offset) {
   13.11 +	if (lseek(fd, offset, SEEK_SET) == -1) {
   13.12  		DPRINTF("bdrv_pwrite failed seek (%#"PRIx64").\n", offset);
   13.13  		return -1;
   13.14  	}
    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/tools/blktap/drivers/check_gcrypt	Wed Jul 02 11:30:37 2008 +0900
    14.3 @@ -0,0 +1,14 @@
    14.4 +#!/bin/sh
    14.5 +
    14.6 +cat > .gcrypt.c << EOF
    14.7 +#include <gcrypt.h>
    14.8 +int main(void) { return 0; }
    14.9 +EOF
   14.10 +
   14.11 +if $1 -o .gcrypt .gcrypt.c -lgcrypt 2>/dev/null ; then
   14.12 +  echo "yes"
   14.13 +else
   14.14 +  echo "no"
   14.15 +fi
   14.16 +
   14.17 +rm -f .gcrypt*
    15.1 --- a/tools/blktap/lib/blktaplib.h	Thu Jun 19 12:48:04 2008 +0900
    15.2 +++ b/tools/blktap/lib/blktaplib.h	Wed Jul 02 11:30:37 2008 +0900
    15.3 @@ -161,7 +161,7 @@ typedef struct tapdev_info {
    15.4  
    15.5  typedef struct domid_translate {
    15.6  	unsigned short domid;
    15.7 -	unsigned short busid;
    15.8 +	uint32_t busid;
    15.9  } domid_translate_t ;
   15.10  
   15.11  typedef struct image {
    16.1 --- a/tools/debugger/xenitp/xenitp.c	Thu Jun 19 12:48:04 2008 +0900
    16.2 +++ b/tools/debugger/xenitp/xenitp.c	Wed Jul 02 11:30:37 2008 +0900
    16.3 @@ -58,6 +58,16 @@ static int cur_vcpu;
    16.4  
    16.5  int virt_to_phys (int is_inst, unsigned long vaddr, unsigned long *paddr);
    16.6  
    16.7 +/* wrapper for vcpu_gest_context_any_t */
    16.8 +static int xc_ia64_vcpu_getcontext(int xc_handle,
    16.9 +                                   uint32_t domid,
   16.10 +                                   uint32_t vcpu,
   16.11 +                                   vcpu_guest_context_t *ctxt)
   16.12 +{
   16.13 +    return xc_vcpu_getcontext(xc_handle, domid, vcpu,
   16.14 +                              (vcpu_guest_context_any_t *)ctxt);
   16.15 +}
   16.16 +
   16.17  static inline unsigned int ctx_slot (vcpu_guest_context_t *ctx)
   16.18  {
   16.19      return (ctx->regs.psr >> PSR_RI_SHIFT) & 3;
   16.20 @@ -729,7 +739,7 @@ int wait_domain (int vcpu, vcpu_guest_co
   16.21          fflush (stdout);
   16.22          nanosleep (&ts, NULL);
   16.23      }
   16.24 -    return xc_vcpu_getcontext (xc_handle, domid, vcpu, ctx);
   16.25 +    return xc_ia64_vcpu_getcontext (xc_handle, domid, vcpu, ctx);
   16.26  }
   16.27  
   16.28  int virt_to_phys (int is_inst, unsigned long vaddr, unsigned long *paddr)
   16.29 @@ -945,13 +955,13 @@ char *parse_arg (char **buf)
   16.30      return res;
   16.31  }
   16.32  
   16.33 -vcpu_guest_context_t vcpu_ctx[MAX_VIRT_CPUS];
   16.34 +vcpu_guest_context_any_t vcpu_ctx_any[MAX_VIRT_CPUS];
   16.35  
   16.36  int vcpu_setcontext (int vcpu)
   16.37  {
   16.38      int ret;
   16.39  
   16.40 -    ret = xc_vcpu_setcontext (xc_handle, domid, vcpu, &vcpu_ctx[vcpu]);
   16.41 +    ret = xc_vcpu_setcontext (xc_handle, domid, vcpu, &vcpu_ctx_any[vcpu]);
   16.42      if (ret < 0)
   16.43          perror ("xc_vcpu_setcontext");
   16.44  
   16.45 @@ -1518,7 +1528,7 @@ enum cmd_status do_command (int vcpu, ch
   16.46      int flag_ambiguous;
   16.47  
   16.48      cur_vcpu = vcpu;
   16.49 -    cur_ctx = &vcpu_ctx[vcpu];
   16.50 +    cur_ctx = &vcpu_ctx_any[vcpu].c;
   16.51  
   16.52      /* Handle repeat last-command.  */
   16.53      if (*line == 0) {
   16.54 @@ -1575,7 +1585,7 @@ void xenitp (int vcpu)
   16.55      int ret;
   16.56      struct sigaction sa;
   16.57  
   16.58 -    cur_ctx = &vcpu_ctx[vcpu];
   16.59 +    cur_ctx = &vcpu_ctx_any[vcpu].c;
   16.60  
   16.61      xc_handle = xc_interface_open (); /* for accessing control interface */
   16.62  
   16.63 @@ -1588,9 +1598,9 @@ void xenitp (int vcpu)
   16.64          exit (-1);
   16.65      }
   16.66  
   16.67 -    ret = xc_vcpu_getcontext (xc_handle, domid, vcpu, cur_ctx);
   16.68 +    ret = xc_ia64_vcpu_getcontext (xc_handle, domid, vcpu, cur_ctx);
   16.69      if (ret < 0) {
   16.70 -        perror ("xc_vcpu_getcontext");
   16.71 +        perror ("xc_ia64_vcpu_getcontext");
   16.72          exit (-1);
   16.73      }
   16.74  
    17.1 --- a/tools/examples/xend-config.sxp	Thu Jun 19 12:48:04 2008 +0900
    17.2 +++ b/tools/examples/xend-config.sxp	Wed Jul 02 11:30:37 2008 +0900
    17.3 @@ -242,3 +242,6 @@
    17.4  
    17.5  # Script to run when the label of a resource has changed.
    17.6  #(resource-label-change-script '')
    17.7 +
    17.8 +# Rotation count of qemu-dm log file.
    17.9 +#(qemu-dm-logrotate-count 10)
    18.1 --- a/tools/firmware/hvmloader/hvmloader.c	Thu Jun 19 12:48:04 2008 +0900
    18.2 +++ b/tools/firmware/hvmloader/hvmloader.c	Wed Jul 02 11:30:37 2008 +0900
    18.3 @@ -206,10 +206,12 @@ static void pci_setup(void)
    18.4              pci_writew(devfn, 0x3d, 0x0001);
    18.5              break;
    18.6          case 0x0101:
    18.7 -            /* PIIX3 IDE */
    18.8 -            ASSERT((vendor_id == 0x8086) && (device_id == 0x7010));
    18.9 -            pci_writew(devfn, 0x40, 0x8000); /* enable IDE0 */
   18.10 -            pci_writew(devfn, 0x42, 0x8000); /* enable IDE1 */
   18.11 +            if ( vendor_id == 0x8086 )
   18.12 +            {
   18.13 +                /* Intel ICHs since PIIX3: enable IDE legacy mode. */
   18.14 +                pci_writew(devfn, 0x40, 0x8000); /* enable IDE0 */
   18.15 +                pci_writew(devfn, 0x42, 0x8000); /* enable IDE1 */
   18.16 +            }
   18.17              break;
   18.18          }
   18.19  
    19.1 --- a/tools/firmware/rombios/rombios.c	Thu Jun 19 12:48:04 2008 +0900
    19.2 +++ b/tools/firmware/rombios/rombios.c	Wed Jul 02 11:30:37 2008 +0900
    19.3 @@ -9783,6 +9783,27 @@ smbios_init:
    19.4  
    19.5  #endif
    19.6  
    19.7 +#if BX_TCGBIOS
    19.8 +; The section between the POST entry and the NMI entry is filling up
    19.9 +; and causes crashes if this code was directly there
   19.10 +tcpa_post_part1:
   19.11 +  call _tcpa_acpi_init
   19.12 +
   19.13 +  push dword #0
   19.14 +  call _tcpa_initialize_tpm
   19.15 +  add sp, #4
   19.16 +
   19.17 +  call _tcpa_do_measure_POSTs
   19.18 +  call _tcpa_wake_event     /* specs: 3.2.3.7 */
   19.19 +  ret
   19.20 +
   19.21 +tcpa_post_part2:
   19.22 +  call _tcpa_calling_int19h          /* specs: 8.2.3 step 1 */
   19.23 +  call _tcpa_add_event_separators    /* specs: 8.2.3 step 2 */
   19.24 +  /* we do not call int 19h handler but keep following eventlog */
   19.25 +  call _tcpa_returned_int19h         /* specs: 8.2.3 step 3/7 */
   19.26 +  ret
   19.27 +#endif
   19.28  
   19.29  
   19.30  ;; for 'C' strings and other data, insert them here with
   19.31 @@ -10003,14 +10024,7 @@ post_default_ints:
   19.32    mov  0x0410, ax
   19.33  
   19.34  #if BX_TCGBIOS
   19.35 -  call _tcpa_acpi_init
   19.36 -
   19.37 -  push dword #0
   19.38 -  call _tcpa_initialize_tpm
   19.39 -  add sp, #4
   19.40 -
   19.41 -  call _tcpa_do_measure_POSTs
   19.42 -  call _tcpa_wake_event     /* specs: 3.2.3.7 */
   19.43 +  call tcpa_post_part1
   19.44  #endif
   19.45  
   19.46    ;; Parallel setup
   19.47 @@ -10138,10 +10152,7 @@ post_default_ints:
   19.48    call _interactive_bootkey
   19.49  
   19.50  #if BX_TCGBIOS
   19.51 -  call _tcpa_calling_int19h          /* specs: 8.2.3 step 1 */
   19.52 -  call _tcpa_add_event_separators    /* specs: 8.2.3 step 2 */
   19.53 -  /* we do not call int 19h handler but keep following eventlog */
   19.54 -  call _tcpa_returned_int19h         /* specs: 8.2.3 step 3/7 */
   19.55 +  call tcpa_post_part2
   19.56  #endif
   19.57  
   19.58    ;; Start the boot sequence.   See the comments in int19_relocated 
    20.1 --- a/tools/ioemu/hw/xen_console.c	Thu Jun 19 12:48:04 2008 +0900
    20.2 +++ b/tools/ioemu/hw/xen_console.c	Wed Jul 02 11:30:37 2008 +0900
    20.3 @@ -160,16 +160,18 @@ int xs_gather(struct xs_handle *xs, cons
    20.4  
    20.5  static int domain_create_ring(struct domain *dom)
    20.6  {
    20.7 -	int err, remote_port, ring_ref, rc;
    20.8 +	int err, remote_port, ring_ref, limit, rc;
    20.9  
   20.10  	err = xs_gather(dom->xsh, dom->serialpath,
   20.11  			"ring-ref", "%u", &ring_ref,
   20.12  			"port", "%i", &remote_port,
   20.13 +			"limit", "%i", &limit,
   20.14  			NULL);
   20.15  	if (err) {
   20.16  		err = xs_gather(dom->xsh, dom->conspath,
   20.17  				"ring-ref", "%u", &ring_ref,
   20.18  				"port", "%i", &remote_port,
   20.19 +				"limit", "%i", &limit,
   20.20  				NULL);
   20.21  		if (err) {
   20.22  			fprintf(stderr, "Console: failed to find ring-ref/port yet\n");
   20.23 @@ -178,7 +180,9 @@ static int domain_create_ring(struct dom
   20.24  		dom->use_consolepath = 1;
   20.25  	} else
   20.26  		dom->use_consolepath = 0;
   20.27 -	fprintf(stderr, "Console: got ring-ref %d port %d\n", ring_ref, remote_port);
   20.28 +	dom->buffer.max_capacity = limit;
   20.29 +	fprintf(stderr, "Console: got ring-ref %d port %d limit %d\n", 
   20.30 +		ring_ref, remote_port, limit);
   20.31  
   20.32  	if ((ring_ref == dom->ring_ref) && (remote_port == dom->remote_port))
   20.33  		goto out;
    21.1 --- a/tools/ioemu/target-i386-dm/exec-dm.c	Thu Jun 19 12:48:04 2008 +0900
    21.2 +++ b/tools/ioemu/target-i386-dm/exec-dm.c	Wed Jul 02 11:30:37 2008 +0900
    21.3 @@ -483,9 +483,11 @@ static void memcpy_words(void *dst, void
    21.4  }
    21.5  #endif
    21.6  
    21.7 -void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf, 
    21.8 -                            int len, int is_write)
    21.9 +void cpu_physical_memory_rw(target_phys_addr_t _addr, uint8_t *buf, 
   21.10 +                            int _len, int is_write)
   21.11  {
   21.12 +    target_phys_addr_t addr = _addr;
   21.13 +    int len = _len;
   21.14      int l, io_index;
   21.15      uint8_t *ptr;
   21.16      uint32_t val;
   21.17 @@ -520,6 +522,7 @@ void cpu_physical_memory_rw(target_phys_
   21.18              } else if ((ptr = phys_ram_addr(addr)) != NULL) {
   21.19                  /* Writing to RAM */
   21.20                  memcpy_words(ptr, buf, l);
   21.21 +#ifndef CONFIG_STUBDOM
   21.22                  if (logdirty_bitmap != NULL) {
   21.23                      /* Record that we have dirtied this frame */
   21.24                      unsigned long pfn = addr >> TARGET_PAGE_BITS;
   21.25 @@ -531,6 +534,7 @@ void cpu_physical_memory_rw(target_phys_
   21.26                              |= 1UL << pfn % HOST_LONG_BITS;
   21.27                      }
   21.28                  }
   21.29 +#endif
   21.30  #ifdef __ia64__
   21.31                  sync_icache(ptr, l);
   21.32  #endif 
   21.33 @@ -566,6 +570,13 @@ void cpu_physical_memory_rw(target_phys_
   21.34          addr += l;
   21.35      }
   21.36  
   21.37 +#ifdef CONFIG_STUBDOM
   21.38 +    if (logdirty_bitmap != NULL)
   21.39 +        xc_hvm_modified_memory(xc_handle, domid, _addr >> TARGET_PAGE_BITS,
   21.40 +                (_addr + _len + TARGET_PAGE_SIZE - 1) >> TARGET_PAGE_BITS
   21.41 +                    - _addr >> TARGET_PAGE_BITS);
   21.42 +#endif
   21.43 +
   21.44      mapcache_unlock();
   21.45  }
   21.46  #endif
    22.1 --- a/tools/ioemu/xenstore.c	Thu Jun 19 12:48:04 2008 +0900
    22.2 +++ b/tools/ioemu/xenstore.c	Wed Jul 02 11:30:37 2008 +0900
    22.3 @@ -260,8 +260,6 @@ void xenstore_parse_domain_config(int hv
    22.4  		    /* autoguess qcow vs qcow2 */
    22.5  		} else if (!strcmp(drv,"file") || !strcmp(drv,"phy")) {
    22.6  		    format = &bdrv_raw;
    22.7 -		} else if (!strcmp(drv,"phy")) {
    22.8 -		    format = &bdrv_raw;
    22.9  		} else {
   22.10  		    format = bdrv_find_format(drv);
   22.11  		    if (!format) {
   22.12 @@ -404,6 +402,10 @@ void xenstore_process_logdirty_event(voi
   22.13              /* No key yet: wait for the next watch */
   22.14              return;
   22.15  
   22.16 +#ifdef CONFIG_STUBDOM
   22.17 +        /* We pass the writes to hypervisor */
   22.18 +        seg = (void*)1;
   22.19 +#else
   22.20          strncpy(key_terminated, key_ascii, 16);
   22.21          free(key_ascii);
   22.22          key = (key_t) strtoull(key_terminated, NULL, 16);
   22.23 @@ -419,11 +421,6 @@ void xenstore_process_logdirty_event(voi
   22.24          fprintf(logfile, "%s: key=%16.16llx size=%lu\n", __FUNCTION__,
   22.25                  (unsigned long long)key, logdirty_bitmap_size);
   22.26  
   22.27 -#ifdef CONFIG_STUBDOM
   22.28 -        /* XXX we just can't use shm. */
   22.29 -        fprintf(logfile, "Log dirty is not implemented in stub domains!\n");
   22.30 -        return;
   22.31 -#else
   22.32          shmid = shmget(key, 2 * logdirty_bitmap_size, S_IRUSR|S_IWUSR);
   22.33          if (shmid == -1) {
   22.34              fprintf(logfile, "Log-dirty: shmget failed: segment %16.16llx "
    23.1 --- a/tools/libxc/ia64/xc_ia64_hvm_build.c	Thu Jun 19 12:48:04 2008 +0900
    23.2 +++ b/tools/libxc/ia64/xc_ia64_hvm_build.c	Wed Jul 02 11:30:37 2008 +0900
    23.3 @@ -1052,7 +1052,8 @@ error_out:
    23.4  int
    23.5  xc_hvm_build(int xc_handle, uint32_t domid, int memsize, const char *image_name)
    23.6  {
    23.7 -    vcpu_guest_context_t st_ctxt, *ctxt = &st_ctxt;
    23.8 +    vcpu_guest_context_any_t st_ctxt_any;
    23.9 +    vcpu_guest_context_t *ctxt = &st_ctxt_any.c;
   23.10      char *image = NULL;
   23.11      unsigned long image_size;
   23.12      unsigned long nr_pages;
   23.13 @@ -1079,14 +1080,14 @@ xc_hvm_build(int xc_handle, uint32_t dom
   23.14  
   23.15      free(image);
   23.16  
   23.17 -    memset(ctxt, 0, sizeof(*ctxt));
   23.18 +    memset(&st_ctxt_any, 0, sizeof(st_ctxt_any));
   23.19      ctxt->regs.ip = 0x80000000ffffffb0UL;
   23.20      ctxt->regs.ar.fpsr = xc_ia64_fpsr_default();
   23.21      ctxt->regs.cr.itir = 14 << 2;
   23.22      ctxt->regs.psr = IA64_PSR_AC | IA64_PSR_BN;
   23.23      ctxt->regs.cr.dcr = 0;
   23.24      ctxt->regs.cr.pta = 15 << 2;
   23.25 -    return xc_vcpu_setcontext(xc_handle, domid, 0, ctxt);
   23.26 +    return xc_vcpu_setcontext(xc_handle, domid, 0, &st_ctxt_any);
   23.27  
   23.28  error_out:
   23.29      free(image);
    24.1 --- a/tools/libxc/ia64/xc_ia64_linux_restore.c	Thu Jun 19 12:48:04 2008 +0900
    24.2 +++ b/tools/libxc/ia64/xc_ia64_linux_restore.c	Wed Jul 02 11:30:37 2008 +0900
    24.3 @@ -117,8 +117,9 @@ xc_ia64_recv_unallocated_list(int xc_han
    24.4  
    24.5  static int
    24.6  xc_ia64_recv_vcpu_context(int xc_handle, int io_fd, uint32_t dom,
    24.7 -                          uint32_t vcpu, vcpu_guest_context_t *ctxt)
    24.8 +                          uint32_t vcpu, vcpu_guest_context_any_t *ctxt_any)
    24.9  {
   24.10 +    vcpu_guest_context_t *ctxt = &ctxt_any->c;
   24.11      if (read_exact(io_fd, ctxt, sizeof(*ctxt))) {
   24.12          ERROR("Error when reading ctxt");
   24.13          return -1;
   24.14 @@ -128,14 +129,14 @@ xc_ia64_recv_vcpu_context(int xc_handle,
   24.15  
   24.16      /* Initialize and set registers.  */
   24.17      ctxt->flags = VGCF_EXTRA_REGS | VGCF_SET_CR_IRR | VGCF_online;
   24.18 -    if (xc_vcpu_setcontext(xc_handle, dom, vcpu, ctxt) != 0) {
   24.19 +    if (xc_vcpu_setcontext(xc_handle, dom, vcpu, ctxt_any) != 0) {
   24.20          ERROR("Couldn't set vcpu context");
   24.21          return -1;
   24.22      }
   24.23  
   24.24      /* Just a check.  */
   24.25      ctxt->flags = 0;
   24.26 -    if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt)) {
   24.27 +    if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt_any)) {
   24.28          ERROR("Could not get vcpu context");
   24.29          return -1;
   24.30      }
   24.31 @@ -226,19 +227,20 @@ xc_ia64_pv_recv_vcpu_context(int xc_hand
   24.32      int rc = -1;
   24.33  
   24.34      /* A copy of the CPU context of the guest. */
   24.35 -    vcpu_guest_context_t ctxt;
   24.36 -    
   24.37 -    if (lock_pages(&ctxt, sizeof(ctxt))) {
   24.38 +    vcpu_guest_context_any_t ctxt_any;
   24.39 +    vcpu_guest_context_t *ctxt = &ctxt_any.c;
   24.40 +
   24.41 +    if (lock_pages(&ctxt_any, sizeof(ctxt_any))) {
   24.42          /* needed for build domctl, but might as well do early */
   24.43          ERROR("Unable to lock_pages ctxt");
   24.44          return -1;
   24.45      }
   24.46  
   24.47 -    if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, vcpu, &ctxt))
   24.48 +    if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, vcpu, &ctxt_any))
   24.49          goto out;
   24.50  
   24.51      /* Then get privreg page.  */
   24.52 -    if (read_page(xc_handle, io_fd, dom, ctxt.privregs_pfn) < 0) {
   24.53 +    if (read_page(xc_handle, io_fd, dom, ctxt->privregs_pfn) < 0) {
   24.54          ERROR("Could not read vcpu privregs");
   24.55          goto out;
   24.56      }
   24.57 @@ -441,12 +443,12 @@ xc_ia64_hvm_recv_context(int xc_handle, 
   24.58      /* vcpu context */
   24.59      for (i = 0; i <= info.max_vcpu_id; i++) {
   24.60          /* A copy of the CPU context of the guest. */
   24.61 -        vcpu_guest_context_t ctxt;
   24.62 +        vcpu_guest_context_any_t ctxt_any;
   24.63  
   24.64          if (!__test_bit(i, vcpumap))
   24.65              continue;
   24.66  
   24.67 -        if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, i, &ctxt))
   24.68 +        if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any))
   24.69              goto out;
   24.70  
   24.71          /* system context of vcpu is recieved as hvm context. */
    25.1 --- a/tools/libxc/ia64/xc_ia64_linux_save.c	Thu Jun 19 12:48:04 2008 +0900
    25.2 +++ b/tools/libxc/ia64/xc_ia64_linux_save.c	Wed Jul 02 11:30:37 2008 +0900
    25.3 @@ -180,9 +180,10 @@ xc_ia64_send_unallocated_list(int xc_han
    25.4  
    25.5  static int
    25.6  xc_ia64_send_vcpu_context(int xc_handle, int io_fd, uint32_t dom,
    25.7 -                          uint32_t vcpu, vcpu_guest_context_t *ctxt)
    25.8 +                          uint32_t vcpu, vcpu_guest_context_any_t *ctxt_any)
    25.9  {
   25.10 -    if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt)) {
   25.11 +    vcpu_guest_context_t *ctxt = &ctxt_any->c;
   25.12 +    if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt_any)) {
   25.13          ERROR("Could not get vcpu context");
   25.14          return -1;
   25.15      }
   25.16 @@ -269,17 +270,19 @@ xc_ia64_pv_send_context(int xc_handle, i
   25.17      /* vcpu context */
   25.18      for (i = 0; i <= info->max_vcpu_id; i++) {
   25.19          /* A copy of the CPU context of the guest. */
   25.20 -        vcpu_guest_context_t ctxt;
   25.21 +        vcpu_guest_context_any_t ctxt_any;
   25.22 +        vcpu_guest_context_t *ctxt = &ctxt_any.c;
   25.23 +
   25.24          char *mem;
   25.25  
   25.26          if (!__test_bit(i, vcpumap))
   25.27              continue;
   25.28  
   25.29 -        if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt))
   25.30 +        if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any))
   25.31              goto out;
   25.32  
   25.33          mem = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
   25.34 -                                   PROT_READ|PROT_WRITE, ctxt.privregs_pfn);
   25.35 +                                   PROT_READ|PROT_WRITE, ctxt->privregs_pfn);
   25.36          if (mem == NULL) {
   25.37              ERROR("cannot map privreg page");
   25.38              goto out;
   25.39 @@ -337,12 +340,12 @@ xc_ia64_hvm_send_context(int xc_handle, 
   25.40      /* vcpu context */
   25.41      for (i = 0; i <= info->max_vcpu_id; i++) {
   25.42          /* A copy of the CPU context of the guest. */
   25.43 -        vcpu_guest_context_t ctxt;
   25.44 +        vcpu_guest_context_any_t ctxt_any;
   25.45  
   25.46          if (!__test_bit(i, vcpumap))
   25.47              continue;
   25.48  
   25.49 -        if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt))
   25.50 +        if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any))
   25.51              goto out;
   25.52  
   25.53          /* system context of vcpu is sent as hvm context. */
    26.1 --- a/tools/libxc/xc_core.c	Thu Jun 19 12:48:04 2008 +0900
    26.2 +++ b/tools/libxc/xc_core.c	Wed Jul 02 11:30:37 2008 +0900
    26.3 @@ -407,7 +407,7 @@ xc_domain_dumpcore_via_callback(int xc_h
    26.4  
    26.5      int nr_vcpus = 0;
    26.6      char *dump_mem, *dump_mem_start = NULL;
    26.7 -    vcpu_guest_context_t  ctxt[MAX_VIRT_CPUS];
    26.8 +    vcpu_guest_context_any_t  ctxt[MAX_VIRT_CPUS];
    26.9      struct xc_core_arch_context arch_ctxt;
   26.10      char dummy[PAGE_SIZE];
   26.11      int dummy_len;
   26.12 @@ -581,10 +581,10 @@ xc_domain_dumpcore_via_callback(int xc_h
   26.13          PERROR("Could not get section header for .xen_prstatus");
   26.14          goto out;
   26.15      }
   26.16 -    filesz = sizeof(ctxt[0]) * nr_vcpus;
   26.17 +    filesz = sizeof(ctxt[0].c) * nr_vcpus;
   26.18      sts = xc_core_shdr_set(shdr, strtab, XEN_DUMPCORE_SEC_PRSTATUS,
   26.19                             SHT_PROGBITS, offset, filesz,
   26.20 -                           __alignof__(ctxt[0]), sizeof(ctxt[0]));
   26.21 +                           __alignof__(ctxt[0].c), sizeof(ctxt[0].c));
   26.22      if ( sts != 0 )
   26.23          goto out;
   26.24      offset += filesz;
   26.25 @@ -707,7 +707,7 @@ xc_domain_dumpcore_via_callback(int xc_h
   26.26          goto out;
   26.27  
   26.28      /* prstatus: .xen_prstatus */
   26.29 -    sts = dump_rtn(args, (char *)&ctxt, sizeof(ctxt[0]) * nr_vcpus);
   26.30 +    sts = dump_rtn(args, (char *)&ctxt[0].c, sizeof(ctxt[0].c) * nr_vcpus);
   26.31      if ( sts != 0 )
   26.32          goto out;
   26.33  
    27.1 --- a/tools/libxc/xc_core_ia64.c	Thu Jun 19 12:48:04 2008 +0900
    27.2 +++ b/tools/libxc/xc_core_ia64.c	Wed Jul 02 11:30:37 2008 +0900
    27.3 @@ -308,9 +308,10 @@ xc_core_arch_context_free(struct xc_core
    27.4  
    27.5  int
    27.6  xc_core_arch_context_get(struct xc_core_arch_context* arch_ctxt,
    27.7 -                         vcpu_guest_context_t* ctxt,
    27.8 +                         vcpu_guest_context_any_t* ctxt_any,
    27.9                           int xc_handle, uint32_t domid)
   27.10  {
   27.11 +    vcpu_guest_context_t *ctxt = &ctxt_any->c;
   27.12      mapped_regs_t* mapped_regs;
   27.13  
   27.14      if ( ctxt->privregs_pfn == VGC_PRIVREGS_HVM )
    28.1 --- a/tools/libxc/xc_core_ia64.h	Thu Jun 19 12:48:04 2008 +0900
    28.2 +++ b/tools/libxc/xc_core_ia64.h	Wed Jul 02 11:30:37 2008 +0900
    28.3 @@ -40,7 +40,7 @@ void
    28.4  xc_core_arch_context_free(struct xc_core_arch_context* arch_ctxt);
    28.5  int
    28.6  xc_core_arch_context_get(struct xc_core_arch_context* arch_ctxt,
    28.7 -                         vcpu_guest_context_t* ctxt,
    28.8 +                         vcpu_guest_context_any_t* ctxt,
    28.9                           int xc_handle, uint32_t domid);
   28.10  int
   28.11  xc_core_arch_context_get_shdr(struct xc_core_arch_context* arch_ctxt, 
    29.1 --- a/tools/libxc/xc_domain.c	Thu Jun 19 12:48:04 2008 +0900
    29.2 +++ b/tools/libxc/xc_domain.c	Wed Jul 02 11:30:37 2008 +0900
    29.3 @@ -298,30 +298,21 @@ int xc_domain_hvm_setcontext(int xc_hand
    29.4  int xc_vcpu_getcontext(int xc_handle,
    29.5                         uint32_t domid,
    29.6                         uint32_t vcpu,
    29.7 -                       vcpu_guest_context_t *ctxt)
    29.8 +                       vcpu_guest_context_any_t *ctxt)
    29.9  {
   29.10      int rc;
   29.11      DECLARE_DOMCTL;
   29.12 -    size_t sz = sizeof(vcpu_guest_context_either_t);
   29.13 +    size_t sz = sizeof(vcpu_guest_context_any_t);
   29.14  
   29.15      domctl.cmd = XEN_DOMCTL_getvcpucontext;
   29.16      domctl.domain = (domid_t)domid;
   29.17      domctl.u.vcpucontext.vcpu   = (uint16_t)vcpu;
   29.18 -    set_xen_guest_handle(domctl.u.vcpucontext.ctxt, ctxt);
   29.19 +    set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt->c);
   29.20  
   29.21 -    /*
   29.22 -     * We may be asked to lock either a 32-bit or a 64-bit context. Lock the
   29.23 -     * larger of the two if possible, otherwise fall back to native size.
   29.24 -     */
   29.25 +    
   29.26      if ( (rc = lock_pages(ctxt, sz)) != 0 )
   29.27 -    {
   29.28 -        sz = sizeof(*ctxt);
   29.29 -        if ( (rc = lock_pages(ctxt, sz)) != 0 )
   29.30 -            return rc;
   29.31 -    }
   29.32 -
   29.33 +        return rc;
   29.34      rc = do_domctl(xc_handle, &domctl);
   29.35 -
   29.36      unlock_pages(ctxt, sz);
   29.37  
   29.38      return rc;
   29.39 @@ -626,32 +617,28 @@ int xc_availheap(int xc_handle,
   29.40  int xc_vcpu_setcontext(int xc_handle,
   29.41                         uint32_t domid,
   29.42                         uint32_t vcpu,
   29.43 -                       vcpu_guest_context_t *ctxt)
   29.44 +                       vcpu_guest_context_any_t *ctxt)
   29.45  {
   29.46      DECLARE_DOMCTL;
   29.47      int rc;
   29.48 -    size_t sz = sizeof(vcpu_guest_context_either_t);
   29.49 +    size_t sz = sizeof(vcpu_guest_context_any_t);
   29.50 +
   29.51 +    if (ctxt == NULL)
   29.52 +    {
   29.53 +        errno = EINVAL;
   29.54 +        return -1;
   29.55 +    }
   29.56  
   29.57      domctl.cmd = XEN_DOMCTL_setvcpucontext;
   29.58      domctl.domain = domid;
   29.59      domctl.u.vcpucontext.vcpu = vcpu;
   29.60 -    set_xen_guest_handle(domctl.u.vcpucontext.ctxt, ctxt);
   29.61 +    set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt->c);
   29.62  
   29.63 -    /*
   29.64 -     * We may be asked to lock either a 32-bit or a 64-bit context. Lock the
   29.65 -     * larger of the two if possible, otherwise fall back to native size.
   29.66 -     */
   29.67 -    if ( (ctxt != NULL) && (rc = lock_pages(ctxt, sz)) != 0 )
   29.68 -    {
   29.69 -        sz = sizeof(*ctxt);
   29.70 -        if ( (rc = lock_pages(ctxt, sz)) != 0 )
   29.71 -            return rc;
   29.72 -    }
   29.73 -
   29.74 +    if ( (rc = lock_pages(ctxt, sz)) != 0 )
   29.75 +        return rc;
   29.76      rc = do_domctl(xc_handle, &domctl);
   29.77 -
   29.78 -    if ( ctxt != NULL )
   29.79 -        unlock_pages(ctxt, sz);
   29.80 +    
   29.81 +    unlock_pages(ctxt, sz);
   29.82  
   29.83      return rc;
   29.84  }
    30.1 --- a/tools/libxc/xc_domain_restore.c	Thu Jun 19 12:48:04 2008 +0900
    30.2 +++ b/tools/libxc/xc_domain_restore.c	Wed Jul 02 11:30:37 2008 +0900
    30.3 @@ -153,7 +153,7 @@ static xen_pfn_t *load_p2m_frame_list(
    30.4      int io_fd, int *pae_extended_cr3, int *ext_vcpucontext)
    30.5  {
    30.6      xen_pfn_t *p2m_frame_list;
    30.7 -    vcpu_guest_context_either_t ctxt;
    30.8 +    vcpu_guest_context_any_t ctxt;
    30.9      xen_pfn_t p2m_fl_zero;
   30.10  
   30.11      /* Read first entry of P2M list, or extended-info signature (~0UL). */
   30.12 @@ -284,12 +284,12 @@ int xc_domain_restore(int xc_handle, int
   30.13      /* The new domain's shared-info frame number. */
   30.14      unsigned long shared_info_frame;
   30.15      unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
   30.16 -    shared_info_either_t *old_shared_info = 
   30.17 -        (shared_info_either_t *)shared_info_page;
   30.18 -    shared_info_either_t *new_shared_info;
   30.19 +    shared_info_any_t *old_shared_info = 
   30.20 +        (shared_info_any_t *)shared_info_page;
   30.21 +    shared_info_any_t *new_shared_info;
   30.22  
   30.23      /* A copy of the CPU context of the guest. */
   30.24 -    vcpu_guest_context_either_t ctxt;
   30.25 +    vcpu_guest_context_any_t ctxt;
   30.26  
   30.27      /* A table containing the type of each PFN (/not/ MFN!). */
   30.28      unsigned long *pfn_type = NULL;
   30.29 @@ -304,7 +304,7 @@ int xc_domain_restore(int xc_handle, int
   30.30      xen_pfn_t *p2m_frame_list = NULL;
   30.31      
   30.32      /* A temporary mapping of the guest's start_info page. */
   30.33 -    start_info_either_t *start_info;
   30.34 +    start_info_any_t *start_info;
   30.35  
   30.36      /* Our mapping of the current region (batch) */
   30.37      char *region_base;
    31.1 --- a/tools/libxc/xc_domain_save.c	Thu Jun 19 12:48:04 2008 +0900
    31.2 +++ b/tools/libxc/xc_domain_save.c	Wed Jul 02 11:30:37 2008 +0900
    31.3 @@ -412,7 +412,7 @@ static int suspend_and_state(int (*suspe
    31.4  ** it to update the MFN to a reasonable value.
    31.5  */
    31.6  static void *map_frame_list_list(int xc_handle, uint32_t dom,
    31.7 -                                 shared_info_either_t *shinfo)
    31.8 +                                 shared_info_any_t *shinfo)
    31.9  {
   31.10      int count = 100;
   31.11      void *p;
   31.12 @@ -628,9 +628,9 @@ static xen_pfn_t *map_and_save_p2m_table
   31.13                                           int io_fd, 
   31.14                                           uint32_t dom,
   31.15                                           unsigned long p2m_size,
   31.16 -                                         shared_info_either_t *live_shinfo)
   31.17 +                                         shared_info_any_t *live_shinfo)
   31.18  {
   31.19 -    vcpu_guest_context_either_t ctxt;
   31.20 +    vcpu_guest_context_any_t ctxt;
   31.21  
   31.22      /* Double and single indirect references to the live P2M table */
   31.23      void *live_p2m_frame_list_list = NULL;
   31.24 @@ -735,7 +735,7 @@ static xen_pfn_t *map_and_save_p2m_table
   31.25          p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]);
   31.26      }
   31.27  
   31.28 -    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) )
   31.29 +    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
   31.30      {
   31.31          ERROR("Could not get vcpu context");
   31.32          goto out;
   31.33 @@ -814,7 +814,7 @@ int xc_domain_save(int xc_handle, int io
   31.34      unsigned long shared_info_frame;
   31.35  
   31.36      /* A copy of the CPU context of the guest. */
   31.37 -    vcpu_guest_context_either_t ctxt;
   31.38 +    vcpu_guest_context_any_t ctxt;
   31.39  
   31.40      /* A table containing the type of each PFN (/not/ MFN!). */
   31.41      unsigned long *pfn_type = NULL;
   31.42 @@ -824,7 +824,7 @@ int xc_domain_save(int xc_handle, int io
   31.43      char page[PAGE_SIZE];
   31.44  
   31.45      /* Live mapping of shared info structure */
   31.46 -    shared_info_either_t *live_shinfo = NULL;
   31.47 +    shared_info_any_t *live_shinfo = NULL;
   31.48  
   31.49      /* base of the region in which domain memory is mapped */
   31.50      unsigned char *region_base = NULL;
   31.51 @@ -1536,7 +1536,7 @@ int xc_domain_save(int xc_handle, int io
   31.52          }
   31.53      }
   31.54  
   31.55 -    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) )
   31.56 +    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
   31.57      {
   31.58          ERROR("Could not get vcpu context");
   31.59          goto out;
   31.60 @@ -1556,7 +1556,7 @@ int xc_domain_save(int xc_handle, int io
   31.61          if ( !(vcpumap & (1ULL << i)) )
   31.62              continue;
   31.63  
   31.64 -        if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt.c) )
   31.65 +        if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
   31.66          {
   31.67              ERROR("No context for VCPU%d", i);
   31.68              goto out;
   31.69 @@ -1624,7 +1624,7 @@ int xc_domain_save(int xc_handle, int io
   31.70       * Reset the MFN to be a known-invalid value. See map_frame_list_list().
   31.71       */
   31.72      memcpy(page, live_shinfo, PAGE_SIZE);
   31.73 -    SET_FIELD(((shared_info_either_t *)page), 
   31.74 +    SET_FIELD(((shared_info_any_t *)page), 
   31.75                arch.pfn_to_mfn_frame_list_list, 0);
   31.76      if ( write_exact(io_fd, page, PAGE_SIZE) )
   31.77      {
    32.1 --- a/tools/libxc/xc_misc.c	Thu Jun 19 12:48:04 2008 +0900
    32.2 +++ b/tools/libxc/xc_misc.c	Wed Jul 02 11:30:37 2008 +0900
    32.3 @@ -267,6 +267,34 @@ int xc_hvm_track_dirty_vram(
    32.4      return rc;
    32.5  }
    32.6  
    32.7 +int xc_hvm_modified_memory(
    32.8 +    int xc_handle, domid_t dom, uint64_t first_pfn, uint64_t nr)
    32.9 +{
   32.10 +    DECLARE_HYPERCALL;
   32.11 +    struct xen_hvm_modified_memory arg;
   32.12 +    int rc;
   32.13 +
   32.14 +    hypercall.op     = __HYPERVISOR_hvm_op;
   32.15 +    hypercall.arg[0] = HVMOP_modified_memory;
   32.16 +    hypercall.arg[1] = (unsigned long)&arg;
   32.17 +
   32.18 +    arg.domid     = dom;
   32.19 +    arg.first_pfn = first_pfn;
   32.20 +    arg.nr        = nr;
   32.21 +
   32.22 +    if ( (rc = lock_pages(&arg, sizeof(arg))) != 0 )
   32.23 +    {
   32.24 +        PERROR("Could not lock memory");
   32.25 +        return rc;
   32.26 +    }
   32.27 +
   32.28 +    rc = do_xen_hypercall(xc_handle, &hypercall);
   32.29 +
   32.30 +    unlock_pages(&arg, sizeof(arg));
   32.31 +
   32.32 +    return rc;
   32.33 +}
   32.34 +
   32.35  void *xc_map_foreign_pages(int xc_handle, uint32_t dom, int prot,
   32.36                             const xen_pfn_t *arr, int num)
   32.37  {
    33.1 --- a/tools/libxc/xc_pagetab.c	Thu Jun 19 12:48:04 2008 +0900
    33.2 +++ b/tools/libxc/xc_pagetab.c	Wed Jul 02 11:30:37 2008 +0900
    33.3 @@ -48,7 +48,7 @@
    33.4  unsigned long xc_translate_foreign_address(int xc_handle, uint32_t dom,
    33.5                                             int vcpu, unsigned long long virt )
    33.6  {
    33.7 -    vcpu_guest_context_t ctx;
    33.8 +    vcpu_guest_context_any_t ctx;
    33.9      unsigned long long cr3;
   33.10      void *pd, *pt, *pdppage = NULL, *pdp, *pml = NULL;
   33.11      unsigned long long pde, pte, pdpe, pmle;
   33.12 @@ -78,7 +78,7 @@ unsigned long xc_translate_foreign_addre
   33.13          DPRINTF("failed to retreive vcpu context\n");
   33.14          goto out;
   33.15      }
   33.16 -    cr3 = ((unsigned long long)xen_cr3_to_pfn(ctx.ctrlreg[3])) << PAGE_SHIFT;
   33.17 +    cr3 = ((unsigned long long)xen_cr3_to_pfn(ctx.c.ctrlreg[3])) << PAGE_SHIFT;
   33.18  
   33.19      /* Page Map Level 4 */
   33.20  
    34.1 --- a/tools/libxc/xc_private.h	Thu Jun 19 12:48:04 2008 +0900
    34.2 +++ b/tools/libxc/xc_private.h	Wed Jul 02 11:30:37 2008 +0900
    34.3 @@ -188,9 +188,9 @@ int xc_map_foreign_ranges(int xc_handle,
    34.4                            privcmd_mmap_entry_t *entries, int nr);
    34.5  
    34.6  void *map_domain_va_core(unsigned long domfd, int cpu, void *guest_va,
    34.7 -                         vcpu_guest_context_t *ctxt);
    34.8 +                         vcpu_guest_context_any_t *ctxt);
    34.9  int xc_waitdomain_core(int xc_handle, int domain, int *status,
   34.10 -    int options, vcpu_guest_context_t *ctxt);
   34.11 +    int options, vcpu_guest_context_any_t *ctxt);
   34.12  
   34.13  void bitmap_64_to_byte(uint8_t *bp, const uint64_t *lp, int nbits);
   34.14  void bitmap_byte_to_64(uint64_t *lp, const uint8_t *bp, int nbits);
    35.1 --- a/tools/libxc/xc_ptrace.c	Thu Jun 19 12:48:04 2008 +0900
    35.2 +++ b/tools/libxc/xc_ptrace.c	Wed Jul 02 11:30:37 2008 +0900
    35.3 @@ -40,9 +40,9 @@ static int current_domid = -1;
    35.4  static int current_isfile;
    35.5  static int current_is_hvm;
    35.6  
    35.7 -static uint64_t                 online_cpumap;
    35.8 -static uint64_t                 regs_valid;
    35.9 -static vcpu_guest_context_t     ctxt[MAX_VIRT_CPUS];
   35.10 +static uint64_t                         online_cpumap;
   35.11 +static uint64_t                         regs_valid;
   35.12 +static vcpu_guest_context_any_t      ctxt[MAX_VIRT_CPUS];
   35.13  
   35.14  extern int ffsll(long long int);
   35.15  #define FOREACH_CPU(cpumap, i)  for ( cpumap = online_cpumap; (i = ffsll(cpumap)); cpumap &= ~(1 << (index - 1)) )
   35.16 @@ -96,9 +96,9 @@ xc_register_event_handler(thr_ev_handler
   35.17  }
   35.18  
   35.19  static inline int
   35.20 -paging_enabled(vcpu_guest_context_t *v)
   35.21 +paging_enabled(vcpu_guest_context_any_t *v)
   35.22  {
   35.23 -    unsigned long cr0 = v->ctrlreg[0];
   35.24 +    unsigned long cr0 = v->c.ctrlreg[0];
   35.25      return (cr0 & X86_CR0_PE) && (cr0 & X86_CR0_PG);
   35.26  }
   35.27  
   35.28 @@ -174,7 +174,7 @@ map_domain_va_32(
   35.29  
   35.30      l2 = xc_map_foreign_range(
   35.31           xc_handle, current_domid, PAGE_SIZE, PROT_READ,
   35.32 -         xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3]));
   35.33 +         xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3]));
   35.34      if ( l2 == NULL )
   35.35          return NULL;
   35.36  
   35.37 @@ -216,7 +216,7 @@ map_domain_va_pae(
   35.38  
   35.39      l3 = xc_map_foreign_range(
   35.40          xc_handle, current_domid, PAGE_SIZE, PROT_READ,
   35.41 -        xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3]));
   35.42 +        xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3]));
   35.43      if ( l3 == NULL )
   35.44          return NULL;
   35.45  
   35.46 @@ -264,12 +264,12 @@ map_domain_va_64(
   35.47      uint64_t *l4, *l3, *l2, *l1;
   35.48      static void *v[MAX_VIRT_CPUS];
   35.49  
   35.50 -    if ((ctxt[cpu].ctrlreg[4] & 0x20) == 0 ) /* legacy ia32 mode */
   35.51 +    if ((ctxt[cpu].c.ctrlreg[4] & 0x20) == 0 ) /* legacy ia32 mode */
   35.52          return map_domain_va_32(xc_handle, cpu, guest_va, perm);
   35.53  
   35.54      l4 = xc_map_foreign_range(
   35.55          xc_handle, current_domid, PAGE_SIZE, PROT_READ,
   35.56 -        xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3]));
   35.57 +        xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3]));
   35.58      if ( l4 == NULL )
   35.59          return NULL;
   35.60  
   35.61 @@ -494,26 +494,26 @@ xc_ptrace(
   35.62      case PTRACE_GETREGS:
   35.63          if (!current_isfile && fetch_regs(xc_handle, cpu, NULL))
   35.64              goto out_error;
   35.65 -        SET_PT_REGS(pt, ctxt[cpu].user_regs);
   35.66 +        SET_PT_REGS(pt, ctxt[cpu].c.user_regs);
   35.67          memcpy(data, &pt, sizeof(struct gdb_regs));
   35.68          break;
   35.69  
   35.70      case PTRACE_GETFPREGS:
   35.71          if (!current_isfile && fetch_regs(xc_handle, cpu, NULL)) 
   35.72                  goto out_error;
   35.73 -        memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof (elf_fpregset_t));
   35.74 +        memcpy(data, &ctxt[cpu].c.fpu_ctxt, sizeof (elf_fpregset_t));
   35.75          break;
   35.76  
   35.77      case PTRACE_GETFPXREGS:
   35.78          if (!current_isfile && fetch_regs(xc_handle, cpu, NULL))
   35.79                  goto out_error;
   35.80 -        memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof(ctxt[cpu].fpu_ctxt));
   35.81 +        memcpy(data, &ctxt[cpu].c.fpu_ctxt, sizeof(ctxt[cpu].c.fpu_ctxt));
   35.82          break;
   35.83  
   35.84      case PTRACE_SETREGS:
   35.85          if (current_isfile)
   35.86                  goto out_unsupported; /* XXX not yet supported */
   35.87 -        SET_XC_REGS(((struct gdb_regs *)data), ctxt[cpu].user_regs);
   35.88 +        SET_XC_REGS(((struct gdb_regs *)data), ctxt[cpu].c.user_regs);
   35.89          if ((retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu,
   35.90                                  &ctxt[cpu])))
   35.91              goto out_error_domctl;
   35.92 @@ -525,7 +525,7 @@ xc_ptrace(
   35.93          /*  XXX we can still have problems if the user switches threads
   35.94           *  during single-stepping - but that just seems retarded
   35.95           */
   35.96 -        ctxt[cpu].user_regs.eflags |= PSL_T;
   35.97 +        ctxt[cpu].c.user_regs.eflags |= PSL_T;
   35.98          if ((retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu,
   35.99                                  &ctxt[cpu])))
  35.100              goto out_error_domctl;
  35.101 @@ -542,9 +542,9 @@ xc_ptrace(
  35.102                  if (fetch_regs(xc_handle, cpu, NULL))
  35.103                      goto out_error;
  35.104                  /* Clear trace flag */
  35.105 -                if ( ctxt[cpu].user_regs.eflags & PSL_T )
  35.106 +                if ( ctxt[cpu].c.user_regs.eflags & PSL_T )
  35.107                  {
  35.108 -                    ctxt[cpu].user_regs.eflags &= ~PSL_T;
  35.109 +                    ctxt[cpu].c.user_regs.eflags &= ~PSL_T;
  35.110                      if ((retval = xc_vcpu_setcontext(xc_handle, current_domid,
  35.111                                                  cpu, &ctxt[cpu])))
  35.112                          goto out_error_domctl;
    36.1 --- a/tools/libxc/xc_ptrace_core.c	Thu Jun 19 12:48:04 2008 +0900
    36.2 +++ b/tools/libxc/xc_ptrace_core.c	Wed Jul 02 11:30:37 2008 +0900
    36.3 @@ -641,24 +641,24 @@ static const struct xc_core_format_type*
    36.4  
    36.5  void *
    36.6  map_domain_va_core(unsigned long domfd, int cpu, void *guest_va,
    36.7 -                   vcpu_guest_context_t *ctxt)
    36.8 +                   vcpu_guest_context_any_t *ctxt)
    36.9  {
   36.10      if (current_format_type == NULL)
   36.11          return NULL;
   36.12      return (current_format_type->map_domain_va_core)(domfd, cpu, guest_va,
   36.13 -                                                     ctxt);
   36.14 +                                                     &ctxt->c);
   36.15  }
   36.16  
   36.17  int
   36.18  xc_waitdomain_core(int xc_handle, int domfd, int *status, int options,
   36.19 -                   vcpu_guest_context_t *ctxt)
   36.20 +                   vcpu_guest_context_any_t *ctxt)
   36.21  {
   36.22      int ret;
   36.23      int i;
   36.24  
   36.25      for (i = 0; i < NR_FORMAT_TYPE; i++) {
   36.26          ret = (format_type[i].waitdomain_core)(xc_handle, domfd, status,
   36.27 -                                               options, ctxt);
   36.28 +                                               options, &ctxt->c);
   36.29          if (ret == 0) {
   36.30              current_format_type = &format_type[i];
   36.31              break;
    37.1 --- a/tools/libxc/xc_resume.c	Thu Jun 19 12:48:04 2008 +0900
    37.2 +++ b/tools/libxc/xc_resume.c	Wed Jul 02 11:30:37 2008 +0900
    37.3 @@ -13,7 +13,7 @@
    37.4  
    37.5  static int modify_returncode(int xc_handle, uint32_t domid)
    37.6  {
    37.7 -    vcpu_guest_context_either_t ctxt;
    37.8 +    vcpu_guest_context_any_t ctxt;
    37.9      xc_dominfo_t info;
   37.10      xen_capabilities_info_t caps;
   37.11      int rc;
   37.12 @@ -39,7 +39,7 @@ static int modify_returncode(int xc_hand
   37.13          return -1;
   37.14      }
   37.15  
   37.16 -    if ( (rc = xc_vcpu_getcontext(xc_handle, domid, 0, &ctxt.c)) != 0 )
   37.17 +    if ( (rc = xc_vcpu_getcontext(xc_handle, domid, 0, &ctxt)) != 0 )
   37.18          return rc;
   37.19  
   37.20      if ( !info.hvm )
   37.21 @@ -49,7 +49,7 @@ static int modify_returncode(int xc_hand
   37.22      else
   37.23          ctxt.x32.user_regs.eax = 1;
   37.24  
   37.25 -    if ( (rc = xc_vcpu_setcontext(xc_handle, domid, 0, &ctxt.c)) != 0 )
   37.26 +    if ( (rc = xc_vcpu_setcontext(xc_handle, domid, 0, &ctxt)) != 0 )
   37.27          return rc;
   37.28  
   37.29      return 0;
   37.30 @@ -89,7 +89,7 @@ static int xc_domain_resume_any(int xc_h
   37.31      int i, rc = -1;
   37.32  #if defined(__i386__) || defined(__x86_64__)
   37.33      unsigned long mfn, p2m_size = 0;
   37.34 -    vcpu_guest_context_t ctxt;
   37.35 +    vcpu_guest_context_any_t ctxt;
   37.36      start_info_t *start_info;
   37.37      shared_info_t *shinfo = NULL;
   37.38      xen_pfn_t *p2m_frame_list_list = NULL;
   37.39 @@ -167,7 +167,7 @@ static int xc_domain_resume_any(int xc_h
   37.40          goto out;
   37.41      }
   37.42  
   37.43 -    mfn = ctxt.user_regs.edx;
   37.44 +    mfn = ctxt.c.user_regs.edx;
   37.45  
   37.46      start_info = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
   37.47                                        PROT_READ | PROT_WRITE, mfn);
    38.1 --- a/tools/libxc/xenctrl.h	Thu Jun 19 12:48:04 2008 +0900
    38.2 +++ b/tools/libxc/xenctrl.h	Wed Jul 02 11:30:37 2008 +0900
    38.3 @@ -31,6 +31,11 @@
    38.4  #include <xen/xsm/acm_ops.h>
    38.5  #include <xen/xsm/flask_op.h>
    38.6  
    38.7 +#if defined(__i386__) || defined(__x86_64__)
    38.8 +#include <xen/foreign/x86_32.h>
    38.9 +#include <xen/foreign/x86_64.h>
   38.10 +#endif
   38.11 +
   38.12  #ifdef __ia64__
   38.13  #define XC_PAGE_SHIFT           14
   38.14  #else
   38.15 @@ -162,6 +167,35 @@ typedef struct xc_dominfo {
   38.16  } xc_dominfo_t;
   38.17  
   38.18  typedef xen_domctl_getdomaininfo_t xc_domaininfo_t;
   38.19 +
   38.20 +typedef union 
   38.21 +{
   38.22 +#if defined(__i386__) || defined(__x86_64__)
   38.23 +    vcpu_guest_context_x86_64_t x64;
   38.24 +    vcpu_guest_context_x86_32_t x32;   
   38.25 +#endif
   38.26 +    vcpu_guest_context_t c;
   38.27 +} vcpu_guest_context_any_t;
   38.28 +
   38.29 +typedef union
   38.30 +{
   38.31 +#if defined(__i386__) || defined(__x86_64__)
   38.32 +    shared_info_x86_64_t x64;
   38.33 +    shared_info_x86_32_t x32;
   38.34 +#endif
   38.35 +    shared_info_t s;
   38.36 +} shared_info_any_t;
   38.37 +
   38.38 +typedef union
   38.39 +{
   38.40 +#if defined(__i386__) || defined(__x86_64__)
   38.41 +    start_info_x86_64_t x64;
   38.42 +    start_info_x86_32_t x32;
   38.43 +#endif
   38.44 +    start_info_t s;
   38.45 +} start_info_any_t;
   38.46 +
   38.47 +
   38.48  int xc_domain_create(int xc_handle,
   38.49                       uint32_t ssidref,
   38.50                       xen_domain_handle_t handle,
   38.51 @@ -307,7 +341,7 @@ int xc_domain_getinfo(int xc_handle,
   38.52  int xc_vcpu_setcontext(int xc_handle,
   38.53                         uint32_t domid,
   38.54                         uint32_t vcpu,
   38.55 -                       vcpu_guest_context_t *ctxt);
   38.56 +                       vcpu_guest_context_any_t *ctxt);
   38.57  /**
   38.58   * This function will return information about one or more domains, using a
   38.59   * single hypercall.  The domain information will be stored into the supplied
   38.60 @@ -368,7 +402,7 @@ int xc_domain_hvm_setcontext(int xc_hand
   38.61  int xc_vcpu_getcontext(int xc_handle,
   38.62                         uint32_t domid,
   38.63                         uint32_t vcpu,
   38.64 -                       vcpu_guest_context_t *ctxt);
   38.65 +                       vcpu_guest_context_any_t *ctxt);
   38.66  
   38.67  typedef xen_domctl_getvcpuinfo_t xc_vcpuinfo_t;
   38.68  int xc_vcpu_getinfo(int xc_handle,
   38.69 @@ -895,6 +929,12 @@ int xc_hvm_track_dirty_vram(
   38.70      uint64_t first_pfn, uint64_t nr,
   38.71      unsigned long *bitmap);
   38.72  
   38.73 +/*
   38.74 + * Notify that some pages got modified by the Device Model
   38.75 + */
   38.76 +int xc_hvm_modified_memory(
   38.77 +    int xc_handle, domid_t dom, uint64_t first_pfn, uint64_t nr);
   38.78 +
   38.79  typedef enum {
   38.80    XC_ERROR_NONE = 0,
   38.81    XC_INTERNAL_ERROR = 1,
    39.1 --- a/tools/libxc/xg_save_restore.h	Thu Jun 19 12:48:04 2008 +0900
    39.2 +++ b/tools/libxc/xg_save_restore.h	Wed Jul 02 11:30:37 2008 +0900
    39.3 @@ -112,28 +112,6 @@ static inline int get_platform_info(int 
    39.4  #define is_mapped(pfn_type) (!((pfn_type) & 0x80000000UL))
    39.5  
    39.6  
    39.7 -/* 32-on-64 support: saving 32bit guests from 64bit tools and vice versa */
    39.8 -typedef union 
    39.9 -{
   39.10 -    vcpu_guest_context_x86_64_t x64;
   39.11 -    vcpu_guest_context_x86_32_t x32;   
   39.12 -    vcpu_guest_context_t c;
   39.13 -} vcpu_guest_context_either_t;
   39.14 -
   39.15 -typedef union 
   39.16 -{
   39.17 -    shared_info_x86_64_t x64;
   39.18 -    shared_info_x86_32_t x32;   
   39.19 -    shared_info_t s;
   39.20 -} shared_info_either_t;
   39.21 -
   39.22 -typedef union 
   39.23 -{
   39.24 -    start_info_x86_64_t x64;
   39.25 -    start_info_x86_32_t x32;   
   39.26 -    start_info_t s;
   39.27 -} start_info_either_t;
   39.28 -
   39.29  #define GET_FIELD(_p, _f) ((guest_width==8) ? ((_p)->x64._f) : ((_p)->x32._f))
   39.30  
   39.31  #define SET_FIELD(_p, _f, _v) do {              \
    40.1 --- a/tools/python/xen/util/blkif.py	Thu Jun 19 12:48:04 2008 +0900
    40.2 +++ b/tools/python/xen/util/blkif.py	Wed Jul 02 11:30:37 2008 +0900
    40.3 @@ -16,8 +16,11 @@ def blkdev_name_to_number(name):
    40.4  
    40.5      n = expand_dev_name(name)
    40.6  
    40.7 +    devname = 'virtual-device'
    40.8 +    devnum = None
    40.9 +
   40.10      try:
   40.11 -        return os.stat(n).st_rdev
   40.12 +        return (devname, os.stat(n).st_rdev)
   40.13      except Exception, ex:
   40.14          pass
   40.15  
   40.16 @@ -25,28 +28,30 @@ def blkdev_name_to_number(name):
   40.17      if re.match( '/dev/sd[a-z]([1-9]|1[0-5])?$', n):
   40.18          major = scsi_major[(ord(n[7:8]) - ord('a')) / 16]
   40.19          minor = ((ord(n[7:8]) - ord('a')) % 16) * 16 + int(n[8:] or 0)
   40.20 -        return major * 256 + minor
   40.21 -    if re.match( '/dev/sd[a-i][a-z]([1-9]|1[0-5])?$', n):
   40.22 +        devnum = major * 256 + minor
   40.23 +    elif re.match( '/dev/sd[a-i][a-z]([1-9]|1[0-5])?$', n):
   40.24          major = scsi_major[((ord(n[7:8]) - ord('a') + 1) * 26 + (ord(n[8:9]) - ord('a'))) / 16 ]
   40.25          minor = (((ord(n[7:8]) - ord('a') + 1 ) * 26 + (ord(n[8:9]) - ord('a'))) % 16) * 16 + int(n[9:] or 0)
   40.26 -        return major * 256 + minor
   40.27 -
   40.28 -    if re.match( '/dev/hd[a-t]([1-9]|[1-5][0-9]|6[0-3])?', n):
   40.29 +        devnum = major * 256 + minor
   40.30 +    elif re.match( '/dev/hd[a-t]([1-9]|[1-5][0-9]|6[0-3])?', n):
   40.31          ide_majors = [ 3, 22, 33, 34, 56, 57, 88, 89, 90, 91 ]
   40.32          major = ide_majors[(ord(n[7:8]) - ord('a')) / 2]
   40.33          minor = ((ord(n[7:8]) - ord('a')) % 2) * 64 + int(n[8:] or 0)
   40.34 -        return major * 256 + minor
   40.35 -
   40.36 -    if re.match( '/dev/xvd[a-p]([1-9]|1[0-5])?', n):
   40.37 -        return 202 * 256 + 16 * (ord(n[8:9]) - ord('a')) + int(n[9:] or 0)
   40.38 +        devnum = major * 256 + minor
   40.39 +    elif re.match( '/dev/xvd[a-p]([1-9]|1[0-5])?$', n):
   40.40 +        devnum = (202 << 8) + ((ord(n[8:9]) - ord('a')) << 4) + int(n[9:] or 0)
   40.41 +    elif re.match('/dev/xvd[q-z]([1-9]|1[0-5])?$', n):
   40.42 +        devname = 'virtual-device-ext'
   40.43 +        devnum = (1 << 28) + ((ord(n[8:9]) - ord('a')) << 8) + int(n[9:] or 0)
   40.44 +    elif re.match('/dev/xvd[a-i][a-z]([1-9]|1[0-5])?$', n):
   40.45 +        devname = 'virtual-device-ext'
   40.46 +        devnum = (1 << 28) + (((ord(n[8:9]) - ord('a') + 1) * 26 + (ord(n[9:10]) - ord('a'))) << 8) + int(n[10:] or 0)
   40.47 +    elif re.match( '^(0x)[0-9a-fA-F]+$', name ):
   40.48 +        devnum = string.atoi(name, 16)
   40.49 +    elif re.match('^[0-9]+$', name):
   40.50 +        devnum = string.atoi(name, 10)
   40.51  
   40.52 -    if re.match( '^(0x)[0-9a-fA-F]+$', name ):
   40.53 -        return string.atoi(name,16)
   40.54 -
   40.55 -    if re.match('^[0-9]+$', name):
   40.56 -        return string.atoi(name, 10)
   40.57 -
   40.58 -    return None
   40.59 +    return (devname, devnum)
   40.60  
   40.61  def blkdev_segment(name):
   40.62      """Take the given block-device name (e.g. '/dev/sda1', 'hda')
   40.63 @@ -58,7 +63,7 @@ def blkdev_segment(name):
   40.64          type:         'Disk' or identifying name for partition type
   40.65      """
   40.66      val = None
   40.67 -    n = blkdev_name_to_number(name)
   40.68 +    (name, n) = blkdev_name_to_number(name)
   40.69      if not n is None:
   40.70          val = { 'device'       : n,
   40.71                  'start_sector' : long(0),
    41.1 --- a/tools/python/xen/xend/XendConfig.py	Thu Jun 19 12:48:04 2008 +0900
    41.2 +++ b/tools/python/xen/xend/XendConfig.py	Wed Jul 02 11:30:37 2008 +0900
    41.3 @@ -1123,7 +1123,7 @@ class XendConfig(dict):
    41.4              try:
    41.5                  devid = int(dev2)
    41.6              except ValueError:
    41.7 -                devid = blkdev_name_to_number(dev2)
    41.8 +                (xenbus, devid) = blkdev_name_to_number(dev2)
    41.9                  if devid == None:
   41.10                      log.debug("The device %s is not device name", dev2)
   41.11                      return None
    42.1 --- a/tools/python/xen/xend/XendOptions.py	Thu Jun 19 12:48:04 2008 +0900
    42.2 +++ b/tools/python/xen/xend/XendOptions.py	Wed Jul 02 11:30:37 2008 +0900
    42.3 @@ -132,6 +132,9 @@ class XendOptions:
    42.4      """Default script to configure a backend network interface"""
    42.5      vif_script = osdep.vif_script
    42.6  
    42.7 +    """Default rotation count of qemu-dm log file."""
    42.8 +    qemu_dm_logrotate_count = 10
    42.9 +
   42.10      def __init__(self):
   42.11          self.configure()
   42.12  
   42.13 @@ -351,6 +354,10 @@ class XendOptions:
   42.14      def get_vnc_x509_verify(self):
   42.15          return self.get_config_string('vnc-x509-verify', self.xend_vnc_x509_verify)
   42.16  
   42.17 +    def get_qemu_dm_logrotate_count(self):
   42.18 +        return self.get_config_int("qemu-dm-logrotate-count",
   42.19 +                                   self.qemu_dm_logrotate_count)
   42.20 +
   42.21  
   42.22  class XendOptionsFile(XendOptions):
   42.23  
    43.1 --- a/tools/python/xen/xend/image.py	Thu Jun 19 12:48:04 2008 +0900
    43.2 +++ b/tools/python/xen/xend/image.py	Wed Jul 02 11:30:37 2008 +0900
    43.3 @@ -378,13 +378,23 @@ class ImageHandler:
    43.4          # keep track of pid and spawned options to kill it later
    43.5  
    43.6          self.logfile = "/var/log/xen/qemu-dm-%s.log" %  str(self.vm.info['name_label'])
    43.7 -        if os.path.exists(self.logfile):
    43.8 -            if os.path.exists(self.logfile + ".1"):
    43.9 -                os.unlink(self.logfile + ".1")
   43.10 -            os.rename(self.logfile, self.logfile + ".1")
   43.11 +
   43.12 +        # rotate log
   43.13 +        logfile_mode = os.O_WRONLY|os.O_CREAT|os.O_APPEND
   43.14 +        logrotate_count = XendOptions.instance().get_qemu_dm_logrotate_count()
   43.15 +        if logrotate_count > 0:
   43.16 +            logfile_mode |= os.O_TRUNC
   43.17 +            if os.path.exists("%s.%d" % (self.logfile, logrotate_count)):
   43.18 +                os.unlink("%s.%d" % (self.logfile, logrotate_count))
   43.19 +            for n in range(logrotate_count - 1, 0, -1):
   43.20 +                if os.path.exists("%s.%d" % (self.logfile, n)):
   43.21 +                    os.rename("%s.%d" % (self.logfile, n),
   43.22 +                              "%s.%d" % (self.logfile, (n + 1)))
   43.23 +            if os.path.exists(self.logfile):
   43.24 +                os.rename(self.logfile, self.logfile + ".1")
   43.25  
   43.26          null = os.open("/dev/null", os.O_RDONLY)
   43.27 -        logfd = os.open(self.logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC|os.O_APPEND)
   43.28 +        logfd = os.open(self.logfile, logfile_mode)
   43.29          
   43.30          sys.stderr.flush()
   43.31          pid = os.fork()
    44.1 --- a/tools/python/xen/xend/server/blkif.py	Thu Jun 19 12:48:04 2008 +0900
    44.2 +++ b/tools/python/xen/xend/server/blkif.py	Wed Jul 02 11:30:37 2008 +0900
    44.3 @@ -81,11 +81,11 @@ class BlkifController(DevController):
    44.4          if security.on() == xsconstants.XS_POLICY_ACM:
    44.5              self.do_access_control(config, uname)
    44.6  
    44.7 -        devid = blkif.blkdev_name_to_number(dev)
    44.8 +        (device_path, devid) = blkif.blkdev_name_to_number(dev)
    44.9          if devid is None:
   44.10              raise VmError('Unable to find number for device (%s)' % (dev))
   44.11  
   44.12 -        front = { 'virtual-device' : "%i" % devid,
   44.13 +        front = { device_path : "%i" % devid,
   44.14                    'device-type' : dev_type
   44.15                  }
   44.16  
   44.17 @@ -204,5 +204,5 @@ class BlkifController(DevController):
   44.18                  dev = devid.split('/')[-1]
   44.19                  dev = int(dev)
   44.20              except ValueError:
   44.21 -                dev = blkif.blkdev_name_to_number(dev)
   44.22 +                (device_path, dev) = blkif.blkdev_name_to_number(dev)
   44.23          return dev
    45.1 --- a/tools/python/xen/xm/main.py	Thu Jun 19 12:48:04 2008 +0900
    45.2 +++ b/tools/python/xen/xm/main.py	Wed Jul 02 11:30:37 2008 +0900
    45.3 @@ -2022,8 +2022,7 @@ def xm_block_list(args):
    45.4              map(server.xenapi.VBD.get_runtime_properties, vbd_refs)
    45.5          vbd_devs = \
    45.6              map(server.xenapi.VBD.get_device, vbd_refs)
    45.7 -        vbd_devids = \
    45.8 -            map(blkdev_name_to_number, vbd_devs)
    45.9 +        vbd_devids = [blkdev_name_to_number(x)[1] for x in vbd_devs]
   45.10          devs = map(lambda (devid, prop): [devid, map2sxp(prop)],
   45.11                     zip(vbd_devids, vbd_properties))
   45.12      else:
    46.1 --- a/tools/tests/test_x86_emulator.c	Thu Jun 19 12:48:04 2008 +0900
    46.2 +++ b/tools/tests/test_x86_emulator.c	Wed Jul 02 11:30:37 2008 +0900
    46.3 @@ -22,23 +22,22 @@
    46.4  static int read(
    46.5      unsigned int seg,
    46.6      unsigned long offset,
    46.7 -    unsigned long *val,
    46.8 +    void *p_data,
    46.9      unsigned int bytes,
   46.10      struct x86_emulate_ctxt *ctxt)
   46.11  {
   46.12 -    *val = 0;
   46.13 -    memcpy(val, (void *)offset, bytes);
   46.14 +    memcpy(p_data, (void *)offset, bytes);
   46.15      return X86EMUL_OKAY;
   46.16  }
   46.17  
   46.18  static int write(
   46.19      unsigned int seg,
   46.20      unsigned long offset,
   46.21 -    unsigned long val,
   46.22 +    void *p_data,
   46.23      unsigned int bytes,
   46.24      struct x86_emulate_ctxt *ctxt)
   46.25  {
   46.26 -    memcpy((void *)offset, &val, bytes);
   46.27 +    memcpy((void *)offset, p_data, bytes);
   46.28      return X86EMUL_OKAY;
   46.29  }
   46.30  
    47.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    47.2 +++ b/tools/xenballoon/xenballoon-monitor	Wed Jul 02 11:30:37 2008 +0900
    47.3 @@ -0,0 +1,43 @@
    47.4 +#!/bin/bash
    47.5 +#
    47.6 +# xenballoon-monitor - monitor certain stats from xenballoond
    47.7 +#   (run in dom0 with "watch -d xenballoon-monitor" for xentop-like output)
    47.8 +#
    47.9 +# Copyright (C) 2009 Oracle Corporation and/or its affiliates.
   47.10 +# All rights reserved
   47.11 +# Written by: Dan Magenheimer <dan.magenheimer@oracle.com>
   47.12 +#
   47.13 +# Hint: Use "xm sched-credit -d 0 -w 2000" to watch on heavily loaded machines
   47.14 +#
   47.15 +echo "id   mem-kb  tgt-kb  commit   swapin  swapout      pgin     pgout active(sec)"
   47.16 +for i in `xenstore-list /local/domain`; do
   47.17 + if [ "$i" -ne 0 ]; then
   47.18 + tot=0; tgt=0; sin=0; sout=0; pgin=0; pgout=0; cmt=0; up=0; idle=0; act=0;
   47.19 + if xenstore-exists /local/domain/$i/memory/meminfo; then
   47.20 +  tot=`xenstore-read /local/domain/$i/memory/meminfo | grep MemTotal \
   47.21 +   | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'`
   47.22 +  cmt=`xenstore-read /local/domain/$i/memory/meminfo | grep Committed_AS \
   47.23 +   | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'`
   47.24 + fi
   47.25 + if xenstore-exists /local/domain/$i/memory/selftarget; then
   47.26 +  tgt=`xenstore-read /local/domain/$i/memory/selftarget`
   47.27 + fi
   47.28 + if xenstore-exists /local/domain/$i/memory/vmstat; then
   47.29 +  sin=`xenstore-read /local/domain/$i/memory/vmstat | grep pswpin \
   47.30 + 	| cut -d" " -f2`
   47.31 +  sout=`xenstore-read /local/domain/$i/memory/vmstat | grep pswpout \
   47.32 + 	| cut -d" " -f2`
   47.33 +  pgin=`xenstore-read /local/domain/$i/memory/vmstat | grep pgpgin \
   47.34 + 	| cut -d" " -f2`
   47.35 +  pgout=`xenstore-read /local/domain/$i/memory/vmstat | grep pgout \
   47.36 +  	| cut -d" " -f2`
   47.37 + fi
   47.38 + if xenstore-exists /local/domain/$i/memory/uptime; then
   47.39 +  up=`xenstore-read /local/domain/$i/memory/uptime | cut -d" " -f1`
   47.40 +  idle=`xenstore-read /local/domain/$i/memory/uptime | cut -d" " -f2`
   47.41 +  act=`echo $up - $idle | bc -iq`
   47.42 + fi
   47.43 + printf "%2d %8d%8d%8d%9d%9d%10d%10d%10.2f\n" $i $tot $tgt $cmt $sin $sout $pgin $pgout $act
   47.44 + fi
   47.45 +done
   47.46 +echo Free memory: `xm info | grep free | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'` MB
    48.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    48.2 +++ b/tools/xenballoon/xenballoon.conf	Wed Jul 02 11:30:37 2008 +0900
    48.3 @@ -0,0 +1,91 @@
    48.4 +## Path: System/xen
    48.5 +## Description: xen domain start/stop on boot
    48.6 +## Type: string
    48.7 +## Default: 
    48.8 +
    48.9 +# NOTE: "xenbus is enabled" means not only that /proc/xen/xenbus exists
   48.10 +# but also that /usr/bin/xenstore-* tools are installed.
   48.11 +
   48.12 +## Type: boolean
   48.13 +## Default: false
   48.14 +#
   48.15 +# If XENBALLOON_SELF is true, selfballooning will occur, meaning the
   48.16 +# balloon driver will grow and shrink according to available memory.
   48.17 +# If xenbus is enabled, may be overridden by {memory/selfballoon}==0
   48.18 +# If false but xenballoond is able to communicate with domain0 via
   48.19 +# xenbus, balloon targets will be set by domain0
   48.20 +# 
   48.21 +XENBALLOON_SELF=false
   48.22 +
   48.23 +## Type: integer (must be > 0)
   48.24 +## Default: 1
   48.25 +#
   48.26 +# If self-ballooning, number of seconds between checks/adjustments.
   48.27 +# If xenbus is enabled, may be overridden by {memory/interval}
   48.28 +XENBALLOON_SELF_INTERVAL=1
   48.29 +
   48.30 +## Type: integer (must be > 0)
   48.31 +## Default: 1
   48.32 +#
   48.33 +# If NOT self-ballooning but xenbus is enabled, number of seconds between
   48.34 +# checks/adjustments. May be overridden by {memory/interval}
   48.35 +XENBALLOON_INTERVAL=1
   48.36 +
   48.37 +## Type: integer (must be > 0)
   48.38 +## Default: 10
   48.39 +#
   48.40 +# When current > target, reduces rate at which target memory is ballooned
   48.41 +# out.  For a value of n, 1/n of the difference will be ballooned.
   48.42 +# This value applies both to selfballooning and directed ballooning.
   48.43 +# May be overridden by {memory/downhysteresis}
   48.44 +XENBALLOON_AUTO_DOWNHYSTERESIS=10
   48.45 +
   48.46 +## Type: integer (must be > 0)
   48.47 +## Default: 1
   48.48 +#
   48.49 +# When current < target, reduces rate at which target memory is reclaimed
   48.50 +# (if available).  For a value of n, 1/n of the difference will be ballooned.
   48.51 +# This value applies both to selfballooning and directed ballooning.
   48.52 +# May be overridden by {memory/uphysteresis}
   48.53 +XENBALLOON_AUTO_UPHYSTERESIS=1
   48.54 +
   48.55 +## Type: integer (must be >= 0)
   48.56 +## Default: 0
   48.57 +#
   48.58 +# In order to avoid ballooning so much memory that a guest experiences
   48.59 +# out-of-memory errors (OOMs), memory will not be ballooned out below
   48.60 +# a minimum target, in MB.  If this value is 0 (default), an heuristic
   48.61 +# based on the maximum amount of memory will be used.  (The heuristic
   48.62 +# provides the same minimum as recent versions of the balloon driver but
   48.63 +# early versions of the balloon driver did not enforce a minimum.)
   48.64 +XENBALLOON_MINMEM=0
   48.65 +
   48.66 +## Type: string
   48.67 +## Default: "/var/run/xenballoon-maxmem"
   48.68 +#
   48.69 +# Location where memory high-water mark is stored; if a guest supports
   48.70 +# hot-add memory, maxmem might increase across time and the minimum
   48.71 +# target heuristic is based on max memory. NOTE: Reboot after changing
   48.72 +# this variable, else overballooning may occur.
   48.73 +XENBALLOON_MAXMEMFILE=/var/run/xenballoon-maxmem
   48.74 +
   48.75 +## Type: integer (0 or 1)
   48.76 +## Default: 1
   48.77 +#
   48.78 +# If xenbus is enabled, whether selfballooning or directed ballooning,
   48.79 +# place the result of 'cat /proc/meminfo" on xenbus at memory/meminfo
   48.80 +XENBALLOON_SEND_MEMINFO=1
   48.81 +
   48.82 +## Type: integer (0 or 1)
   48.83 +## Default: 1
   48.84 +#
   48.85 +# If xenbus is enabled, whether selfballooning or directed ballooning,
   48.86 +# place the result of 'cat /proc/vmstat" on xenbus at memory/vmstat
   48.87 +XENBALLOON_SEND_VMSTAT=1
   48.88 +
   48.89 +## Type: integer (0 or 1)
   48.90 +## Default: 1
   48.91 +#
   48.92 +# If xenbus is enabled, whether selfballooning or directed ballooning,
   48.93 +# place the result of 'cat /proc/uptime" on xenbus at memory/uptime
   48.94 +XENBALLOON_SEND_UPTIME=1
    49.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    49.2 +++ b/tools/xenballoon/xenballoond	Wed Jul 02 11:30:37 2008 +0900
    49.3 @@ -0,0 +1,205 @@
    49.4 +#!/bin/bash
    49.5 +#
    49.6 +# Copyright (C) 2008 Oracle Corporation and/or its affiliates.
    49.7 +# All rights reserved.
    49.8 +# Written by: Dan Magenheimer <dan.magenheimer@oracle.com>
    49.9 +#
   49.10 +# xenballoond - In-guest engine for Xen memory ballooning
   49.11 +# Version: 080630
   49.12 +#
   49.13 +# Two "policies" are implemented:
   49.14 +# - Selfballooning: Adjust memory periodically, with no (or little) input
   49.15 +#     from domain0.  Target memory is determined solely by the
   49.16 +#     Committed_AS line in /proc/meminfo, but parameters may adjust
   49.17 +#     the rate at which the target is achieved.
   49.18 +# - Directed ballooning: Adjust memory solely as directed by domain0
   49.19 +#
   49.20 +# Under some circumstances, "output" may also be generated; the contents
   49.21 +# of /proc/meminfo and /proc/vmstat may be periodically placed on xenbus.
   49.22 +#
   49.23 +# If xenbus is running and the /usr/bin/xenstore-* tools are installed,
   49.24 +# "xenbus is enabled".
   49.25 +#
   49.26 +# Parameters are documented in /etc/sysconfig/xenballoon.conf. Although 
   49.27 +# some are not used with directed ballooning, all must be set properly.
   49.28 +# If xenbus is enabled, some of these parameters may be overridden by values
   49.29 +# set by domain0 via xenbus.
   49.30 +
   49.31 +minmb() {
   49.32 +	RETVAL=$XENBALLOON_MINMEM
   49.33 +	if [ $RETVAL -ne 0 ]; then
   49.34 +		return $RETVAL
   49.35 +	fi
   49.36 +	kb=`cat $XENBALLOON_MAXMEMFILE`
   49.37 +	let "mb=$kb/1024"
   49.38 +	let "pages=$kb/4"
   49.39 +	# this algorithm from drivers/xen/balloon/balloon.c:minimum_target()
   49.40 +	# which was added to balloon.c in 2008 to avoid ballooning too small
   49.41 +	# it is unnecessary here except to accomodate pre-2008 balloon drivers
   49.42 +	# note that ranges are adjusted because a VM with "memory=1024"
   49.43 +	# gets somewhat less than 1024MB
   49.44 +	if [ $mb -lt 125 ]; then
   49.45 +		let RETVAL="$(( 8 + ($pages >> 9) ))"
   49.46 +	elif [ $mb -lt 500 ]; then
   49.47 +		let RETVAL="$(( 40 + ($pages >> 10) ))"
   49.48 +	elif [ $mb -lt 2000 ]; then
   49.49 +		let RETVAL="$(( 104 + ($pages >> 11) ))"
   49.50 +	else
   49.51 +		let RETVAL="$(( 296 + ($pages >> 13) ))"
   49.52 +	fi
   49.53 +	return	# value returned in RETVAL in mB
   49.54 +}
   49.55 +
   49.56 +curkb() {
   49.57 +	kb=`grep MemTotal /proc/meminfo | sed 's/  */ /' | \
   49.58 +		cut -f2 -d' '`
   49.59 +	RETVAL=$kb
   49.60 +	return  # value returned in RETVAL in kB
   49.61 +}
   49.62 +
   49.63 +downhysteresis() {
   49.64 +	RETVAL=$XENBALLOON_AUTO_DOWNHYSTERESIS
   49.65 +	if [ $xenstore_enabled = "true" ]; then
   49.66 +		if xenstore-exists memory/downhysteresis ; then
   49.67 +			RETVAL=`xenstore-read memory/downhysteresis`
   49.68 +		fi
   49.69 +	fi
   49.70 +	return
   49.71 +}
   49.72 +
   49.73 +uphysteresis() {
   49.74 +	RETVAL=$XENBALLOON_AUTO_UPHYSTERESIS
   49.75 +	if [ $xenstore_enabled = "true" ]; then
   49.76 +		if xenstore-exists memory/uphysteresis ; then
   49.77 +			RETVAL=`xenstore-read memory/uphysteresis`
   49.78 +		fi
   49.79 +	fi
   49.80 +	return
   49.81 +}
   49.82 +
   49.83 +selfballoon_eval() {
   49.84 +	if [ $xenstore_enabled = "true" ]; then
   49.85 +		if xenstore-exists memory/selfballoon; then
   49.86 +			RETVAL=`xenstore-read memory/selfballoon`
   49.87 +			if [ $RETVAL -eq 1 ]; then
   49.88 +				selfballoon_enabled=true
   49.89 +				return
   49.90 +			fi
   49.91 +		fi
   49.92 +	fi
   49.93 +	selfballoon_enabled=$XENBALLOON_SELF
   49.94 +	return
   49.95 +}
   49.96 +
   49.97 +selftarget() {
   49.98 +	tgtkb=`grep Committed_AS /proc/meminfo | sed 's/  */ /' | cut -f2 -d' '`
   49.99 +	minmb
  49.100 +	let "minbytes=$RETVAL*1024*1024"
  49.101 +	let "tgtbytes=$tgtkb*1024"
  49.102 +	if [ $tgtbytes -lt $minbytes ]; then
  49.103 +		let "tgtbytes=$minbytes"
  49.104 +	fi
  49.105 +	RETVAL=$tgtbytes  # value returned in RETVAL in bytes
  49.106 +	return
  49.107 +}
  49.108 +
  49.109 +# $1 == 1 means use selftarget, else target in kB
  49.110 +balloon_to_target() {
  49.111 +	if [ "$1" -eq 1 ]; then
  49.112 +		selftarget
  49.113 +		tgtbytes=$RETVAL
  49.114 +	else
  49.115 +		let "tgtbytes=$(( $1 * 1024 ))"
  49.116 +	fi
  49.117 +	curkb
  49.118 +	let "curbytes=$RETVAL*1024"
  49.119 +	if [ $curbytes -gt $tgtbytes ]; then
  49.120 +		downhysteresis
  49.121 +		downhys=$RETVAL
  49.122 +		if [ $downhys -ne 0 ]; then
  49.123 +			let "tgtbytes=$(( $curbytes - \
  49.124 +				( ( $curbytes - $tgtbytes ) / $downhys ) ))"
  49.125 +		fi
  49.126 +	else if [ $curbytes -lt $tgtbytes ]; then
  49.127 +		uphysteresis
  49.128 +		uphys=$RETVAL
  49.129 +		let "tgtbytes=$(( $curbytes + \
  49.130 +				( ( $tgtbytes - $curbytes ) / $uphys ) ))"
  49.131 +		fi
  49.132 +	fi
  49.133 +	echo $tgtbytes > /proc/xen/balloon
  49.134 +	if [ $xenstore_enabled = "true" ]; then
  49.135 +		let "tgtkb=$(( $tgtbytes/1024 ))"
  49.136 +		xenstore-write memory/selftarget $tgtkb
  49.137 +	fi
  49.138 +}
  49.139 +
  49.140 +send_memory_stats() {
  49.141 +	if [ ! $xenstore_enabled = "true" ]; then
  49.142 +		return
  49.143 +	fi
  49.144 +	if [ $XENBALLOON_SEND_MEMINFO ]; then
  49.145 +		xenstore-write memory/meminfo "`cat /proc/meminfo`"
  49.146 +	fi
  49.147 +	if [ $XENBALLOON_SEND_VMSTAT ]; then
  49.148 +		xenstore-write memory/vmstat "`cat /proc/vmstat`"
  49.149 +	fi
  49.150 +	if [ $XENBALLOON_SEND_UPTIME ]; then
  49.151 +		xenstore-write memory/uptime "`cat /proc/uptime`"
  49.152 +	fi
  49.153 +}
  49.154 +
  49.155 +if [ ! -f /proc/xen/balloon ]; then
  49.156 +	echo "$0: no balloon driver installed"
  49.157 +	exit 0
  49.158 +fi
  49.159 +if [ ! -f /proc/meminfo ]; then
  49.160 +	echo "$0: can't read /proc/meminfo"
  49.161 +	exit 0
  49.162 +fi
  49.163 +xenstore_enabled=true
  49.164 +if [ -f /usr/bin/xenstore-exists -a -f /usr/bin/xenstore-read -a \
  49.165 +     -f /usr/bin/xenstore-write ]; then
  49.166 +	xenstore_enabled=true
  49.167 +else
  49.168 +	echo "$0: missing /usr/bin/xenstore-* tools, disabling directed ballooning"
  49.169 +	xenstore_enabled=false
  49.170 +fi
  49.171 +
  49.172 +. /etc/sysconfig/xenballoon.conf
  49.173 +
  49.174 +while true;
  49.175 +do
  49.176 +	# handle special case for PV domains with hot-add memory
  49.177 +	if [ ! -f $XENBALLOON_MAXMEMFILE ]; then
  49.178 +		maxkb=0
  49.179 +	else
  49.180 +		maxkb=`cat $XENBALLOON_MAXMEMFILE`
  49.181 +	fi
  49.182 +	curkb=`grep MemTotal /proc/meminfo | sed 's/  */ /' | cut -f2 -d' '`
  49.183 +	if [ $curkb -gt $maxkb ]; then
  49.184 +		echo $curkb > $XENBALLOON_MAXMEMFILE
  49.185 +	fi
  49.186 +	interval=$XENBALLOON_INTERVAL
  49.187 +	# do self-ballooning
  49.188 +	selfballoon_eval
  49.189 +	if [ $selfballoon_enabled = "true" ]; then
  49.190 +		balloon_to_target 1
  49.191 +		interval=$XENBALLOON_SELF_INTERVAL
  49.192 +	# or do directed ballooning
  49.193 +	elif [ $xenstore_enabled = "true" ]; then
  49.194 +		if xenstore-exists memory/target ; then
  49.195 +			tgtkb=`xenstore-read memory/target`
  49.196 +			balloon_to_target $tgtkb
  49.197 +		fi
  49.198 +		interval=$XENBALLOON_INTERVAL
  49.199 +	fi
  49.200 +	send_memory_stats
  49.201 +	if [ $xenstore_enabled = "true" ]; then
  49.202 +		if xenstore-exists memory/interval ; then
  49.203 +			interval=`xenstore-read memory/interval`
  49.204 +		fi
  49.205 +	fi
  49.206 +	sleep $interval
  49.207 +done &
  49.208 +
    50.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    50.2 +++ b/tools/xenballoon/xenballoond.README	Wed Jul 02 11:30:37 2008 +0900
    50.3 @@ -0,0 +1,82 @@
    50.4 +Xenballoond.README
    50.5 +Preliminary version 0.1, 2008/06/30
    50.6 +
    50.7 +Copyright (C) 2008 Oracle Corporation and/or its affiliates.
    50.8 +All rights reserved.
    50.9 +Written by Dan Magenheimer <dan.magenheimer@oracle.com>
   50.10 +
   50.11 +INTRODUCTION
   50.12 +
   50.13 +Xenballoond runs in guest domains and both implements selfballooning and
   50.14 +provides metrics to dom0 for (future) directed ballooning.  Both capabilities
   50.15 +provide a foundation for basic "memory overcommit" functionality.
   50.16 +
   50.17 +With selfballooning enabled, xenballoond uses the Committed_AS value found
   50.18 +in /proc/meminfo as a first approximation of how much memory is required
   50.19 +by the guest and feeds this statistic back to the balloon driver to inflate
   50.20 +or deflate the balloon as required to achieve the target guest memory size.
   50.21 +Hysteresis parameters may be adjusted to rate-limit balloon inflation
   50.22 +and deflation.
   50.23 +
   50.24 +If configured, certain selfballooning parameters -- including notably
   50.25 +enabling/disabling of self-ballooning -- can be controlled from domain0.
   50.26 +(These are fully documented in xenballoon.conf.)
   50.27 +
   50.28 +If configured, the following guest statistics are sent back to domain0:
   50.29 +- /proc/meminfo
   50.30 +- /proc/vmstat
   50.31 +- /proc/uptime
   50.32 +In a future release, some of these values will be used by a policy module
   50.33 +in domain0 to control guest balloon size and provide memory balancing
   50.34 +across all guests on a given system.
   50.35 +
   50.36 +Note that no page sharing (content-based or otherwise) is implemented
   50.37 +and no VMM-based swapping is necessary.
   50.38 +
   50.39 +For more information, see:
   50.40 +http://www.xen.org/files/xensummitboston08/MemoryOvercommit-XenSummit2008.pdf
   50.41 +http://wiki.xensource.com/xenwiki/Open_Topics_For_Discussion?action=AttachFile&do=get&target=Memory+Overcommit.pdf
   50.42 +
   50.43 +INSTALLATION AND DEPLOYMENT
   50.44 +
   50.45 +In this preliminary release:
   50.46 +- directed ballooning is not implemented, though a monitor is provided
   50.47 +- only Redhat-based guests are supported
   50.48 +
   50.49 +Guest prerequisites to use xenballoond:
   50.50 +- each guest must be configured with adequate[1] swap space
   50.51 +- each guest must have the balloon driver installed (/proc/xen/balloon exists) 
   50.52 +- if directed ballooning (or monitoring) is desired, xenstore tools must be
   50.53 +  installed in each guest in /usr/bin [2]
   50.54 +
   50.55 +[1] for best results, for a guest that is configured with maxmem=N and
   50.56 +    requires Z MB of swap space without xenballoond, available swap should
   50.57 +    be increased to N+Z MB when xenballoond is running
   50.58 +[2] specifically xenstore-read, xenstore-exists, and xenstore-write must
   50.59 +    be installed.  Binaries can be obtained, for example, by building
   50.60 +    xen-vvv.gz/tools in a guest-binary-compatible development tree
   50.61 +
   50.62 +Instructions to install/deploy xenballoond (in Redhat-based system):
   50.63 +- in each guest:
   50.64 +  - ensure pre-requisites are met (see above)
   50.65 +  - place xenballoon.conf in /etc/sysconfig
   50.66 +  - place xenballoond in /usr/sbin
   50.67 +  - copy xenballoond.init to /etc/rc.d/init.d/xenballoond (note file rename)
   50.68 +  - edit /etc/sysconfig/xenballoond.conf as desired (especially note that
   50.69 +    selfballooning defaults as off)
   50.70 +  - start xenballoond with "service xenballoond start", and/or configure
   50.71 +    xenballoond to start at init (e.g. "chkconfig xenballoond on")
   50.72 +- in domain0:
   50.73 +  - if monitoring is desired, xenballoon-monitor may be installed in /usr/sbin
   50.74 +- note that certain xenballoond.conf variables may be overridden by domain0
   50.75 +  if xenstore is running in the guest; these are fully documented in
   50.76 +  xenballoond.conf
   50.77 +
   50.78 +TODO:
   50.79 +080630 modifications to support SUSE-based and debian-based guests
   50.80 +080630 domain0 ballooning policy module
   50.81 +080630 experiment with more aggressive (optionally) memory minimum targets
   50.82 +080630 BUG: xenballoond doesn't properly record the fact that it's running;
   50.83 +       e.g. flipping between run levels 5 and 3 launches additional daemons
   50.84 +080630 BUG: reports of possible incompatibilites between ballooning and
   50.85 +       save/restore/migrate have not been duplicated
    51.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    51.2 +++ b/tools/xenballoon/xenballoond.init	Wed Jul 02 11:30:37 2008 +0900
    51.3 @@ -0,0 +1,91 @@
    51.4 +#!/bin/bash
    51.5 +#
    51.6 +# xenballoond	Script to start and stop Xen ballooning daemon.
    51.7 +#
    51.8 +# Copyright (C) 2008 Oracle Corporation and/or its affiliates.
    51.9 +# All rights reserved.
   51.10 +# Written by: Dan Magenheimer <dan.magenheimer@oracle.com>
   51.11 +#
   51.12 +# chkconfig: 2345 98 01
   51.13 +# description: Starts and stops the Xen control daemon.
   51.14 +### BEGIN INIT INFO
   51.15 +# Provides:          xenballoond
   51.16 +# Required-Start:    $syslog $remote_fs
   51.17 +# Should-Start:
   51.18 +# Required-Stop:     $syslog $remote_fs
   51.19 +# Should-Stop:
   51.20 +# Default-Start:     3 4 5
   51.21 +# Default-Stop:      0 1 2 6
   51.22 +# Default-Enabled:   yes
   51.23 +# Short-Description: Start/stop xend
   51.24 +# Description:       Starts and stops the Xen ballooning daemon.
   51.25 +### END INIT INFO
   51.26 +
   51.27 +# Source function library
   51.28 +. /etc/init.d/functions
   51.29 +
   51.30 +#don't use in domain0
   51.31 +[ -f /proc/xen/capabilities ] && \
   51.32 +	grep -q "control_d" /proc/xen/capabilities && exit 0
   51.33 +
   51.34 +if [ -f /etc/sysconfig/xenballoon.conf ]; then
   51.35 +	. /etc/sysconfig/xenballoon.conf
   51.36 +fi
   51.37 +
   51.38 +# Check that balloon driver is present
   51.39 +[ ! -f /proc/xen/balloon ] && exit 0
   51.40 +
   51.41 +# Record original memory (in kB)
   51.42 +[ -z "$XENBALLOON_MAXMEMFILE" ] && exit 0
   51.43 +let maxmem=`grep MemTotal /proc/meminfo | sed 's/  */ /' | cut -f2 -d' '`
   51.44 +if [ -f "$XENBALLOON_MAXMEMFILE" ]; then
   51.45 +	let oldmax=`cat $XENBALLOON_MAXMEMFILE`
   51.46 +	if [ $oldmax -gt $maxmem ]; then
   51.47 +		let maxmem=oldmax
   51.48 +	fi
   51.49 +fi
   51.50 +echo $maxmem > $XENBALLOON_MAXMEMFILE
   51.51 +
   51.52 +RETVAL=0
   51.53 +prog="xenballoond"
   51.54 +
   51.55 +start() {
   51.56 +        # Start daemons.
   51.57 +        echo -n $"Starting $prog: "
   51.58 +        daemon xenballoond $OPTIONS
   51.59 +	RETVAL=$?
   51.60 +        echo
   51.61 +	return $RETVAL
   51.62 +}
   51.63 +
   51.64 +stop() {
   51.65 +        echo -n $"Shutting down $prog: "
   51.66 +	killproc xenballoond
   51.67 +	RETVAL=$?
   51.68 +        echo
   51.69 +	return $RETVAL
   51.70 +}
   51.71 +
   51.72 +# See how we were called.
   51.73 +case "$1" in
   51.74 +  start)
   51.75 +	start
   51.76 +        ;;
   51.77 +  stop)
   51.78 +	stop
   51.79 +        ;;
   51.80 +  status)
   51.81 +	status xenballoond
   51.82 +	RETVAL=$?
   51.83 +	;;
   51.84 +  restart|reload)
   51.85 +	stop
   51.86 +	start
   51.87 +	RETVAL=$?
   51.88 +	;;
   51.89 +  *)
   51.90 +        echo $"Usage: $0 {start|stop|restart|status}"
   51.91 +        exit 1
   51.92 +esac
   51.93 +
   51.94 +exit $RETVAL
    52.1 --- a/tools/xentrace/xenctx.c	Thu Jun 19 12:48:04 2008 +0900
    52.2 +++ b/tools/xentrace/xenctx.c	Wed Jul 02 11:30:37 2008 +0900
    52.3 @@ -702,7 +702,7 @@ void print_stack(vcpu_guest_context_t *c
    52.4  void dump_ctx(int vcpu)
    52.5  {
    52.6      int ret;
    52.7 -    vcpu_guest_context_t ctx;
    52.8 +    vcpu_guest_context_any_t ctx;
    52.9      xc_dominfo_t dominfo;
   52.10  
   52.11      xc_handle = xc_interface_open(); /* for accessing control interface */
   52.12 @@ -727,10 +727,10 @@ void dump_ctx(int vcpu)
   52.13          exit(-1);
   52.14      }
   52.15  
   52.16 -    print_ctx(&ctx);
   52.17 +    print_ctx(&ctx.c);
   52.18  #ifndef NO_TRANSLATION
   52.19 -    if (is_kernel_text(INSTR_POINTER((&ctx.user_regs))))
   52.20 -        print_stack(&ctx, vcpu);
   52.21 +    if (is_kernel_text(INSTR_POINTER((&ctx.c.user_regs))))
   52.22 +        print_stack(&ctx.c, vcpu);
   52.23  #endif
   52.24  
   52.25      if (!dominfo.paused) {
    53.1 --- a/tools/xm-test/lib/XmTestLib/block_utils.py	Thu Jun 19 12:48:04 2008 +0900
    53.2 +++ b/tools/xm-test/lib/XmTestLib/block_utils.py	Wed Jul 02 11:30:37 2008 +0900
    53.3 @@ -15,7 +15,7 @@ import xen.util.blkif
    53.4  
    53.5  
    53.6  def get_state(domain, devname):
    53.7 -    number = xen.util.blkif.blkdev_name_to_number(devname)
    53.8 +    (path, number) = xen.util.blkif.blkdev_name_to_number(devname)
    53.9      s, o = traceCommand("xm block-list %s | awk '/^%d/ {print $4}'" %
   53.10                          (domain.getName(), number))
   53.11      if s != 0:
    54.1 --- a/xen/arch/ia64/vmx/vmx_hypercall.c	Thu Jun 19 12:48:04 2008 +0900
    54.2 +++ b/xen/arch/ia64/vmx/vmx_hypercall.c	Wed Jul 02 11:30:37 2008 +0900
    54.3 @@ -204,6 +204,53 @@ do_hvm_op(unsigned long op, XEN_GUEST_HA
    54.4          rc = -ENOSYS;
    54.5          break;
    54.6  
    54.7 +    case HVMOP_modified_memory:
    54.8 +    {
    54.9 +        struct xen_hvm_modified_memory a;
   54.10 +        struct domain *d;
   54.11 +        unsigned long pfn;
   54.12 +
   54.13 +        if ( copy_from_guest(&a, arg, 1) )
   54.14 +            return -EFAULT;
   54.15 +
   54.16 +        if ( a.domid == DOMID_SELF )
   54.17 +        {
   54.18 +            d = rcu_lock_current_domain();
   54.19 +        }
   54.20 +        else
   54.21 +        {
   54.22 +            if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL )
   54.23 +                return -ESRCH;
   54.24 +            if ( !IS_PRIV_FOR(current->domain, d) )
   54.25 +            {
   54.26 +                rc = -EPERM;
   54.27 +                goto param_fail3;
   54.28 +            }
   54.29 +        }
   54.30 +
   54.31 +        rc = -EINVAL;
   54.32 +        if ( !is_hvm_domain(d) )
   54.33 +            goto param_fail3;
   54.34 +
   54.35 +        rc = -EINVAL;
   54.36 +        if ( a.first_pfn > domain_get_maximum_gpfn(d)
   54.37 +                || a.first_pfn + a.nr - 1 < a.first_pfn
   54.38 +                || a.first_pfn + a.nr - 1 > domain_get_maximum_gpfn(d))
   54.39 +            goto param_fail3;
   54.40 +
   54.41 +        rc = 0;
   54.42 +        if ( !d->arch.shadow_bitmap )
   54.43 +            goto param_fail3;
   54.44 +
   54.45 +        for (pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++)
   54.46 +            if (pfn < d->arch.shadow_bitmap_size)
   54.47 +                set_bit(pfn, d->arch.shadow_bitmap);
   54.48 +
   54.49 +    param_fail3:
   54.50 +        rcu_unlock_domain(d);
   54.51 +        break;
   54.52 +    }
   54.53 +
   54.54      default:
   54.55          gdprintk(XENLOG_INFO, "Bad HVM op %ld.\n", op);
   54.56          rc = -ENOSYS;
    55.1 --- a/xen/arch/ia64/xen/mm.c	Thu Jun 19 12:48:04 2008 +0900
    55.2 +++ b/xen/arch/ia64/xen/mm.c	Wed Jul 02 11:30:37 2008 +0900
    55.3 @@ -207,7 +207,7 @@ alloc_dom_xen_and_dom_io(void)
    55.4       * Any Xen-heap pages that we will allow to be mapped will have
    55.5       * their domain field set to dom_xen.
    55.6       */
    55.7 -    dom_xen = alloc_domain(DOMID_XEN);
    55.8 +    dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
    55.9      BUG_ON(dom_xen == NULL);
   55.10  
   55.11      /*
   55.12 @@ -215,7 +215,7 @@ alloc_dom_xen_and_dom_io(void)
   55.13       * This domain owns I/O pages that are within the range of the page_info
   55.14       * array. Mappings occur at the priv of the caller.
   55.15       */
   55.16 -    dom_io = alloc_domain(DOMID_IO);
   55.17 +    dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
   55.18      BUG_ON(dom_io == NULL);
   55.19  }
   55.20  
   55.21 @@ -1553,7 +1553,7 @@ expose_p2m_init(void)
   55.22       * Initialise our DOMID_P2M domain.
   55.23       * This domain owns m2p table pages.
   55.24       */
   55.25 -    dom_p2m = alloc_domain(DOMID_P2M);
   55.26 +    dom_p2m = domain_create(DOMID_P2M, DOMCRF_dummy, 0);
   55.27      BUG_ON(dom_p2m == NULL);
   55.28      dom_p2m->max_pages = ~0U;
   55.29  
    56.1 --- a/xen/arch/x86/acpi/cpufreq/Makefile	Thu Jun 19 12:48:04 2008 +0900
    56.2 +++ b/xen/arch/x86/acpi/cpufreq/Makefile	Wed Jul 02 11:30:37 2008 +0900
    56.3 @@ -1,3 +1,4 @@
    56.4  obj-y += cpufreq.o
    56.5  obj-y += utility.o
    56.6  obj-y += cpufreq_ondemand.o
    56.7 +obj-y += powernow.o
    57.1 --- a/xen/arch/x86/acpi/cpufreq/cpufreq.c	Thu Jun 19 12:48:04 2008 +0900
    57.2 +++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c	Wed Jul 02 11:30:37 2008 +0900
    57.3 @@ -47,6 +47,10 @@
    57.4  struct processor_pminfo processor_pminfo[NR_CPUS];
    57.5  struct cpufreq_policy xen_px_policy[NR_CPUS];
    57.6  
    57.7 +static cpumask_t *cpufreq_dom_pt;
    57.8 +static cpumask_t cpufreq_dom_mask;
    57.9 +static unsigned int cpufreq_dom_max;
   57.10 +
   57.11  enum {
   57.12      UNDEFINED_CAPABLE = 0,
   57.13      SYSTEM_INTEL_MSR_CAPABLE,
   57.14 @@ -60,7 +64,6 @@ struct acpi_cpufreq_data {
   57.15      struct processor_performance *acpi_data;
   57.16      struct cpufreq_frequency_table *freq_table;
   57.17      unsigned int max_freq;
   57.18 -    unsigned int resume;
   57.19      unsigned int cpu_feature;
   57.20  };
   57.21  
   57.22 @@ -328,14 +331,16 @@ static int acpi_cpufreq_target(struct cp
   57.23  
   57.24      next_perf_state = data->freq_table[next_state].index;
   57.25      if (perf->state == next_perf_state) {
   57.26 -        if (unlikely(data->resume)) {
   57.27 -            printk("xen_pminfo: @acpi_cpufreq_target, "
   57.28 -                "Called after resume, resetting to P%d\n", 
   57.29 +        if (unlikely(policy->resume)) {
   57.30 +            printk(KERN_INFO "Called after resume, resetting to P%d\n", 
   57.31                  next_perf_state);
   57.32 -            data->resume = 0;
   57.33 +            policy->resume = 0;
   57.34          }
   57.35 -        else
   57.36 +        else {
   57.37 +            printk(KERN_INFO "Already at target state (P%d)\n", 
   57.38 +                next_perf_state);
   57.39              return 0;
   57.40 +        }
   57.41      }
   57.42  
   57.43      switch (data->cpu_feature) {
   57.44 @@ -531,7 +536,7 @@ acpi_cpufreq_cpu_init(struct cpufreq_pol
   57.45       * the first call to ->target() should result in us actually
   57.46       * writing something to the appropriate registers.
   57.47       */
   57.48 -    data->resume = 1;
   57.49 +    policy->resume = 1;
   57.50  
   57.51      return result;
   57.52  
   57.53 @@ -549,61 +554,101 @@ static struct cpufreq_driver acpi_cpufre
   57.54      .init   = acpi_cpufreq_cpu_init,
   57.55  };
   57.56  
   57.57 -int acpi_cpufreq_init(void)
   57.58 +void cpufreq_dom_exit(void)
   57.59  {
   57.60 -    unsigned int i, ret = 0;
   57.61 -    unsigned int dom, max_dom = 0;
   57.62 -    cpumask_t *pt, dom_mask;
   57.63 +    cpufreq_dom_max = 0;
   57.64 +    cpus_clear(cpufreq_dom_mask);
   57.65 +    if (cpufreq_dom_pt)
   57.66 +        xfree(cpufreq_dom_pt);
   57.67 +}
   57.68  
   57.69 -    cpus_clear(dom_mask);
   57.70 +int cpufreq_dom_init(void)
   57.71 +{
   57.72 +    unsigned int i;
   57.73 +
   57.74 +    cpufreq_dom_max = 0;
   57.75 +    cpus_clear(cpufreq_dom_mask);
   57.76  
   57.77      for_each_online_cpu(i) {
   57.78 -        cpu_set(processor_pminfo[i].perf.domain_info.domain, dom_mask);
   57.79 -        if (max_dom < processor_pminfo[i].perf.domain_info.domain)
   57.80 -            max_dom = processor_pminfo[i].perf.domain_info.domain;
   57.81 +        cpu_set(processor_pminfo[i].perf.domain_info.domain, cpufreq_dom_mask);
   57.82 +        if (cpufreq_dom_max < processor_pminfo[i].perf.domain_info.domain)
   57.83 +            cpufreq_dom_max = processor_pminfo[i].perf.domain_info.domain;
   57.84      }
   57.85 -    max_dom++;
   57.86 +    cpufreq_dom_max++;
   57.87  
   57.88 -    pt = xmalloc_array(cpumask_t, max_dom);
   57.89 -    if (!pt)
   57.90 +    cpufreq_dom_pt = xmalloc_array(cpumask_t, cpufreq_dom_max);
   57.91 +    if (!cpufreq_dom_pt)
   57.92          return -ENOMEM;
   57.93 -    memset(pt, 0, max_dom * sizeof(cpumask_t));
   57.94 -
   57.95 -    /* get cpumask of each psd domain */
   57.96 -    for_each_online_cpu(i)
   57.97 -        cpu_set(i, pt[processor_pminfo[i].perf.domain_info.domain]);
   57.98 +    memset(cpufreq_dom_pt, 0, cpufreq_dom_max * sizeof(cpumask_t));
   57.99  
  57.100      for_each_online_cpu(i)
  57.101 -        processor_pminfo[i].perf.shared_cpu_map = 
  57.102 -            pt[processor_pminfo[i].perf.domain_info.domain];
  57.103 +        cpu_set(i, cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain]);
  57.104 +
  57.105 +    for_each_online_cpu(i)
  57.106 +        processor_pminfo[i].perf.shared_cpu_map =
  57.107 +            cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain];
  57.108  
  57.109 -    cpufreq_driver = &acpi_cpufreq_driver;
  57.110 +    return 0;
  57.111 +}
  57.112  
  57.113 -    /* setup cpufreq infrastructure */
  57.114 +static int cpufreq_cpu_init(void)
  57.115 +{
  57.116 +    int i, ret = 0;
  57.117 +
  57.118      for_each_online_cpu(i) {
  57.119          xen_px_policy[i].cpu = i;
  57.120  
  57.121          ret = px_statistic_init(i);
  57.122          if (ret)
  57.123 -            goto out;
  57.124 +            return ret;
  57.125  
  57.126          ret = acpi_cpufreq_cpu_init(&xen_px_policy[i]);
  57.127          if (ret)
  57.128 -            goto out;
  57.129 +            return ret;
  57.130      }
  57.131 +    return ret;
  57.132 +}
  57.133  
  57.134 -    /* setup ondemand cpufreq */
  57.135 -    for (dom=0; dom<max_dom; dom++) {
  57.136 -        if (!cpu_isset(dom, dom_mask))
  57.137 +int cpufreq_dom_dbs(unsigned int event)
  57.138 +{
  57.139 +    int cpu, dom, ret = 0;
  57.140 +
  57.141 +    for (dom=0; dom<cpufreq_dom_max; dom++) {
  57.142 +        if (!cpu_isset(dom, cpufreq_dom_mask))
  57.143              continue;
  57.144 -        i = first_cpu(pt[dom]);
  57.145 -        ret = cpufreq_governor_dbs(&xen_px_policy[i], CPUFREQ_GOV_START);
  57.146 +        cpu = first_cpu(cpufreq_dom_pt[dom]);
  57.147 +        ret = cpufreq_governor_dbs(&xen_px_policy[cpu], event);
  57.148          if (ret)
  57.149 -            goto out;
  57.150 +            return ret;
  57.151      }
  57.152 -
  57.153 -out:
  57.154 -    xfree(pt);
  57.155 -   
  57.156      return ret;
  57.157  }
  57.158 +
  57.159 +int acpi_cpufreq_init(void)
  57.160 +{
  57.161 +    int ret = 0;
  57.162 +    
  57.163 +    /* setup cpumask of psd dom and shared cpu map of cpu */
  57.164 +    ret = cpufreq_dom_init();
  57.165 +    if (ret)
  57.166 +        goto err;
  57.167 +
  57.168 +    /* setup cpufreq driver */
  57.169 +    cpufreq_driver = &acpi_cpufreq_driver;
  57.170 +
  57.171 +    /* setup cpufreq infrastructure */
  57.172 +    ret = cpufreq_cpu_init();
  57.173 +    if (ret)
  57.174 +        goto err;
  57.175 +
  57.176 +    /* setup cpufreq dbs according to dom coordiation */
  57.177 +    ret = cpufreq_dom_dbs(CPUFREQ_GOV_START);
  57.178 +    if (ret)
  57.179 +        goto err;
  57.180 +
  57.181 +    return ret;
  57.182 +
  57.183 +err:
  57.184 +    cpufreq_dom_exit();
  57.185 +    return ret;
  57.186 +}
    58.1 --- a/xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c	Thu Jun 19 12:48:04 2008 +0900
    58.2 +++ b/xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c	Wed Jul 02 11:30:37 2008 +0900
    58.3 @@ -52,7 +52,7 @@ static struct dbs_tuners {
    58.4  
    58.5  static struct timer dbs_timer[NR_CPUS];
    58.6  
    58.7 -static inline uint64_t get_cpu_idle_time(unsigned int cpu)
    58.8 +inline uint64_t get_cpu_idle_time(unsigned int cpu)
    58.9  {
   58.10      uint64_t idle_ns;
   58.11      struct vcpu *v;
   58.12 @@ -79,6 +79,12 @@ static void dbs_check_cpu(struct cpu_dbs
   58.13          return;
   58.14  
   58.15      policy = this_dbs_info->cur_policy;
   58.16 +
   58.17 +    if (unlikely(policy->resume)) {
   58.18 +        __cpufreq_driver_target(policy, policy->max,CPUFREQ_RELATION_H);
   58.19 +        return;
   58.20 +    }
   58.21 +
   58.22      cur_ns = NOW();
   58.23      total_ns = cur_ns - this_dbs_info->prev_cpu_wall;
   58.24      this_dbs_info->prev_cpu_wall = NOW();
   58.25 @@ -217,8 +223,7 @@ int cpufreq_governor_dbs(struct cpufreq_
   58.26          break;
   58.27  
   58.28      case CPUFREQ_GOV_STOP:
   58.29 -        if (this_dbs_info->enable)
   58.30 -            dbs_timer_exit(this_dbs_info);
   58.31 +        dbs_timer_exit(this_dbs_info);
   58.32          dbs_enable--;
   58.33  
   58.34          break;
   58.35 @@ -233,5 +238,4 @@ int cpufreq_governor_dbs(struct cpufreq_
   58.36          break;
   58.37      }
   58.38      return 0;
   58.39 -}
   58.40 -             
   58.41 +} 
    59.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    59.2 +++ b/xen/arch/x86/acpi/cpufreq/powernow.c	Wed Jul 02 11:30:37 2008 +0900
    59.3 @@ -0,0 +1,305 @@
    59.4 +/*
    59.5 + *  powernow - AMD Architectural P-state Driver ($Revision: 1.4 $)
    59.6 + *
    59.7 + *  Copyright (C) 2008 Mark Langsdorf <mark.langsdorf@amd.com>
    59.8 + *
    59.9 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   59.10 + *
   59.11 + *  This program is free software; you can redistribute it and/or modify
   59.12 + *  it under the terms of the GNU General Public License as published by
   59.13 + *  the Free Software Foundation; either version 2 of the License, or (at
   59.14 + *  your option) any later version.
   59.15 + *
   59.16 + *  This program is distributed in the hope that it will be useful, but
   59.17 + *  WITHOUT ANY WARRANTY; without even the implied warranty of
   59.18 + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   59.19 + *  General Public License for more details.
   59.20 + *
   59.21 + *  You should have received a copy of the GNU General Public License along
   59.22 + *  with this program; if not, write to the Free Software Foundation, Inc.,
   59.23 + *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
   59.24 + *
   59.25 + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   59.26 + */
   59.27 +
   59.28 +#include <xen/types.h>
   59.29 +#include <xen/errno.h>
   59.30 +#include <xen/delay.h>
   59.31 +#include <xen/cpumask.h>
   59.32 +#include <xen/timer.h>
   59.33 +#include <xen/xmalloc.h>
   59.34 +#include <asm/bug.h>
   59.35 +#include <asm/msr.h>
   59.36 +#include <asm/io.h>
   59.37 +#include <asm/config.h>
   59.38 +#include <asm/processor.h>
   59.39 +#include <asm/percpu.h>
   59.40 +#include <asm/cpufeature.h>
   59.41 +#include <acpi/acpi.h>
   59.42 +#include <acpi/cpufreq/cpufreq.h>
   59.43 +
   59.44 +#define CPUID_FREQ_VOLT_CAPABILITIES    0x80000007
   59.45 +#define USE_HW_PSTATE           0x00000080
   59.46 +#define HW_PSTATE_MASK          0x00000007
   59.47 +#define HW_PSTATE_VALID_MASK    0x80000000
   59.48 +#define HW_PSTATE_MAX_MASK      0x000000f0
   59.49 +#define HW_PSTATE_MAX_SHIFT     4
   59.50 +#define MSR_PSTATE_DEF_BASE     0xc0010064 /* base of Pstate MSRs */
   59.51 +#define MSR_PSTATE_STATUS       0xc0010063 /* Pstate Status MSR */
   59.52 +#define MSR_PSTATE_CTRL         0xc0010062 /* Pstate control MSR */
   59.53 +#define MSR_PSTATE_CUR_LIMIT    0xc0010061 /* pstate current limit MSR */
   59.54 +
   59.55 +extern struct processor_pminfo processor_pminfo[NR_CPUS];
   59.56 +extern struct cpufreq_policy xen_px_policy[NR_CPUS];
   59.57 +
   59.58 +struct powernow_cpufreq_data {
   59.59 +    struct processor_performance *acpi_data;
   59.60 +    struct cpufreq_frequency_table *freq_table;
   59.61 +    unsigned int max_freq;
   59.62 +    unsigned int resume;
   59.63 +    unsigned int cpu_feature;
   59.64 +};
   59.65 +
   59.66 +static struct powernow_cpufreq_data *drv_data[NR_CPUS];
   59.67 +
   59.68 +struct drv_cmd {
   59.69 +    unsigned int type;
   59.70 +    cpumask_t mask;
   59.71 +    u64 addr;
   59.72 +    u32 val;
   59.73 +};
   59.74 +
   59.75 +static void transition_pstate(void *drvcmd)
   59.76 +{
   59.77 +    struct drv_cmd *cmd;
   59.78 +    cmd = (struct drv_cmd *) drvcmd;
   59.79 +
   59.80 +    wrmsr(MSR_PSTATE_CTRL, cmd->val, 0);
   59.81 +}
   59.82 +
   59.83 +static int powernow_cpufreq_target(struct cpufreq_policy *policy,
   59.84 +                               unsigned int target_freq, unsigned int relation)
   59.85 +{
   59.86 +    struct powernow_cpufreq_data *data = drv_data[policy->cpu];
   59.87 +    struct processor_performance *perf;
   59.88 +    struct cpufreq_freqs freqs;
   59.89 +    cpumask_t online_policy_cpus;
   59.90 +    struct drv_cmd cmd;
   59.91 +    unsigned int next_state = 0; /* Index into freq_table */
   59.92 +    unsigned int next_perf_state = 0; /* Index into perf table */
   59.93 +    int result = 0;
   59.94 +
   59.95 +    if (unlikely(data == NULL ||
   59.96 +        data->acpi_data == NULL || data->freq_table == NULL)) {
   59.97 +        return -ENODEV;
   59.98 +    }
   59.99 +
  59.100 +    perf = data->acpi_data;
  59.101 +    result = cpufreq_frequency_table_target(policy,
  59.102 +                                            data->freq_table,
  59.103 +                                            target_freq,
  59.104 +                                            relation, &next_state);
  59.105 +    if (unlikely(result))
  59.106 +        return -ENODEV;
  59.107 +
  59.108 +    online_policy_cpus = policy->cpus;
  59.109 +
  59.110 +    next_perf_state = data->freq_table[next_state].index;
  59.111 +    if (perf->state == next_perf_state) {
  59.112 +        if (unlikely(data->resume)) 
  59.113 +            data->resume = 0;
  59.114 +        else
  59.115 +            return 0;
  59.116 +    }
  59.117 +
  59.118 +    cpus_clear(cmd.mask);
  59.119 +
  59.120 +    if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
  59.121 +        cmd.mask = online_policy_cpus;
  59.122 +    else
  59.123 +        cpu_set(policy->cpu, cmd.mask);
  59.124 +
  59.125 +    freqs.old = perf->states[perf->state].core_frequency * 1000;
  59.126 +    freqs.new = data->freq_table[next_state].frequency;
  59.127 +
  59.128 +    cmd.val = next_perf_state;
  59.129 +
  59.130 +    on_selected_cpus( cmd.mask, transition_pstate, (void *) &cmd, 0, 0);
  59.131 +
  59.132 +    perf->state = next_perf_state;
  59.133 +    policy->cur = freqs.new;
  59.134 +
  59.135 +    return result;
  59.136 +}
  59.137 +
  59.138 +static int powernow_cpufreq_cpu_init(struct cpufreq_policy *policy)
  59.139 +{
  59.140 +    unsigned int i;
  59.141 +    unsigned int valid_states = 0;
  59.142 +    unsigned int cpu = policy->cpu;
  59.143 +    struct powernow_cpufreq_data *data;
  59.144 +    unsigned int result = 0;
  59.145 +    struct processor_performance *perf;
  59.146 +    u32 max_hw_pstate, hi = 0, lo = 0;
  59.147 +
  59.148 +    data = xmalloc(struct powernow_cpufreq_data);
  59.149 +    if (!data)
  59.150 +        return -ENOMEM;
  59.151 +    memset(data, 0, sizeof(struct powernow_cpufreq_data));
  59.152 +
  59.153 +    drv_data[cpu] = data;
  59.154 +
  59.155 +    data->acpi_data = &processor_pminfo[cpu].perf;
  59.156 +
  59.157 +    perf = data->acpi_data;
  59.158 +    policy->shared_type = perf->shared_type;
  59.159 +
  59.160 +    /*
  59.161 +     * Will let policy->cpus know about dependency only when software
  59.162 +     * coordination is required.
  59.163 +     */
  59.164 +    if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
  59.165 +        policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
  59.166 +        policy->cpus = perf->shared_cpu_map;
  59.167 +    } else {
  59.168 +        policy->cpus = cpumask_of_cpu(cpu);    
  59.169 +    }
  59.170 +
  59.171 +    /* capability check */
  59.172 +    if (perf->state_count <= 1) {
  59.173 +        printk("No P-States\n");
  59.174 +        result = -ENODEV;
  59.175 +        goto err_unreg;
  59.176 +    }
  59.177 +    rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);
  59.178 +    max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
  59.179 +
  59.180 +    if (perf->control_register.space_id != perf->status_register.space_id) {
  59.181 +        result = -ENODEV;
  59.182 +        goto err_unreg;
  59.183 +    }
  59.184 +
  59.185 +    data->freq_table = xmalloc_array(struct cpufreq_frequency_table, 
  59.186 +                                    (perf->state_count+1));
  59.187 +    if (!data->freq_table) {
  59.188 +        result = -ENOMEM;
  59.189 +        goto err_unreg;
  59.190 +    }
  59.191 +
  59.192 +    /* detect transition latency */
  59.193 +    policy->cpuinfo.transition_latency = 0;
  59.194 +    for (i=0; i<perf->state_count; i++) {
  59.195 +        if ((perf->states[i].transition_latency * 1000) >
  59.196 +            policy->cpuinfo.transition_latency)
  59.197 +            policy->cpuinfo.transition_latency =
  59.198 +                perf->states[i].transition_latency * 1000;
  59.199 +    }
  59.200 +
  59.201 +    data->max_freq = perf->states[0].core_frequency * 1000;
  59.202 +    /* table init */
  59.203 +    for (i=0; i<perf->state_count && i<max_hw_pstate; i++) {
  59.204 +        if (i>0 && perf->states[i].core_frequency >=
  59.205 +            data->freq_table[valid_states-1].frequency / 1000)
  59.206 +            continue;
  59.207 +
  59.208 +        data->freq_table[valid_states].index = perf->states[i].control & HW_PSTATE_MASK;
  59.209 +        data->freq_table[valid_states].frequency =
  59.210 +            perf->states[i].core_frequency * 1000;
  59.211 +        valid_states++;
  59.212 +    }
  59.213 +    data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
  59.214 +    perf->state = 0;
  59.215 +
  59.216 +    result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
  59.217 +    if (result)
  59.218 +        goto err_freqfree;
  59.219 +
  59.220 +    /*
  59.221 +     * the first call to ->target() should result in us actually
  59.222 +     * writing something to the appropriate registers.
  59.223 +     */
  59.224 +    data->resume = 1;
  59.225 +
  59.226 +    policy->cur = data->freq_table[i].frequency;
  59.227 +    return result;
  59.228 +
  59.229 +err_freqfree:
  59.230 +    xfree(data->freq_table);
  59.231 +err_unreg:
  59.232 +    xfree(data);
  59.233 +    drv_data[cpu] = NULL;
  59.234 +
  59.235 +    return result;
  59.236 +}
  59.237 +
  59.238 +static struct cpufreq_driver powernow_cpufreq_driver = {
  59.239 +    .target = powernow_cpufreq_target,
  59.240 +    .init   = powernow_cpufreq_cpu_init,
  59.241 +};
  59.242 +
  59.243 +int powernow_cpufreq_init(void)
  59.244 +{
  59.245 +    unsigned int i, ret = 0;
  59.246 +    unsigned int dom, max_dom = 0;
  59.247 +    cpumask_t *pt, dom_mask;
  59.248 +
  59.249 +    cpus_clear(dom_mask);
  59.250 +
  59.251 +    for_each_online_cpu(i) {
  59.252 +        struct cpuinfo_x86 *c = &cpu_data[i];
  59.253 +	if (c->x86_vendor != X86_VENDOR_AMD)
  59.254 +            ret = -ENODEV;
  59.255 +        else 
  59.256 +        {
  59.257 +            u32 eax, ebx, ecx, edx;
  59.258 +            cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
  59.259 +            if ((edx & USE_HW_PSTATE) != USE_HW_PSTATE)
  59.260 +                ret = -ENODEV;
  59.261 +	}
  59.262 +        if (ret)
  59.263 +            return ret;
  59.264 +        cpu_set(processor_pminfo[i].perf.domain_info.domain, dom_mask);
  59.265 +        if (max_dom < processor_pminfo[i].perf.domain_info.domain)
  59.266 +            max_dom = processor_pminfo[i].perf.domain_info.domain;
  59.267 +    }
  59.268 +    max_dom++;
  59.269 +
  59.270 +    pt = xmalloc_array(cpumask_t, max_dom);
  59.271 +    if (!pt)
  59.272 +        return -ENOMEM;
  59.273 +    memset(pt, 0, max_dom * sizeof(cpumask_t));
  59.274 +
  59.275 +    /* get cpumask of each psd domain */
  59.276 +    for_each_online_cpu(i)
  59.277 +        cpu_set(i, pt[processor_pminfo[i].perf.domain_info.domain]);
  59.278 +
  59.279 +    for_each_online_cpu(i)
  59.280 +        processor_pminfo[i].perf.shared_cpu_map = 
  59.281 +            pt[processor_pminfo[i].perf.domain_info.domain];
  59.282 +
  59.283 +    cpufreq_driver = &powernow_cpufreq_driver;
  59.284 +
  59.285 +    /* setup cpufreq infrastructure */
  59.286 +    for_each_online_cpu(i) {
  59.287 +        xen_px_policy[i].cpu = i;
  59.288 +
  59.289 +        ret = powernow_cpufreq_cpu_init(&xen_px_policy[i]);
  59.290 +        if (ret)
  59.291 +            goto cpufreq_init_out;
  59.292 +    }
  59.293 +
  59.294 +    /* setup ondemand cpufreq */
  59.295 +    for (dom=0; dom<max_dom; dom++) {
  59.296 +        if (!cpu_isset(dom, dom_mask))
  59.297 +            continue;
  59.298 +        i = first_cpu(pt[dom]);
  59.299 +        ret = cpufreq_governor_dbs(&xen_px_policy[i], CPUFREQ_GOV_START);
  59.300 +        if (ret)
  59.301 +            goto cpufreq_init_out;
  59.302 +    }
  59.303 +
  59.304 +cpufreq_init_out:
  59.305 +    xfree(pt);
  59.306 +   
  59.307 +    return ret;
  59.308 +}
    60.1 --- a/xen/arch/x86/acpi/cpufreq/utility.c	Thu Jun 19 12:48:04 2008 +0900
    60.2 +++ b/xen/arch/x86/acpi/cpufreq/utility.c	Wed Jul 02 11:30:37 2008 +0900
    60.3 @@ -37,6 +37,41 @@ struct cpufreq_driver *cpufreq_driver;
    60.4   *                    Px STATISTIC INFO                              *
    60.5   *********************************************************************/
    60.6  
    60.7 +void px_statistic_suspend(void)
    60.8 +{
    60.9 +    int cpu;
   60.10 +    uint64_t now;
   60.11 +
   60.12 +    now = NOW();
   60.13 +
   60.14 +    for_each_online_cpu(cpu) {
   60.15 +        struct pm_px *pxpt = &px_statistic_data[cpu];
   60.16 +        uint64_t total_idle_ns;
   60.17 +        uint64_t tmp_idle_ns;
   60.18 +
   60.19 +        total_idle_ns = get_cpu_idle_time(cpu);
   60.20 +        tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall;
   60.21 +
   60.22 +        pxpt->u.pt[pxpt->u.cur].residency +=
   60.23 +                    now - pxpt->prev_state_wall;
   60.24 +        pxpt->u.pt[pxpt->u.cur].residency -= tmp_idle_ns;
   60.25 +    }
   60.26 +}
   60.27 +
   60.28 +void px_statistic_resume(void)
   60.29 +{
   60.30 +    int cpu;
   60.31 +    uint64_t now;
   60.32 +
   60.33 +    now = NOW();
   60.34 +
   60.35 +    for_each_online_cpu(cpu) {
   60.36 +        struct pm_px *pxpt = &px_statistic_data[cpu];
   60.37 +        pxpt->prev_state_wall = now;
   60.38 +        pxpt->prev_idle_wall = get_cpu_idle_time(cpu);
   60.39 +    }
   60.40 +}
   60.41 +
   60.42  void px_statistic_update(cpumask_t cpumask, uint8_t from, uint8_t to)
   60.43  {
   60.44      uint32_t i;
   60.45 @@ -47,15 +82,22 @@ void px_statistic_update(cpumask_t cpuma
   60.46      for_each_cpu_mask(i, cpumask) {
   60.47          struct pm_px *pxpt = &px_statistic_data[i];
   60.48          uint32_t statnum = processor_pminfo[i].perf.state_count;
   60.49 +        uint64_t total_idle_ns;
   60.50 +        uint64_t tmp_idle_ns;
   60.51 +
   60.52 +        total_idle_ns = get_cpu_idle_time(i);
   60.53 +        tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall;
   60.54  
   60.55          pxpt->u.last = from;
   60.56          pxpt->u.cur = to;
   60.57          pxpt->u.pt[to].count++;
   60.58          pxpt->u.pt[from].residency += now - pxpt->prev_state_wall;
   60.59 +        pxpt->u.pt[from].residency -= tmp_idle_ns;
   60.60  
   60.61          (*(pxpt->u.trans_pt + from*statnum + to))++;
   60.62  
   60.63          pxpt->prev_state_wall = now;
   60.64 +        pxpt->prev_idle_wall = total_idle_ns;
   60.65      }
   60.66  }
   60.67  
   60.68 @@ -87,6 +129,7 @@ int px_statistic_init(int cpuid)
   60.69          pxpt->u.pt[i].freq = pmpt->perf.states[i].core_frequency;
   60.70  
   60.71      pxpt->prev_state_wall = NOW();
   60.72 +    pxpt->prev_idle_wall = get_cpu_idle_time(cpuid);
   60.73  
   60.74      return 0;
   60.75  }
   60.76 @@ -107,6 +150,7 @@ void px_statistic_reset(int cpuid)
   60.77      }
   60.78  
   60.79      pxpt->prev_state_wall = NOW();
   60.80 +    pxpt->prev_idle_wall = get_cpu_idle_time(cpuid);
   60.81  }
   60.82  
   60.83  
   60.84 @@ -242,3 +286,62 @@ int __cpufreq_driver_getavg(struct cpufr
   60.85  
   60.86      return ret;
   60.87  }
   60.88 +
   60.89 +
   60.90 +/*********************************************************************
   60.91 + *               CPUFREQ SUSPEND/RESUME                              *
   60.92 + *********************************************************************/
   60.93 +
   60.94 +void cpufreq_suspend(void)
   60.95 +{
   60.96 +    int cpu;
   60.97 +
   60.98 +    /* to protect the case when Px was controlled by dom0-kernel */
   60.99 +    /* or when CPU_FREQ not set in which case ACPI Px objects not parsed */
  60.100 +    for_each_online_cpu(cpu) {
  60.101 +        struct processor_performance *perf = &processor_pminfo[cpu].perf;
  60.102 +
  60.103 +        if (!perf->init)
  60.104 +            return;
  60.105 +    }
  60.106 +
  60.107 +    cpufreq_dom_dbs(CPUFREQ_GOV_STOP);
  60.108 +
  60.109 +    cpufreq_dom_exit();
  60.110 +
  60.111 +    px_statistic_suspend();
  60.112 +}
  60.113 +
  60.114 +int cpufreq_resume(void)
  60.115 +{
  60.116 +    int cpu, ret = 0;
  60.117 +
  60.118 +    /* 1. to protect the case when Px was controlled by dom0-kernel */
  60.119 +    /* or when CPU_FREQ not set in which case ACPI Px objects not parsed */
  60.120 +    /* 2. set state and resume flag to sync cpu to right state and freq */
  60.121 +    for_each_online_cpu(cpu) {
  60.122 +        struct processor_performance *perf = &processor_pminfo[cpu].perf;
  60.123 +        struct cpufreq_policy *policy = &xen_px_policy[cpu];
  60.124 +
  60.125 +        if (!perf->init)
  60.126 +            goto err;
  60.127 +        perf->state = 0;
  60.128 +        policy->resume = 1;
  60.129 +    }
  60.130 +
  60.131 +    px_statistic_resume();
  60.132 +
  60.133 +    ret = cpufreq_dom_init();
  60.134 +    if (ret)
  60.135 +        goto err;
  60.136 +
  60.137 +    ret = cpufreq_dom_dbs(CPUFREQ_GOV_START);
  60.138 +    if (ret)
  60.139 +        goto err;
  60.140 +
  60.141 +    return ret;
  60.142 +
  60.143 +err:
  60.144 +    cpufreq_dom_exit();
  60.145 +    return ret;
  60.146 +}
    61.1 --- a/xen/arch/x86/acpi/pmstat.c	Thu Jun 19 12:48:04 2008 +0900
    61.2 +++ b/xen/arch/x86/acpi/pmstat.c	Wed Jul 02 11:30:37 2008 +0900
    61.3 @@ -71,11 +71,18 @@ int do_get_pm_info(struct xen_sysctl_get
    61.4      case PMSTAT_get_pxstat:
    61.5      {
    61.6          uint64_t now, ct;
    61.7 +        uint64_t total_idle_ns;
    61.8 +        uint64_t tmp_idle_ns;
    61.9 +
   61.10 +        total_idle_ns = get_cpu_idle_time(op->cpuid);
   61.11 +        tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall;
   61.12  
   61.13          now = NOW();
   61.14          pxpt->u.usable = pmpt->perf.state_count - pmpt->perf.ppc;
   61.15          pxpt->u.pt[pxpt->u.cur].residency += now - pxpt->prev_state_wall;
   61.16 +        pxpt->u.pt[pxpt->u.cur].residency -= tmp_idle_ns;
   61.17          pxpt->prev_state_wall = now;
   61.18 +        pxpt->prev_idle_wall = total_idle_ns;
   61.19  
   61.20          ct = pmpt->perf.state_count;
   61.21          if ( copy_to_guest(op->u.getpx.trans_pt, pxpt->u.trans_pt, ct*ct) )
    62.1 --- a/xen/arch/x86/acpi/power.c	Thu Jun 19 12:48:04 2008 +0900
    62.2 +++ b/xen/arch/x86/acpi/power.c	Wed Jul 02 11:30:37 2008 +0900
    62.3 @@ -27,7 +27,7 @@
    62.4  #include <public/platform.h>
    62.5  #include <asm/tboot.h>
    62.6  
    62.7 -#define pmprintk(_l, _f, _a...) printk(_l "<PM> " _f "\n", ## _a )
    62.8 +#include <acpi/cpufreq/cpufreq.h>
    62.9  
   62.10  static char opt_acpi_sleep[20];
   62.11  string_param("acpi_sleep", opt_acpi_sleep);
   62.12 @@ -124,10 +124,12 @@ static int enter_state(u32 state)
   62.13      if ( !spin_trylock(&pm_lock) )
   62.14          return -EBUSY;
   62.15  
   62.16 -    pmprintk(XENLOG_INFO, "Preparing system for ACPI S%d state.", state);
   62.17 +    printk(XENLOG_INFO "Preparing system for ACPI S%d state.", state);
   62.18  
   62.19      freeze_domains();
   62.20  
   62.21 +    cpufreq_suspend();
   62.22 +
   62.23      disable_nonboot_cpus();
   62.24      if ( num_online_cpus() != 1 )
   62.25      {
   62.26 @@ -139,11 +141,14 @@ static int enter_state(u32 state)
   62.27  
   62.28      acpi_sleep_prepare(state);
   62.29  
   62.30 +    console_start_sync();
   62.31 +    printk("Entering ACPI S%d state.\n", state);
   62.32 +
   62.33      local_irq_save(flags);
   62.34  
   62.35      if ( (error = device_power_down()) )
   62.36      {
   62.37 -        pmprintk(XENLOG_ERR, "Some devices failed to power down.");
   62.38 +        printk(XENLOG_ERR "Some devices failed to power down.");
   62.39          goto done;
   62.40      }
   62.41  
   62.42 @@ -162,8 +167,6 @@ static int enter_state(u32 state)
   62.43          break;
   62.44      }
   62.45  
   62.46 -    pmprintk(XENLOG_DEBUG, "Back to C.");
   62.47 -
   62.48      /* Restore CR4 and EFER from cached values. */
   62.49      write_cr4(read_cr4());
   62.50      if ( cpu_has_efer )
   62.51 @@ -171,16 +174,18 @@ static int enter_state(u32 state)
   62.52  
   62.53      device_power_up();
   62.54  
   62.55 -    pmprintk(XENLOG_INFO, "Finishing wakeup from ACPI S%d state.", state);
   62.56 +    printk(XENLOG_INFO "Finishing wakeup from ACPI S%d state.", state);
   62.57  
   62.58   done:
   62.59      local_irq_restore(flags);
   62.60 +    console_end_sync();
   62.61      acpi_sleep_post(state);
   62.62      if ( !hvm_cpu_up() )
   62.63          BUG();
   62.64  
   62.65   enable_cpu:
   62.66      enable_nonboot_cpus();
   62.67 +    cpufreq_resume();
   62.68      thaw_domains();
   62.69      spin_unlock(&pm_lock);
   62.70      return error;
   62.71 @@ -206,7 +211,7 @@ int acpi_enter_sleep(struct xenpf_enter_
   62.72           ((sleep->pm1a_cnt_val ^ sleep->pm1b_cnt_val) &
   62.73            ACPI_BITMASK_SLEEP_ENABLE) )
   62.74      {
   62.75 -        pmprintk(XENLOG_ERR, "Mismatched pm1a/pm1b setting.");
   62.76 +        gdprintk(XENLOG_ERR, "Mismatched pm1a/pm1b setting.");
   62.77          return -EINVAL;
   62.78      }
   62.79  
   62.80 @@ -278,7 +283,7 @@ acpi_status asmlinkage acpi_enter_sleep_
   62.81      if ( tboot_in_measured_env() )
   62.82      {
   62.83          tboot_sleep(sleep_state);
   62.84 -        pmprintk(XENLOG_ERR, "TBOOT failed entering s3 state\n");
   62.85 +        printk(XENLOG_ERR "TBOOT failed entering s3 state\n");
   62.86          return_ACPI_STATUS(AE_ERROR);
   62.87      }
   62.88  
   62.89 @@ -320,7 +325,7 @@ static int __init acpi_sleep_init(void)
   62.90              p += strspn(p, ", \t");
   62.91      }
   62.92  
   62.93 -    printk(XENLOG_INFO "<PM> ACPI (supports");
   62.94 +    printk(XENLOG_INFO "ACPI sleep modes:");
   62.95      for ( i = 0; i < ACPI_S_STATE_COUNT; i++ )
   62.96      {
   62.97          if ( i == ACPI_STATE_S3 )
   62.98 @@ -331,7 +336,7 @@ static int __init acpi_sleep_init(void)
   62.99          else
  62.100              sleep_states[i] = 0;
  62.101      }
  62.102 -    printk(")\n");
  62.103 +    printk("\n");
  62.104  
  62.105      return 0;
  62.106  }
    63.1 --- a/xen/arch/x86/hvm/emulate.c	Thu Jun 19 12:48:04 2008 +0900
    63.2 +++ b/xen/arch/x86/hvm/emulate.c	Wed Jul 02 11:30:37 2008 +0900
    63.3 @@ -21,15 +21,33 @@
    63.4  
    63.5  static int hvmemul_do_io(
    63.6      int is_mmio, paddr_t addr, unsigned long *reps, int size,
    63.7 -    paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val)
    63.8 +    paddr_t ram_gpa, int dir, int df, void *p_data)
    63.9  {
   63.10 +    paddr_t value = ram_gpa;
   63.11 +    int value_is_ptr = (p_data == NULL);
   63.12      struct vcpu *curr = current;
   63.13      vcpu_iodata_t *vio = get_ioreq(curr);
   63.14      ioreq_t *p = &vio->vp_ioreq;
   63.15      int rc;
   63.16  
   63.17 -    /* Only retrieve the value from singleton (non-REP) reads. */
   63.18 -    ASSERT((val == NULL) || ((dir == IOREQ_READ) && !value_is_ptr));
   63.19 +    /*
   63.20 +     * Weird-sized accesses have undefined behaviour: we discard writes
   63.21 +     * and read all-ones.
   63.22 +     */
   63.23 +    if ( unlikely((size > sizeof(long)) || (size & (size - 1))) )
   63.24 +    {
   63.25 +        gdprintk(XENLOG_WARNING, "bad mmio size %d\n", size);
   63.26 +        ASSERT(p_data != NULL); /* cannot happen with a REP prefix */
   63.27 +        if ( dir == IOREQ_READ )
   63.28 +            memset(p_data, ~0, size);
   63.29 +        return X86EMUL_UNHANDLEABLE;
   63.30 +    }
   63.31 +
   63.32 +    if ( (p_data != NULL) && (dir == IOREQ_WRITE) )
   63.33 +    {
   63.34 +        memcpy(&value, p_data, size);
   63.35 +        p_data = NULL;
   63.36 +    }
   63.37  
   63.38      if ( is_mmio && !value_is_ptr )
   63.39      {
   63.40 @@ -47,8 +65,7 @@ static int hvmemul_do_io(
   63.41              unsigned int bytes = curr->arch.hvm_vcpu.mmio_large_read_bytes;
   63.42              if ( (addr >= pa) && ((addr + size) <= (pa + bytes)) )
   63.43              {
   63.44 -                *val = 0;
   63.45 -                memcpy(val, &curr->arch.hvm_vcpu.mmio_large_read[addr - pa],
   63.46 +                memcpy(p_data, &curr->arch.hvm_vcpu.mmio_large_read[addr - pa],
   63.47                         size);
   63.48                  return X86EMUL_OKAY;
   63.49              }
   63.50 @@ -61,7 +78,7 @@ static int hvmemul_do_io(
   63.51          break;
   63.52      case HVMIO_completed:
   63.53          curr->arch.hvm_vcpu.io_state = HVMIO_none;
   63.54 -        if ( val == NULL )
   63.55 +        if ( p_data == NULL )
   63.56              return X86EMUL_UNHANDLEABLE;
   63.57          goto finish_access;
   63.58      case HVMIO_dispatched:
   63.59 @@ -82,7 +99,7 @@ static int hvmemul_do_io(
   63.60      }
   63.61  
   63.62      curr->arch.hvm_vcpu.io_state =
   63.63 -        (val == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion;
   63.64 +        (p_data == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion;
   63.65  
   63.66      p->dir = dir;
   63.67      p->data_is_ptr = value_is_ptr;
   63.68 @@ -116,7 +133,7 @@ static int hvmemul_do_io(
   63.69          break;
   63.70      case X86EMUL_UNHANDLEABLE:
   63.71          hvm_send_assist_req(curr);
   63.72 -        rc = (val != NULL) ? X86EMUL_RETRY : X86EMUL_OKAY;
   63.73 +        rc = (p_data != NULL) ? X86EMUL_RETRY : X86EMUL_OKAY;
   63.74          break;
   63.75      default:
   63.76          BUG();
   63.77 @@ -126,8 +143,8 @@ static int hvmemul_do_io(
   63.78          return rc;
   63.79  
   63.80   finish_access:
   63.81 -    if ( val != NULL )
   63.82 -        *val = curr->arch.hvm_vcpu.io_data;
   63.83 +    if ( p_data != NULL )
   63.84 +        memcpy(p_data, &curr->arch.hvm_vcpu.io_data, size);
   63.85  
   63.86      if ( is_mmio && !value_is_ptr )
   63.87      {
   63.88 @@ -152,7 +169,7 @@ static int hvmemul_do_io(
   63.89                    sizeof(curr->arch.hvm_vcpu.mmio_large_read)) )
   63.90              {
   63.91                  memcpy(&curr->arch.hvm_vcpu.mmio_large_read[addr - pa],
   63.92 -                       val, size);
   63.93 +                       p_data, size);
   63.94                  curr->arch.hvm_vcpu.mmio_large_read_bytes += size;
   63.95              }
   63.96          }
   63.97 @@ -163,18 +180,16 @@ static int hvmemul_do_io(
   63.98  
   63.99  static int hvmemul_do_pio(
  63.100      unsigned long port, unsigned long *reps, int size,
  63.101 -    paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val)
  63.102 +    paddr_t ram_gpa, int dir, int df, void *p_data)
  63.103  {
  63.104 -    return hvmemul_do_io(0, port, reps, size, value,
  63.105 -                         dir, df, value_is_ptr, val);
  63.106 +    return hvmemul_do_io(0, port, reps, size, ram_gpa, dir, df, p_data);
  63.107  }
  63.108  
  63.109  static int hvmemul_do_mmio(
  63.110      paddr_t gpa, unsigned long *reps, int size,
  63.111 -    paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val)
  63.112 +    paddr_t ram_gpa, int dir, int df, void *p_data)
  63.113  {
  63.114 -    return hvmemul_do_io(1, gpa, reps, size, value,
  63.115 -                         dir, df, value_is_ptr, val);
  63.116 +    return hvmemul_do_io(1, gpa, reps, size, ram_gpa, dir, df, p_data);
  63.117  }
  63.118  
  63.119  /*
  63.120 @@ -287,7 +302,7 @@ static int hvmemul_virtual_to_linear(
  63.121  static int __hvmemul_read(
  63.122      enum x86_segment seg,
  63.123      unsigned long offset,
  63.124 -    unsigned long *val,
  63.125 +    void *p_data,
  63.126      unsigned int bytes,
  63.127      enum hvm_access_type access_type,
  63.128      struct hvm_emulate_ctxt *hvmemul_ctxt)
  63.129 @@ -303,8 +318,6 @@ static int __hvmemul_read(
  63.130      if ( rc != X86EMUL_OKAY )
  63.131          return rc;
  63.132  
  63.133 -    *val = 0;
  63.134 -
  63.135      if ( unlikely(curr->arch.hvm_vcpu.mmio_gva == (addr & PAGE_MASK)) &&
  63.136           curr->arch.hvm_vcpu.mmio_gva )
  63.137      {
  63.138 @@ -314,7 +327,7 @@ static int __hvmemul_read(
  63.139          gpa = (((paddr_t)curr->arch.hvm_vcpu.mmio_gpfn << PAGE_SHIFT) | off);
  63.140          if ( (off + bytes) <= PAGE_SIZE )
  63.141              return hvmemul_do_mmio(gpa, &reps, bytes, 0,
  63.142 -                                   IOREQ_READ, 0, 0, val);
  63.143 +                                   IOREQ_READ, 0, p_data);
  63.144      }
  63.145  
  63.146      if ( (seg != x86_seg_none) &&
  63.147 @@ -322,15 +335,13 @@ static int __hvmemul_read(
  63.148          pfec |= PFEC_user_mode;
  63.149  
  63.150      rc = ((access_type == hvm_access_insn_fetch) ?
  63.151 -          hvm_fetch_from_guest_virt(val, addr, bytes, pfec) :
  63.152 -          hvm_copy_from_guest_virt(val, addr, bytes, pfec));
  63.153 +          hvm_fetch_from_guest_virt(p_data, addr, bytes, pfec) :
  63.154 +          hvm_copy_from_guest_virt(p_data, addr, bytes, pfec));
  63.155      if ( rc == HVMCOPY_bad_gva_to_gfn )
  63.156          return X86EMUL_EXCEPTION;
  63.157  
  63.158      if ( rc == HVMCOPY_bad_gfn_to_mfn )
  63.159      {
  63.160 -        unsigned long reps = 1;
  63.161 -
  63.162          if ( access_type == hvm_access_insn_fetch )
  63.163              return X86EMUL_UNHANDLEABLE;
  63.164  
  63.165 @@ -339,7 +350,7 @@ static int __hvmemul_read(
  63.166          if ( rc != X86EMUL_OKAY )
  63.167              return rc;
  63.168  
  63.169 -        return hvmemul_do_mmio(gpa, &reps, bytes, 0, IOREQ_READ, 0, 0, val);
  63.170 +        return hvmemul_do_mmio(gpa, &reps, bytes, 0, IOREQ_READ, 0, p_data);
  63.171      }
  63.172  
  63.173      return X86EMUL_OKAY;
  63.174 @@ -348,19 +359,19 @@ static int __hvmemul_read(
  63.175  static int hvmemul_read(
  63.176      enum x86_segment seg,
  63.177      unsigned long offset,
  63.178 -    unsigned long *val,
  63.179 +    void *p_data,
  63.180      unsigned int bytes,
  63.181      struct x86_emulate_ctxt *ctxt)
  63.182  {
  63.183      return __hvmemul_read(
  63.184 -        seg, offset, val, bytes, hvm_access_read,
  63.185 +        seg, offset, p_data, bytes, hvm_access_read,
  63.186          container_of(ctxt, struct hvm_emulate_ctxt, ctxt));
  63.187  }
  63.188  
  63.189  static int hvmemul_insn_fetch(
  63.190      enum x86_segment seg,
  63.191      unsigned long offset,
  63.192 -    unsigned long *val,
  63.193 +    void *p_data,
  63.194      unsigned int bytes,
  63.195      struct x86_emulate_ctxt *ctxt)
  63.196  {
  63.197 @@ -371,19 +382,18 @@ static int hvmemul_insn_fetch(
  63.198      /* Fall back if requested bytes are not in the prefetch cache. */
  63.199      if ( unlikely((insn_off + bytes) > hvmemul_ctxt->insn_buf_bytes) )
  63.200          return __hvmemul_read(
  63.201 -            seg, offset, val, bytes,
  63.202 +            seg, offset, p_data, bytes,
  63.203              hvm_access_insn_fetch, hvmemul_ctxt);
  63.204  
  63.205      /* Hit the cache. Simple memcpy. */
  63.206 -    *val = 0;
  63.207 -    memcpy(val, &hvmemul_ctxt->insn_buf[insn_off], bytes);
  63.208 +    memcpy(p_data, &hvmemul_ctxt->insn_buf[insn_off], bytes);
  63.209      return X86EMUL_OKAY;
  63.210  }
  63.211  
  63.212  static int hvmemul_write(
  63.213      enum x86_segment seg,
  63.214      unsigned long offset,
  63.215 -    unsigned long val,
  63.216 +    void *p_data,
  63.217      unsigned int bytes,
  63.218      struct x86_emulate_ctxt *ctxt)
  63.219  {
  63.220 @@ -406,29 +416,27 @@ static int hvmemul_write(
  63.221          unsigned int off = addr & (PAGE_SIZE - 1);
  63.222          gpa = (((paddr_t)curr->arch.hvm_vcpu.mmio_gpfn << PAGE_SHIFT) | off);
  63.223          if ( (off + bytes) <= PAGE_SIZE )
  63.224 -            return hvmemul_do_mmio(gpa, &reps, bytes, val,
  63.225 -                                   IOREQ_WRITE, 0, 0, NULL);
  63.226 +            return hvmemul_do_mmio(gpa, &reps, bytes, 0,
  63.227 +                                   IOREQ_WRITE, 0, p_data);
  63.228      }
  63.229  
  63.230      if ( (seg != x86_seg_none) &&
  63.231           (hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3) )
  63.232          pfec |= PFEC_user_mode;
  63.233  
  63.234 -    rc = hvm_copy_to_guest_virt(addr, &val, bytes, pfec);
  63.235 +    rc = hvm_copy_to_guest_virt(addr, p_data, bytes, pfec);
  63.236      if ( rc == HVMCOPY_bad_gva_to_gfn )
  63.237          return X86EMUL_EXCEPTION;
  63.238  
  63.239      if ( rc == HVMCOPY_bad_gfn_to_mfn )
  63.240      {
  63.241 -        unsigned long reps = 1;
  63.242 -
  63.243          rc = hvmemul_linear_to_phys(
  63.244              addr, &gpa, bytes, &reps, pfec, hvmemul_ctxt);
  63.245          if ( rc != X86EMUL_OKAY )
  63.246              return rc;
  63.247  
  63.248 -        return hvmemul_do_mmio(gpa, &reps, bytes, val,
  63.249 -                               IOREQ_WRITE, 0, 0, NULL);
  63.250 +        return hvmemul_do_mmio(gpa, &reps, bytes, 0,
  63.251 +                               IOREQ_WRITE, 0, p_data);
  63.252      }
  63.253  
  63.254      return X86EMUL_OKAY;
  63.255 @@ -442,12 +450,8 @@ static int hvmemul_cmpxchg(
  63.256      unsigned int bytes,
  63.257      struct x86_emulate_ctxt *ctxt)
  63.258  {
  63.259 -    unsigned long new = 0;
  63.260 -    if ( bytes > sizeof(new) )
  63.261 -        return X86EMUL_UNHANDLEABLE;
  63.262 -    memcpy(&new, p_new, bytes);
  63.263      /* Fix this in case the guest is really relying on r-m-w atomicity. */
  63.264 -    return hvmemul_write(seg, offset, new, bytes, ctxt);
  63.265 +    return hvmemul_write(seg, offset, p_new, bytes, ctxt);
  63.266  }
  63.267  
  63.268  static int hvmemul_rep_ins(
  63.269 @@ -480,7 +484,7 @@ static int hvmemul_rep_ins(
  63.270          return rc;
  63.271  
  63.272      return hvmemul_do_pio(src_port, reps, bytes_per_rep, gpa, IOREQ_READ,
  63.273 -                          !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
  63.274 +                          !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
  63.275  }
  63.276  
  63.277  static int hvmemul_rep_outs(
  63.278 @@ -513,7 +517,7 @@ static int hvmemul_rep_outs(
  63.279          return rc;
  63.280  
  63.281      return hvmemul_do_pio(dst_port, reps, bytes_per_rep, gpa, IOREQ_WRITE,
  63.282 -                          !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
  63.283 +                          !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
  63.284  }
  63.285  
  63.286  static int hvmemul_rep_movs(
  63.287 @@ -563,14 +567,14 @@ static int hvmemul_rep_movs(
  63.288      if ( !p2m_is_ram(p2mt) )
  63.289          return hvmemul_do_mmio(
  63.290              sgpa, reps, bytes_per_rep, dgpa, IOREQ_READ,
  63.291 -            !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
  63.292 +            !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
  63.293  
  63.294      (void)gfn_to_mfn_current(dgpa >> PAGE_SHIFT, &p2mt);
  63.295      if ( p2m_is_ram(p2mt) )
  63.296          return X86EMUL_UNHANDLEABLE;
  63.297      return hvmemul_do_mmio(
  63.298          dgpa, reps, bytes_per_rep, sgpa, IOREQ_WRITE,
  63.299 -        !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
  63.300 +        !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
  63.301  }
  63.302  
  63.303  static int hvmemul_read_segment(
  63.304 @@ -607,7 +611,8 @@ static int hvmemul_read_io(
  63.305      struct x86_emulate_ctxt *ctxt)
  63.306  {
  63.307      unsigned long reps = 1;
  63.308 -    return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_READ, 0, 0, val);
  63.309 +    *val = 0;
  63.310 +    return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_READ, 0, val);
  63.311  }
  63.312  
  63.313  static int hvmemul_write_io(
  63.314 @@ -617,7 +622,7 @@ static int hvmemul_write_io(
  63.315      struct x86_emulate_ctxt *ctxt)
  63.316  {
  63.317      unsigned long reps = 1;
  63.318 -    return hvmemul_do_pio(port, &reps, bytes, val, IOREQ_WRITE, 0, 0, NULL);
  63.319 +    return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_WRITE, 0, &val);
  63.320  }
  63.321  
  63.322  static int hvmemul_read_cr(
    64.1 --- a/xen/arch/x86/hvm/hvm.c	Thu Jun 19 12:48:04 2008 +0900
    64.2 +++ b/xen/arch/x86/hvm/hvm.c	Wed Jul 02 11:30:37 2008 +0900
    64.3 @@ -2529,6 +2529,66 @@ long do_hvm_op(unsigned long op, XEN_GUE
    64.4          break;
    64.5      }
    64.6  
    64.7 +    case HVMOP_modified_memory:
    64.8 +    {
    64.9 +        struct xen_hvm_modified_memory a;
   64.10 +        struct domain *d;
   64.11 +        unsigned long pfn;
   64.12 +
   64.13 +        if ( copy_from_guest(&a, arg, 1) )
   64.14 +            return -EFAULT;
   64.15 +
   64.16 +        if ( a.domid == DOMID_SELF )
   64.17 +        {
   64.18 +            d = rcu_lock_current_domain();
   64.19 +        }
   64.20 +        else
   64.21 +        {
   64.22 +            if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL )
   64.23 +                return -ESRCH;
   64.24 +            if ( !IS_PRIV_FOR(current->domain, d) )
   64.25 +            {
   64.26 +                rc = -EPERM;
   64.27 +                goto param_fail3;
   64.28 +            }
   64.29 +        }
   64.30 +
   64.31 +        rc = -EINVAL;
   64.32 +        if ( !is_hvm_domain(d) )
   64.33 +            goto param_fail3;
   64.34 +
   64.35 +        rc = xsm_hvm_param(d, op);
   64.36 +        if ( rc )
   64.37 +            goto param_fail3;
   64.38 +
   64.39 +        rc = -EINVAL;
   64.40 +        if ( (a.first_pfn > domain_get_maximum_gpfn(d)) ||
   64.41 +             ((a.first_pfn + a.nr - 1) < a.first_pfn) ||
   64.42 +             ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) )
   64.43 +            goto param_fail3;
   64.44 +
   64.45 +        rc = 0;
   64.46 +        if ( !paging_mode_log_dirty(d) )
   64.47 +            goto param_fail3;
   64.48 +
   64.49 +        for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ )
   64.50 +        {
   64.51 +            p2m_type_t t;
   64.52 +            mfn_t mfn = gfn_to_mfn(d, pfn, &t);
   64.53 +            if ( mfn_x(mfn) != INVALID_MFN )
   64.54 +            {
   64.55 +                paging_mark_dirty(d, mfn_x(mfn));
   64.56 +                /* These are most probably not page tables any more */
   64.57 +                /* don't take a long time and don't die either */
   64.58 +                sh_remove_shadows(d->vcpu[0], mfn, 1, 0);
   64.59 +            }
   64.60 +        }
   64.61 +
   64.62 +    param_fail3:
   64.63 +        rcu_unlock_domain(d);
   64.64 +        break;
   64.65 +    }
   64.66 +
   64.67      default:
   64.68      {
   64.69          gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
    65.1 --- a/xen/arch/x86/hvm/vmx/vmcs.c	Thu Jun 19 12:48:04 2008 +0900
    65.2 +++ b/xen/arch/x86/hvm/vmx/vmcs.c	Wed Jul 02 11:30:37 2008 +0900
    65.3 @@ -677,10 +677,11 @@ static int construct_vmcs(struct vcpu *v
    65.4      return 0;
    65.5  }
    65.6  
    65.7 -int vmx_read_guest_msr(struct vcpu *v, u32 msr, u64 *val)
    65.8 +int vmx_read_guest_msr(u32 msr, u64 *val)
    65.9  {
   65.10 -    unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
   65.11 -    const struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
   65.12 +    struct vcpu *curr = current;
   65.13 +    unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count;
   65.14 +    const struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
   65.15  
   65.16      for ( i = 0; i < msr_count; i++ )
   65.17      {
   65.18 @@ -694,10 +695,11 @@ int vmx_read_guest_msr(struct vcpu *v, u
   65.19      return -ESRCH;
   65.20  }
   65.21  
   65.22 -int vmx_write_guest_msr(struct vcpu *v, u32 msr, u64 val)
   65.23 +int vmx_write_guest_msr(u32 msr, u64 val)
   65.24  {
   65.25 -    unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
   65.26 -    struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
   65.27 +    struct vcpu *curr = current;
   65.28 +    unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count;
   65.29 +    struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
   65.30  
   65.31      for ( i = 0; i < msr_count; i++ )
   65.32      {
   65.33 @@ -711,10 +713,20 @@ int vmx_write_guest_msr(struct vcpu *v, 
   65.34      return -ESRCH;
   65.35  }
   65.36  
   65.37 -int vmx_add_guest_msr(struct vcpu *v, u32 msr)
   65.38 +int vmx_add_guest_msr(u32 msr)
   65.39  {
   65.40 -    unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
   65.41 -    struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
   65.42 +    struct vcpu *curr = current;
   65.43 +    unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count;
   65.44 +    struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
   65.45 +
   65.46 +    if ( msr_area == NULL )
   65.47 +    {
   65.48 +        if ( (msr_area = alloc_xenheap_page()) == NULL )
   65.49 +            return -ENOMEM;
   65.50 +        curr->arch.hvm_vmx.msr_area = msr_area;
   65.51 +        __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(msr_area));
   65.52 +        __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
   65.53 +    }
   65.54  
   65.55      for ( i = 0; i < msr_count; i++ )
   65.56          if ( msr_area[i].index == msr )
   65.57 @@ -723,29 +735,29 @@ int vmx_add_guest_msr(struct vcpu *v, u3
   65.58      if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
   65.59          return -ENOSPC;
   65.60  
   65.61 -    if ( msr_area == NULL )
   65.62 -    {
   65.63 -        if ( (msr_area = alloc_xenheap_page()) == NULL )
   65.64 -            return -ENOMEM;
   65.65 -        v->arch.hvm_vmx.msr_area = msr_area;
   65.66 -        __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(msr_area));
   65.67 -        __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
   65.68 -    }
   65.69 -
   65.70      msr_area[msr_count].index = msr;
   65.71      msr_area[msr_count].mbz   = 0;
   65.72      msr_area[msr_count].data  = 0;
   65.73 -    v->arch.hvm_vmx.msr_count = ++msr_count;
   65.74 +    curr->arch.hvm_vmx.msr_count = ++msr_count;
   65.75      __vmwrite(VM_EXIT_MSR_STORE_COUNT, msr_count);
   65.76      __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, msr_count);
   65.77  
   65.78      return 0;
   65.79  }
   65.80  
   65.81 -int vmx_add_host_load_msr(struct vcpu *v, u32 msr)
   65.82 +int vmx_add_host_load_msr(u32 msr)
   65.83  {
   65.84 -    unsigned int i, msr_count = v->arch.hvm_vmx.host_msr_count;
   65.85 -    struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.host_msr_area;
   65.86 +    struct vcpu *curr = current;
   65.87 +    unsigned int i, msr_count = curr->arch.hvm_vmx.host_msr_count;
   65.88 +    struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.host_msr_area;
   65.89 +
   65.90 +    if ( msr_area == NULL )
   65.91 +    {
   65.92 +        if ( (msr_area = alloc_xenheap_page()) == NULL )
   65.93 +            return -ENOMEM;
   65.94 +        curr->arch.hvm_vmx.host_msr_area = msr_area;
   65.95 +        __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
   65.96 +    }
   65.97  
   65.98      for ( i = 0; i < msr_count; i++ )
   65.99          if ( msr_area[i].index == msr )
  65.100 @@ -754,18 +766,10 @@ int vmx_add_host_load_msr(struct vcpu *v
  65.101      if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
  65.102          return -ENOSPC;
  65.103  
  65.104 -    if ( msr_area == NULL )
  65.105 -    {
  65.106 -        if ( (msr_area = alloc_xenheap_page()) == NULL )
  65.107 -            return -ENOMEM;
  65.108 -        v->arch.hvm_vmx.host_msr_area = msr_area;
  65.109 -        __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
  65.110 -    }
  65.111 -
  65.112      msr_area[msr_count].index = msr;
  65.113      msr_area[msr_count].mbz   = 0;
  65.114      rdmsrl(msr, msr_area[msr_count].data);
  65.115 -    v->arch.hvm_vmx.host_msr_count = ++msr_count;
  65.116 +    curr->arch.hvm_vmx.host_msr_count = ++msr_count;
  65.117      __vmwrite(VM_EXIT_MSR_LOAD_COUNT, msr_count);
  65.118  
  65.119      return 0;
  65.120 @@ -776,21 +780,17 @@ int vmx_create_vmcs(struct vcpu *v)
  65.121      struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
  65.122      int rc;
  65.123  
  65.124 -    if ( arch_vmx->vmcs == NULL )
  65.125 -    {
  65.126 -        if ( (arch_vmx->vmcs = vmx_alloc_vmcs()) == NULL )
  65.127 -            return -ENOMEM;
  65.128 +    if ( (arch_vmx->vmcs = vmx_alloc_vmcs()) == NULL )
  65.129 +        return -ENOMEM;
  65.130  
  65.131 -        INIT_LIST_HEAD(&arch_vmx->active_list);
  65.132 -        __vmpclear(virt_to_maddr(arch_vmx->vmcs));
  65.133 -        arch_vmx->active_cpu = -1;
  65.134 -        arch_vmx->launched   = 0;
  65.135 -    }
  65.136 +    INIT_LIST_HEAD(&arch_vmx->active_list);
  65.137 +    __vmpclear(virt_to_maddr(arch_vmx->vmcs));
  65.138 +    arch_vmx->active_cpu = -1;
  65.139 +    arch_vmx->launched   = 0;
  65.140  
  65.141      if ( (rc = construct_vmcs(v)) != 0 )
  65.142      {
  65.143          vmx_free_vmcs(arch_vmx->vmcs);
  65.144 -        arch_vmx->vmcs = NULL;
  65.145          return rc;
  65.146      }
  65.147  
  65.148 @@ -801,13 +801,13 @@ void vmx_destroy_vmcs(struct vcpu *v)
  65.149  {
  65.150      struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
  65.151  
  65.152 -    if ( arch_vmx->vmcs == NULL )
  65.153 -        return;
  65.154 -
  65.155      vmx_clear_vmcs(v);
  65.156  
  65.157      vmx_free_vmcs(arch_vmx->vmcs);
  65.158 -    arch_vmx->vmcs = NULL;
  65.159 +
  65.160 +    free_xenheap_page(v->arch.hvm_vmx.host_msr_area);
  65.161 +    free_xenheap_page(v->arch.hvm_vmx.msr_area);
  65.162 +    free_xenheap_page(v->arch.hvm_vmx.msr_bitmap);
  65.163  }
  65.164  
  65.165  void vm_launch_fail(void)
    66.1 --- a/xen/arch/x86/hvm/vmx/vmx.c	Thu Jun 19 12:48:04 2008 +0900
    66.2 +++ b/xen/arch/x86/hvm/vmx/vmx.c	Wed Jul 02 11:30:37 2008 +0900
    66.3 @@ -1523,7 +1523,8 @@ static int vmx_cr_access(unsigned long e
    66.4          break;
    66.5      case VMX_CONTROL_REG_ACCESS_TYPE_LMSW:
    66.6          value = v->arch.hvm_vcpu.guest_cr[0];
    66.7 -        value = (value & ~0xFFFF) | ((exit_qualification >> 16) & 0xFFFF);
    66.8 +        /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */
    66.9 +        value = (value & ~0xe) | ((exit_qualification >> 16) & 0xf);
   66.10          HVMTRACE_LONG_1D(LMSW, current, value);
   66.11          return !hvm_set_cr0(value);
   66.12      default:
   66.13 @@ -1655,7 +1656,7 @@ static int vmx_msr_read_intercept(struct
   66.14                  goto done;
   66.15          }
   66.16  
   66.17 -        if ( vmx_read_guest_msr(v, ecx, &msr_content) == 0 )
   66.18 +        if ( vmx_read_guest_msr(ecx, &msr_content) == 0 )
   66.19              break;
   66.20  
   66.21          if ( is_last_branch_msr(ecx) )
   66.22 @@ -1817,12 +1818,12 @@ static int vmx_msr_write_intercept(struc
   66.23  
   66.24              for ( ; (rc == 0) && lbr->count; lbr++ )
   66.25                  for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
   66.26 -                    if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 )
   66.27 +                    if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 )
   66.28                          vmx_disable_intercept_for_msr(v, lbr->base + i);
   66.29          }
   66.30  
   66.31          if ( (rc < 0) ||
   66.32 -             (vmx_add_host_load_msr(v, ecx) < 0) )
   66.33 +             (vmx_add_host_load_msr(ecx) < 0) )
   66.34              vmx_inject_hw_exception(v, TRAP_machine_check, 0);
   66.35          else
   66.36          {
   66.37 @@ -1842,7 +1843,7 @@ static int vmx_msr_write_intercept(struc
   66.38          switch ( long_mode_do_msr_write(regs) )
   66.39          {
   66.40              case HNDL_unhandled:
   66.41 -                if ( (vmx_write_guest_msr(v, ecx, msr_content) != 0) &&
   66.42 +                if ( (vmx_write_guest_msr(ecx, msr_content) != 0) &&
   66.43                       !is_last_branch_msr(ecx) )
   66.44                      wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
   66.45                  break;
    67.1 --- a/xen/arch/x86/hvm/vmx/vpmu_core2.c	Thu Jun 19 12:48:04 2008 +0900
    67.2 +++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c	Wed Jul 02 11:30:37 2008 +0900
    67.3 @@ -219,12 +219,12 @@ static int core2_vpmu_alloc_resource(str
    67.4          return 0;
    67.5  
    67.6      wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
    67.7 -    if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) )
    67.8 +    if ( vmx_add_host_load_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
    67.9          return 0;
   67.10  
   67.11 -    if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) )
   67.12 +    if ( vmx_add_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
   67.13          return 0;
   67.14 -    vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, -1ULL);
   67.15 +    vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, -1ULL);
   67.16  
   67.17      pmu_enable = xmalloc_bytes(sizeof(struct core2_pmu_enable) +
   67.18                   (core2_get_pmc_count()-1)*sizeof(char));
   67.19 @@ -347,7 +347,7 @@ static int core2_vpmu_do_wrmsr(struct cp
   67.20          break;
   67.21      case MSR_CORE_PERF_FIXED_CTR_CTRL:
   67.22          non_global_ctrl = msr_content;
   67.23 -        vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
   67.24 +        vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
   67.25          global_ctrl >>= 32;
   67.26          for ( i = 0; i < 3; i++ )
   67.27          {
   67.28 @@ -359,7 +359,7 @@ static int core2_vpmu_do_wrmsr(struct cp
   67.29          break;
   67.30      default:
   67.31          tmp = ecx - MSR_P6_EVNTSEL0;
   67.32 -        vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
   67.33 +        vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
   67.34          if ( tmp >= 0 && tmp < core2_get_pmc_count() )
   67.35              core2_vpmu_cxt->pmu_enable->arch_pmc_enable[tmp] =
   67.36                  (global_ctrl >> tmp) & (msr_content >> 22) & 1;
   67.37 @@ -385,7 +385,7 @@ static int core2_vpmu_do_wrmsr(struct cp
   67.38      if ( type != MSR_TYPE_GLOBAL )
   67.39          wrmsrl(ecx, msr_content);
   67.40      else
   67.41 -        vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
   67.42 +        vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
   67.43  
   67.44      return 1;
   67.45  }
   67.46 @@ -410,7 +410,7 @@ static int core2_vpmu_do_rdmsr(struct cp
   67.47          msr_content = core2_vpmu_cxt->global_ovf_status;
   67.48          break;
   67.49      case MSR_CORE_PERF_GLOBAL_CTRL:
   67.50 -        vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &msr_content);
   67.51 +        vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &msr_content);
   67.52          break;
   67.53      default:
   67.54          rdmsrl(regs->ecx, msr_content);
    68.1 --- a/xen/arch/x86/mm.c	Thu Jun 19 12:48:04 2008 +0900
    68.2 +++ b/xen/arch/x86/mm.c	Wed Jul 02 11:30:37 2008 +0900
    68.3 @@ -219,7 +219,7 @@ void __init arch_init_memory(void)
    68.4       * Any Xen-heap pages that we will allow to be mapped will have
    68.5       * their domain field set to dom_xen.
    68.6       */
    68.7 -    dom_xen = alloc_domain(DOMID_XEN);
    68.8 +    dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
    68.9      BUG_ON(dom_xen == NULL);
   68.10  
   68.11      /*
   68.12 @@ -227,7 +227,7 @@ void __init arch_init_memory(void)
   68.13       * This domain owns I/O pages that are within the range of the page_info
   68.14       * array. Mappings occur at the priv of the caller.
   68.15       */
   68.16 -    dom_io = alloc_domain(DOMID_IO);
   68.17 +    dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
   68.18      BUG_ON(dom_io == NULL);
   68.19  
   68.20      /* First 1MB of RAM is historically marked as I/O. */
   68.21 @@ -1933,9 +1933,15 @@ int get_page_type(struct page_info *page
   68.22          {
   68.23              struct domain *d = page_get_owner(page);
   68.24  
   68.25 -            /* Never allow a shadowed frame to go from type count 0 to 1 */
   68.26 -            if ( d && shadow_mode_enabled(d) )
   68.27 -                shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
   68.28 +            /* Normally we should never let a page go from type count 0
   68.29 +             * to type count 1 when it is shadowed. One exception:
   68.30 +             * out-of-sync shadowed pages are allowed to become
   68.31 +             * writeable. */
   68.32 +            if ( d && shadow_mode_enabled(d)
   68.33 +                 && (page->count_info & PGC_page_table)
   68.34 +                 && !((page->shadow_flags & (1u<<29))
   68.35 +                      && type == PGT_writable_page) )
   68.36 +               shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
   68.37  
   68.38              ASSERT(!(x & PGT_pae_xen_l2));
   68.39              if ( (x & PGT_type_mask) != type )
   68.40 @@ -3533,15 +3539,14 @@ struct ptwr_emulate_ctxt {
   68.41  static int ptwr_emulated_read(
   68.42      enum x86_segment seg,
   68.43      unsigned long offset,
   68.44 -    unsigned long *val,
   68.45 +    void *p_data,
   68.46      unsigned int bytes,
   68.47      struct x86_emulate_ctxt *ctxt)
   68.48  {
   68.49      unsigned int rc;
   68.50      unsigned long addr = offset;
   68.51  
   68.52 -    *val = 0;
   68.53 -    if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 )
   68.54 +    if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 )
   68.55      {
   68.56          propagate_page_fault(addr + bytes - rc, 0); /* read fault */
   68.57          return X86EMUL_EXCEPTION;
   68.58 @@ -3568,7 +3573,7 @@ static int ptwr_emulated_update(
   68.59      /* Only allow naturally-aligned stores within the original %cr2 page. */
   68.60      if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
   68.61      {
   68.62 -        MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)",
   68.63 +        MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
   68.64                  ptwr_ctxt->cr2, addr, bytes);
   68.65          return X86EMUL_UNHANDLEABLE;
   68.66      }
   68.67 @@ -3676,10 +3681,21 @@ static int ptwr_emulated_update(
   68.68  static int ptwr_emulated_write(
   68.69      enum x86_segment seg,
   68.70      unsigned long offset,
   68.71 -    unsigned long val,
   68.72 +    void *p_data,
   68.73      unsigned int bytes,
   68.74      struct x86_emulate_ctxt *ctxt)
   68.75  {
   68.76 +    paddr_t val = 0;
   68.77 +
   68.78 +    if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
   68.79 +    {
   68.80 +        MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)",
   68.81 +                offset, bytes);
   68.82 +        return X86EMUL_UNHANDLEABLE;
   68.83 +    }
   68.84 +
   68.85 +    memcpy(&val, p_data, bytes);
   68.86 +
   68.87      return ptwr_emulated_update(
   68.88          offset, 0, val, bytes, 0,
   68.89          container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
   68.90 @@ -3694,10 +3710,17 @@ static int ptwr_emulated_cmpxchg(
   68.91      struct x86_emulate_ctxt *ctxt)
   68.92  {
   68.93      paddr_t old = 0, new = 0;
   68.94 -    if ( bytes > sizeof(paddr_t) )
   68.95 +
   68.96 +    if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
   68.97 +    {
   68.98 +        MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)",
   68.99 +                offset, bytes);
  68.100          return X86EMUL_UNHANDLEABLE;
  68.101 +    }
  68.102 +
  68.103      memcpy(&old, p_old, bytes);
  68.104      memcpy(&new, p_new, bytes);
  68.105 +
  68.106      return ptwr_emulated_update(
  68.107          offset, old, new, bytes, 1,
  68.108          container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
    69.1 --- a/xen/arch/x86/mm/shadow/common.c	Thu Jun 19 12:48:04 2008 +0900
    69.2 +++ b/xen/arch/x86/mm/shadow/common.c	Wed Jul 02 11:30:37 2008 +0900
    69.3 @@ -54,6 +54,10 @@ void shadow_domain_init(struct domain *d
    69.4      /* Use shadow pagetables for log-dirty support */
    69.5      paging_log_dirty_init(d, shadow_enable_log_dirty, 
    69.6                            shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
    69.7 +
    69.8 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
    69.9 +    d->arch.paging.shadow.oos_active = 0;
   69.10 +#endif
   69.11  }
   69.12  
   69.13  /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
   69.14 @@ -64,6 +68,16 @@ void shadow_domain_init(struct domain *d
   69.15   */
   69.16  void shadow_vcpu_init(struct vcpu *v)
   69.17  {
   69.18 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
   69.19 +    int i;
   69.20 +
   69.21 +    for ( i = 0; i < SHADOW_OOS_PAGES; i++ )
   69.22 +    {
   69.23 +        v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN);
   69.24 +        v->arch.paging.shadow.oos_snapshot[i] = _mfn(INVALID_MFN);
   69.25 +    }
   69.26 +#endif
   69.27 +
   69.28      v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
   69.29  }
   69.30  
   69.31 @@ -131,7 +145,7 @@ static int hvm_translate_linear_addr(
   69.32  static int
   69.33  hvm_read(enum x86_segment seg,
   69.34           unsigned long offset,
   69.35 -         unsigned long *val,
   69.36 +         void *p_data,
   69.37           unsigned int bytes,
   69.38           enum hvm_access_type access_type,
   69.39           struct sh_emulate_ctxt *sh_ctxt)
   69.40 @@ -144,12 +158,10 @@ hvm_read(enum x86_segment seg,
   69.41      if ( rc )
   69.42          return rc;
   69.43  
   69.44 -    *val = 0;
   69.45 -
   69.46      if ( access_type == hvm_access_insn_fetch )
   69.47 -        rc = hvm_fetch_from_guest_virt(val, addr, bytes, 0);
   69.48 +        rc = hvm_fetch_from_guest_virt(p_data, addr, bytes, 0);
   69.49      else
   69.50 -        rc = hvm_copy_from_guest_virt(val, addr, bytes, 0);
   69.51 +        rc = hvm_copy_from_guest_virt(p_data, addr, bytes, 0);
   69.52  
   69.53      switch ( rc )
   69.54      {
   69.55 @@ -167,20 +179,20 @@ hvm_read(enum x86_segment seg,
   69.56  static int
   69.57  hvm_emulate_read(enum x86_segment seg,
   69.58                   unsigned long offset,
   69.59 -                 unsigned long *val,
   69.60 +                 void *p_data,
   69.61                   unsigned int bytes,
   69.62                   struct x86_emulate_ctxt *ctxt)
   69.63  {
   69.64      if ( !is_x86_user_segment(seg) )
   69.65          return X86EMUL_UNHANDLEABLE;
   69.66 -    return hvm_read(seg, offset, val, bytes, hvm_access_read,
   69.67 +    return hvm_read(seg, offset, p_data, bytes, hvm_access_read,
   69.68                      container_of(ctxt, struct sh_emulate_ctxt, ctxt));
   69.69  }
   69.70  
   69.71  static int
   69.72  hvm_emulate_insn_fetch(enum x86_segment seg,
   69.73                         unsigned long offset,
   69.74 -                       unsigned long *val,
   69.75 +                       void *p_data,
   69.76                         unsigned int bytes,
   69.77                         struct x86_emulate_ctxt *ctxt)
   69.78  {
   69.79 @@ -192,19 +204,18 @@ hvm_emulate_insn_fetch(enum x86_segment 
   69.80  
   69.81      /* Fall back if requested bytes are not in the prefetch cache. */
   69.82      if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
   69.83 -        return hvm_read(seg, offset, val, bytes,
   69.84 +        return hvm_read(seg, offset, p_data, bytes,
   69.85                          hvm_access_insn_fetch, sh_ctxt);
   69.86  
   69.87      /* Hit the cache. Simple memcpy. */
   69.88 -    *val = 0;
   69.89 -    memcpy(val, &sh_ctxt->insn_buf[insn_off], bytes);
   69.90 +    memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes);
   69.91      return X86EMUL_OKAY;
   69.92  }
   69.93  
   69.94  static int
   69.95  hvm_emulate_write(enum x86_segment seg,
   69.96                    unsigned long offset,
   69.97 -                  unsigned long val,
   69.98 +                  void *p_data,
   69.99                    unsigned int bytes,
  69.100                    struct x86_emulate_ctxt *ctxt)
  69.101  {
  69.102 @@ -227,7 +238,7 @@ hvm_emulate_write(enum x86_segment seg,
  69.103          return rc;
  69.104  
  69.105      return v->arch.paging.mode->shadow.x86_emulate_write(
  69.106 -        v, addr, &val, bytes, sh_ctxt);
  69.107 +        v, addr, p_data, bytes, sh_ctxt);
  69.108  }
  69.109  
  69.110  static int 
  69.111 @@ -279,7 +290,7 @@ static struct x86_emulate_ops hvm_shadow
  69.112  static int
  69.113  pv_emulate_read(enum x86_segment seg,
  69.114                  unsigned long offset,
  69.115 -                unsigned long *val,
  69.116 +                void *p_data,
  69.117                  unsigned int bytes,
  69.118                  struct x86_emulate_ctxt *ctxt)
  69.119  {
  69.120 @@ -288,8 +299,7 @@ pv_emulate_read(enum x86_segment seg,
  69.121      if ( !is_x86_user_segment(seg) )
  69.122          return X86EMUL_UNHANDLEABLE;
  69.123  
  69.124 -    *val = 0;
  69.125 -    if ( (rc = copy_from_user((void *)val, (void *)offset, bytes)) != 0 )
  69.126 +    if ( (rc = copy_from_user(p_data, (void *)offset, bytes)) != 0 )
  69.127      {
  69.128          propagate_page_fault(offset + bytes - rc, 0); /* read fault */
  69.129          return X86EMUL_EXCEPTION;
  69.130 @@ -301,7 +311,7 @@ pv_emulate_read(enum x86_segment seg,
  69.131  static int
  69.132  pv_emulate_write(enum x86_segment seg,
  69.133                   unsigned long offset,
  69.134 -                 unsigned long val,
  69.135 +                 void *p_data,
  69.136                   unsigned int bytes,
  69.137                   struct x86_emulate_ctxt *ctxt)
  69.138  {
  69.139 @@ -311,7 +321,7 @@ pv_emulate_write(enum x86_segment seg,
  69.140      if ( !is_x86_user_segment(seg) )
  69.141          return X86EMUL_UNHANDLEABLE;
  69.142      return v->arch.paging.mode->shadow.x86_emulate_write(
  69.143 -        v, offset, &val, bytes, sh_ctxt);
  69.144 +        v, offset, p_data, bytes, sh_ctxt);
  69.145  }
  69.146  
  69.147  static int 
  69.148 @@ -427,6 +437,585 @@ void shadow_continue_emulation(struct sh
  69.149          }
  69.150      }
  69.151  }
  69.152 + 
  69.153 +
  69.154 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  69.155 +/**************************************************************************/
  69.156 +/* Out-of-sync shadows. */ 
  69.157 +
  69.158 +/* From time to time, we let a shadowed pagetable page go out of sync 
  69.159 + * with its shadow: the guest is allowed to write directly to the page, 
  69.160 + * and those writes are not synchronously reflected in the shadow.
  69.161 + * This lets us avoid many emulations if the guest is writing a lot to a 
  69.162 + * pagetable, but it relaxes a pretty important invariant in the shadow 
  69.163 + * pagetable design.  Therefore, some rules:
  69.164 + *
  69.165 + * 1. Only L1 pagetables may go out of sync: any page that is shadowed
  69.166 + *    at at higher level must be synchronously updated.  This makes
  69.167 + *    using linear shadow pagetables much less dangerous.
  69.168 + *    That means that: (a) unsyncing code needs to check for higher-level
  69.169 + *    shadows, and (b) promotion code needs to resync.
  69.170 + * 
  69.171 + * 2. All shadow operations on a guest page require the page to be brought
  69.172 + *    back into sync before proceeding.  This must be done under the
  69.173 + *    shadow lock so that the page is guaranteed to remain synced until
  69.174 + *    the operation completes.
  69.175 + *
  69.176 + *    Exceptions to this rule: the pagefault and invlpg handlers may 
  69.177 + *    update only one entry on an out-of-sync page without resyncing it. 
  69.178 + *
  69.179 + * 3. Operations on shadows that do not start from a guest page need to
  69.180 + *    be aware that they may be handling an out-of-sync shadow.
  69.181 + *
  69.182 + * 4. Operations that do not normally take the shadow lock (fast-path 
  69.183 + *    #PF handler, INVLPG) must fall back to a locking, syncing version 
  69.184 + *    if they see an out-of-sync table. 
  69.185 + *
  69.186 + * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG)
  69.187 + *    must explicitly resync all relevant pages or update their
  69.188 + *    shadows.
  69.189 + *
  69.190 + * Currently out-of-sync pages are listed in a simple open-addressed
  69.191 + * hash table with a second chance (must resist temptation to radically
  69.192 + * over-engineer hash tables...)  The virtual address of the access
  69.193 + * which caused us to unsync the page is also kept in the hash table, as
  69.194 + * a hint for finding the writable mappings later.
  69.195 + *
  69.196 + * We keep a hash per vcpu, because we want as much as possible to do
  69.197 + * the re-sync on the save vcpu we did the unsync on, so the VA hint
  69.198 + * will be valid.
  69.199 + */
  69.200 +
  69.201 +
  69.202 +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
  69.203 +static void sh_oos_audit(struct domain *d) 
  69.204 +{
  69.205 +    int idx, expected_idx, expected_idx_alt;
  69.206 +    struct page_info *pg;
  69.207 +    struct vcpu *v;
  69.208 +    
  69.209 +    for_each_vcpu(d, v) 
  69.210 +    {
  69.211 +        for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
  69.212 +        {
  69.213 +            mfn_t *oos = v->arch.paging.shadow.oos;
  69.214 +            if ( !mfn_valid(oos[idx]) )
  69.215 +                continue;
  69.216 +            
  69.217 +            expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES;
  69.218 +            expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES);
  69.219 +            if ( idx != expected_idx && idx != expected_idx_alt )
  69.220 +            {
  69.221 +                printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n",
  69.222 +                       __func__, idx, mfn_x(oos[idx]), 
  69.223 +                       expected_idx, expected_idx_alt);
  69.224 +                BUG();
  69.225 +            }
  69.226 +            pg = mfn_to_page(oos[idx]);
  69.227 +            if ( !(pg->count_info & PGC_page_table) )
  69.228 +            {
  69.229 +                printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n",
  69.230 +                       __func__, idx, mfn_x(oos[idx]), pg->count_info);
  69.231 +                BUG();
  69.232 +            }
  69.233 +            if ( !(pg->shadow_flags & SHF_out_of_sync) )
  69.234 +            {
  69.235 +                printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n",
  69.236 +                       __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
  69.237 +                BUG();
  69.238 +            }
  69.239 +            if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) )
  69.240 +            {
  69.241 +                printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n",
  69.242 +                       __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
  69.243 +                BUG();
  69.244 +            }
  69.245 +        }
  69.246 +    }
  69.247 +}
  69.248 +#endif
  69.249 +
  69.250 +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
  69.251 +void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn) 
  69.252 +{
  69.253 +    int idx;
  69.254 +    struct vcpu *v;
  69.255 +    mfn_t *oos;
  69.256 +
  69.257 +    ASSERT(mfn_is_out_of_sync(gmfn));
  69.258 +    
  69.259 +    for_each_vcpu(d, v) 
  69.260 +    {
  69.261 +        oos = v->arch.paging.shadow.oos;
  69.262 +        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
  69.263 +        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
  69.264 +            idx = (idx + 1) % SHADOW_OOS_PAGES;
  69.265 +        
  69.266 +        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
  69.267 +            return;
  69.268 +    }
  69.269 +
  69.270 +    SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn));
  69.271 +    BUG();
  69.272 +}
  69.273 +#endif
  69.274 +
  69.275 +/* Update the shadow, but keep the page out of sync. */
  69.276 +static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn, mfn_t snpmfn)
  69.277 +{
  69.278 +    struct page_info *pg = mfn_to_page(gmfn);
  69.279 +
  69.280 +    ASSERT(mfn_valid(gmfn));
  69.281 +    ASSERT(page_is_out_of_sync(pg));
  69.282 +
  69.283 +    /* Call out to the appropriate per-mode resyncing function */
  69.284 +    if ( pg->shadow_flags & SHF_L1_32 )
  69.285 +        SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn, snpmfn);
  69.286 +    else if ( pg->shadow_flags & SHF_L1_PAE )
  69.287 +        SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn, snpmfn);
  69.288 +#if CONFIG_PAGING_LEVELS >= 4
  69.289 +    else if ( pg->shadow_flags & SHF_L1_64 )
  69.290 +        SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn, snpmfn);
  69.291 +#endif
  69.292 +}
  69.293 +
  69.294 +#define _FIXUP_IDX(_b, _i) ((_b) * SHADOW_OOS_FT_HASH + (_i))
  69.295 +
  69.296 +void oos_fixup_add(struct vcpu *v, mfn_t gmfn,
  69.297 +                   mfn_t smfn, unsigned long off)
  69.298 +{
  69.299 +    int idx, i, free = 0, free_slot = 0;
  69.300 +    struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
  69.301 +
  69.302 +    idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH;
  69.303 +    for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ )
  69.304 +    {
  69.305 +        if ( !mfn_valid(fixups[_FIXUP_IDX(idx, i)].gmfn)
  69.306 +             || !mfn_is_out_of_sync(fixups[_FIXUP_IDX(idx, i)].gmfn) )
  69.307 +        {
  69.308 +            free = 1;
  69.309 +            free_slot = _FIXUP_IDX(idx, i);
  69.310 +        }
  69.311 +        else if ( (mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) == mfn_x(gmfn))
  69.312 +                  && (mfn_x(fixups[_FIXUP_IDX(idx, i)].smfn) == mfn_x(smfn))
  69.313 +                  && (fixups[_FIXUP_IDX(idx, i)].off == off) )
  69.314 +        {
  69.315 +            perfc_incr(shadow_oos_fixup_no_add);
  69.316 +            return;
  69.317 +        }
  69.318 +    }
  69.319 +
  69.320 +    if ( free )
  69.321 +    {
  69.322 +        if ( !v->arch.paging.shadow.oos_fixup_used )
  69.323 +            v->arch.paging.shadow.oos_fixup_used = 1;
  69.324 +        fixups[free_slot].gmfn = gmfn;
  69.325 +        fixups[free_slot].smfn = smfn;
  69.326 +        fixups[free_slot].off = off;
  69.327 +        perfc_incr(shadow_oos_fixup_add_ok);
  69.328 +        return;
  69.329 +    }
  69.330 +
  69.331 +
  69.332 +    perfc_incr(shadow_oos_fixup_add_fail);
  69.333 +}
  69.334 +
  69.335 +void oos_fixup_remove(struct vcpu *v, mfn_t gmfn)
  69.336 +{
  69.337 +    int idx, i;
  69.338 +    struct domain *d = v->domain;
  69.339 +
  69.340 +    perfc_incr(shadow_oos_fixup_remove);
  69.341 +
  69.342 +    /* If the domain is dying we might get called when deallocating
  69.343 +     * the shadows. Fixup tables are already freed so exit now. */
  69.344 +    if ( d->is_dying )
  69.345 +        return;
  69.346 +
  69.347 +    idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH;
  69.348 +    for_each_vcpu(d, v)
  69.349 +    {
  69.350 +        struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
  69.351 +        for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ )
  69.352 +            if ( mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) == mfn_x(gmfn) )
  69.353 +                fixups[_FIXUP_IDX(idx, i)].gmfn = _mfn(INVALID_MFN);
  69.354 +    }
  69.355 +}
  69.356 +
  69.357 +int oos_fixup_flush(struct vcpu *v)
  69.358 +{
  69.359 +    int i, rc = 0;
  69.360 +    struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
  69.361 +
  69.362 +    perfc_incr(shadow_oos_fixup_flush);
  69.363 +
  69.364 +    if ( !v->arch.paging.shadow.oos_fixup_used )
  69.365 +        return 0;
  69.366 +
  69.367 +    for ( i = 0; i < SHADOW_OOS_FT_HASH * SHADOW_OOS_FT_ENTRIES; i++ )
  69.368 +    {
  69.369 +        if ( mfn_valid(fixups[i].gmfn) )
  69.370 +        {
  69.371 +            if ( mfn_is_out_of_sync(fixups[i].gmfn) )
  69.372 +                rc |= sh_remove_write_access_from_sl1p(v, fixups[i].gmfn,
  69.373 +                                                       fixups[i].smfn,
  69.374 +                                                       fixups[i].off);
  69.375 +            fixups[i].gmfn = _mfn(INVALID_MFN);
  69.376 +        }
  69.377 +    }
  69.378 +
  69.379 +    v->arch.paging.shadow.oos_fixup_used = 0;
  69.380 +
  69.381 +    return rc;
  69.382 +}
  69.383 +
  69.384 +int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn)
  69.385 +{
  69.386 +    int idx, i, rc = 0;
  69.387 +    struct domain *d = v->domain;
  69.388 +
  69.389 +    perfc_incr(shadow_oos_fixup_flush_gmfn);
  69.390 +
  69.391 +    idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH;
  69.392 +    for_each_vcpu(d, v)
  69.393 +    {
  69.394 +        struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
  69.395 +
  69.396 +        for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ )
  69.397 +        {
  69.398 +            if ( mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) != mfn_x(gmfn) )
  69.399 +                continue;
  69.400 +
  69.401 +            rc |= sh_remove_write_access_from_sl1p(v, 
  69.402 +                                                   fixups[_FIXUP_IDX(idx,i)].gmfn,
  69.403 +                                                   fixups[_FIXUP_IDX(idx,i)].smfn,
  69.404 +                                                   fixups[_FIXUP_IDX(idx,i)].off);
  69.405 +
  69.406 +            fixups[_FIXUP_IDX(idx,i)].gmfn = _mfn(INVALID_MFN);
  69.407 +        }
  69.408 +    }
  69.409 +
  69.410 +    return rc;
  69.411 +}
  69.412 +
  69.413 +static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn, unsigned long va)
  69.414 +{
  69.415 +    int ftlb = 0;
  69.416 +
  69.417 +    ftlb |= oos_fixup_flush_gmfn(v, gmfn);
  69.418 +
  69.419 +    switch ( sh_remove_write_access(v, gmfn, 0, va) )
  69.420 +    {
  69.421 +    default:
  69.422 +    case 0:
  69.423 +        break;
  69.424 +
  69.425 +    case 1:
  69.426 +        ftlb |= 1;
  69.427 +        break;
  69.428 +
  69.429 +    case -1:
  69.430 +        /* An unfindable writeable typecount has appeared, probably via a
  69.431 +         * grant table entry: can't shoot the mapping, so try to unshadow 
  69.432 +         * the page.  If that doesn't work either, the guest is granting
  69.433 +         * his pagetables and must be killed after all.
  69.434 +         * This will flush the tlb, so we can return with no worries. */
  69.435 +        sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
  69.436 +        return 1;
  69.437 +    }
  69.438 +
  69.439 +    if ( ftlb )
  69.440 +        flush_tlb_mask(v->domain->domain_dirty_cpumask);
  69.441 +
  69.442 +    return 0;
  69.443 +}
  69.444 +
  69.445 +
  69.446 +/* Pull all the entries on an out-of-sync page back into sync. */
  69.447 +static void _sh_resync(struct vcpu *v, mfn_t gmfn, unsigned long va, mfn_t snp)
  69.448 +{
  69.449 +    struct page_info *pg = mfn_to_page(gmfn);
  69.450 +
  69.451 +    ASSERT(shadow_locked_by_me(v->domain));
  69.452 +    ASSERT(mfn_is_out_of_sync(gmfn));
  69.453 +    /* Guest page must be shadowed *only* as L1 when out of sync. */
  69.454 +    ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask 
  69.455 +             & ~SHF_L1_ANY));
  69.456 +    ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));
  69.457 +
  69.458 +    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, va=%lx\n",
  69.459 +                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
  69.460 +
  69.461 +    /* Need to pull write access so the page *stays* in sync. */
  69.462 +    if ( oos_remove_write_access(v, gmfn, va) )
  69.463 +    {
  69.464 +        /* Page has been unshadowed. */
  69.465 +        return;
  69.466 +    }
  69.467 +
  69.468 +    /* No more writable mappings of this page, please */
  69.469 +    pg->shadow_flags &= ~SHF_oos_may_write;
  69.470 +
  69.471 +    /* Update the shadows with current guest entries. */
  69.472 +    _sh_resync_l1(v, gmfn, snp);
  69.473 +
  69.474 +    /* Now we know all the entries are synced, and will stay that way */
  69.475 +    pg->shadow_flags &= ~SHF_out_of_sync;
  69.476 +    perfc_incr(shadow_resync);
  69.477 +}
  69.478 +
  69.479 +
  69.480 +/* Add an MFN to the list of out-of-sync guest pagetables */
  69.481 +static void oos_hash_add(struct vcpu *v, mfn_t gmfn, unsigned long va)
  69.482 +{
  69.483 +    int idx, oidx, swap = 0;
  69.484 +    void *gptr, *gsnpptr;
  69.485 +    mfn_t *oos = v->arch.paging.shadow.oos;
  69.486 +    unsigned long *oos_va = v->arch.paging.shadow.oos_va;
  69.487 +    mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
  69.488 +
  69.489 +    idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
  69.490 +    oidx = idx;
  69.491 +
  69.492 +    if ( mfn_valid(oos[idx]) 
  69.493 +         && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx )
  69.494 +    {
  69.495 +        /* Punt the current occupant into the next slot */
  69.496 +        SWAP(oos[idx], gmfn);
  69.497 +        SWAP(oos_va[idx], va);
  69.498 +        swap = 1;
  69.499 +        idx = (idx + 1) % SHADOW_OOS_PAGES;
  69.500 +    }
  69.501 +    if ( mfn_valid(oos[idx]) )
  69.502 +   {
  69.503 +        /* Crush the current occupant. */
  69.504 +        _sh_resync(v, oos[idx], oos_va[idx], oos_snapshot[idx]);
  69.505 +        perfc_incr(shadow_unsync_evict);
  69.506 +    }
  69.507 +    oos[idx] = gmfn;
  69.508 +    oos_va[idx] = va;
  69.509 +
  69.510 +    if ( swap )
  69.511 +        SWAP(oos_snapshot[idx], oos_snapshot[oidx]);
  69.512 +
  69.513 +    gptr = sh_map_domain_page(oos[oidx]);
  69.514 +    gsnpptr = sh_map_domain_page(oos_snapshot[oidx]);
  69.515 +    memcpy(gsnpptr, gptr, PAGE_SIZE);
  69.516 +    sh_unmap_domain_page(gptr);
  69.517 +    sh_unmap_domain_page(gsnpptr);
  69.518 +}
  69.519 +
  69.520 +/* Remove an MFN from the list of out-of-sync guest pagetables */
  69.521 +static void oos_hash_remove(struct vcpu *v, mfn_t gmfn)
  69.522 +{
  69.523 +    int idx;
  69.524 +    mfn_t *oos;
  69.525 +    struct domain *d = v->domain;
  69.526 +
  69.527 +    SHADOW_PRINTK("D%dV%d gmfn %lx\n",
  69.528 +                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); 
  69.529 +
  69.530 +    for_each_vcpu(d, v) 
  69.531 +    {
  69.532 +        oos = v->arch.paging.shadow.oos;
  69.533 +        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
  69.534 +        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
  69.535 +            idx = (idx + 1) % SHADOW_OOS_PAGES;
  69.536 +        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
  69.537 +        {
  69.538 +            oos[idx] = _mfn(INVALID_MFN);
  69.539 +            return;
  69.540 +        }
  69.541 +    }
  69.542 +
  69.543 +    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
  69.544 +    BUG();
  69.545 +}
  69.546 +
  69.547 +mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn)
  69.548 +{
  69.549 +    int idx;
  69.550 +    mfn_t *oos;
  69.551 +    mfn_t *oos_snapshot;
  69.552 +    struct domain *d = v->domain;
  69.553 +    
  69.554 +    for_each_vcpu(d, v) 
  69.555 +    {
  69.556 +        oos = v->arch.paging.shadow.oos;
  69.557 +        oos_snapshot = v->arch.paging.shadow.oos_snapshot;
  69.558 +        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
  69.559 +        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
  69.560 +            idx = (idx + 1) % SHADOW_OOS_PAGES;
  69.561 +        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
  69.562 +        {
  69.563 +            return oos_snapshot[idx];
  69.564 +        }
  69.565 +    }
  69.566 +
  69.567 +    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
  69.568 +    BUG();
  69.569 +    return _mfn(INVALID_MFN);
  69.570 +}
  69.571 +
  69.572 +/* Pull a single guest page back into sync */
  69.573 +void sh_resync(struct vcpu *v, mfn_t gmfn)
  69.574 +{
  69.575 +    int idx;
  69.576 +    mfn_t *oos;
  69.577 +    unsigned long *oos_va;
  69.578 +    mfn_t *oos_snapshot;
  69.579 +    struct domain *d = v->domain;
  69.580 +
  69.581 +    for_each_vcpu(d, v) 
  69.582 +    {
  69.583 +        oos = v->arch.paging.shadow.oos;
  69.584 +        oos_va = v->arch.paging.shadow.oos_va;
  69.585 +        oos_snapshot = v->arch.paging.shadow.oos_snapshot;
  69.586 +        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
  69.587 +        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
  69.588 +            idx = (idx + 1) % SHADOW_OOS_PAGES;
  69.589 +        
  69.590 +        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
  69.591 +        {
  69.592 +            _sh_resync(v, gmfn, oos_va[idx], oos_snapshot[idx]);
  69.593 +            oos[idx] = _mfn(INVALID_MFN);
  69.594 +            return;
  69.595 +        }
  69.596 +    }
  69.597 +
  69.598 +    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
  69.599 +    BUG();
  69.600 +}
  69.601 +
  69.602 +/* Figure out whether it's definitely safe not to sync this l1 table,
  69.603 + * by making a call out to the mode in which that shadow was made. */
  69.604 +static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn)
  69.605 +{
  69.606 +    struct page_info *pg = mfn_to_page(gl1mfn);
  69.607 +    if ( pg->shadow_flags & SHF_L1_32 )
  69.608 +        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn);
  69.609 +    else if ( pg->shadow_flags & SHF_L1_PAE )
  69.610 +        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn);
  69.611 +#if CONFIG_PAGING_LEVELS >= 4
  69.612 +    else if ( pg->shadow_flags & SHF_L1_64 )
  69.613 +        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn);
  69.614 +#endif
  69.615 +    SHADOW_ERROR("gmfn 0x%lx was OOS but not shadowed as an l1.\n", 
  69.616 +                 mfn_x(gl1mfn));
  69.617 +    BUG();
  69.618 +    return 0; /* BUG() is no longer __attribute__((noreturn)). */
  69.619 +}
  69.620 +
  69.621 +
  69.622 +/* Pull all out-of-sync pages back into sync.  Pages brought out of sync
  69.623 + * on other vcpus are allowed to remain out of sync, but their contents
  69.624 + * will be made safe (TLB flush semantics); pages unsynced by this vcpu
  69.625 + * are brought back into sync and write-protected.  If skip != 0, we try
  69.626 + * to avoid resyncing at all if we think we can get away with it. */
  69.627 +void sh_resync_all(struct vcpu *v, int skip, int this, int others, int do_locking)
  69.628 +{
  69.629 +    int idx;
  69.630 +    struct vcpu *other;
  69.631 +    mfn_t *oos = v->arch.paging.shadow.oos;
  69.632 +    unsigned long *oos_va = v->arch.paging.shadow.oos_va;
  69.633 +    mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
  69.634 +
  69.635 +    SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id);
  69.636 +
  69.637 +    ASSERT(do_locking || shadow_locked_by_me(v->domain));
  69.638 +
  69.639 +    if ( !this )
  69.640 +        goto resync_others;
  69.641 +
  69.642 +    if ( do_locking )
  69.643 +        shadow_lock(v->domain);
  69.644 +
  69.645 +    if ( oos_fixup_flush(v) )
  69.646 +        flush_tlb_mask(v->domain->domain_dirty_cpumask);    
  69.647 +
  69.648 +    /* First: resync all of this vcpu's oos pages */
  69.649 +    for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) 
  69.650 +        if ( mfn_valid(oos[idx]) )
  69.651 +        {
  69.652 +            /* Write-protect and sync contents */
  69.653 +            _sh_resync(v, oos[idx], oos_va[idx], oos_snapshot[idx]);
  69.654 +            oos[idx] = _mfn(INVALID_MFN);
  69.655 +        }
  69.656 +
  69.657 +    if ( do_locking )
  69.658 +        shadow_unlock(v->domain);
  69.659 +
  69.660 + resync_others:
  69.661 +    if ( !others )
  69.662 +        return;
  69.663 +
  69.664 +    /* Second: make all *other* vcpus' oos pages safe. */
  69.665 +    for_each_vcpu(v->domain, other)
  69.666 +    {
  69.667 +        if ( v == other ) 
  69.668 +            continue;
  69.669 +
  69.670 +        if ( do_locking )
  69.671 +            shadow_lock(v->domain);
  69.672 +
  69.673 +        oos = other->arch.paging.shadow.oos;
  69.674 +        oos_va = other->arch.paging.shadow.oos_va;
  69.675 +        oos_snapshot = other->arch.paging.shadow.oos_snapshot;
  69.676 +        for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) 
  69.677 +        {
  69.678 +            if ( !mfn_valid(oos[idx]) )
  69.679 +                continue;
  69.680 +
  69.681 +            if ( skip )
  69.682 +            {
  69.683 +                /* Update the shadows and leave the page OOS. */
  69.684 +                if ( sh_skip_sync(v, oos[idx]) )
  69.685 +                    continue;
  69.686 +                _sh_resync_l1(other, oos[idx], oos_snapshot[idx]);
  69.687 +            }
  69.688 +            else
  69.689 +            {
  69.690 +                /* Write-protect and sync contents */
  69.691 +                _sh_resync(other, oos[idx], oos_va[idx], oos_snapshot[idx]);
  69.692 +                oos[idx] = _mfn(INVALID_MFN);
  69.693 +            }
  69.694 +        }
  69.695 +        
  69.696 +        if ( do_locking )
  69.697 +            shadow_unlock(v->domain);
  69.698 +    }
  69.699 +}
  69.700 +
  69.701 +/* Allow a shadowed page to go out of sync */
  69.702 +int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va)
  69.703 +{
  69.704 +    struct page_info *pg;
  69.705 +    
  69.706 +    ASSERT(shadow_locked_by_me(v->domain));
  69.707 +
  69.708 +    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx va %lx\n",
  69.709 +                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
  69.710 +
  69.711 +    pg = mfn_to_page(gmfn);
  69.712 + 
  69.713 +    /* Guest page must be shadowed *only* as L1 and *only* once when out
  69.714 +     * of sync.  Also, get out now if it's already out of sync. 
  69.715 +     * Also, can't safely unsync if some vcpus have paging disabled.*/
  69.716 +    if ( pg->shadow_flags & 
  69.717 +         ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync) 
  69.718 +         || sh_page_has_multiple_shadows(pg)
  69.719 +         || !is_hvm_domain(v->domain)
  69.720 +         || !v->domain->arch.paging.shadow.oos_active )
  69.721 +        return 0;
  69.722 +
  69.723 +    pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
  69.724 +    oos_hash_add(v, gmfn, va);
  69.725 +    perfc_incr(shadow_unsync);
  69.726 +    return 1;
  69.727 +}
  69.728 +
  69.729 +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
  69.730 +
  69.731  
  69.732  /**************************************************************************/
  69.733  /* Code for "promoting" a guest page to the point where the shadow code is
  69.734 @@ -440,6 +1029,12 @@ void shadow_promote(struct vcpu *v, mfn_
  69.735  
  69.736      ASSERT(mfn_valid(gmfn));
  69.737  
  69.738 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
  69.739 +    /* Is the page already shadowed and out of sync? */
  69.740 +    if ( page_is_out_of_sync(page) ) 
  69.741 +        sh_resync(v, gmfn);
  69.742 +#endif
  69.743 +
  69.744      /* We should never try to promote a gmfn that has writeable mappings */
  69.745      ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
  69.746             || (page->u.inuse.type_info & PGT_count_mask) == 0
  69.747 @@ -463,7 +1058,17 @@ void shadow_demote(struct vcpu *v, mfn_t
  69.748      clear_bit(type, &page->shadow_flags);
  69.749  
  69.750      if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
  69.751 +    {
  69.752 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
  69.753 +        /* Was the page out of sync? */
  69.754 +        if ( page_is_out_of_sync(page) ) 
  69.755 +        {
  69.756 +            oos_hash_remove(v, gmfn);
  69.757 +            oos_fixup_remove(v, gmfn);
  69.758 +        }
  69.759 +#endif 
  69.760          clear_bit(_PGC_page_table, &page->count_info);
  69.761 +    }
  69.762  }
  69.763  
  69.764  /**************************************************************************/
  69.765 @@ -674,7 +1279,8 @@ shadow_order(unsigned int shadow_type)
  69.766          0, /* SH_type_l3_64_shadow   */
  69.767          0, /* SH_type_l4_64_shadow   */
  69.768          2, /* SH_type_p2m_table      */
  69.769 -        0  /* SH_type_monitor_table  */
  69.770 +        0, /* SH_type_monitor_table  */
  69.771 +        0  /* SH_type_oos_snapshot   */
  69.772          };
  69.773      ASSERT(shadow_type < SH_type_unused);
  69.774      return type_to_order[shadow_type];
  69.775 @@ -1220,6 +1826,14 @@ static unsigned int sh_set_allocation(st
  69.776              sp = list_entry(d->arch.paging.shadow.freelists[order].next,
  69.777                              struct shadow_page_info, list);
  69.778              list_del(&sp->list);
  69.779 +#if defined(__x86_64__)
  69.780 +            /*
  69.781 +             * Re-instate lock field which we overwrite with shadow_page_info.
  69.782 +             * This was safe, since the lock is only used on guest pages.
  69.783 +             */
  69.784 +            for ( j = 0; j < 1U << order; j++ )
  69.785 +                spin_lock_init(&((struct page_info *)sp)[j].lock);
  69.786 +#endif
  69.787              d->arch.paging.shadow.free_pages -= 1 << order;
  69.788              d->arch.paging.shadow.total_pages -= 1 << order;
  69.789              free_domheap_pages((struct page_info *)sp, order);
  69.790 @@ -1297,6 +1911,27 @@ static void sh_hash_audit_bucket(struct 
  69.791              /* Bad shadow flags on guest page? */
  69.792              BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
  69.793              /* Bad type count on guest page? */
  69.794 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  69.795 +            if ( sp->type == SH_type_l1_32_shadow
  69.796 +                 || sp->type == SH_type_l1_pae_shadow
  69.797 +                 || sp->type == SH_type_l1_64_shadow )
  69.798 +            {
  69.799 +                if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page
  69.800 +                     && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
  69.801 +                {
  69.802 +                    if ( !page_is_out_of_sync(gpg) )
  69.803 +                    {
  69.804 +                        SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
  69.805 +                                     " and not OOS but has typecount %#lx\n",
  69.806 +                                     sp->backpointer, 
  69.807 +                                     mfn_x(shadow_page_to_mfn(sp)), 
  69.808 +                                     gpg->u.inuse.type_info);
  69.809 +                        BUG();
  69.810 +                    }
  69.811 +                }
  69.812 +            }
  69.813 +            else /* Not an l1 */
  69.814 +#endif
  69.815              if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page 
  69.816                   && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
  69.817              {
  69.818 @@ -1608,7 +2243,8 @@ void sh_destroy_shadow(struct vcpu *v, m
  69.819  /* Remove all writeable mappings of a guest frame from the shadow tables 
  69.820   * Returns non-zero if we need to flush TLBs. 
  69.821   * level and fault_addr desribe how we found this to be a pagetable;
  69.822 - * level==0 means we have some other reason for revoking write access.*/
  69.823 + * level==0 means we have some other reason for revoking write access.
  69.824 + * If level==0 we are allowed to fail, returning -1. */
  69.825  
  69.826  int sh_remove_write_access(struct vcpu *v, mfn_t gmfn, 
  69.827                             unsigned int level,
  69.828 @@ -1659,7 +2295,12 @@ int sh_remove_write_access(struct vcpu *
  69.829          return 0;
  69.830  
  69.831      /* Early exit if it's already a pagetable, or otherwise not writeable */
  69.832 -    if ( sh_mfn_is_a_page_table(gmfn) 
  69.833 +    if ( (sh_mfn_is_a_page_table(gmfn)
  69.834 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
  69.835 +         /* Unless they've been allowed to go out of sync with their shadows */
  69.836 +           && !mfn_oos_may_write(gmfn)
  69.837 +#endif
  69.838 +         )
  69.839           || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
  69.840          return 0;
  69.841  
  69.842 @@ -1676,7 +2317,7 @@ int sh_remove_write_access(struct vcpu *
  69.843      }
  69.844  
  69.845  #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
  69.846 -    if ( v == current && level != 0 )
  69.847 +    if ( v == current )
  69.848      {
  69.849          unsigned long gfn;
  69.850          /* Heuristic: there is likely to be only one writeable mapping,
  69.851 @@ -1690,6 +2331,8 @@ int sh_remove_write_access(struct vcpu *
  69.852                  return 1;                                                 \
  69.853          } while (0)
  69.854  
  69.855 +        if ( level == 0 && fault_addr )
  69.856 +            GUESS(fault_addr, 6);
  69.857          
  69.858          if ( v->arch.paging.mode->guest_levels == 2 )
  69.859          {
  69.860 @@ -1773,13 +2416,19 @@ int sh_remove_write_access(struct vcpu *
  69.861  #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
  69.862      
  69.863      /* Brute-force search of all the shadows, by walking the hash */
  69.864 -    perfc_incr(shadow_writeable_bf);
  69.865 +    if ( level == 0 )
  69.866 +        perfc_incr(shadow_writeable_bf_1);
  69.867 +    else
  69.868 +        perfc_incr(shadow_writeable_bf);
  69.869      hash_foreach(v, callback_mask, callbacks, gmfn);
  69.870  
  69.871      /* If that didn't catch the mapping, then there's some non-pagetable
  69.872       * mapping -- ioreq page, grant mapping, &c. */
  69.873      if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
  69.874      {
  69.875 +        if ( level == 0 )
  69.876 +            return -1;
  69.877 +
  69.878          SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
  69.879                        "%lu special-use mappings of it\n", mfn_x(gmfn),
  69.880                        (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
  69.881 @@ -1790,7 +2439,34 @@ int sh_remove_write_access(struct vcpu *
  69.882      return 1;
  69.883  }
  69.884  
  69.885 -
  69.886 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
  69.887 +int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
  69.888 +                                     mfn_t smfn, unsigned long off)
  69.889 +{
  69.890 +    struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
  69.891 +    
  69.892 +    ASSERT(mfn_valid(smfn));
  69.893 +    ASSERT(mfn_valid(gmfn));
  69.894 +    
  69.895 +    if ( sp->type == SH_type_l1_32_shadow )
  69.896 +    {
  69.897 +        return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2)
  69.898 +            (v, gmfn, smfn, off);
  69.899 +    }
  69.900 +#if CONFIG_PAGING_LEVELS >= 3
  69.901 +    else if ( sp->type == SH_type_l1_pae_shadow )
  69.902 +        return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3)
  69.903 +            (v, gmfn, smfn, off);
  69.904 +#if CONFIG_PAGING_LEVELS >= 4
  69.905 +    else if ( sp->type == SH_type_l1_64_shadow )
  69.906 +        return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4)
  69.907 +            (v, gmfn, smfn, off);
  69.908 +#endif
  69.909 +#endif
  69.910 +    
  69.911 +    return 0;
  69.912 +}
  69.913 +#endif 
  69.914  
  69.915  /**************************************************************************/
  69.916  /* Remove all mappings of a guest frame from the shadow tables.
  69.917 @@ -2127,6 +2803,36 @@ static void sh_update_paging_modes(struc
  69.918      }
  69.919  #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
  69.920  
  69.921 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
  69.922 +    if ( v->arch.paging.shadow.oos_fixups == NULL )
  69.923 +    {
  69.924 +        int i;
  69.925 +        v->arch.paging.shadow.oos_fixups =
  69.926 +            alloc_xenheap_pages(SHADOW_OOS_FT_ORDER);
  69.927 +        if ( v->arch.paging.shadow.oos_fixups == NULL )
  69.928 +        {
  69.929 +            SHADOW_ERROR("Could not allocate OOS fixup table"
  69.930 +                         " for dom %u vcpu %u\n",
  69.931 +                         v->domain->domain_id, v->vcpu_id);
  69.932 +            domain_crash(v->domain);
  69.933 +            return;
  69.934 +        }
  69.935 +        for ( i = 0; i < SHADOW_OOS_FT_HASH * SHADOW_OOS_FT_ENTRIES; i++ )
  69.936 +            v->arch.paging.shadow.oos_fixups[i].gmfn = _mfn(INVALID_MFN);
  69.937 +    }
  69.938 +     
  69.939 +    if ( mfn_x(v->arch.paging.shadow.oos_snapshot[0]) == INVALID_MFN )
  69.940 +    {
  69.941 +        int i;
  69.942 +        for(i = 0; i < SHADOW_OOS_PAGES; i++)
  69.943 +        {
  69.944 +            shadow_prealloc(d, SH_type_oos_snapshot, 1);
  69.945 +            v->arch.paging.shadow.oos_snapshot[i] =
  69.946 +                shadow_alloc(d, SH_type_oos_snapshot, 0);
  69.947 +        }
  69.948 +    }
  69.949 +#endif /* OOS */
  69.950 +
  69.951      // Valid transitions handled by this function:
  69.952      // - For PV guests:
  69.953      //     - after a shadow mode has been changed
  69.954 @@ -2159,6 +2865,13 @@ static void sh_update_paging_modes(struc
  69.955          ASSERT(shadow_mode_translate(d));
  69.956          ASSERT(shadow_mode_external(d));
  69.957  
  69.958 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
  69.959 +        /* Need to resync all our pages now, because if a page goes out
  69.960 +         * of sync with paging enabled and is resynced with paging
  69.961 +         * disabled, the resync will go wrong. */
  69.962 +        shadow_resync_all(v, 0);
  69.963 +#endif /* OOS */
  69.964 +
  69.965          if ( !hvm_paging_enabled(v) )
  69.966          {
  69.967              /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE
  69.968 @@ -2254,6 +2967,27 @@ static void sh_update_paging_modes(struc
  69.969          //        This *does* happen, at least for CR4.PGE...
  69.970      }
  69.971  
  69.972 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  69.973 +    /* We need to check that all the vcpus have paging enabled to
  69.974 +     * unsync PTs. */
  69.975 +    if ( is_hvm_domain(d) )
  69.976 +    {
  69.977 +        int pe = 1;
  69.978 +        struct vcpu *vptr;
  69.979 +
  69.980 +        for_each_vcpu(d, vptr)
  69.981 +        {
  69.982 +            if ( !hvm_paging_enabled(vptr) )
  69.983 +            {
  69.984 +                pe = 0;
  69.985 +                break;
  69.986 +            }
  69.987 +        }
  69.988 +
  69.989 +        d->arch.paging.shadow.oos_active = pe;
  69.990 +    }
  69.991 +#endif /* OOS */
  69.992 +
  69.993      v->arch.paging.mode->update_cr3(v, 0);
  69.994  }
  69.995  
  69.996 @@ -2426,17 +3160,36 @@ void shadow_teardown(struct domain *d)
  69.997          }
  69.998      }
  69.999  
 69.1000 -#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) 
 69.1001 +#if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC))
 69.1002      /* Free the virtual-TLB array attached to each vcpu */
 69.1003      for_each_vcpu(d, v)
 69.1004      {
 69.1005 +#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
 69.1006          if ( v->arch.paging.vtlb )
 69.1007          {
 69.1008              xfree(v->arch.paging.vtlb);
 69.1009              v->arch.paging.vtlb = NULL;
 69.1010          }
 69.1011 +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
 69.1012 +
 69.1013 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
 69.1014 +        if ( v->arch.paging.shadow.oos_fixups )
 69.1015 +        {
 69.1016 +            free_xenheap_pages(v->arch.paging.shadow.oos_fixups,
 69.1017 +                               SHADOW_OOS_FT_ORDER);
 69.1018 +            v->arch.paging.shadow.oos_fixups = NULL;
 69.1019 +        }
 69.1020 +
 69.1021 +        {
 69.1022 +            int i;
 69.1023 +            mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
 69.1024 +            for(i = 0; i < SHADOW_OOS_PAGES; i++)
 69.1025 +                if ( mfn_valid(oos_snapshot[i]) )
 69.1026 +                    shadow_free(d, oos_snapshot[i]);
 69.1027 +        }
 69.1028 +#endif /* OOS */
 69.1029      }
 69.1030 -#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
 69.1031 +#endif /* (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) */
 69.1032  
 69.1033      list_for_each_safe(entry, n, &d->arch.paging.shadow.p2m_freelist)
 69.1034      {
 69.1035 @@ -3044,7 +3797,11 @@ void shadow_audit_tables(struct vcpu *v)
 69.1036  
 69.1037      if ( !(SHADOW_AUDIT_ENABLE) )
 69.1038          return;
 69.1039 -    
 69.1040 +
 69.1041 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
 69.1042 +    sh_oos_audit(v->domain);
 69.1043 +#endif
 69.1044 +
 69.1045      if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
 69.1046          mask = ~1; /* Audit every table in the system */
 69.1047      else 
    70.1 --- a/xen/arch/x86/mm/shadow/multi.c	Thu Jun 19 12:48:04 2008 +0900
    70.2 +++ b/xen/arch/x86/mm/shadow/multi.c	Wed Jul 02 11:30:37 2008 +0900
    70.3 @@ -305,22 +305,54 @@ shadow_check_gwalk(struct vcpu *v, unsig
    70.4  }
    70.5  
    70.6  /* Remove write access permissions from a gwalk_t in a batch, and
    70.7 - * return OR-ed result for TLB flush hint
    70.8 + * return OR-ed result for TLB flush hint and need to rewalk the guest
    70.9 + * pages.
   70.10 + *
   70.11 + * Syncing pages will remove write access to that page; but it may
   70.12 + * also give write access to other pages in the path. If we resync any
   70.13 + * pages, re-walk from the beginning.
   70.14   */
   70.15 +#define GW_RMWR_FLUSHTLB 1
   70.16 +#define GW_RMWR_REWALK   2
   70.17 +
   70.18  static inline uint32_t
   70.19  gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
   70.20  {
   70.21 -    int rc = 0;
   70.22 +    uint32_t rc = 0;
   70.23  
   70.24  #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
   70.25  #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
   70.26 -    rc = sh_remove_write_access(v, gw->l3mfn, 3, va);
   70.27 -#endif
   70.28 -    rc |= sh_remove_write_access(v, gw->l2mfn, 2, va);
   70.29 -#endif
   70.30 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
   70.31 +    if ( mfn_is_out_of_sync(gw->l3mfn) )
   70.32 +    {
   70.33 +        sh_resync(v, gw->l3mfn);
   70.34 +        rc = GW_RMWR_REWALK;
   70.35 +    }
   70.36 +    else
   70.37 +#endif /* OOS */
   70.38 +     if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
   70.39 +         rc = GW_RMWR_FLUSHTLB;
   70.40 +#endif /* GUEST_PAGING_LEVELS >= 4 */
   70.41 +
   70.42 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
   70.43 +    if ( mfn_is_out_of_sync(gw->l2mfn) )
   70.44 +    {
   70.45 +        sh_resync(v, gw->l2mfn);
   70.46 +        rc |= GW_RMWR_REWALK;
   70.47 +    }
   70.48 +    else
   70.49 +#endif /* OOS */
   70.50 +    if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
   70.51 +        rc |= GW_RMWR_FLUSHTLB;
   70.52 +#endif /* GUEST_PAGING_LEVELS >= 3 */
   70.53 +
   70.54      if ( !(guest_supports_superpages(v) &&
   70.55 -           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
   70.56 -        rc |= sh_remove_write_access(v, gw->l1mfn, 1, va);
   70.57 +           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
   70.58 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
   70.59 +         && !mfn_is_out_of_sync(gw->l1mfn)
   70.60 +#endif /* OOS */
   70.61 +         && sh_remove_write_access(v, gw->l1mfn, 1, va) )
   70.62 +        rc |= GW_RMWR_FLUSHTLB;
   70.63  
   70.64      return rc;
   70.65  }
   70.66 @@ -882,7 +914,12 @@ static always_inline void
   70.67      
   70.68      // protect guest page tables
   70.69      //
   70.70 -    if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
   70.71 +    if ( unlikely((level == 1) 
   70.72 +                  && sh_mfn_is_a_page_table(target_mfn)
   70.73 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
   70.74 +                  && !mfn_oos_may_write(target_mfn)
   70.75 +#endif /* OOS */
   70.76 +                  ) )
   70.77      {
   70.78          if ( shadow_mode_trap_reads(d) )
   70.79          {
   70.80 @@ -1125,6 +1162,9 @@ static int shadow_set_l4e(struct vcpu *v
   70.81              domain_crash(v->domain);
   70.82              return SHADOW_SET_ERROR;
   70.83          }
   70.84 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
   70.85 +        shadow_resync_all(v, 0);
   70.86 +#endif
   70.87      }
   70.88  
   70.89      /* Write the new entry */
   70.90 @@ -1163,12 +1203,17 @@ static int shadow_set_l3e(struct vcpu *v
   70.91               | (((unsigned long)sl3e) & ~PAGE_MASK));
   70.92      
   70.93      if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
   70.94 +    {
   70.95          /* About to install a new reference */        
   70.96          if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
   70.97          {
   70.98              domain_crash(v->domain);
   70.99              return SHADOW_SET_ERROR;
  70.100 -        } 
  70.101 +        }
  70.102 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
  70.103 +        shadow_resync_all(v, 0);
  70.104 +#endif
  70.105 +    }
  70.106  
  70.107      /* Write the new entry */
  70.108      shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
  70.109 @@ -1219,12 +1264,29 @@ static int shadow_set_l2e(struct vcpu *v
  70.110               | (((unsigned long)sl2e) & ~PAGE_MASK));
  70.111  
  70.112      if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT ) 
  70.113 +    {
  70.114 +        mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
  70.115 +
  70.116          /* About to install a new reference */
  70.117 -        if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
  70.118 +        if ( !sh_get_ref(v, sl1mfn, paddr) )
  70.119          {
  70.120              domain_crash(v->domain);
  70.121              return SHADOW_SET_ERROR;
  70.122 -        } 
  70.123 +        }
  70.124 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  70.125 +        {
  70.126 +            struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn);
  70.127 +            mfn_t gl1mfn = _mfn(sp->backpointer);
  70.128 +
  70.129 +            /* If the shadow is a fl1 then the backpointer contains
  70.130 +               the GFN instead of the GMFN, and it's definitely not
  70.131 +               OOS. */
  70.132 +            if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
  70.133 +                 && mfn_is_out_of_sync(gl1mfn) )
  70.134 +                sh_resync(v, gl1mfn);
  70.135 +        }
  70.136 +#endif
  70.137 +    }
  70.138  
  70.139      /* Write the new entry */
  70.140  #if GUEST_PAGING_LEVELS == 2
  70.141 @@ -1347,6 +1409,9 @@ static int shadow_set_l1e(struct vcpu *v
  70.142      int flags = 0;
  70.143      struct domain *d = v->domain;
  70.144      shadow_l1e_t old_sl1e;
  70.145 +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
  70.146 +    mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e);
  70.147 +#endif
  70.148      ASSERT(sl1e != NULL);
  70.149      
  70.150      old_sl1e = *sl1e;
  70.151 @@ -1363,8 +1428,18 @@ static int shadow_set_l1e(struct vcpu *v
  70.152                  /* Doesn't look like a pagetable. */
  70.153                  flags |= SHADOW_SET_ERROR;
  70.154                  new_sl1e = shadow_l1e_empty();
  70.155 -            } else {
  70.156 +            }
  70.157 +            else
  70.158 +            {
  70.159                  shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d);
  70.160 +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
  70.161 +                if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn)
  70.162 +                     && (shadow_l1e_get_flags(new_sl1e) & _PAGE_RW) )
  70.163 +                {
  70.164 +                    oos_fixup_add(v, new_gmfn, sl1mfn, pgentry_ptr_to_slot(sl1e));
  70.165 +                }
  70.166 +#endif
  70.167 +
  70.168              }
  70.169          }
  70.170      } 
  70.171 @@ -2532,6 +2607,9 @@ static int validate_gl1e(struct vcpu *v,
  70.172      mfn_t gmfn;
  70.173      p2m_type_t p2mt;
  70.174      int result = 0;
  70.175 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  70.176 +    mfn_t gl1mfn;
  70.177 +#endif /* OOS */
  70.178  
  70.179      perfc_incr(shadow_validate_gl1e_calls);
  70.180  
  70.181 @@ -2539,11 +2617,139 @@ static int validate_gl1e(struct vcpu *v,
  70.182      gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
  70.183  
  70.184      l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
  70.185 -    
  70.186      result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
  70.187 +
  70.188 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  70.189 +    gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
  70.190 +    if ( mfn_valid(gl1mfn) 
  70.191 +         && mfn_is_out_of_sync(gl1mfn) )
  70.192 +    {
  70.193 +        /* Update the OOS snapshot. */
  70.194 +        mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn);
  70.195 +        guest_l1e_t *snp;
  70.196 +
  70.197 +        ASSERT(mfn_valid(snpmfn));
  70.198 +
  70.199 +        snp = sh_map_domain_page(snpmfn);
  70.200 +        snp[guest_index(new_ge)] = new_gl1e;
  70.201 +        sh_unmap_domain_page(snp);
  70.202 +    }
  70.203 +#endif /* OOS */
  70.204 +
  70.205      return result;
  70.206  }
  70.207  
  70.208 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  70.209 +/**************************************************************************/
  70.210 +/* Special validation function for re-syncing out-of-sync shadows. 
  70.211 + * Walks the *shadow* page, and for every entry that it finds,
  70.212 + * revalidates the guest entry that corresponds to it.
  70.213 + * N.B. This function is called with the vcpu that unsynced the page,
  70.214 + *      *not* the one that is causing it to be resynced. */
  70.215 +void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
  70.216 +{
  70.217 +    mfn_t sl1mfn;
  70.218 +    shadow_l1e_t *sl1p;
  70.219 +    guest_l1e_t *gl1p, *gp, *snp;
  70.220 +    int rc = 0;
  70.221 +
  70.222 +    ASSERT(mfn_valid(snpmfn));
  70.223 +
  70.224 +    sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
  70.225 +    ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
  70.226 +
  70.227 +    snp = sh_map_domain_page(snpmfn);
  70.228 +    gp = sh_map_domain_page(gl1mfn);
  70.229 +    gl1p = gp;
  70.230 +
  70.231 +   SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
  70.232 +        guest_l1e_t gl1e = *gl1p;
  70.233 +        guest_l1e_t *snpl1p = (guest_l1e_t *)snp + guest_index(gl1p);
  70.234 +
  70.235 +        if ( memcmp(snpl1p, &gl1e, sizeof(gl1e)) )
  70.236 +        {
  70.237 +            gfn_t gfn;
  70.238 +            mfn_t gmfn;
  70.239 +            p2m_type_t p2mt;
  70.240 +            shadow_l1e_t nsl1e;
  70.241 +
  70.242 +            gfn = guest_l1e_get_gfn(gl1e);
  70.243 +            gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
  70.244 +            l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt);
  70.245 +            rc |= shadow_set_l1e(v, sl1p, nsl1e, sl1mfn);
  70.246 +            
  70.247 +            *snpl1p = gl1e;
  70.248 +        }
  70.249 +    });
  70.250 +
  70.251 +    sh_unmap_domain_page(gp);
  70.252 +    sh_unmap_domain_page(snp);
  70.253 +
  70.254 +    /* Setting shadow L1 entries should never need us to flush the TLB */
  70.255 +    ASSERT(!(rc & SHADOW_SET_FLUSH));
  70.256 +}
  70.257 +
  70.258 +/* Figure out whether it's definitely safe not to sync this l1 table. 
  70.259 + * That is: if we can tell that it's only used once, and that the 
  70.260 + * toplevel shadow responsible is not one of ours. 
  70.261 + * N.B. This function is called with the vcpu that required the resync, 
  70.262 + *      *not* the one that originally unsynced the page, but it is
  70.263 + *      called in the *mode* of the vcpu that unsynced it.  Clear?  Good. */
  70.264 +int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
  70.265 +{
  70.266 +    struct shadow_page_info *sp;
  70.267 +    mfn_t smfn;
  70.268 +
  70.269 +    smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
  70.270 +    ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
  70.271 +    
  70.272 +    /* Up to l2 */
  70.273 +    sp = mfn_to_shadow_page(smfn);
  70.274 +    if ( sp->count != 1 || !sp->up )
  70.275 +        return 0;
  70.276 +    smfn = _mfn(sp->up >> PAGE_SHIFT);
  70.277 +    ASSERT(mfn_valid(smfn));
  70.278 +
  70.279 +#if (SHADOW_PAGING_LEVELS == 4) 
  70.280 +    /* up to l3 */
  70.281 +    sp = mfn_to_shadow_page(smfn);
  70.282 +    if ( sp->count != 1 || !sp->up )
  70.283 +        return 0;
  70.284 +    smfn = _mfn(sp->up >> PAGE_SHIFT);
  70.285 +    ASSERT(mfn_valid(smfn));
  70.286 +
  70.287 +    /* up to l4 */
  70.288 +    sp = mfn_to_shadow_page(smfn);
  70.289 +    if ( sp->count != 1 
  70.290 +         || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
  70.291 +        return 0;
  70.292 +    smfn = _mfn(sp->up >> PAGE_SHIFT);
  70.293 +    ASSERT(mfn_valid(smfn));
  70.294 +
  70.295 +#if (GUEST_PAGING_LEVELS == 2)
  70.296 +    /* In 2-on-3 shadow mode the up pointer contains the link to the
  70.297 +     * shadow page, but the shadow_table contains only the first of the
  70.298 +     * four pages that makes the PAE top shadow tables. */
  70.299 +    smfn = _mfn(mfn_x(smfn) & ~0x3UL);
  70.300 +#endif
  70.301 +
  70.302 +#endif
  70.303 +
  70.304 +    if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
  70.305 +#if (SHADOW_PAGING_LEVELS == 3) 
  70.306 +         || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
  70.307 +         || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
  70.308 +         || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn) 
  70.309 +#endif
  70.310 +        )
  70.311 +        return 0;
  70.312 +    
  70.313 +    /* Only in use in one toplevel shadow, and it's not the one we're 
  70.314 +     * running on */
  70.315 +    return 1;
  70.316 +}
  70.317 +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
  70.318 +
  70.319  
  70.320  /**************************************************************************/
  70.321  /* Functions which translate and install the shadows of arbitrary guest 
  70.322 @@ -2725,6 +2931,10 @@ static void sh_prefetch(struct vcpu *v, 
  70.323      shadow_l1e_t sl1e;
  70.324      u32 gflags;
  70.325      p2m_type_t p2mt;
  70.326 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  70.327 +    guest_l1e_t *snpl1p = NULL;
  70.328 +#endif /* OOS */
  70.329 +
  70.330  
  70.331      /* Prefetch no further than the end of the _shadow_ l1 MFN */
  70.332      dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
  70.333 @@ -2737,6 +2947,17 @@ static void sh_prefetch(struct vcpu *v, 
  70.334          /* Normal guest page; grab the next guest entry */
  70.335          gl1p = sh_map_domain_page(gw->l1mfn);
  70.336          gl1p += guest_l1_table_offset(gw->va);
  70.337 +
  70.338 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  70.339 +        if ( mfn_is_out_of_sync(gw->l1mfn) )
  70.340 +        {
  70.341 +            mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn);
  70.342 +
  70.343 +            ASSERT(mfn_valid(snpmfn));
  70.344 +            snpl1p = sh_map_domain_page(snpmfn);
  70.345 +            snpl1p += guest_l1_table_offset(gw->va);
  70.346 +        }
  70.347 +#endif /* OOS */
  70.348      }
  70.349  
  70.350      for ( i = 1; i < dist ; i++ ) 
  70.351 @@ -2774,9 +2995,18 @@ static void sh_prefetch(struct vcpu *v, 
  70.352          /* Propagate the entry.  */
  70.353          l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
  70.354          (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
  70.355 +
  70.356 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  70.357 +        if ( snpl1p != NULL )
  70.358 +            snpl1p[i] = gl1e;
  70.359 +#endif /* OOS */
  70.360      }
  70.361      if ( gl1p != NULL )
  70.362          sh_unmap_domain_page(gl1p);
  70.363 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  70.364 +    if ( snpl1p != NULL )
  70.365 +        sh_unmap_domain_page(snpl1p);
  70.366 +#endif /* OOS */
  70.367  }
  70.368  
  70.369  #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
  70.370 @@ -2805,6 +3035,7 @@ static int sh_page_fault(struct vcpu *v,
  70.371      int r;
  70.372      fetch_type_t ft = 0;
  70.373      p2m_type_t p2mt;
  70.374 +    uint32_t rc;
  70.375  #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
  70.376      int fast_emul = 0;
  70.377  #endif
  70.378 @@ -2830,6 +3061,17 @@ static int sh_page_fault(struct vcpu *v,
  70.379          {
  70.380              fast_emul = 1;
  70.381              gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
  70.382 +
  70.383 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
  70.384 +            /* Fall back to the slow path if we're trying to emulate
  70.385 +               writes to an out of sync page. */
  70.386 +            if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
  70.387 +            {
  70.388 +                v->arch.paging.last_write_emul_ok = 0;
  70.389 +                goto page_fault_slow_path;
  70.390 +            }
  70.391 +#endif /* OOS */
  70.392 +
  70.393              perfc_incr(shadow_fault_fast_emulate);
  70.394              goto early_emulation;
  70.395          }
  70.396 @@ -2855,6 +3097,31 @@ static int sh_page_fault(struct vcpu *v,
  70.397                                        sizeof(sl1e)) == 0)
  70.398                      && sh_l1e_is_magic(sl1e)) )
  70.399          {
  70.400 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
  70.401 +             /* First, need to check that this isn't an out-of-sync
  70.402 +              * shadow l1e.  If it is, we fall back to the slow path, which
  70.403 +              * will sync it up again. */
  70.404 +            {
  70.405 +                shadow_l2e_t sl2e;
  70.406 +                mfn_t gl1mfn;
  70.407 +               if ( (__copy_from_user(&sl2e,
  70.408 +                                       (sh_linear_l2_table(v)
  70.409 +                                        + shadow_l2_linear_offset(va)),
  70.410 +                                       sizeof(sl2e)) != 0)
  70.411 +                     || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
  70.412 +                     || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page(
  70.413 +                                      shadow_l2e_get_mfn(sl2e))->backpointer))
  70.414 +                     || unlikely(mfn_is_out_of_sync(gl1mfn)) )
  70.415 +               {
  70.416 +                   /* Hit the slow path as if there had been no 
  70.417 +                    * shadow entry at all, and let it tidy up */
  70.418 +                   ASSERT(regs->error_code & PFEC_page_present);
  70.419 +                   regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
  70.420 +                   goto page_fault_slow_path;
  70.421 +               }
  70.422 +            }
  70.423 +#endif /* SHOPT_OUT_OF_SYNC */
  70.424 +
  70.425              if ( sh_l1e_is_gnp(sl1e) )
  70.426              {
  70.427                  /* Not-present in a guest PT: pass to the guest as
  70.428 @@ -2890,6 +3157,10 @@ static int sh_page_fault(struct vcpu *v,
  70.429              return EXCRET_fault_fixed;
  70.430          }
  70.431      }
  70.432 +
  70.433 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
  70.434 + page_fault_slow_path:
  70.435 +#endif
  70.436  #endif /* SHOPT_FAST_FAULT_PATH */
  70.437  
  70.438      /* Detect if this page fault happened while we were already in Xen
  70.439 @@ -2904,7 +3175,21 @@ static int sh_page_fault(struct vcpu *v,
  70.440          return 0;
  70.441      }
  70.442  
  70.443 -    if ( guest_walk_tables(v, va, &gw, regs->error_code) != 0 )
  70.444 + rewalk:
  70.445 +    rc = guest_walk_tables(v, va, &gw, regs->error_code);
  70.446 +
  70.447 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  70.448 +    if ( !(rc & _PAGE_PRESENT) )
  70.449 +        regs->error_code |= PFEC_page_present;
  70.450 +    else if ( regs->error_code & PFEC_page_present )
  70.451 +    {
  70.452 +            SHADOW_ERROR("OOS paranoia: Something is wrong in guest TLB"
  70.453 +                         " flushing. Have fun debugging it.\n");
  70.454 +            regs->error_code &= ~PFEC_page_present;
  70.455 +    }
  70.456 +#endif
  70.457 +
  70.458 +    if ( rc != 0 )
  70.459      {
  70.460          perfc_incr(shadow_fault_bail_real_fault);
  70.461          SHADOW_PRINTK("not a shadow fault\n");
  70.462 @@ -2948,7 +3233,10 @@ static int sh_page_fault(struct vcpu *v,
  70.463  
  70.464      shadow_lock(d);
  70.465  
  70.466 -    if ( gw_remove_write_accesses(v, va, &gw) )
  70.467 +    rc = gw_remove_write_accesses(v, va, &gw);
  70.468 +
  70.469 +    /* First bit set: Removed write access to a page. */
  70.470 +    if ( rc & GW_RMWR_FLUSHTLB )
  70.471      {
  70.472          /* Write permission removal is also a hint that other gwalks
  70.473           * overlapping with this one may be inconsistent
  70.474 @@ -2958,11 +3246,20 @@ static int sh_page_fault(struct vcpu *v,
  70.475          flush_tlb_mask(d->domain_dirty_cpumask);
  70.476      }
  70.477  
  70.478 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  70.479 +    /* Second bit set: Resynced a page. Re-walk needed. */
  70.480 +    if ( rc & GW_RMWR_REWALK )
  70.481 +    {
  70.482 +        shadow_unlock(d);
  70.483 +        goto rewalk;
  70.484 +    }
  70.485 +#endif /* OOS */
  70.486 +
  70.487      if ( !shadow_check_gwalk(v, va, &gw) )
  70.488      {
  70.489          perfc_incr(shadow_inconsistent_gwalk);
  70.490          shadow_unlock(d);
  70.491 -        return EXCRET_fault_fixed;
  70.492 +        goto rewalk;
  70.493      }
  70.494  
  70.495      shadow_audit_tables(v);
  70.496 @@ -2991,17 +3288,45 @@ static int sh_page_fault(struct vcpu *v,
  70.497          return 0;
  70.498      }
  70.499  
  70.500 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  70.501 +    /* Always unsync when writing to L1 page tables. */
  70.502 +    if ( sh_mfn_is_a_page_table(gmfn)
  70.503 +         && ft == ft_demand_write )
  70.504 +        sh_unsync(v, gmfn, va);
  70.505 +#endif /* OOS */
  70.506 +
  70.507      /* Calculate the shadow entry and write it */
  70.508      l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
  70.509      r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
  70.510  
  70.511 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  70.512 +    if ( mfn_valid(gw.l1mfn) 
  70.513 +         && mfn_is_out_of_sync(gw.l1mfn) )
  70.514 +    {
  70.515 +        /* Update the OOS snapshot. */
  70.516 +        mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn);
  70.517 +        guest_l1e_t *snp;
  70.518 +        
  70.519 +        ASSERT(mfn_valid(snpmfn));
  70.520 +        
  70.521 +        snp = sh_map_domain_page(snpmfn);
  70.522 +        snp[guest_l1_table_offset(va)] = gw.l1e;
  70.523 +        sh_unmap_domain_page(snp);
  70.524 +    }
  70.525 +#endif /* OOS */
  70.526 +
  70.527  #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
  70.528      /* Prefetch some more shadow entries */
  70.529      sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
  70.530  #endif
  70.531  
  70.532      /* Need to emulate accesses to page tables */
  70.533 -    if ( sh_mfn_is_a_page_table(gmfn) )
  70.534 +    if ( sh_mfn_is_a_page_table(gmfn)
  70.535 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
  70.536 +         /* Unless they've been allowed to go out of sync with their shadows */
  70.537 +         && !mfn_is_out_of_sync(gmfn)
  70.538 +#endif
  70.539 +         )
  70.540      {
  70.541          if ( ft == ft_demand_write )
  70.542          {
  70.543 @@ -3215,6 +3540,7 @@ sh_invlpg(struct vcpu *v, unsigned long 
  70.544   * instruction should be issued on the hardware, or 0 if it's safe not
  70.545   * to do so. */
  70.546  {
  70.547 +    mfn_t sl1mfn;
  70.548      shadow_l2e_t sl2e;
  70.549      
  70.550      perfc_incr(shadow_invlpg);
  70.551 @@ -3278,13 +3604,65 @@ sh_invlpg(struct vcpu *v, unsigned long 
  70.552      // If so, then we'll need to flush the entire TLB (because that's
  70.553      // easier than invalidating all of the individual 4K pages).
  70.554      //
  70.555 -    if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
  70.556 +    sl1mfn = shadow_l2e_get_mfn(sl2e);
  70.557 +    if ( mfn_to_shadow_page(sl1mfn)->type
  70.558           == SH_type_fl1_shadow )
  70.559      {
  70.560          flush_tlb_local();
  70.561          return 0;
  70.562      }
  70.563  
  70.564 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
  70.565 +    /* Check to see if the SL1 is out of sync. */
  70.566 +    {
  70.567 +        mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
  70.568 +        struct page_info *pg = mfn_to_page(gl1mfn);
  70.569 +        if ( mfn_valid(gl1mfn) 
  70.570 +             && page_is_out_of_sync(pg) )
  70.571 +        {
  70.572 +            /* The test above may give false positives, since we don't
  70.573 +             * hold the shadow lock yet.  Check again with the lock held. */
  70.574 +            shadow_lock(v->domain);
  70.575 +
  70.576 +            /* This must still be a copy-from-user because we didn't
  70.577 +             * have the shadow lock last time we checked, and the
  70.578 +             * higher-level shadows might have disappeared under our
  70.579 +             * feet. */
  70.580 +            if ( __copy_from_user(&sl2e, 
  70.581 +                                  sh_linear_l2_table(v)
  70.582 +                                  + shadow_l2_linear_offset(va),
  70.583 +                                  sizeof (sl2e)) != 0 )
  70.584 +            {
  70.585 +                perfc_incr(shadow_invlpg_fault);
  70.586 +                shadow_unlock(v->domain);
  70.587 +                return 0;
  70.588 +            }
  70.589 +
  70.590 +            if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
  70.591 +            {
  70.592 +                shadow_unlock(v->domain);
  70.593 +                return 0;
  70.594 +            }
  70.595 +
  70.596 +            sl1mfn = shadow_l2e_get_mfn(sl2e);
  70.597 +            gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
  70.598 +            pg = mfn_to_page(gl1mfn);
  70.599 +            
  70.600 +            if ( likely(sh_mfn_is_a_page_table(gl1mfn)
  70.601 +                        && page_is_out_of_sync(pg) ) )
  70.602 +            {
  70.603 +                shadow_l1e_t *sl1;
  70.604 +                sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
  70.605 +                /* Remove the shadow entry that maps this VA */
  70.606 +                (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn);
  70.607 +            }
  70.608 +            shadow_unlock(v->domain);
  70.609 +            /* Need the invlpg, to pick up the disappeareance of the sl1e */
  70.610 +            return 1;
  70.611 +        }
  70.612 +    }
  70.613 +#endif
  70.614 +
  70.615      return 1;
  70.616  }
  70.617  
  70.618 @@ -3710,6 +4088,13 @@ sh_update_cr3(struct vcpu *v, int do_loc
  70.619          return;
  70.620      }
  70.621  
  70.622 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  70.623 +    /* Need to resync all the shadow entries on a TLB flush.  Resync
  70.624 +     * current vcpus OOS pages before switching to the new shadow
  70.625 +     * tables so that the VA hint is still valid.  */
  70.626 +    shadow_resync_current_vcpu(v, do_locking);
  70.627 +#endif
  70.628 +
  70.629      if ( do_locking ) shadow_lock(v->domain);
  70.630  
  70.631      ASSERT(shadow_locked_by_me(v->domain));
  70.632 @@ -3938,12 +4323,71 @@ sh_update_cr3(struct vcpu *v, int do_loc
  70.633  
  70.634      /* Release the lock, if we took it (otherwise it's the caller's problem) */
  70.635      if ( do_locking ) shadow_unlock(v->domain);
  70.636 +
  70.637 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  70.638 +    /* Need to resync all the shadow entries on a TLB flush. We only
  70.639 +     * update the shadows, leaving the pages out of sync. Also, we try
  70.640 +     * to skip synchronization of shadows not mapped in the new
  70.641 +     * tables. */
  70.642 +    shadow_sync_other_vcpus(v, do_locking);
  70.643 +#endif
  70.644 +
  70.645  }
  70.646  
  70.647  
  70.648  /**************************************************************************/
  70.649  /* Functions to revoke guest rights */
  70.650  
  70.651 +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
  70.652 +int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn, 
  70.653 +                                 mfn_t smfn, unsigned long off)
  70.654 +{
  70.655 +    int r;
  70.656 +    shadow_l1e_t *sl1p, sl1e;
  70.657 +    struct shadow_page_info *sp;
  70.658 +
  70.659 +    ASSERT(mfn_valid(gmfn));
  70.660 +    ASSERT(mfn_valid(smfn));
  70.661 +
  70.662 +    sp = mfn_to_shadow_page(smfn);
  70.663 +
  70.664 +    if ( sp->mbz != 0 ||
  70.665 +#if GUEST_PAGING_LEVELS == 4
  70.666 +         (sp->type != SH_type_l1_64_shadow)
  70.667 +#elif GUEST_PAGING_LEVELS == 3
  70.668 +         (sp->type != SH_type_l1_pae_shadow)
  70.669 +#elif GUEST_PAGING_LEVELS == 2
  70.670 +         (sp->type != SH_type_l1_32_shadow)
  70.671 +#endif
  70.672 +       )
  70.673 +        goto fail;
  70.674 +
  70.675 +    sl1p = sh_map_domain_page(smfn);
  70.676 +    sl1p += off;
  70.677 +    sl1e = *sl1p;
  70.678 +    if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
  70.679 +          != (_PAGE_PRESENT|_PAGE_RW))
  70.680 +         || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
  70.681 +    {
  70.682 +        sh_unmap_domain_page(sl1p);
  70.683 +        goto fail;
  70.684 +    }
  70.685 +
  70.686 +    /* Found it!  Need to remove its write permissions. */
  70.687 +    sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
  70.688 +    r = shadow_set_l1e(v, sl1p, sl1e, smfn);
  70.689 +    ASSERT( !(r & SHADOW_SET_ERROR) );
  70.690 +
  70.691 +    sh_unmap_domain_page(sl1p);
  70.692 +    perfc_incr(shadow_writeable_h_7);
  70.693 +    return 1;
  70.694 +
  70.695 + fail:
  70.696 +    perfc_incr(shadow_writeable_h_8);
  70.697 +    return 0;
  70.698 +}
  70.699 +#endif /* OOS */
  70.700 +
  70.701  #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
  70.702  static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
  70.703  /* Look up this vaddr in the current shadow and see if it's a writeable
  70.704 @@ -4437,23 +4881,35 @@ sh_x86_emulate_cmpxchg8b(struct vcpu *v,
  70.705  
  70.706  #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
  70.707  
  70.708 -#define AUDIT_FAIL(_level, _fmt, _a...) do {                               \
  70.709 -    printk("Shadow %u-on-%u audit failed at level %i, index %i\n"         \
  70.710 -           "gl" #_level "mfn = %" PRI_mfn                              \
  70.711 -           " sl" #_level "mfn = %" PRI_mfn                             \
  70.712 -           " &gl" #_level "e = %p &sl" #_level "e = %p"                    \
  70.713 -           " gl" #_level "e = %" SH_PRI_gpte                              \
  70.714 -           " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n",        \
  70.715 -           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                      \
  70.716 -           _level, guest_index(gl ## _level ## e),                         \
  70.717 -           mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),         \
  70.718 -           gl ## _level ## e, sl ## _level ## e,                           \
  70.719 -           gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
  70.720 -           ##_a);                                                          \
  70.721 -    BUG();                                                                 \
  70.722 -    done = 1;                                                              \
  70.723 +#define AUDIT_FAIL(_level, _fmt, _a...) do {                            \
  70.724 +    printk("Shadow %u-on-%u audit failed at level %i, index %i\n"       \
  70.725 +           "gl" #_level "mfn = %" PRI_mfn                               \
  70.726 +           " sl" #_level "mfn = %" PRI_mfn                              \
  70.727 +           " &gl" #_level "e = %p &sl" #_level "e = %p"                 \
  70.728 +           " gl" #_level "e = %" SH_PRI_gpte                            \
  70.729 +           " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n",      \
  70.730 +           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                   \
  70.731 +               _level, guest_index(gl ## _level ## e),                  \
  70.732 +               mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),  \
  70.733 +               gl ## _level ## e, sl ## _level ## e,                    \
  70.734 +               gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
  70.735 +               ##_a);                                                   \
  70.736 +        BUG();                                                          \
  70.737 +        done = 1;                                                       \
  70.738  } while (0)
  70.739  
  70.740 +#define AUDIT_FAIL_MIN(_level, _fmt, _a...) do {                        \
  70.741 +    printk("Shadow %u-on-%u audit failed at level %i\n"                 \
  70.742 +           "gl" #_level "mfn = %" PRI_mfn                               \
  70.743 +           " sl" #_level "mfn = %" PRI_mfn                              \
  70.744 +           " Error: " _fmt "\n",                                        \
  70.745 +           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                   \
  70.746 +           _level,                                                      \
  70.747 +           mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),      \
  70.748 +           ##_a);                                                       \
  70.749 +    BUG();                                                              \
  70.750 +    done = 1;                                                           \
  70.751 +} while (0)
  70.752  
  70.753  static char * sh_audit_flags(struct vcpu *v, int level,
  70.754                                int gflags, int sflags) 
  70.755 @@ -4494,6 +4950,16 @@ int sh_audit_l1_table(struct vcpu *v, mf
  70.756      
  70.757      /* Follow the backpointer */
  70.758      gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
  70.759 +
  70.760 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  70.761 +    /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
  70.762 +    if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) )
  70.763 +    {
  70.764 +        oos_audit_hash_is_present(v->domain, gl1mfn);
  70.765 +        return 0;
  70.766 +    }
  70.767 +#endif
  70.768 +
  70.769      gl1e = gp = sh_map_domain_page(gl1mfn);
  70.770      SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
  70.771  
  70.772 @@ -4574,6 +5040,13 @@ int sh_audit_l2_table(struct vcpu *v, mf
  70.773  
  70.774      /* Follow the backpointer */
  70.775      gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
  70.776 +
  70.777 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  70.778 +    /* Only L1's may be out of sync. */
  70.779 +    if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) )
  70.780 +        AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
  70.781 +#endif
  70.782 +
  70.783      gl2e = gp = sh_map_domain_page(gl2mfn);
  70.784      SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
  70.785  
  70.786 @@ -4616,6 +5089,13 @@ int sh_audit_l3_table(struct vcpu *v, mf
  70.787  
  70.788      /* Follow the backpointer */
  70.789      gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
  70.790 +
  70.791 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
  70.792 +    /* Only L1's may be out of sync. */
  70.793 +    if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
  70.794 +        AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
  70.795 +#endif
  70.796 +
  70.797      gl3e = gp = sh_map_domain_page(gl3mfn);
  70.798      SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
  70.799  
  70.800 @@ -4656,6 +5136,13 @@ int sh_audit_l4_table(struct vcpu *v, mf
  70.801  
  70.802      /* Follow the backpointer */
  70.803      gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
  70.804 +
  70.805 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
  70.806 +    /* Only L1's may be out of sync. */
  70.807 +    if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
  70.808 +        AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
  70.809 +#endif
  70.810 +
  70.811      gl4e = gp = sh_map_domain_page(gl4mfn);
  70.812      SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
  70.813      {
    71.1 --- a/xen/arch/x86/mm/shadow/multi.h	Thu Jun 19 12:48:04 2008 +0900
    71.2 +++ b/xen/arch/x86/mm/shadow/multi.h	Wed Jul 02 11:30:37 2008 +0900
    71.3 @@ -115,3 +115,17 @@ SHADOW_INTERNAL_NAME(sh_destroy_monitor_
    71.4  
    71.5  extern struct paging_mode 
    71.6  SHADOW_INTERNAL_NAME(sh_paging_mode, GUEST_LEVELS);
    71.7 +
    71.8 +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
    71.9 +extern void 
   71.10 +SHADOW_INTERNAL_NAME(sh_resync_l1, GUEST_LEVELS)
   71.11 +     (struct vcpu *v, mfn_t gmfn, mfn_t snpmfn);
   71.12 +
   71.13 +extern int
   71.14 +SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, GUEST_LEVELS)
   71.15 +     (struct vcpu*v, mfn_t gmfn);
   71.16 +
   71.17 +extern int
   71.18 +SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p, GUEST_LEVELS)
   71.19 +     (struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off);
   71.20 +#endif
    72.1 --- a/xen/arch/x86/mm/shadow/private.h	Thu Jun 19 12:48:04 2008 +0900
    72.2 +++ b/xen/arch/x86/mm/shadow/private.h	Wed Jul 02 11:30:37 2008 +0900
    72.3 @@ -63,8 +63,9 @@ extern int shadow_audit_enable;
    72.4  #define SHOPT_SKIP_VERIFY         0x20  /* Skip PTE v'fy when safe to do so */
    72.5  #define SHOPT_VIRTUAL_TLB         0x40  /* Cache guest v->p translations */
    72.6  #define SHOPT_FAST_EMULATION      0x80  /* Fast write emulation */
    72.7 +#define SHOPT_OUT_OF_SYNC        0x100  /* Allow guest writes to L1 PTs */
    72.8  
    72.9 -#define SHADOW_OPTIMIZATIONS      0xff
   72.10 +#define SHADOW_OPTIMIZATIONS     0x1ff
   72.11  
   72.12  
   72.13  /******************************************************************************
   72.14 @@ -195,9 +196,9 @@ struct shadow_page_info
   72.15          u32 tlbflush_timestamp;
   72.16      };
   72.17      struct {
   72.18 -        unsigned int type:4;      /* What kind of shadow is this? */
   72.19 +        unsigned int type:5;      /* What kind of shadow is this? */
   72.20          unsigned int pinned:1;    /* Is the shadow pinned? */
   72.21 -        unsigned int count:27;    /* Reference count */
   72.22 +        unsigned int count:26;    /* Reference count */
   72.23          u32 mbz;                  /* Must be zero: this is where the owner 
   72.24                                     * field lives in a non-shadow page */
   72.25      } __attribute__((packed));
   72.26 @@ -242,7 +243,8 @@ static inline void shadow_check_page_str
   72.27  #define SH_type_max_shadow    (13U)
   72.28  #define SH_type_p2m_table     (14U) /* in use as the p2m table */
   72.29  #define SH_type_monitor_table (15U) /* in use as a monitor table */
   72.30 -#define SH_type_unused        (16U)
   72.31 +#define SH_type_oos_snapshot  (16U) /* in use as OOS snapshot */
   72.32 +#define SH_type_unused        (17U)
   72.33  
   72.34  /* 
   72.35   * What counts as a pinnable shadow?
   72.36 @@ -301,6 +303,72 @@ static inline int sh_type_is_pinnable(st
   72.37  #define SHF_PAE (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE|SHF_L2H_PAE)
   72.38  #define SHF_64  (SHF_L1_64|SHF_FL1_64|SHF_L2_64|SHF_L2H_64|SHF_L3_64|SHF_L4_64)
   72.39  
   72.40 +#define SHF_L1_ANY  (SHF_L1_32|SHF_L1_PAE|SHF_L1_64)
   72.41 +
   72.42 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
   72.43 +/* Marks a guest L1 page table which is shadowed but not write-protected.
   72.44 + * If set, then *only* L1 shadows (SHF_L1_*) are allowed. 
   72.45 + *
   72.46 + * out_of_sync indicates that the shadow tables may not reflect the
   72.47 + * guest tables.  If it is clear, then the shadow tables *must* reflect
   72.48 + * the guest tables.
   72.49 + *
   72.50 + * oos_may_write indicates that a page may have writable mappings.
   72.51 + *
   72.52 + * Most of the time the flags are synonymous.  There is a short period of time 
   72.53 + * during resync that oos_may_write is clear but out_of_sync is not.  If a 
   72.54 + * codepath is called during that time and is sensitive to oos issues, it may 
   72.55 + * need to use the second flag.
   72.56 + */
   72.57 +#define SHF_out_of_sync (1u<<30)
   72.58 +#define SHF_oos_may_write (1u<<29)
   72.59 +
   72.60 +/* Fixup tables are a non-complete writable-mappings reverse map for
   72.61 +   OOS pages. This let us quickly resync pages (avoiding brute-force
   72.62 +   search of the shadows) when the va hint is not sufficient (i.e.,
   72.63 +   the pagetable is mapped in multiple places and in multiple
   72.64 +   shadows.) */
   72.65 +#define SHADOW_OOS_FT_ENTRIES                           \
   72.66 +    ((PAGE_SIZE << SHADOW_OOS_FT_ORDER)                 \
   72.67 +     / (SHADOW_OOS_FT_HASH * sizeof(struct oos_fixup)))
   72.68 +
   72.69 +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
   72.70 +
   72.71 +static inline int sh_page_has_multiple_shadows(struct page_info *pg)
   72.72 +{
   72.73 +    u32 shadows;
   72.74 +    if ( !(pg->count_info & PGC_page_table) )
   72.75 +        return 0;
   72.76 +    shadows = pg->shadow_flags & SHF_page_type_mask;
   72.77 +    /* More than one type bit set in shadow-flags? */
   72.78 +    return ( (shadows & ~(1UL << find_first_set_bit(shadows))) != 0 );
   72.79 +}
   72.80 +
   72.81 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
   72.82 +/* The caller must verify this is reasonable to call; i.e., valid mfn,
   72.83 + * domain is translated, &c */
   72.84 +static inline int page_is_out_of_sync(struct page_info *p) 
   72.85 +{
   72.86 +    return (p->count_info & PGC_page_table)
   72.87 +        && (p->shadow_flags & SHF_out_of_sync);
   72.88 +}
   72.89 +
   72.90 +static inline int mfn_is_out_of_sync(mfn_t gmfn) 
   72.91 +{
   72.92 +    return page_is_out_of_sync(mfn_to_page(mfn_x(gmfn)));
   72.93 +}
   72.94 +
   72.95 +static inline int page_oos_may_write(struct page_info *p) 
   72.96 +{
   72.97 +    return (p->count_info & PGC_page_table)
   72.98 +        && (p->shadow_flags & SHF_oos_may_write);
   72.99 +}
  72.100 +
  72.101 +static inline int mfn_oos_may_write(mfn_t gmfn) 
  72.102 +{
  72.103 +    return page_oos_may_write(mfn_to_page(mfn_x(gmfn)));
  72.104 +}
  72.105 +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
  72.106  
  72.107  /******************************************************************************
  72.108   * Various function declarations 
  72.109 @@ -351,7 +419,57 @@ int shadow_write_guest_entry(struct vcpu
  72.110  int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
  72.111                                 intpte_t *old, intpte_t new, mfn_t gmfn);
  72.112  
  72.113 +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
  72.114 +/* Allow a shadowed page to go out of sync */
  72.115 +int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va);
  72.116  
  72.117 +/* Pull an out-of-sync page back into sync. */
  72.118 +void sh_resync(struct vcpu *v, mfn_t gmfn);
  72.119 +
  72.120 +void oos_fixup_add(struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off);
  72.121 +
  72.122 +int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
  72.123 +                                     mfn_t smfn, unsigned long offset);
  72.124 +
  72.125 +/* Pull all out-of-sync shadows back into sync.  If skip != 0, we try
  72.126 + * to avoid resyncing where we think we can get away with it. */
  72.127 +
  72.128 +void sh_resync_all(struct vcpu *v, int skip, int this, int others, int do_locking);
  72.129 +
  72.130 +static inline void
  72.131 +shadow_resync_all(struct vcpu *v, int do_locking)
  72.132 +{
  72.133 +    sh_resync_all(v,
  72.134 +                  0 /* skip */,
  72.135 +                  1 /* this */,
  72.136 +                  1 /* others */,
  72.137 +                  do_locking);
  72.138 +}
  72.139 +
  72.140 +static inline void
  72.141 +shadow_resync_current_vcpu(struct vcpu *v, int do_locking)
  72.142 +{
  72.143 +    sh_resync_all(v,
  72.144 +                  0 /* skip */,
  72.145 +                  1 /* this */, 
  72.146 +                  0 /* others */,
  72.147 +                  do_locking);
  72.148 +}
  72.149 +
  72.150 +static inline void
  72.151 +shadow_sync_other_vcpus(struct vcpu *v, int do_locking)
  72.152 +{
  72.153 +    sh_resync_all(v,
  72.154 +                  1 /* skip */, 
  72.155 +                  0 /* this */,
  72.156 +                  1 /* others */,
  72.157 +                  do_locking);
  72.158 +}
  72.159 +
  72.160 +void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn);
  72.161 +mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn);
  72.162 +
  72.163 +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
  72.164  
  72.165  /******************************************************************************
  72.166   * Flags used in the return value of the shadow_set_lXe() functions...
    73.1 --- a/xen/arch/x86/mm/shadow/types.h	Thu Jun 19 12:48:04 2008 +0900
    73.2 +++ b/xen/arch/x86/mm/shadow/types.h	Wed Jul 02 11:30:37 2008 +0900
    73.3 @@ -438,6 +438,11 @@ struct shadow_walk_t
    73.4  #define sh_guess_wrmap             INTERNAL_NAME(sh_guess_wrmap)
    73.5  #define sh_clear_shadow_entry      INTERNAL_NAME(sh_clear_shadow_entry)
    73.6  
    73.7 +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
    73.8 +#define sh_resync_l1               INTERNAL_NAME(sh_resync_l1)
    73.9 +#define sh_safe_not_to_sync        INTERNAL_NAME(sh_safe_not_to_sync)
   73.10 +#define sh_rm_write_access_from_sl1p INTERNAL_NAME(sh_rm_write_access_from_sl1p)
   73.11 +#endif
   73.12  
   73.13  /* The sh_guest_(map|get)_* functions depends on Xen's paging levels */
   73.14  #define sh_guest_map_l1e \
    74.1 --- a/xen/arch/x86/platform_hypercall.c	Thu Jun 19 12:48:04 2008 +0900
    74.2 +++ b/xen/arch/x86/platform_hypercall.c	Wed Jul 02 11:30:37 2008 +0900
    74.3 @@ -408,7 +408,12 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
    74.4                  cpu_count++;
    74.5              }
    74.6              if ( cpu_count == num_online_cpus() )
    74.7 -                ret = acpi_cpufreq_init();
    74.8 +            {
    74.9 +                if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
   74.10 +                    ret = powernow_cpufreq_init();
   74.11 +                else
   74.12 +                    ret = acpi_cpufreq_init();
   74.13 +            }
   74.14              break;
   74.15          }
   74.16   
    75.1 --- a/xen/arch/x86/x86_emulate/x86_emulate.c	Thu Jun 19 12:48:04 2008 +0900
    75.2 +++ b/xen/arch/x86/x86_emulate/x86_emulate.c	Wed Jul 02 11:30:37 2008 +0900
    75.3 @@ -142,12 +142,14 @@ static uint8_t opcode_table[256] = {
    75.4      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
    75.5      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
    75.6      /* 0xD0 - 0xD7 */
    75.7 -    ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, 
    75.8 -    ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, 
    75.9 +    ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
   75.10 +    ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
   75.11      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
   75.12      /* 0xD8 - 0xDF */
   75.13 -    0, ImplicitOps|ModRM|Mov, 0, ImplicitOps|ModRM|Mov,
   75.14 -    0, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
   75.15 +    ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
   75.16 +    ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
   75.17 +    ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
   75.18 +    ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
   75.19      /* 0xE0 - 0xE7 */
   75.20      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
   75.21      ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
   75.22 @@ -216,7 +218,7 @@ static uint8_t twobyte_table[256] = {
   75.23      ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
   75.24      /* 0xA0 - 0xA7 */
   75.25      ImplicitOps, ImplicitOps, ImplicitOps, DstBitBase|SrcReg|ModRM,
   75.26 -    DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, 0, 
   75.27 +    DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, 0,
   75.28      /* 0xA8 - 0xAF */
   75.29      ImplicitOps, ImplicitOps, 0, DstBitBase|SrcReg|ModRM,
   75.30      DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, DstReg|SrcMem|ModRM,
   75.31 @@ -246,8 +248,20 @@ static uint8_t twobyte_table[256] = {
   75.32  /* Type, address-of, and value of an instruction's operand. */
   75.33  struct operand {
   75.34      enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
   75.35 -    unsigned int  bytes;
   75.36 -    unsigned long val, orig_val;
   75.37 +    unsigned int bytes;
   75.38 +
   75.39 +    /* Up to 128-byte operand value, addressable as ulong or uint32_t[]. */
   75.40 +    union {
   75.41 +        unsigned long val;
   75.42 +        uint32_t bigval[4];
   75.43 +    };
   75.44 +
   75.45 +    /* Up to 128-byte operand value, addressable as ulong or uint32_t[]. */
   75.46 +    union {
   75.47 +        unsigned long orig_val;
   75.48 +        uint32_t orig_bigval[4];
   75.49 +    };
   75.50 +
   75.51      union {
   75.52          /* OP_REG: Pointer to register field. */
   75.53          unsigned long *reg;
   75.54 @@ -466,7 +480,7 @@ do{ asm volatile (                      
   75.55  
   75.56  /* Fetch next part of the instruction being emulated. */
   75.57  #define insn_fetch_bytes(_size)                                         \
   75.58 -({ unsigned long _x, _eip = _regs.eip;                                  \
   75.59 +({ unsigned long _x = 0, _eip = _regs.eip;                              \
   75.60     if ( !mode_64bit() ) _eip = (uint32_t)_eip; /* ignore upper dword */ \
   75.61     _regs.eip += (_size); /* real hardware doesn't truncate */           \
   75.62     generate_exception_if((uint8_t)(_regs.eip - ctxt->regs->eip) > 15,   \
   75.63 @@ -594,6 +608,18 @@ do{ struct fpu_insn_ctxt fic;           
   75.64      put_fpu(&fic);                                      \
   75.65  } while (0)
   75.66  
   75.67 +#define emulate_fpu_insn_memsrc(_op, _arg)              \
   75.68 +do{ struct fpu_insn_ctxt fic;                           \
   75.69 +    get_fpu(X86EMUL_FPU_fpu, &fic);                     \
   75.70 +    asm volatile (                                      \
   75.71 +        "movb $2f-1f,%0 \n"                             \
   75.72 +        "1: " _op " %1  \n"                             \
   75.73 +        "2:             \n"                             \
   75.74 +        : "=m" (fic.insn_bytes)                         \
   75.75 +        : "m" (_arg) : "memory" );                      \
   75.76 +    put_fpu(&fic);                                      \
   75.77 +} while (0)
   75.78 +
   75.79  #define emulate_fpu_insn_stub(_bytes...)                                \
   75.80  do{ uint8_t stub[] = { _bytes, 0xc3 };                                  \
   75.81      struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };        \
   75.82 @@ -655,6 +681,19 @@ static void __put_rep_prefix(
   75.83          __put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \
   75.84  })
   75.85  
   75.86 +/* Compatibility function: read guest memory, zero-extend result to a ulong. */
   75.87 +static int read_ulong(
   75.88 +        enum x86_segment seg,
   75.89 +        unsigned long offset,
   75.90 +        unsigned long *val,
   75.91 +        unsigned int bytes,
   75.92 +        struct x86_emulate_ctxt *ctxt,
   75.93 +        struct x86_emulate_ops *ops)
   75.94 +{
   75.95 +    *val = 0;
   75.96 +    return ops->read(seg, offset, val, bytes, ctxt);
   75.97 +}
   75.98 +
   75.99  /*
  75.100   * Unsigned multiplication with double-word result.
  75.101   * IN:  Multiplicand=m[0], Multiplier=m[1]
  75.102 @@ -841,7 +880,8 @@ static int ioport_access_check(
  75.103           (tr.limit < 0x67) )
  75.104          goto raise_exception;
  75.105  
  75.106 -    if ( (rc = ops->read(x86_seg_none, tr.base + 0x66, &iobmp, 2, ctxt)) )
  75.107 +    if ( (rc = read_ulong(x86_seg_none, tr.base + 0x66,
  75.108 +                          &iobmp, 2, ctxt, ops)) )
  75.109          return rc;
  75.110  
  75.111      /* Ensure TSS includes two bytes including byte containing first port. */
  75.112 @@ -849,7 +889,8 @@ static int ioport_access_check(
  75.113      if ( tr.limit <= iobmp )
  75.114          goto raise_exception;
  75.115  
  75.116 -    if ( (rc = ops->read(x86_seg_none, tr.base + iobmp, &iobmp, 2, ctxt)) )
  75.117 +    if ( (rc = read_ulong(x86_seg_none, tr.base + iobmp,
  75.118 +                          &iobmp, 2, ctxt, ops)) )
  75.119          return rc;
  75.120      if ( (iobmp & (((1<<bytes)-1) << (first_port&7))) != 0 )
  75.121          goto raise_exception;
  75.122 @@ -941,12 +982,12 @@ protmode_load_seg(
  75.123          goto raise_exn;
  75.124  
  75.125      do {
  75.126 -        if ( (rc = ops->read(x86_seg_none, desctab.base + (sel & 0xfff8),
  75.127 -                             &val, 4, ctxt)) )
  75.128 +        if ( (rc = read_ulong(x86_seg_none, desctab.base + (sel & 0xfff8),
  75.129 +                              &val, 4, ctxt, ops)) )
  75.130              return rc;
  75.131          desc.a = val;
  75.132 -        if ( (rc = ops->read(x86_seg_none, desctab.base + (sel & 0xfff8) + 4,
  75.133 -                             &val, 4, ctxt)) )
  75.134 +        if ( (rc = read_ulong(x86_seg_none, desctab.base + (sel & 0xfff8) + 4,
  75.135 +                              &val, 4, ctxt, ops)) )
  75.136              return rc;
  75.137          desc.b = val;
  75.138  
  75.139 @@ -992,14 +1033,15 @@ protmode_load_seg(
  75.140              if ( (desc.b & (5u<<9)) == (4u<<9) )
  75.141                  goto raise_exn;
  75.142              /* Non-conforming segment: check DPL against RPL and CPL. */
  75.143 -            if ( ((desc.b & (6u<<9)) != (6u<<9)) && ((dpl < cpl) || (dpl < rpl)) )
  75.144 +            if ( ((desc.b & (6u<<9)) != (6u<<9)) &&
  75.145 +                 ((dpl < cpl) || (dpl < rpl)) )
  75.146                  goto raise_exn;
  75.147              break;
  75.148          }
  75.149  
  75.150          /* Ensure Accessed flag is set. */
  75.151          new_desc_b = desc.b | 0x100;
  75.152 -        rc = ((desc.b & 0x100) ? X86EMUL_OKAY : 
  75.153 +        rc = ((desc.b & 0x100) ? X86EMUL_OKAY :
  75.154                ops->cmpxchg(
  75.155                    x86_seg_none, desctab.base + (sel & 0xfff8) + 4,
  75.156                    &desc.b, &new_desc_b, 4, ctxt));
  75.157 @@ -1061,16 +1103,16 @@ decode_register(
  75.158      case  2: p = &regs->edx; break;
  75.159      case  3: p = &regs->ebx; break;
  75.160      case  4: p = (highbyte_regs ?
  75.161 -                  ((unsigned char *)&regs->eax + 1) : 
  75.162 +                  ((unsigned char *)&regs->eax + 1) :
  75.163                    (unsigned char *)&regs->esp); break;
  75.164      case  5: p = (highbyte_regs ?
  75.165 -                  ((unsigned char *)&regs->ecx + 1) : 
  75.166 +                  ((unsigned char *)&regs->ecx + 1) :
  75.167                    (unsigned char *)&regs->ebp); break;
  75.168      case  6: p = (highbyte_regs ?
  75.169 -                  ((unsigned char *)&regs->edx + 1) : 
  75.170 +                  ((unsigned char *)&regs->edx + 1) :
  75.171                    (unsigned char *)&regs->esi); break;
  75.172      case  7: p = (highbyte_regs ?
  75.173 -                  ((unsigned char *)&regs->ebx + 1) : 
  75.174 +                  ((unsigned char *)&regs->ebx + 1) :
  75.175                    (unsigned char *)&regs->edi); break;
  75.176  #if defined(__x86_64__)
  75.177      case  8: p = &regs->r8;  break;
  75.178 @@ -1402,8 +1444,8 @@ x86_emulate(
  75.179              case 8: src.val = *(uint64_t *)src.reg; break;
  75.180              }
  75.181          }
  75.182 -        else if ( (rc = ops->read(src.mem.seg, src.mem.off,
  75.183 -                                  &src.val, src.bytes, ctxt)) )
  75.184 +        else if ( (rc = read_ulong(src.mem.seg, src.mem.off,
  75.185 +                                   &src.val, src.bytes, ctxt, ops)) )
  75.186              goto done;
  75.187          break;
  75.188      case SrcImm:
  75.189 @@ -1494,8 +1536,8 @@ x86_emulate(
  75.190          }
  75.191          else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */
  75.192          {
  75.193 -            if ( (rc = ops->read(dst.mem.seg, dst.mem.off,
  75.194 -                                 &dst.val, dst.bytes, ctxt)) )
  75.195 +            if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
  75.196 +                                  &dst.val, dst.bytes, ctxt, ops)) )
  75.197                  goto done;
  75.198              dst.orig_val = dst.val;
  75.199          }
  75.200 @@ -1571,8 +1613,8 @@ x86_emulate(
  75.201          int lb, ub, idx;
  75.202          generate_exception_if(mode_64bit() || (src.type != OP_MEM),
  75.203                                EXC_UD, -1);
  75.204 -        if ( (rc = ops->read(src.mem.seg, src.mem.off + op_bytes,
  75.205 -                             &src_val2, op_bytes, ctxt)) )
  75.206 +        if ( (rc = read_ulong(src.mem.seg, src.mem.off + op_bytes,
  75.207 +                              &src_val2, op_bytes, ctxt, ops)) )
  75.208              goto done;
  75.209          ub  = (op_bytes == 2) ? (int16_t)src_val2 : (int32_t)src_val2;
  75.210          lb  = (op_bytes == 2) ? (int16_t)src.val  : (int32_t)src.val;
  75.211 @@ -1588,8 +1630,8 @@ x86_emulate(
  75.212              /* movsxd */
  75.213              if ( src.type == OP_REG )
  75.214                  src.val = *(int32_t *)src.reg;
  75.215 -            else if ( (rc = ops->read(src.mem.seg, src.mem.off,
  75.216 -                                      &src.val, 4, ctxt)) )
  75.217 +            else if ( (rc = read_ulong(src.mem.seg, src.mem.off,
  75.218 +                                       &src.val, 4, ctxt, ops)) )
  75.219                  goto done;
  75.220              dst.val = (int32_t)src.val;
  75.221          }
  75.222 @@ -1613,8 +1655,8 @@ x86_emulate(
  75.223          unsigned long src1; /* ModR/M source operand */
  75.224          if ( ea.type == OP_REG )
  75.225              src1 = *ea.reg;
  75.226 -        else if ( (rc = ops->read(ea.mem.seg, ea.mem.off,
  75.227 -                                  &src1, op_bytes, ctxt)) )
  75.228 +        else if ( (rc = read_ulong(ea.mem.seg, ea.mem.off,
  75.229 +                                   &src1, op_bytes, ctxt, ops)) )
  75.230              goto done;
  75.231          _regs.eflags &= ~(EFLG_OF|EFLG_CF);
  75.232          switch ( dst.bytes )
  75.233 @@ -1720,8 +1762,8 @@ x86_emulate(
  75.234          /* 64-bit mode: POP defaults to a 64-bit operand. */
  75.235          if ( mode_64bit() && (dst.bytes == 4) )
  75.236              dst.bytes = 8;
  75.237 -        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes),
  75.238 -                             &dst.val, dst.bytes, ctxt)) != 0 )
  75.239 +        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes),
  75.240 +                              &dst.val, dst.bytes, ctxt, ops)) != 0 )
  75.241              goto done;
  75.242          break;
  75.243  
  75.244 @@ -1773,8 +1815,8 @@ x86_emulate(
  75.245          dst.val = x86_seg_es;
  75.246      les: /* dst.val identifies the segment */
  75.247          generate_exception_if(src.type != OP_MEM, EXC_UD, -1);
  75.248 -        if ( (rc = ops->read(src.mem.seg, src.mem.off + src.bytes,
  75.249 -                             &sel, 2, ctxt)) != 0 )
  75.250 +        if ( (rc = read_ulong(src.mem.seg, src.mem.off + src.bytes,
  75.251 +                              &sel, 2, ctxt, ops)) != 0 )
  75.252              goto done;
  75.253          if ( (rc = load_seg(dst.val, (uint16_t)sel, ctxt, ops)) != 0 )
  75.254              goto done;
  75.255 @@ -2020,8 +2062,8 @@ x86_emulate(
  75.256                  dst.bytes = op_bytes = 8;
  75.257                  if ( dst.type == OP_REG )
  75.258                      dst.val = *dst.reg;
  75.259 -                else if ( (rc = ops->read(dst.mem.seg, dst.mem.off,
  75.260 -                                          &dst.val, 8, ctxt)) != 0 )
  75.261 +                else if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
  75.262 +                                           &dst.val, 8, ctxt, ops)) != 0 )
  75.263                      goto done;
  75.264              }
  75.265              src.val = _regs.eip;
  75.266 @@ -2036,8 +2078,8 @@ x86_emulate(
  75.267  
  75.268              generate_exception_if(dst.type != OP_MEM, EXC_UD, -1);
  75.269  
  75.270 -            if ( (rc = ops->read(dst.mem.seg, dst.mem.off+dst.bytes,
  75.271 -                                 &sel, 2, ctxt)) )
  75.272 +            if ( (rc = read_ulong(dst.mem.seg, dst.mem.off+dst.bytes,
  75.273 +                                  &sel, 2, ctxt, ops)) )
  75.274                  goto done;
  75.275  
  75.276              if ( (modrm_reg & 7) == 3 ) /* call */
  75.277 @@ -2046,9 +2088,9 @@ x86_emulate(
  75.278                  fail_if(ops->read_segment == NULL);
  75.279                  if ( (rc = ops->read_segment(x86_seg_cs, &reg, ctxt)) ||
  75.280                       (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
  75.281 -                                      reg.sel, op_bytes, ctxt)) ||
  75.282 +                                      &reg.sel, op_bytes, ctxt)) ||
  75.283                       (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
  75.284 -                                      _regs.eip, op_bytes, ctxt)) )
  75.285 +                                      &_regs.eip, op_bytes, ctxt)) )
  75.286                      goto done;
  75.287              }
  75.288  
  75.289 @@ -2066,12 +2108,12 @@ x86_emulate(
  75.290                  dst.bytes = 8;
  75.291                  if ( dst.type == OP_REG )
  75.292                      dst.val = *dst.reg;
  75.293 -                else if ( (rc = ops->read(dst.mem.seg, dst.mem.off,
  75.294 -                                          &dst.val, 8, ctxt)) != 0 )
  75.295 +                else if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
  75.296 +                                           &dst.val, 8, ctxt, ops)) != 0 )
  75.297                      goto done;
  75.298              }
  75.299              if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
  75.300 -                                  dst.val, dst.bytes, ctxt)) != 0 )
  75.301 +                                  &dst.val, dst.bytes, ctxt)) != 0 )
  75.302                  goto done;
  75.303              dst.type = OP_NONE;
  75.304              break;
  75.305 @@ -2106,7 +2148,7 @@ x86_emulate(
  75.306                  &dst.val, dst.bytes, ctxt);
  75.307          else
  75.308              rc = ops->write(
  75.309 -                dst.mem.seg, dst.mem.off, dst.val, dst.bytes, ctxt);
  75.310 +                dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
  75.311          if ( rc != 0 )
  75.312              goto done;
  75.313      default:
  75.314 @@ -2153,7 +2195,7 @@ x86_emulate(
  75.315          if ( mode_64bit() && (op_bytes == 4) )
  75.316              op_bytes = 8;
  75.317          if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
  75.318 -                              reg.sel, op_bytes, ctxt)) != 0 )
  75.319 +                              &reg.sel, op_bytes, ctxt)) != 0 )
  75.320              goto done;
  75.321          break;
  75.322      }
  75.323 @@ -2165,8 +2207,8 @@ x86_emulate(
  75.324          /* 64-bit mode: POP defaults to a 64-bit operand. */
  75.325          if ( mode_64bit() && (op_bytes == 4) )
  75.326              op_bytes = 8;
  75.327 -        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
  75.328 -                             &dst.val, op_bytes, ctxt)) != 0 )
  75.329 +        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
  75.330 +                              &dst.val, op_bytes, ctxt, ops)) != 0 )
  75.331              goto done;
  75.332          if ( (rc = load_seg(src.val, (uint16_t)dst.val, ctxt, ops)) != 0 )
  75.333              return rc;
  75.334 @@ -2275,8 +2317,8 @@ x86_emulate(
  75.335          dst.bytes = op_bytes;
  75.336          if ( mode_64bit() && (dst.bytes == 4) )
  75.337              dst.bytes = 8;
  75.338 -        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes),
  75.339 -                             &dst.val, dst.bytes, ctxt)) != 0 )
  75.340 +        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes),
  75.341 +                              &dst.val, dst.bytes, ctxt, ops)) != 0 )
  75.342              goto done;
  75.343          break;
  75.344  
  75.345 @@ -2288,7 +2330,7 @@ x86_emulate(
  75.346          generate_exception_if(mode_64bit(), EXC_UD, -1);
  75.347          for ( i = 0; i < 8; i++ )
  75.348              if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
  75.349 -                                  regs[i], op_bytes, ctxt)) != 0 )
  75.350 +                                  &regs[i], op_bytes, ctxt)) != 0 )
  75.351              goto done;
  75.352          break;
  75.353      }
  75.354 @@ -2303,8 +2345,8 @@ x86_emulate(
  75.355          generate_exception_if(mode_64bit(), EXC_UD, -1);
  75.356          for ( i = 0; i < 8; i++ )
  75.357          {
  75.358 -            if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
  75.359 -                                 &dst.val, op_bytes, ctxt)) != 0 )
  75.360 +            if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
  75.361 +                                  &dst.val, op_bytes, ctxt, ops)) != 0 )
  75.362                  goto done;
  75.363              switch ( op_bytes )
  75.364              {
  75.365 @@ -2382,8 +2424,8 @@ x86_emulate(
  75.366          }
  75.367          else
  75.368          {
  75.369 -            if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
  75.370 -                                 &dst.val, dst.bytes, ctxt)) != 0 )
  75.371 +            if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
  75.372 +                                  &dst.val, dst.bytes, ctxt, ops)) != 0 )
  75.373                  goto done;
  75.374              fail_if(ops->write_io == NULL);
  75.375              if ( (rc = ops->write_io(port, dst.bytes, dst.val, ctxt)) != 0 )
  75.376 @@ -2455,9 +2497,9 @@ x86_emulate(
  75.377  
  75.378          if ( (rc = ops->read_segment(x86_seg_cs, &reg, ctxt)) ||
  75.379               (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
  75.380 -                              reg.sel, op_bytes, ctxt)) ||
  75.381 +                              &reg.sel, op_bytes, ctxt)) ||
  75.382               (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
  75.383 -                              _regs.eip, op_bytes, ctxt)) )
  75.384 +                              &_regs.eip, op_bytes, ctxt)) )
  75.385              goto done;
  75.386  
  75.387          if ( (rc = load_seg(x86_seg_cs, sel, ctxt, ops)) != 0 )
  75.388 @@ -2483,8 +2525,8 @@ x86_emulate(
  75.389          /* 64-bit mode: POP defaults to a 64-bit operand. */
  75.390          if ( mode_64bit() && (op_bytes == 4) )
  75.391              op_bytes = 8;
  75.392 -        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
  75.393 -                             &dst.val, op_bytes, ctxt)) != 0 )
  75.394 +        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
  75.395 +                              &dst.val, op_bytes, ctxt, ops)) != 0 )
  75.396              goto done;
  75.397          if ( op_bytes == 2 )
  75.398              dst.val = (uint16_t)dst.val | (_regs.eflags & 0xffff0000u);
  75.399 @@ -2507,8 +2549,8 @@ x86_emulate(
  75.400          dst.type  = OP_REG;
  75.401          dst.reg   = (unsigned long *)&_regs.eax;
  75.402          dst.bytes = (d & ByteOp) ? 1 : op_bytes;
  75.403 -        if ( (rc = ops->read(ea.mem.seg, insn_fetch_bytes(ad_bytes),
  75.404 -                             &dst.val, dst.bytes, ctxt)) != 0 )
  75.405 +        if ( (rc = read_ulong(ea.mem.seg, insn_fetch_bytes(ad_bytes),
  75.406 +                              &dst.val, dst.bytes, ctxt, ops)) != 0 )
  75.407              goto done;
  75.408          break;
  75.409  
  75.410 @@ -2536,8 +2578,8 @@ x86_emulate(
  75.411          }
  75.412          else
  75.413          {
  75.414 -            if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
  75.415 -                                 &dst.val, dst.bytes, ctxt)) != 0 )
  75.416 +            if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
  75.417 +                                  &dst.val, dst.bytes, ctxt, ops)) != 0 )
  75.418                  goto done;
  75.419              dst.type = OP_MEM;
  75.420              nr_reps = 1;
  75.421 @@ -2556,10 +2598,10 @@ x86_emulate(
  75.422          unsigned long next_eip = _regs.eip;
  75.423          get_rep_prefix();
  75.424          src.bytes = dst.bytes = (d & ByteOp) ? 1 : op_bytes;
  75.425 -        if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
  75.426 -                             &dst.val, dst.bytes, ctxt)) ||
  75.427 -             (rc = ops->read(x86_seg_es, truncate_ea(_regs.edi),
  75.428 -                             &src.val, src.bytes, ctxt)) )
  75.429 +        if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
  75.430 +                              &dst.val, dst.bytes, ctxt, ops)) ||
  75.431 +             (rc = read_ulong(x86_seg_es, truncate_ea(_regs.edi),
  75.432 +                              &src.val, src.bytes, ctxt, ops)) )
  75.433              goto done;
  75.434          register_address_increment(
  75.435              _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
  75.436 @@ -2592,8 +2634,8 @@ x86_emulate(
  75.437          dst.type  = OP_REG;
  75.438          dst.bytes = (d & ByteOp) ? 1 : op_bytes;
  75.439          dst.reg   = (unsigned long *)&_regs.eax;
  75.440 -        if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
  75.441 -                             &dst.val, dst.bytes, ctxt)) != 0 )
  75.442 +        if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
  75.443 +                              &dst.val, dst.bytes, ctxt, ops)) != 0 )
  75.444              goto done;
  75.445          register_address_increment(
  75.446              _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
  75.447 @@ -2606,8 +2648,8 @@ x86_emulate(
  75.448          get_rep_prefix();
  75.449          src.bytes = dst.bytes = (d & ByteOp) ? 1 : op_bytes;
  75.450          dst.val = _regs.eax;
  75.451 -        if ( (rc = ops->read(x86_seg_es, truncate_ea(_regs.edi),
  75.452 -                             &src.val, src.bytes, ctxt)) != 0 )
  75.453 +        if ( (rc = read_ulong(x86_seg_es, truncate_ea(_regs.edi),
  75.454 +                              &src.val, src.bytes, ctxt, ops)) != 0 )
  75.455              goto done;
  75.456          register_address_increment(
  75.457              _regs.edi, (_regs.eflags & EFLG_DF) ? -src.bytes : src.bytes);
  75.458 @@ -2624,8 +2666,8 @@ x86_emulate(
  75.459      case 0xc3: /* ret (near) */ {
  75.460          int offset = (b == 0xc2) ? insn_fetch_type(uint16_t) : 0;
  75.461          op_bytes = mode_64bit() ? 8 : op_bytes;
  75.462 -        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes + offset),
  75.463 -                             &dst.val, op_bytes, ctxt)) != 0 )
  75.464 +        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes + offset),
  75.465 +                              &dst.val, op_bytes, ctxt, ops)) != 0 )
  75.466              goto done;
  75.467          _regs.eip = dst.val;
  75.468          break;
  75.469 @@ -2640,7 +2682,7 @@ x86_emulate(
  75.470          dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes;
  75.471          dst.reg = (unsigned long *)&_regs.ebp;
  75.472          if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
  75.473 -                              _regs.ebp, dst.bytes, ctxt)) )
  75.474 +                              &_regs.ebp, dst.bytes, ctxt)) )
  75.475              goto done;
  75.476          dst.val = _regs.esp;
  75.477  
  75.478 @@ -2650,14 +2692,14 @@ x86_emulate(
  75.479              {
  75.480                  unsigned long ebp, temp_data;
  75.481                  ebp = truncate_word(_regs.ebp - i*dst.bytes, ctxt->sp_size/8);
  75.482 -                if ( (rc = ops->read(x86_seg_ss, ebp,
  75.483 -                                     &temp_data, dst.bytes, ctxt)) ||
  75.484 +                if ( (rc = read_ulong(x86_seg_ss, ebp,
  75.485 +                                      &temp_data, dst.bytes, ctxt, ops)) ||
  75.486                       (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
  75.487 -                                      temp_data, dst.bytes, ctxt)) )
  75.488 +                                      &temp_data, dst.bytes, ctxt)) )
  75.489                      goto done;
  75.490              }
  75.491              if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
  75.492 -                                  dst.val, dst.bytes, ctxt)) )
  75.493 +                                  &dst.val, dst.bytes, ctxt)) )
  75.494                  goto done;
  75.495          }
  75.496  
  75.497 @@ -2683,8 +2725,8 @@ x86_emulate(
  75.498  
  75.499          /* Second writeback, to %%ebp. */
  75.500          dst.reg = (unsigned long *)&_regs.ebp;
  75.501 -        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes),
  75.502 -                             &dst.val, dst.bytes, ctxt)) )
  75.503 +        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes),
  75.504 +                              &dst.val, dst.bytes, ctxt, ops)) )
  75.505              goto done;
  75.506          break;
  75.507  
  75.508 @@ -2692,10 +2734,10 @@ x86_emulate(
  75.509      case 0xcb: /* ret (far) */ {
  75.510          int offset = (b == 0xca) ? insn_fetch_type(uint16_t) : 0;
  75.511          op_bytes = mode_64bit() ? 8 : op_bytes;
  75.512 -        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
  75.513 -                             &dst.val, op_bytes, ctxt)) || 
  75.514 -             (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes + offset),
  75.515 -                             &src.val, op_bytes, ctxt)) ||
  75.516 +        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
  75.517 +                              &dst.val, op_bytes, ctxt, ops)) ||
  75.518 +             (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes + offset),
  75.519 +                              &src.val, op_bytes, ctxt, ops)) ||
  75.520               (rc = load_seg(x86_seg_cs, (uint16_t)src.val, ctxt, ops)) )
  75.521              goto done;
  75.522          _regs.eip = dst.val;
  75.523 @@ -2729,12 +2771,12 @@ x86_emulate(
  75.524          if ( !mode_iopl() )
  75.525              mask |= EFLG_IF;
  75.526          fail_if(!in_realmode(ctxt, ops));
  75.527 -        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
  75.528 -                             &eip, op_bytes, ctxt)) ||
  75.529 -             (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
  75.530 -                             &cs, op_bytes, ctxt)) ||
  75.531 -             (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
  75.532 -                             &eflags, op_bytes, ctxt)) )
  75.533 +        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
  75.534 +                              &eip, op_bytes, ctxt, ops)) ||
  75.535 +             (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
  75.536 +                              &cs, op_bytes, ctxt, ops)) ||
  75.537 +             (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
  75.538 +                              &eflags, op_bytes, ctxt, ops)) )
  75.539              goto done;
  75.540          if ( op_bytes == 2 )
  75.541              eflags = (uint16_t)eflags | (_regs.eflags & 0xffff0000u);
  75.542 @@ -2779,13 +2821,65 @@ x86_emulate(
  75.543  
  75.544      case 0xd7: /* xlat */ {
  75.545          unsigned long al = (uint8_t)_regs.eax;
  75.546 -        if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.ebx + al),
  75.547 -                             &al, 1, ctxt)) != 0 )
  75.548 +        if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.ebx + al),
  75.549 +                              &al, 1, ctxt, ops)) != 0 )
  75.550              goto done;
  75.551          *(uint8_t *)&_regs.eax = al;
  75.552          break;
  75.553      }
  75.554  
  75.555 +    case 0xd8: /* FPU 0xd8 */
  75.556 +        switch ( modrm )
  75.557 +        {
  75.558 +        case 0xc0 ... 0xc7: /* fadd %stN,%stN */
  75.559 +        case 0xc8 ... 0xcf: /* fmul %stN,%stN */
  75.560 +        case 0xd0 ... 0xd7: /* fcom %stN,%stN */
  75.561 +        case 0xd8 ... 0xdf: /* fcomp %stN,%stN */
  75.562 +        case 0xe0 ... 0xe7: /* fsub %stN,%stN */
  75.563 +        case 0xe8 ... 0xef: /* fsubr %stN,%stN */
  75.564 +        case 0xf0 ... 0xf7: /* fdiv %stN,%stN */
  75.565 +        case 0xf8 ... 0xff: /* fdivr %stN,%stN */
  75.566 +            emulate_fpu_insn_stub(0xd8, modrm);
  75.567 +            break;
  75.568 +        default:
  75.569 +            fail_if(modrm >= 0xc0);
  75.570 +            ea.bytes = 4;
  75.571 +            src = ea;
  75.572 +            if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
  75.573 +                                 src.bytes, ctxt)) != 0 )
  75.574 +                goto done;
  75.575 +            switch ( modrm_reg & 7 )
  75.576 +            {
  75.577 +            case 0: /* fadd */
  75.578 +                emulate_fpu_insn_memsrc("fadds", src.val);
  75.579 +                break;
  75.580 +            case 1: /* fmul */
  75.581 +                emulate_fpu_insn_memsrc("fmuls", src.val);
  75.582 +                break;
  75.583 +            case 2: /* fcom */
  75.584 +                emulate_fpu_insn_memsrc("fcoms", src.val);
  75.585 +                break;
  75.586 +            case 3: /* fcomp */
  75.587 +                emulate_fpu_insn_memsrc("fcomps", src.val);
  75.588 +                break;
  75.589 +            case 4: /* fsub */
  75.590 +                emulate_fpu_insn_memsrc("fsubs", src.val);
  75.591 +                break;
  75.592 +            case 5: /* fsubr */
  75.593 +                emulate_fpu_insn_memsrc("fsubrs", src.val);
  75.594 +                break;
  75.595 +            case 6: /* fdiv */
  75.596 +                emulate_fpu_insn_memsrc("fdivs", src.val);
  75.597 +                break;
  75.598 +            case 7: /* fdivr */
  75.599 +                emulate_fpu_insn_memsrc("fdivrs", src.val);
  75.600 +                break;
  75.601 +            default:
  75.602 +                goto cannot_emulate;
  75.603 +            }
  75.604 +        }
  75.605 +        break;
  75.606 +
  75.607      case 0xd9: /* FPU 0xd9 */
  75.608          switch ( modrm )
  75.609          {
  75.610 @@ -2822,28 +2916,269 @@ x86_emulate(
  75.611              emulate_fpu_insn_stub(0xd9, modrm);
  75.612              break;
  75.613          default:
  75.614 -            fail_if((modrm_reg & 7) != 7);
  75.615              fail_if(modrm >= 0xc0);
  75.616 -            /* fnstcw m2byte */
  75.617 -            ea.bytes = 2;
  75.618 -            dst = ea;
  75.619 -            emulate_fpu_insn_memdst("fnstcw", dst.val);
  75.620 +            switch ( modrm_reg & 7 )
  75.621 +            {
  75.622 +            case 0: /* fld m32fp */
  75.623 +                ea.bytes = 4;
  75.624 +                src = ea;
  75.625 +                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
  75.626 +                                     src.bytes, ctxt)) != 0 )
  75.627 +                    goto done;
  75.628 +                emulate_fpu_insn_memsrc("flds", src.val);
  75.629 +                break;
  75.630 +            case 2: /* fstp m32fp */
  75.631 +                ea.bytes = 4;
  75.632 +                dst = ea;
  75.633 +                dst.type = OP_MEM;
  75.634 +                emulate_fpu_insn_memdst("fsts", dst.val);
  75.635 +                break;
  75.636 +            case 3: /* fstp m32fp */
  75.637 +                ea.bytes = 4;
  75.638 +                dst = ea;
  75.639 +                dst.type = OP_MEM;
  75.640 +                emulate_fpu_insn_memdst("fstps", dst.val);
  75.641 +                break;
  75.642 +                /* case 4: fldenv - TODO */
  75.643 +            case 5: /* fldcw m2byte */
  75.644 +                ea.bytes = 2;
  75.645 +                src = ea;
  75.646 +                if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
  75.647 +                                     src.bytes, ctxt)) != 0 )
  75.648 +                    goto done;
  75.649 +                emulate_fpu_insn_memsrc("fldcw", src.val);
  75.650 +                break;
  75.651 +                /* case 6: fstenv - TODO */
  75.652 +            case 7: /* fnstcw m2byte */
  75.653 +                ea.bytes = 2;
  75.654 +                dst = ea;
  75.655 +                dst.type = OP_MEM;
  75.656 +                emulate_fpu_insn_memdst("fnstcw", dst.val);
  75.657 +                break;
  75.658 +            default:
  75.659 +                goto cannot_emulate;
  75.660 +            }
  75.661 +        }
  75.662 +        break;
  75.663 +
  75.664 +    case 0xda: /* FPU 0xda */
  75.665 +        switch ( modrm )
  75.666 +        {
  75.667 +        case 0xc0 ... 0xc7: /* fcmovb %stN */
  75.668 +        case 0xc8 ... 0xcf: /* fcmove %stN */
  75.669 +        case 0xd0 ... 0xd7: /* fcmovbe %stN */
  75.670 +        case 0xd8 ... 0xdf: /* fcmovu %stN */
  75.671 +        case 0xe9:          /* fucompp */
  75.672 +            emulate_fpu_insn_stub(0xda, modrm);
  75.673 +            break;
  75.674 +        default:
  75.675 +            fail_if(modrm >= 0xc0);
  75.676 +            ea.bytes = 8;
  75.677 +            src = ea;
  75.678 +            if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
  75.679 +                                 src.bytes, ctxt)) != 0 )
  75.680 +                goto done;
  75.681 +            switch ( modrm_reg & 7 )
  75.682 +            {
  75.683 +            case 0: /* fiadd m64i */
  75.684 +                emulate_fpu_insn_memsrc("fiaddl", src.val);
  75.685 +                break;
  75.686 +            case 1: /* fimul m64i */
  75.687 +                emulate_fpu_insn_memsrc("fimul", src.val);
  75.688 +                break;
  75.689 +            case 2: /* ficom m64i */
  75.690 +                emulate_fpu_insn_memsrc("ficoml", src.val);
  75.691 +                break;
  75.692 +            case 3: /* ficomp m64i */
  75.693 +                emulate_fpu_insn_memsrc("ficompl", src.val);
  75.694 +                break;
  75.695 +            case 4: /* fisub m64i */
  75.696 +                emulate_fpu_insn_memsrc("fisubl", src.val);
  75.697 +                break;
  75.698 +            case 5: /* fisubr m64i */
  75.699 +                emulate_fpu_insn_memsrc("fisubrl", src.val);
  75.700 +                break;
  75.701 +            case 6: /* fidiv m64i */
  75.702 +                emulate_fpu_insn_memsrc("fidivl", src.val);
  75.703 +                break;
  75.704 +            case 7: /* fidivr m64i */
  75.705 +                emulate_fpu_insn_memsrc("fidivrl", src.val);
  75.706 +                break;
  75.707 +            default:
  75.708 +                goto cannot_emulate;
  75.709 +            }
  75.710          }
  75.711          break;
  75.712  
  75.713      case 0xdb: /* FPU 0xdb */
  75.714 -        fail_if(modrm != 0xe3);
  75.715 -        /* fninit */
  75.716 -        emulate_fpu_insn("fninit");
  75.717 +        switch ( modrm )
  75.718 +        {
  75.719 +        case 0xc0 ... 0xc7: /* fcmovnb %stN */
  75.720 +        case 0xc8 ... 0xcf: /* fcmovne %stN */
  75.721 +        case 0xd0 ... 0xd7: /* fcmovnbe %stN */
  75.722 +        case 0xd8 ... 0xdf: /* fcmovnu %stN */
  75.723 +            emulate_fpu_insn_stub(0xdb, modrm);
  75.724 +            break;
  75.725 +        case 0xe2: /* fnclex */
  75.726 +            emulate_fpu_insn("fnclex");
  75.727 +            break;
  75.728 +        case 0xe3: /* fninit */
  75.729 +            emulate_fpu_insn("fninit");
  75.730 +            break;
  75.731 +        case 0xe4: /* fsetpm - 287 only, ignored by 387 */
  75.732 +            break;
  75.733 +        case 0xe8 ... 0xef: /* fucomi %stN */
  75.734 +        case 0xf0 ... 0xf7: /* fcomi %stN */
  75.735 +            emulate_fpu_insn_stub(0xdb, modrm);
  75.736 +            break;
  75.737 +        default:
  75.738 +            fail_if(modrm >= 0xc0);
  75.739 +            switch ( modrm_reg & 7 )
  75.740 +            {
  75.741 +            case 0: /* fild m32i */
  75.742 +                ea.bytes = 4;
  75.743 +                src = ea;
  75.744 +                if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
  75.745 +                                     src.bytes, ctxt)) != 0 )
  75.746 +                    goto done;
  75.747 +                emulate_fpu_insn_memsrc("fildl", src.val);
  75.748 +                break;
  75.749 +            case 1: /* fisttp m32i */
  75.750 +                ea.bytes = 4;
  75.751 +                dst = ea;
  75.752 +                dst.type = OP_MEM;
  75.753 +                emulate_fpu_insn_memdst("fisttpl", dst.val);
  75.754 +                break;
  75.755 +            case 2: /* fist m32i */
  75.756 +                ea.bytes = 4;
  75.757 +                dst = ea;
  75.758 +                dst.type = OP_MEM;
  75.759 +                emulate_fpu_insn_memdst("fistl", dst.val);
  75.760 +                break;
  75.761 +            case 3: /* fistp m32i */
  75.762 +                ea.bytes = 4;
  75.763 +                dst = ea;
  75.764 +                dst.type = OP_MEM;
  75.765 +                emulate_fpu_insn_memdst("fistpl", dst.val);
  75.766 +                break;
  75.767 +            case 5: /* fld m80fp */
  75.768 +                ea.bytes = 10;
  75.769 +                src = ea;
  75.770 +                if ( (rc = ops->read(src.mem.seg, src.mem.off,
  75.771 +                                     &src.val, src.bytes, ctxt)) != 0 )
  75.772 +                    goto done;
  75.773 +                emulate_fpu_insn_memdst("fldt", src.val);
  75.774 +                break;
  75.775 +            case 7: /* fstp m80fp */
  75.776 +                ea.bytes = 10;
  75.777 +                dst.type = OP_MEM;
  75.778 +                dst = ea;
  75.779 +                emulate_fpu_insn_memdst("fstpt", dst.val);
  75.780 +                break;
  75.781 +            default:
  75.782 +                goto cannot_emulate;
  75.783 +            }
  75.784 +        }
  75.785 +        break;
  75.786 +
  75.787 +    case 0xdc: /* FPU 0xdc */
  75.788 +        switch ( modrm )
  75.789 +        {
  75.790 +        case 0xc0 ... 0xc7: /* fadd %stN */
  75.791 +        case 0xc8 ... 0xcf: /* fmul %stN */
  75.792 +        case 0xe0 ... 0xe7: /* fsubr %stN */
  75.793 +        case 0xe8 ... 0xef: /* fsub %stN */
  75.794 +        case 0xf0 ... 0xf7: /* fdivr %stN */
  75.795 +        case 0xf8 ... 0xff: /* fdiv %stN */
  75.796 +            emulate_fpu_insn_stub(0xdc, modrm);
  75.797 +            break;
  75.798 +        default:
  75.799 +            fail_if(modrm >= 0xc0);
  75.800 +            ea.bytes = 8;
  75.801 +            src = ea;
  75.802 +            if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
  75.803 +                                 src.bytes, ctxt)) != 0 )
  75.804 +                goto done;
  75.805 +            switch ( modrm_reg & 7 )
  75.806 +            {
  75.807 +            case 0: /* fadd m64fp */
  75.808 +                emulate_fpu_insn_memsrc("faddl", src.val);
  75.809 +                break;
  75.810 +            case 1: /* fmul m64fp */
  75.811 +                emulate_fpu_insn_memsrc("fmull", src.val);
  75.812 +                break;
  75.813 +            case 2: /* fcom m64fp */
  75.814 +                emulate_fpu_insn_memsrc("fcoml", src.val);
  75.815 +                break;
  75.816 +            case 3: /* fcomp m64fp */
  75.817 +                emulate_fpu_insn_memsrc("fcompl", src.val);
  75.818 +                break;
  75.819 +            case 4: /* fsub m64fp */
  75.820 +                emulate_fpu_insn_memsrc("fsubl", src.val);
  75.821 +                break;
  75.822 +            case 5: /* fsubr m64fp */
  75.823 +                emulate_fpu_insn_memsrc("fsubrl", src.val);
  75.824 +                break;
  75.825 +            case 6: /* fdiv m64fp */
  75.826 +                emulate_fpu_insn_memsrc("fdivl", src.val);
  75.827 +                break;
  75.828 +            case 7: /* fdivr m64fp */
  75.829 +                emulate_fpu_insn_memsrc("fdivrl", src.val);
  75.830 +                break;
  75.831 +            }
  75.832 +        }
  75.833          break;
  75.834  
  75.835      case 0xdd: /* FPU 0xdd */
  75.836 -        fail_if((modrm_reg & 7) != 7);
  75.837 -        fail_if(modrm >= 0xc0);
  75.838 -        /* fnstsw m2byte */
  75.839 -        ea.bytes = 2;
  75.840 -        dst = ea;
  75.841 -        emulate_fpu_insn_memdst("fnstsw", dst.val);
  75.842 +        switch ( modrm )
  75.843 +        {
  75.844 +        case 0xc0 ... 0xc7: /* ffree %stN */
  75.845 +        case 0xd0 ... 0xd7: /* fst %stN */
  75.846 +        case 0xd8 ... 0xdf: /* fstp %stN */
  75.847 +        case 0xe0 ... 0xe7: /* fucom %stN */
  75.848 +        case 0xe8 ... 0xef: /* fucomp %stN */
  75.849 +            emulate_fpu_insn_stub(0xdd, modrm);
  75.850 +            break;
  75.851 +        default:
  75.852 +            fail_if(modrm >= 0xc0);
  75.853 +            switch ( modrm_reg & 7 )
  75.854 +            {
  75.855 +            case 0: /* fld m64fp */;
  75.856 +                ea.bytes = 8;
  75.857 +                src = ea;
  75.858 +                if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
  75.859 +                                     src.bytes, ctxt)) != 0 )
  75.860 +                    goto done;
  75.861 +                emulate_fpu_insn_memsrc("fldl", src.val);
  75.862 +                break;
  75.863 +            case 1: /* fisttp m64i */
  75.864 +                ea.bytes = 8;
  75.865 +                dst = ea;
  75.866 +                dst.type = OP_MEM;
  75.867 +                emulate_fpu_insn_memdst("fisttpll", dst.val);
  75.868 +                break;
  75.869 +            case 2: /* fst m64fp */
  75.870 +                ea.bytes = 8;
  75.871 +                dst = ea;
  75.872 +                dst.type = OP_MEM;
  75.873 +                emulate_fpu_insn_memsrc("fstl", dst.val);
  75.874 +                break;
  75.875 +            case 3: /* fstp m64fp */
  75.876 +                ea.bytes = 8;
  75.877 +                dst = ea;
  75.878 +                dst.type = OP_MEM;
  75.879 +                emulate_fpu_insn_memdst("fstpl", dst.val);
  75.880 +                break;
  75.881 +            case 7: /* fnstsw m2byte */
  75.882 +                ea.bytes = 2;
  75.883 +                dst = ea;
  75.884 +                dst.type = OP_MEM;
  75.885 +                emulate_fpu_insn_memdst("fnstsw", dst.val);
  75.886 +                break;
  75.887 +            default:
  75.888 +                goto cannot_emulate;
  75.889 +            }
  75.890 +        }
  75.891          break;
  75.892  
  75.893      case 0xde: /* FPU 0xde */
  75.894 @@ -2859,17 +3194,120 @@ x86_emulate(
  75.895              emulate_fpu_insn_stub(0xde, modrm);
  75.896              break;
  75.897          default:
  75.898 -            goto cannot_emulate;
  75.899 +            fail_if(modrm >= 0xc0);
  75.900 +            ea.bytes = 2;
  75.901 +            src = ea;
  75.902 +            if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
  75.903 +                                 src.bytes, ctxt)) != 0 )
  75.904 +                goto done;
  75.905 +            switch ( modrm_reg & 7 )
  75.906 +            {
  75.907 +            case 0: /* fiadd m16i */
  75.908 +                emulate_fpu_insn_memsrc("fiadd", src.val);
  75.909 +                break;
  75.910 +            case 1: /* fimul m16i */
  75.911 +                emulate_fpu_insn_memsrc("fimul", src.val);
  75.912 +                break;
  75.913 +            case 2: /* ficom m16i */
  75.914 +                emulate_fpu_insn_memsrc("ficom", src.val);
  75.915 +                break;
  75.916 +            case 3: /* ficomp m16i */
  75.917 +                emulate_fpu_insn_memsrc("ficomp", src.val);
  75.918 +                break;
  75.919 +            case 4: /* fisub m16i */
  75.920 +                emulate_fpu_insn_memsrc("fisub", src.val);
  75.921 +                break;
  75.922 +            case 5: /* fisubr m16i */
  75.923 +                emulate_fpu_insn_memsrc("fisubr", src.val);
  75.924 +                break;
  75.925 +            case 6: /* fidiv m16i */
  75.926 +                emulate_fpu_insn_memsrc("fidiv", src.val);
  75.927 +                break;
  75.928 +            case 7: /* fidivr m16i */
  75.929 +                emulate_fpu_insn_memsrc("fidivr", src.val);
  75.930 +                break;
  75.931 +            default:
  75.932 +                goto cannot_emulate;
  75.933 +            }
  75.934          }
  75.935          break;
  75.936  
  75.937      case 0xdf: /* FPU 0xdf */
  75.938 -        fail_if(modrm != 0xe0);
  75.939 -        /* fnstsw %ax */
  75.940 -        dst.bytes = 2;
  75.941 -        dst.type = OP_REG;
  75.942 -        dst.reg = (unsigned long *)&_regs.eax;
  75.943 -        emulate_fpu_insn_memdst("fnstsw", dst.val);
  75.944 +        switch ( modrm )
  75.945 +        {
  75.946 +        case 0xe0:
  75.947 +            /* fnstsw %ax */
  75.948 +            dst.bytes = 2;
  75.949 +            dst.type = OP_REG;
  75.950 +            dst.reg = (unsigned long *)&_regs.eax;
  75.951 +            emulate_fpu_insn_memdst("fnstsw", dst.val);
  75.952 +            break;
  75.953 +        case 0xf0 ... 0xf7: /* fcomip %stN */
  75.954 +        case 0xf8 ... 0xff: /* fucomip %stN */
  75.955 +            emulate_fpu_insn_stub(0xdf, modrm);
  75.956 +            break;
  75.957 +        default:
  75.958 +            fail_if(modrm >= 0xc0);
  75.959 +            switch ( modrm_reg & 7 )
  75.960 +            {
  75.961 +            case 0: /* fild m16i */
  75.962 +                ea.bytes = 2;
  75.963 +                src = ea;
  75.964 +                if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
  75.965 +                                     src.bytes, ctxt)) != 0 )
  75.966 +                    goto done;
  75.967 +                emulate_fpu_insn_memsrc("fild", src.val);
  75.968 +                break;
  75.969 +            case 1: /* fisttp m16i */
  75.970 +                ea.bytes = 2;
  75.971 +                dst = ea;
  75.972 +                dst.type = OP_MEM;
  75.973 +                emulate_fpu_insn_memdst("fisttp", dst.val);
  75.974 +                break;
  75.975 +            case 2: /* fist m16i */
  75.976 +                ea.bytes = 2;
  75.977 +                dst = ea;
  75.978 +                dst.type = OP_MEM;
  75.979 +                emulate_fpu_insn_memdst("fist", dst.val);
  75.980 +                break;
  75.981 +            case 3: /* fistp m16i */
  75.982 +                ea.bytes = 2;
  75.983 +                dst = ea;
  75.984 +                dst.type = OP_MEM;
  75.985 +                emulate_fpu_insn_memdst("fistp", dst.val);
  75.986 +                break;
  75.987 +            case 4: /* fbld m80dec */
  75.988 +                ea.bytes = 10;
  75.989 +                dst = ea;
  75.990 +                if ( (rc = ops->read(src.mem.seg, src.mem.off,
  75.991 +                                     &src.val, src.bytes, ctxt)) != 0 )
  75.992 +                    goto done;
  75.993 +                emulate_fpu_insn_memdst("fbld", src.val);
  75.994 +                break;
  75.995 +            case 5: /* fild m64i */
  75.996 +                ea.bytes = 8;
  75.997 +                src = ea;
  75.998 +                if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
  75.999 +                                     src.bytes, ctxt)) != 0 )
 75.1000 +                    goto done;
 75.1001 +                emulate_fpu_insn_memsrc("fildll", src.val);
 75.1002 +                break;
 75.1003 +            case 6: /* fbstp packed bcd */
 75.1004 +                ea.bytes = 10;
 75.1005 +                dst = ea;
 75.1006 +                dst.type = OP_MEM;
 75.1007 +                emulate_fpu_insn_memdst("fbstp", dst.val);
 75.1008 +                break;
 75.1009 +            case 7: /* fistp m64i */
 75.1010 +                ea.bytes = 8;
 75.1011 +                dst = ea;
 75.1012 +                dst.type = OP_MEM;
 75.1013 +                emulate_fpu_insn_memdst("fistpll", dst.val);
 75.1014 +                break;
 75.1015 +            default:
 75.1016 +                goto cannot_emulate;
 75.1017 +            }
 75.1018 +        }
 75.1019          break;
 75.1020  
 75.1021      case 0xe0 ... 0xe2: /* loop{,z,nz} */ {
 75.1022 @@ -2924,7 +3362,6 @@ x86_emulate(
 75.1023              /* out */
 75.1024              fail_if(ops->write_io == NULL);
 75.1025              rc = ops->write_io(port, op_bytes, _regs.eax, ctxt);
 75.1026 -            
 75.1027          }
 75.1028          else
 75.1029          {
 75.1030 @@ -3242,9 +3679,9 @@ x86_emulate(
 75.1031              if ( op_bytes == 2 )
 75.1032                  reg.base &= 0xffffff;
 75.1033              if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0,
 75.1034 -                                  reg.limit, 2, ctxt)) ||
 75.1035 +                                  &reg.limit, 2, ctxt)) ||
 75.1036                   (rc = ops->write(ea.mem.seg, ea.mem.off+2,
 75.1037 -                                  reg.base, mode_64bit() ? 8 : 4, ctxt)) )
 75.1038 +                                  &reg.base, mode_64bit() ? 8 : 4, ctxt)) )
 75.1039                  goto done;
 75.1040              break;
 75.1041          case 2: /* lgdt */
 75.1042 @@ -3252,10 +3689,10 @@ x86_emulate(
 75.1043              generate_exception_if(ea.type != OP_MEM, EXC_UD, -1);
 75.1044              fail_if(ops->write_segment == NULL);
 75.1045              memset(&reg, 0, sizeof(reg));
 75.1046 -            if ( (rc = ops->read(ea.mem.seg, ea.mem.off+0,
 75.1047 -                                 &limit, 2, ctxt)) ||
 75.1048 -                 (rc = ops->read(ea.mem.seg, ea.mem.off+2,
 75.1049 -                                 &base, mode_64bit() ? 8 : 4, ctxt)) )
 75.1050 +            if ( (rc = read_ulong(ea.mem.seg, ea.mem.off+0,
 75.1051 +                                  &limit, 2, ctxt, ops)) ||
 75.1052 +                 (rc = read_ulong(ea.mem.seg, ea.mem.off+2,
 75.1053 +                                  &base, mode_64bit() ? 8 : 4, ctxt, ops)) )
 75.1054                  goto done;
 75.1055              reg.base = base;
 75.1056              reg.limit = limit;
 75.1057 @@ -3267,7 +3704,8 @@ x86_emulate(
 75.1058                  goto done;
 75.1059              break;
 75.1060          case 4: /* smsw */
 75.1061 -            ea.bytes = 2;
 75.1062 +            if ( ea.type == OP_MEM )
 75.1063 +                ea.bytes = 2;
 75.1064              dst = ea;
 75.1065              fail_if(ops->read_cr == NULL);
 75.1066              if ( (rc = ops->read_cr(0, &dst.val, ctxt)) )
 75.1067 @@ -3281,11 +3719,11 @@ x86_emulate(
 75.1068                  goto done;
 75.1069              if ( ea.type == OP_REG )
 75.1070                  cr0w = *ea.reg;
 75.1071 -            else if ( (rc = ops->read(ea.mem.seg, ea.mem.off,
 75.1072 -                                      &cr0w, 2, ctxt)) )
 75.1073 +            else if ( (rc = read_ulong(ea.mem.seg, ea.mem.off,
 75.1074 +                                       &cr0w, 2, ctxt, ops)) )
 75.1075                  goto done;
 75.1076 -            cr0 &= 0xffff0000;
 75.1077 -            cr0 |= (uint16_t)cr0w;
 75.1078 +            /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */
 75.1079 +            cr0 = (cr0 & ~0xe) | (cr0w & 0xf);
 75.1080              if ( (rc = ops->write_cr(0, cr0, ctxt)) )
 75.1081                  goto done;
 75.1082              break;
 75.1083 @@ -3404,8 +3842,10 @@ x86_emulate(
 75.1084          if ( ea.type == OP_MEM )
 75.1085          {
 75.1086              unsigned long lval, hval;
 75.1087 -            if ( (rc = ops->read(ea.mem.seg, ea.mem.off+0, &lval, 4, ctxt)) ||
 75.1088 -                 (rc = ops->read(ea.mem.seg, ea.mem.off+4, &hval, 4, ctxt)) )
 75.1089 +            if ( (rc = read_ulong(ea.mem.seg, ea.mem.off+0,
 75.1090 +                                  &lval, 4, ctxt, ops)) ||
 75.1091 +                 (rc = read_ulong(ea.mem.seg, ea.mem.off+4,
 75.1092 +                                  &hval, 4, ctxt, ops)) )
 75.1093                  goto done;
 75.1094              val = ((uint64_t)hval << 32) | (uint32_t)lval;
 75.1095              stub[2] = modrm & 0x38; /* movq (%eax),%mmN */
 75.1096 @@ -3428,8 +3868,8 @@ x86_emulate(
 75.1097          if ( ea.type == OP_MEM )
 75.1098          {
 75.1099              unsigned long lval = (uint32_t)val, hval = (uint32_t)(val >> 32);
 75.1100 -            if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, lval, 4, ctxt)) ||
 75.1101 -                 (rc = ops->write(ea.mem.seg, ea.mem.off+4, hval, 4, ctxt)) )
 75.1102 +            if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, &lval, 4, ctxt)) ||
 75.1103 +                 (rc = ops->write(ea.mem.seg, ea.mem.off+4, &hval, 4, ctxt)) )
 75.1104                  goto done;
 75.1105          }
 75.1106          break;
 75.1107 @@ -3481,8 +3921,8 @@ x86_emulate(
 75.1108  
 75.1109          /* Get actual old value. */
 75.1110          for ( i = 0; i < (op_bytes/sizeof(long)); i++ )
 75.1111 -            if ( (rc = ops->read(ea.mem.seg, ea.mem.off + i*sizeof(long),
 75.1112 -                                 &old[i], sizeof(long), ctxt)) != 0 )
 75.1113 +            if ( (rc = read_ulong(ea.mem.seg, ea.mem.off + i*sizeof(long),
 75.1114 +                                  &old[i], sizeof(long), ctxt, ops)) != 0 )
 75.1115                  goto done;
 75.1116  
 75.1117          /* Get expected and proposed values. */
    76.1 --- a/xen/arch/x86/x86_emulate/x86_emulate.h	Thu Jun 19 12:48:04 2008 +0900
    76.2 +++ b/xen/arch/x86/x86_emulate/x86_emulate.h	Wed Jul 02 11:30:37 2008 +0900
    76.3 @@ -102,7 +102,8 @@ enum x86_emulate_fpu_type {
    76.4  };
    76.5  
    76.6  /*
    76.7 - * These operations represent the instruction emulator's interface to memory.
    76.8 + * These operations represent the instruction emulator's interface to memory,
    76.9 + * I/O ports, privileged state... pretty much everything other than GPRs.
   76.10   * 
   76.11   * NOTES:
   76.12   *  1. If the access fails (cannot emulate, or a standard access faults) then
   76.13 @@ -110,8 +111,7 @@ enum x86_emulate_fpu_type {
   76.14   *     some out-of-band mechanism, unknown to the emulator. The memop signals
   76.15   *     failure by returning X86EMUL_EXCEPTION to the emulator, which will
   76.16   *     then immediately bail.
   76.17 - *  2. Valid access sizes are 1, 2, 4 and 8 (x86/64 only) bytes.
   76.18 - *  3. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
   76.19 + *  2. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
   76.20   */
   76.21  struct x86_emulate_ops
   76.22  {
   76.23 @@ -121,19 +121,25 @@ struct x86_emulate_ops
   76.24       * All memory-access functions:
   76.25       *  @seg:   [IN ] Segment being dereferenced (specified as x86_seg_??).
   76.26       *  @offset:[IN ] Offset within segment.
   76.27 +     *  @p_data:[IN ] Pointer to i/o data buffer (length is @bytes)
   76.28       * Read functions:
   76.29       *  @val:   [OUT] Value read, zero-extended to 'ulong'.
   76.30       * Write functions:
   76.31       *  @val:   [IN ] Value to write (low-order bytes used as req'd).
   76.32       * Variable-length access functions:
   76.33 -     *  @bytes: [IN ] Number of bytes to read or write.
   76.34 +     *  @bytes: [IN ] Number of bytes to read or write. Valid access sizes are
   76.35 +     *                1, 2, 4 and 8 (x86/64 only) bytes, unless otherwise
   76.36 +     *                stated.
   76.37       */
   76.38  
   76.39 -    /* read: Emulate a memory read. */
   76.40 +    /*
   76.41 +     * read: Emulate a memory read.
   76.42 +     *  @bytes: Access length (0 < @bytes < 4096).
   76.43 +     */
   76.44      int (*read)(
   76.45          enum x86_segment seg,
   76.46          unsigned long offset,
   76.47 -        unsigned long *val,
   76.48 +        void *p_data,
   76.49          unsigned int bytes,
   76.50          struct x86_emulate_ctxt *ctxt);
   76.51  
   76.52 @@ -144,15 +150,18 @@ struct x86_emulate_ops
   76.53      int (*insn_fetch)(
   76.54          enum x86_segment seg,
   76.55          unsigned long offset,
   76.56 -        unsigned long *val,
   76.57 +        void *p_data,
   76.58          unsigned int bytes,
   76.59          struct x86_emulate_ctxt *ctxt);
   76.60  
   76.61 -    /* write: Emulate a memory write. */
   76.62 +    /*
   76.63 +     * write: Emulate a memory write.
   76.64 +     *  @bytes: Access length (0 < @bytes < 4096).
   76.65 +     */
   76.66      int (*write)(
   76.67          enum x86_segment seg,
   76.68          unsigned long offset,
   76.69 -        unsigned long val,
   76.70 +        void *p_data,
   76.71          unsigned int bytes,
   76.72          struct x86_emulate_ctxt *ctxt);
   76.73  
    77.1 --- a/xen/common/domain.c	Thu Jun 19 12:48:04 2008 +0900
    77.2 +++ b/xen/common/domain.c	Wed Jul 02 11:30:37 2008 +0900
    77.3 @@ -73,36 +73,13 @@ int current_domain_id(void)
    77.4      return current->domain->domain_id;
    77.5  }
    77.6  
    77.7 -struct domain *alloc_domain(domid_t domid)
    77.8 +static struct domain *alloc_domain_struct(void)
    77.9  {
   77.10 -    struct domain *d;
   77.11 -
   77.12 -    if ( (d = xmalloc(struct domain)) == NULL )
   77.13 -        return NULL;
   77.14 -
   77.15 -    memset(d, 0, sizeof(*d));
   77.16 -    d->domain_id = domid;
   77.17 -
   77.18 -    if ( xsm_alloc_security_domain(d) != 0 )
   77.19 -    {
   77.20 -        free_domain(d);
   77.21 -        return NULL;
   77.22 -    }
   77.23 -
   77.24 -    atomic_set(&d->refcnt, 1);
   77.25 -    spin_lock_init(&d->domain_lock);
   77.26 -    spin_lock_init(&d->page_alloc_lock);
   77.27 -    spin_lock_init(&d->shutdown_lock);
   77.28 -    spin_lock_init(&d->hypercall_deadlock_mutex);
   77.29 -    INIT_LIST_HEAD(&d->page_list);
   77.30 -    INIT_LIST_HEAD(&d->xenpage_list);
   77.31 -
   77.32 -    return d;
   77.33 +    return xmalloc(struct domain);
   77.34  }
   77.35  
   77.36 -void free_domain(struct domain *d)
   77.37 +static void free_domain_struct(struct domain *d)
   77.38  {
   77.39 -    xsm_free_security_domain(d);
   77.40      xfree(d);
   77.41  }
   77.42  
   77.43 @@ -210,19 +187,39 @@ struct domain *domain_create(
   77.44      domid_t domid, unsigned int domcr_flags, ssidref_t ssidref)
   77.45  {
   77.46      struct domain *d, **pd;
   77.47 -    enum { INIT_evtchn = 1, INIT_gnttab = 2, INIT_arch = 8 }; 
   77.48 +    enum { INIT_xsm = 1u<<0, INIT_rangeset = 1u<<1, INIT_evtchn = 1u<<2,
   77.49 +           INIT_gnttab = 1u<<3, INIT_arch = 1u<<4 };
   77.50      int init_status = 0;
   77.51  
   77.52 -    if ( (d = alloc_domain(domid)) == NULL )
   77.53 +    if ( (d = alloc_domain_struct()) == NULL )
   77.54          return NULL;
   77.55  
   77.56 +    memset(d, 0, sizeof(*d));
   77.57 +    d->domain_id = domid;
   77.58 +
   77.59 +    if ( xsm_alloc_security_domain(d) != 0 )
   77.60 +        goto fail;
   77.61 +    init_status |= INIT_xsm;
   77.62 +
   77.63 +    atomic_set(&d->refcnt, 1);
   77.64 +    spin_lock_init(&d->domain_lock);
   77.65 +    spin_lock_init(&d->page_alloc_lock);
   77.66 +    spin_lock_init(&d->shutdown_lock);
   77.67 +    spin_lock_init(&d->hypercall_deadlock_mutex);
   77.68 +    INIT_LIST_HEAD(&d->page_list);
   77.69 +    INIT_LIST_HEAD(&d->xenpage_list);
   77.70 +
   77.71      if ( domcr_flags & DOMCRF_hvm )
   77.72          d->is_hvm = 1;
   77.73  
   77.74      if ( (domid == 0) && opt_dom0_vcpus_pin )
   77.75          d->is_pinned = 1;
   77.76  
   77.77 +    if ( domcr_flags & DOMCRF_dummy )
   77.78 +        return d;
   77.79 +
   77.80      rangeset_domain_initialise(d);
   77.81 +    init_status |= INIT_rangeset;
   77.82  
   77.83      if ( !is_idle_domain(d) )
   77.84      {
   77.85 @@ -278,8 +275,11 @@ struct domain *domain_create(
   77.86          grant_table_destroy(d);
   77.87      if ( init_status & INIT_evtchn )
   77.88          evtchn_destroy(d);
   77.89 -    rangeset_domain_destroy(d);
   77.90 -    free_domain(d);
   77.91 +    if ( init_status & INIT_rangeset )
   77.92 +        rangeset_domain_destroy(d);
   77.93 +    if ( init_status & INIT_xsm )
   77.94 +        xsm_free_security_domain(d);
   77.95 +    free_domain_struct(d);
   77.96      return NULL;
   77.97  }
   77.98  
   77.99 @@ -535,7 +535,8 @@ static void complete_domain_destroy(stru
  77.100      if ( d->target != NULL )
  77.101          put_domain(d->target);
  77.102  
  77.103 -    free_domain(d);
  77.104 +    xsm_free_security_domain(d);
  77.105 +    free_domain_struct(d);
  77.106  
  77.107      send_guest_global_virq(dom0, VIRQ_DOM_EXC);
  77.108  }
    78.1 --- a/xen/drivers/passthrough/vtd/dmar.c	Thu Jun 19 12:48:04 2008 +0900
    78.2 +++ b/xen/drivers/passthrough/vtd/dmar.c	Wed Jul 02 11:30:37 2008 +0900
    78.3 @@ -383,7 +383,8 @@ acpi_parse_one_drhd(struct acpi_dmar_ent
    78.4      dmaru->address = drhd->address;
    78.5      dmaru->include_all = drhd->flags & 1; /* BIT0: INCLUDE_ALL */
    78.6      INIT_LIST_HEAD(&dmaru->ioapic_list);
    78.7 -    dprintk(XENLOG_INFO VTDPREFIX, "dmaru->address = %lx\n", dmaru->address);
    78.8 +    dprintk(XENLOG_INFO VTDPREFIX, "dmaru->address = %"PRIx64"\n",
    78.9 +            dmaru->address);
   78.10  
   78.11      dev_scope_start = (void *)(drhd + 1);
   78.12      dev_scope_end   = ((void *)drhd) + header->length;
    79.1 --- a/xen/drivers/passthrough/vtd/dmar.h	Thu Jun 19 12:48:04 2008 +0900
    79.2 +++ b/xen/drivers/passthrough/vtd/dmar.h	Wed Jul 02 11:30:37 2008 +0900
    79.3 @@ -42,28 +42,28 @@ struct acpi_ioapic_unit {
    79.4  
    79.5  struct acpi_drhd_unit {
    79.6      struct list_head list;
    79.7 -    unsigned long    address; /* register base address of the unit */
    79.8 -    struct    pci_dev *devices; /* target devices */
    79.9 +    u64    address; /* register base address of the unit */
   79.10 +    struct pci_dev *devices; /* target devices */
   79.11      int    devices_cnt;
   79.12 -    u8    include_all:1;
   79.13 +    u8     include_all:1;
   79.14      struct iommu *iommu;
   79.15      struct list_head ioapic_list;
   79.16  };
   79.17  
   79.18  struct acpi_rmrr_unit {
   79.19      struct list_head list;
   79.20 -    unsigned long base_address;
   79.21 -    unsigned long end_address;
   79.22 +    u64    base_address;
   79.23 +    u64    end_address;
   79.24      struct pci_dev *devices; /* target devices */
   79.25      int    devices_cnt;
   79.26 -    u8    allow_all:1;
   79.27 +    u8     allow_all:1;
   79.28  };
   79.29  
   79.30  struct acpi_atsr_unit {
   79.31      struct list_head list;
   79.32 -    struct    pci_dev *devices; /* target devices */
   79.33 +    struct pci_dev *devices; /* target devices */
   79.34      int    devices_cnt;
   79.35 -    u8    all_ports:1;
   79.36 +    u8     all_ports:1;
   79.37  };
   79.38  
   79.39  #define for_each_iommu(domain, iommu) \
    80.1 --- a/xen/drivers/passthrough/vtd/intremap.c	Thu Jun 19 12:48:04 2008 +0900
    80.2 +++ b/xen/drivers/passthrough/vtd/intremap.c	Wed Jul 02 11:30:37 2008 +0900
    80.3 @@ -52,7 +52,7 @@ static void remap_entry_to_ioapic_rte(
    80.4      unsigned long flags;
    80.5      struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
    80.6  
    80.7 -    if ( ir_ctrl == NULL || ir_ctrl->iremap_index < 0 )
    80.8 +    if ( ir_ctrl == NULL )
    80.9      {
   80.10          dprintk(XENLOG_ERR VTDPREFIX,
   80.11                  "remap_entry_to_ioapic_rte: ir_ctl is not ready\n");
   80.12 @@ -153,6 +153,7 @@ static void ioapic_rte_to_remap_entry(st
   80.13      }
   80.14  
   80.15      memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
   80.16 +    iommu_flush_cache_entry(iremap_entry);
   80.17      iommu_flush_iec_index(iommu, 0, index);
   80.18      invalidate_sync(iommu);
   80.19  
   80.20 @@ -170,7 +171,8 @@ unsigned int io_apic_read_remap_rte(
   80.21      struct iommu *iommu = ioapic_to_iommu(mp_ioapics[apic].mpc_apicid);
   80.22      struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
   80.23  
   80.24 -    if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 )
   80.25 +    if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 ||
   80.26 +         ir_ctrl->iremap_index == -1 )
   80.27      {
   80.28          *IO_APIC_BASE(apic) = reg;
   80.29          return *(IO_APIC_BASE(apic)+4);
   80.30 @@ -377,6 +379,7 @@ static void msi_msg_to_remap_entry(
   80.31      remap_rte->data = 0;
   80.32  
   80.33      memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
   80.34 +    iommu_flush_cache_entry(iremap_entry);
   80.35      iommu_flush_iec_index(iommu, 0, index);
   80.36      invalidate_sync(iommu);
   80.37  
    81.1 --- a/xen/drivers/passthrough/vtd/iommu.c	Thu Jun 19 12:48:04 2008 +0900
    81.2 +++ b/xen/drivers/passthrough/vtd/iommu.c	Wed Jul 02 11:30:37 2008 +0900
    81.3 @@ -1269,7 +1269,6 @@ static int domain_context_mapping(
    81.4  }
    81.5  
    81.6  static int domain_context_unmap_one(
    81.7 -    struct domain *domain,
    81.8      struct iommu *iommu,
    81.9      u8 bus, u8 devfn)
   81.10  {
   81.11 @@ -1300,7 +1299,6 @@ static int domain_context_unmap_one(
   81.12  }
   81.13  
   81.14  static int domain_context_unmap(
   81.15 -    struct domain *domain,
   81.16      struct iommu *iommu,
   81.17      struct pci_dev *pdev)
   81.18  {
   81.19 @@ -1320,14 +1318,13 @@ static int domain_context_unmap(
   81.20              PCI_FUNC(pdev->devfn), PCI_SUBORDINATE_BUS);
   81.21          break;
   81.22      case DEV_TYPE_PCIe_ENDPOINT:
   81.23 -        ret = domain_context_unmap_one(domain, iommu,
   81.24 +        ret = domain_context_unmap_one(iommu,
   81.25                                         (u8)(pdev->bus), (u8)(pdev->devfn));
   81.26          break;
   81.27      case DEV_TYPE_PCI:
   81.28          if ( pdev->bus == 0 )
   81.29              ret = domain_context_unmap_one(
   81.30 -                domain, iommu,
   81.31 -                (u8)(pdev->bus), (u8)(pdev->devfn));
   81.32 +                iommu, (u8)(pdev->bus), (u8)(pdev->devfn));
   81.33          else
   81.34          {
   81.35              if ( bus2bridge[pdev->bus].bus != 0 )
   81.36 @@ -1335,7 +1332,7 @@ static int domain_context_unmap(
   81.37                           "domain_context_unmap:"
   81.38                           "bus2bridge[%d].bus != 0\n", pdev->bus);
   81.39  
   81.40 -            ret = domain_context_unmap_one(domain, iommu,
   81.41 +            ret = domain_context_unmap_one(iommu,
   81.42                                             (u8)(bus2bridge[pdev->bus].bus),
   81.43                                             (u8)(bus2bridge[pdev->bus].devfn));
   81.44  
   81.45 @@ -1345,8 +1342,7 @@ static int domain_context_unmap(
   81.46                  for ( func = 0; func < 8; func++ )
   81.47                  {
   81.48                      ret = domain_context_unmap_one(
   81.49 -                        domain, iommu,
   81.50 -                        pdev->bus, (u8)PCI_DEVFN(dev, func));
   81.51 +                        iommu, pdev->bus, (u8)PCI_DEVFN(dev, func));
   81.52                      if ( ret )
   81.53                          return ret;
   81.54                  }
   81.55 @@ -1389,7 +1385,7 @@ void reassign_device_ownership(
   81.56   found:
   81.57      drhd = acpi_find_matched_drhd_unit(pdev);
   81.58      iommu = drhd->iommu;
   81.59 -    domain_context_unmap(source, iommu, pdev);
   81.60 +    domain_context_unmap(iommu, pdev);
   81.61  
   81.62      /* Move pci device from the source domain to target domain. */
   81.63      spin_lock_irqsave(&source_hd->iommu_list_lock, flags);
   81.64 @@ -1589,7 +1585,7 @@ static int iommu_prepare_rmrr_dev(
   81.65      struct pci_dev *pdev)
   81.66  {
   81.67      struct acpi_drhd_unit *drhd;
   81.68 -    unsigned long size;
   81.69 +    u64 size;
   81.70      int ret;
   81.71  
   81.72      /* page table init */
    82.1 --- a/xen/drivers/passthrough/vtd/qinval.c	Thu Jun 19 12:48:04 2008 +0900
    82.2 +++ b/xen/drivers/passthrough/vtd/qinval.c	Wed Jul 02 11:30:37 2008 +0900
    82.3 @@ -222,7 +222,7 @@ int invalidate_sync(struct iommu *iommu)
    82.4      int ret = -1;
    82.5      struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
    82.6  
    82.7 -    if ( qi_ctrl->qinval_maddr == 0 )
    82.8 +    if ( qi_ctrl->qinval_maddr != 0 )
    82.9      {
   82.10          ret = queue_invalidate_wait(iommu,
   82.11              0, 1, 1, 1, &qi_ctrl->qinval_poll_status);
   82.12 @@ -416,7 +416,6 @@ static int flush_iotlb_qi(
   82.13  int qinval_setup(struct iommu *iommu)
   82.14  {
   82.15      s_time_t start_time;
   82.16 -    u32 status = 0;
   82.17      struct qi_ctrl *qi_ctrl;
   82.18      struct iommu_flush *flush;
   82.19  
   82.20 @@ -450,15 +449,12 @@ int qinval_setup(struct iommu *iommu)
   82.21  
   82.22      /* Make sure hardware complete it */
   82.23      start_time = NOW();
   82.24 -    for ( ; ; )
   82.25 +    while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_QIES) )
   82.26      {
   82.27 -        status = dmar_readl(iommu->reg, DMAR_GSTS_REG);
   82.28 -        if ( status & DMA_GSTS_QIES )
   82.29 -            break;
   82.30          if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) )
   82.31              panic("Cannot set QIE field for queue invalidation\n");
   82.32          cpu_relax();
   82.33      }
   82.34 -    status = 0;
   82.35 -    return status;
   82.36 +
   82.37 +    return 0;
   82.38  }
    83.1 --- a/xen/drivers/passthrough/vtd/utils.c	Thu Jun 19 12:48:04 2008 +0900
    83.2 +++ b/xen/drivers/passthrough/vtd/utils.c	Wed Jul 02 11:30:37 2008 +0900
    83.3 @@ -166,7 +166,7 @@ void print_iommu_regs(struct acpi_drhd_u
    83.4      struct iommu *iommu = drhd->iommu;
    83.5  
    83.6      printk("---- print_iommu_regs ----\n");
    83.7 -    printk("print_iommu_regs: drhd->address = %lx\n", drhd->address);
    83.8 +    printk("print_iommu_regs: drhd->address = %"PRIx64"\n", drhd->address);
    83.9      printk("print_iommu_regs: DMAR_VER_REG = %x\n",
   83.10             dmar_readl(iommu->reg,DMAR_VER_REG));
   83.11      printk("print_iommu_regs: DMAR_CAP_REG = %"PRIx64"\n",
    84.1 --- a/xen/include/acpi/cpufreq/cpufreq.h	Thu Jun 19 12:48:04 2008 +0900
    84.2 +++ b/xen/include/acpi/cpufreq/cpufreq.h	Wed Jul 02 11:30:37 2008 +0900
    84.3 @@ -36,7 +36,10 @@ struct cpufreq_policy {
    84.4      unsigned int        max;    /* in kHz */
    84.5      unsigned int        cur;    /* in kHz, only needed if cpufreq
    84.6                                   * governors are used */
    84.7 +    unsigned int        resume; /* flag for cpufreq 1st run
    84.8 +                                 * S3 wakeup, hotplug cpu, etc */
    84.9  };
   84.10 +extern struct cpufreq_policy xen_px_policy[NR_CPUS];
   84.11  
   84.12  #define CPUFREQ_SHARED_TYPE_NONE (0) /* None */
   84.13  #define CPUFREQ_SHARED_TYPE_HW   (1) /* HW does needed coordination */
    85.1 --- a/xen/include/acpi/cpufreq/processor_perf.h	Thu Jun 19 12:48:04 2008 +0900
    85.2 +++ b/xen/include/acpi/cpufreq/processor_perf.h	Wed Jul 02 11:30:37 2008 +0900
    85.3 @@ -6,9 +6,21 @@
    85.4  
    85.5  int get_cpu_id(u8);
    85.6  int acpi_cpufreq_init(void);
    85.7 +int powernow_cpufreq_init(void);
    85.8 +
    85.9  void px_statistic_update(cpumask_t, uint8_t, uint8_t);
   85.10  int  px_statistic_init(int);
   85.11  void px_statistic_reset(int);
   85.12 +void px_statistic_suspend(void);
   85.13 +void px_statistic_resume(void);
   85.14 +
   85.15 +void cpufreq_dom_exit(void);
   85.16 +int  cpufreq_dom_init(void);
   85.17 +int  cpufreq_dom_dbs(unsigned int);
   85.18 +void cpufreq_suspend(void);
   85.19 +int  cpufreq_resume(void);
   85.20 +
   85.21 +inline uint64_t get_cpu_idle_time(unsigned int);
   85.22  
   85.23  struct processor_performance {
   85.24      uint32_t state;
   85.25 @@ -44,6 +56,7 @@ struct px_stat {
   85.26  struct pm_px {
   85.27      struct px_stat u;
   85.28      uint64_t prev_state_wall;
   85.29 +    uint64_t prev_idle_wall;
   85.30  };
   85.31  
   85.32  extern struct pm_px px_statistic_data[NR_CPUS];
    86.1 --- a/xen/include/asm-x86/domain.h	Thu Jun 19 12:48:04 2008 +0900
    86.2 +++ b/xen/include/asm-x86/domain.h	Wed Jul 02 11:30:37 2008 +0900
    86.3 @@ -103,6 +103,9 @@ struct shadow_domain {
    86.4       * emulation and remove write permission
    86.5       */
    86.6      atomic_t          gtable_dirty_version;
    86.7 +
    86.8 +    /* OOS */
    86.9 +    int oos_active;
   86.10  };
   86.11  
   86.12  struct shadow_vcpu {
   86.13 @@ -122,6 +125,17 @@ struct shadow_vcpu {
   86.14      unsigned long last_emulated_frame;
   86.15      /* Last MFN that we emulated a write successfully */
   86.16      unsigned long last_emulated_mfn;
   86.17 +
   86.18 +    /* Shadow out-of-sync: pages that this vcpu has let go out of sync */
   86.19 +    mfn_t oos[SHADOW_OOS_PAGES];
   86.20 +    unsigned long oos_va[SHADOW_OOS_PAGES];
   86.21 +    mfn_t oos_snapshot[SHADOW_OOS_PAGES];
   86.22 +    struct oos_fixup {
   86.23 +        mfn_t gmfn;
   86.24 +        mfn_t smfn;
   86.25 +        unsigned long off;
   86.26 +    } *oos_fixups;
   86.27 +    int oos_fixup_used;
   86.28  };
   86.29  
   86.30  /************************************************/
    87.1 --- a/xen/include/asm-x86/hvm/vmx/vmcs.h	Thu Jun 19 12:48:04 2008 +0900
    87.2 +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h	Wed Jul 02 11:30:37 2008 +0900
    87.3 @@ -333,10 +333,10 @@ enum vmcs_field {
    87.4  #define VMCS_VPID_WIDTH 16
    87.5  
    87.6  void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr);
    87.7 -int vmx_read_guest_msr(struct vcpu *v, u32 msr, u64 *val);
    87.8 -int vmx_write_guest_msr(struct vcpu *v, u32 msr, u64 val);
    87.9 -int vmx_add_guest_msr(struct vcpu *v, u32 msr);
   87.10 -int vmx_add_host_load_msr(struct vcpu *v, u32 msr);
   87.11 +int vmx_read_guest_msr(u32 msr, u64 *val);
   87.12 +int vmx_write_guest_msr(u32 msr, u64 val);
   87.13 +int vmx_add_guest_msr(u32 msr);
   87.14 +int vmx_add_host_load_msr(u32 msr);
   87.15  
   87.16  #endif /* ASM_X86_HVM_VMX_VMCS_H__ */
   87.17  
    88.1 --- a/xen/include/asm-x86/mm.h	Thu Jun 19 12:48:04 2008 +0900
    88.2 +++ b/xen/include/asm-x86/mm.h	Wed Jul 02 11:30:37 2008 +0900
    88.3 @@ -130,6 +130,14 @@ static inline u32 pickle_domptr(struct d
    88.4  /* The order of the largest allocation unit we use for shadow pages */
    88.5  #define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
    88.6  
    88.7 +/* The number of out-of-sync shadows we allow per vcpu (prime, please) */
    88.8 +#define SHADOW_OOS_PAGES 3
    88.9 +
   88.10 +/* The order OOS fixup tables per vcpu */
   88.11 +#define SHADOW_OOS_FT_ORDER 1
   88.12 +/* OOS fixup tables hash entries */
   88.13 +#define SHADOW_OOS_FT_HASH 13
   88.14 +
   88.15  #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
   88.16  #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
   88.17  
    89.1 --- a/xen/include/asm-x86/perfc_defn.h	Thu Jun 19 12:48:04 2008 +0900
    89.2 +++ b/xen/include/asm-x86/perfc_defn.h	Wed Jul 02 11:30:37 2008 +0900
    89.3 @@ -80,7 +80,11 @@ PERFCOUNTER(shadow_writeable_h_2,  "shad
    89.4  PERFCOUNTER(shadow_writeable_h_3,  "shadow writeable: 64b w2k3")
    89.5  PERFCOUNTER(shadow_writeable_h_4,  "shadow writeable: linux low/solaris")
    89.6  PERFCOUNTER(shadow_writeable_h_5,  "shadow writeable: linux high")
    89.7 +PERFCOUNTER(shadow_writeable_h_6,  "shadow writeable: unsync va")
    89.8 +PERFCOUNTER(shadow_writeable_h_7,  "shadow writeable: sl1p")
    89.9 +PERFCOUNTER(shadow_writeable_h_8,  "shadow writeable: sl1p failed")
   89.10  PERFCOUNTER(shadow_writeable_bf,   "shadow writeable brute-force")
   89.11 +PERFCOUNTER(shadow_writeable_bf_1, "shadow writeable resync bf")
   89.12  PERFCOUNTER(shadow_mappings,       "shadow removes all mappings")
   89.13  PERFCOUNTER(shadow_mappings_bf,    "shadow rm-mappings brute-force")
   89.14  PERFCOUNTER(shadow_early_unshadow, "shadow unshadows for fork/exit")
   89.15 @@ -101,4 +105,15 @@ PERFCOUNTER(shadow_em_ex_pt,       "shad
   89.16  PERFCOUNTER(shadow_em_ex_non_pt,   "shadow extra non-pt-write op")
   89.17  PERFCOUNTER(shadow_em_ex_fail,     "shadow extra emulation failed")
   89.18  
   89.19 +PERFCOUNTER(shadow_oos_fixup_add_ok,    "shadow OOS fixups adds")
   89.20 +PERFCOUNTER(shadow_oos_fixup_no_add,    "shadow OOS fixups no adds")
   89.21 +PERFCOUNTER(shadow_oos_fixup_add_fail,  "shadow OOS fixups adds failed")
   89.22 +PERFCOUNTER(shadow_oos_fixup_remove,    "shadow OOS fixups removes")
   89.23 +PERFCOUNTER(shadow_oos_fixup_flush,     "shadow OOS fixups flushes")
   89.24 +PERFCOUNTER(shadow_oos_fixup_flush_gmfn,"shadow OOS fixups gmfn flushes")
   89.25 +
   89.26 +PERFCOUNTER(shadow_unsync,         "shadow OOS unsyncs")
   89.27 +PERFCOUNTER(shadow_unsync_evict,   "shadow OOS evictions")
   89.28 +PERFCOUNTER(shadow_resync,         "shadow OOS resyncs")
   89.29 +
   89.30  /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
    90.1 --- a/xen/include/public/hvm/hvm_op.h	Thu Jun 19 12:48:04 2008 +0900
    90.2 +++ b/xen/include/public/hvm/hvm_op.h	Wed Jul 02 11:30:37 2008 +0900
    90.3 @@ -92,6 +92,19 @@ struct xen_hvm_track_dirty_vram {
    90.4  typedef struct xen_hvm_track_dirty_vram xen_hvm_track_dirty_vram_t;
    90.5  DEFINE_XEN_GUEST_HANDLE(xen_hvm_track_dirty_vram_t);
    90.6  
    90.7 +/* Notify that some pages got modified by the Device Model. */
    90.8 +#define HVMOP_modified_memory    7
    90.9 +struct xen_hvm_modified_memory {
   90.10 +    /* Domain to be updated. */
   90.11 +    domid_t  domid;
   90.12 +    /* First pfn. */
   90.13 +    uint64_aligned_t first_pfn;
   90.14 +    /* Number of pages. */
   90.15 +    uint64_aligned_t nr;
   90.16 +};
   90.17 +typedef struct xen_hvm_modified_memory xen_hvm_modified_memory_t;
   90.18 +DEFINE_XEN_GUEST_HANDLE(xen_hvm_modified_memory_t);
   90.19 +
   90.20  #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
   90.21  
   90.22  #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
    91.1 --- a/xen/include/xen/domain.h	Thu Jun 19 12:48:04 2008 +0900
    91.2 +++ b/xen/include/xen/domain.h	Wed Jul 02 11:30:37 2008 +0900
    91.3 @@ -16,9 +16,6 @@ int boot_vcpu(
    91.4  struct vcpu *alloc_idle_vcpu(unsigned int cpu_id);
    91.5  void vcpu_reset(struct vcpu *v);
    91.6  
    91.7 -struct domain *alloc_domain(domid_t domid);
    91.8 -void free_domain(struct domain *d);
    91.9 -
   91.10  struct xen_domctl_getdomaininfo;
   91.11  void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info);
   91.12  
    92.1 --- a/xen/include/xen/sched.h	Thu Jun 19 12:48:04 2008 +0900
    92.2 +++ b/xen/include/xen/sched.h	Wed Jul 02 11:30:37 2008 +0900
    92.3 @@ -315,10 +315,14 @@ static inline struct domain *get_current
    92.4  struct domain *domain_create(
    92.5      domid_t domid, unsigned int domcr_flags, ssidref_t ssidref);
    92.6   /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */
    92.7 -#define _DOMCRF_hvm 0
    92.8 -#define DOMCRF_hvm  (1U<<_DOMCRF_hvm)
    92.9 -#define _DOMCRF_hap 1
   92.10 -#define DOMCRF_hap  (1U<<_DOMCRF_hap)
   92.11 +#define _DOMCRF_hvm   0
   92.12 +#define DOMCRF_hvm    (1U<<_DOMCRF_hvm)
   92.13 + /* DOMCRF_hap: Create a domain with hardware-assisted paging. */
   92.14 +#define _DOMCRF_hap   1
   92.15 +#define DOMCRF_hap    (1U<<_DOMCRF_hap)
   92.16 + /* DOMCRF_dummy: Create a dummy domain (not scheduled; not on domain list) */
   92.17 +#define _DOMCRF_dummy 2
   92.18 +#define DOMCRF_dummy  (1U<<_DOMCRF_dummy)
   92.19  
   92.20  int construct_dom0(
   92.21      struct domain *d,