ia64/xen-unstable

changeset 18758:e75cb35c798b

merge with xen-unstable.hg
author Isaku Yamahata <yamahata@valinux.co.jp>
date Tue Nov 04 12:43:19 2008 +0900 (2008-11-04)
parents 10f0e1bb8e5e 43a079fd50fd
children 57c94bdbd6b3
files xen/arch/ia64/xen/cpufreq/cpufreq.c xen/arch/ia64/xen/irq.c xen/common/xmalloc.c
line diff
     1.1 --- a/.hgignore	Tue Nov 04 12:07:22 2008 +0900
     1.2 +++ b/.hgignore	Tue Nov 04 12:43:19 2008 +0900
     1.3 @@ -211,6 +211,7 @@
     1.4  ^tools/xenfb/vncfb$
     1.5  ^tools/xenmon/xentrace_setmask$
     1.6  ^tools/xenmon/xenbaked$
     1.7 +^tools/xenpmd/xenpmd$
     1.8  ^tools/xenstat/xentop/xentop$
     1.9  ^tools/xenstore/testsuite/tmp/.*$
    1.10  ^tools/xenstore/xen$
     3.1 --- a/extras/mini-os/include/sched.h	Tue Nov 04 12:07:22 2008 +0900
     3.2 +++ b/extras/mini-os/include/sched.h	Tue Nov 04 12:43:19 2008 +0900
     3.3 @@ -48,8 +48,9 @@ struct thread* create_thread(char *name,
     3.4  void exit_thread(void) __attribute__((noreturn));
     3.5  void schedule(void);
     3.6  
     3.7 +#ifdef __INSIDE_MINIOS__
     3.8  #define current get_current()
     3.9 -
    3.10 +#endif
    3.11  
    3.12  void wake(struct thread *thread);
    3.13  void block(struct thread *thread);
     4.1 --- a/extras/mini-os/include/wait.h	Tue Nov 04 12:07:22 2008 +0900
     4.2 +++ b/extras/mini-os/include/wait.h	Tue Nov 04 12:43:19 2008 +0900
     4.3 @@ -7,7 +7,7 @@
     4.4  
     4.5  #define DEFINE_WAIT(name)                               \
     4.6  struct wait_queue name = {                              \
     4.7 -    .thread       = current,                            \
     4.8 +    .thread       = get_current(),                            \
     4.9      .thread_list  = MINIOS_LIST_HEAD_INIT((name).thread_list), \
    4.10  }
    4.11  
    4.12 @@ -53,7 +53,7 @@ static inline void wake_up(struct wait_q
    4.13      unsigned long flags;        \
    4.14      local_irq_save(flags);      \
    4.15      add_wait_queue(&wq, &w);    \
    4.16 -    block(current);             \
    4.17 +    block(get_current());       \
    4.18      local_irq_restore(flags);   \
    4.19  } while (0)
    4.20  
    4.21 @@ -74,8 +74,8 @@ static inline void wake_up(struct wait_q
    4.22          /* protect the list */                                  \
    4.23          local_irq_save(flags);                                  \
    4.24          add_wait_queue(&wq, &__wait);                           \
    4.25 -        current->wakeup_time = deadline;                        \
    4.26 -        clear_runnable(current);                                \
    4.27 +        get_current()->wakeup_time = deadline;                  \
    4.28 +        clear_runnable(get_current());                          \
    4.29          local_irq_restore(flags);                               \
    4.30          if((condition) || (deadline && NOW() >= deadline))      \
    4.31              break;                                              \
    4.32 @@ -83,7 +83,7 @@ static inline void wake_up(struct wait_q
    4.33      }                                                           \
    4.34      local_irq_save(flags);                                      \
    4.35      /* need to wake up */                                       \
    4.36 -    wake(current);                                              \
    4.37 +    wake(get_current());                                        \
    4.38      remove_wait_queue(&__wait);                                 \
    4.39      local_irq_restore(flags);                                   \
    4.40  } while(0) 
     5.1 --- a/extras/mini-os/minios.mk	Tue Nov 04 12:07:22 2008 +0900
     5.2 +++ b/extras/mini-os/minios.mk	Tue Nov 04 12:43:19 2008 +0900
     5.3 @@ -26,6 +26,9 @@ else
     5.4  DEF_CFLAGS += -O3
     5.5  endif
     5.6  
     5.7 +# Make the headers define our internal stuff
     5.8 +DEF_CFLAGS += -D__INSIDE_MINIOS__
     5.9 +
    5.10  # Build the CFLAGS and ASFLAGS for compiling and assembling.
    5.11  # DEF_... flags are the common mini-os flags,
    5.12  # ARCH_... flags may be defined in arch/$(TARGET_ARCH_FAM/rules.mk
     6.1 --- a/tools/Makefile	Tue Nov 04 12:07:22 2008 +0900
     6.2 +++ b/tools/Makefile	Tue Nov 04 12:43:19 2008 +0900
     6.3 @@ -24,6 +24,7 @@ SUBDIRS-y += libfsimage
     6.4  SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen
     6.5  SUBDIRS-y += fs-back
     6.6  SUBDIRS-$(CONFIG_IOEMU) += ioemu-dir
     6.7 +SUBDIRS-y += xenpmd
     6.8  
     6.9  # These don't cross-compile
    6.10  ifeq ($(XEN_COMPILE_ARCH),$(XEN_TARGET_ARCH))
     7.1 --- a/tools/blktap/drivers/block-qcow.c	Tue Nov 04 12:07:22 2008 +0900
     7.2 +++ b/tools/blktap/drivers/block-qcow.c	Tue Nov 04 12:43:19 2008 +0900
     7.3 @@ -722,11 +722,11 @@ static inline void init_fds(struct disk_
     7.4  /* Open the disk file and initialize qcow state. */
     7.5  static int tdqcow_open (struct disk_driver *dd, const char *name, td_flag_t flags)
     7.6  {
     7.7 -	int fd, len, i, shift, ret, size, l1_table_size, o_flags;
     7.8 +	int fd, len, i, shift, ret, size, l1_table_size, o_flags, l1_table_block;
     7.9  	int max_aio_reqs;
    7.10  	struct td_state     *bs = dd->td_state;
    7.11  	struct tdqcow_state *s  = (struct tdqcow_state *)dd->private;
    7.12 -	char *buf;
    7.13 +	char *buf, *buf2;
    7.14  	QCowHeader *header;
    7.15  	QCowHeader_ext *exthdr;
    7.16  	uint32_t cksum;
    7.17 @@ -734,8 +734,8 @@ static int tdqcow_open (struct disk_driv
    7.18  
    7.19   	DPRINTF("QCOW: Opening %s\n",name);
    7.20  
    7.21 -	/* Since we don't handle O_DIRECT correctly, don't use it */
    7.22 -	o_flags = O_LARGEFILE | ((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
    7.23 +	o_flags = O_DIRECT | O_LARGEFILE | 
    7.24 +		((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
    7.25  	fd = open(name, o_flags);
    7.26  	if (fd < 0) {
    7.27  		DPRINTF("Unable to open %s (%d)\n",name,0 - errno);
    7.28 @@ -819,9 +819,14 @@ static int tdqcow_open (struct disk_driv
    7.29  		(int) (s->l1_size * sizeof(uint64_t)), 
    7.30  		l1_table_size);
    7.31  
    7.32 -	lseek(fd, s->l1_table_offset, SEEK_SET);
    7.33 -	if (read(fd, s->l1_table, l1_table_size) != l1_table_size)
    7.34 +	lseek(fd, 0, SEEK_SET);
    7.35 +	l1_table_block = l1_table_size + s->l1_table_offset;
    7.36 +	l1_table_block = l1_table_block + 512 - (l1_table_block % 512); 
    7.37 +	ret = posix_memalign((void **)&buf2, 4096, l1_table_block);
    7.38 +	if (ret != 0) goto fail;
    7.39 +	if (read(fd, buf2, l1_table_block) != l1_table_block)
    7.40  		goto fail;
    7.41 +	memcpy(s->l1_table, buf2 + s->l1_table_offset, l1_table_size);
    7.42  
    7.43  	for(i = 0; i < s->l1_size; i++) {
    7.44  		be64_to_cpus(&s->l1_table[i]);
    7.45 @@ -871,8 +876,9 @@ static int tdqcow_open (struct disk_driv
    7.46  
    7.47  			DPRINTF("qcow: Converting image to big endian L1 table\n");
    7.48  
    7.49 -			lseek(fd, s->l1_table_offset, SEEK_SET);
    7.50 -			if (write(fd, s->l1_table, l1_table_size) != l1_table_size) {
    7.51 +			memcpy(buf2 + s->l1_table_offset, s->l1_table, l1_table_size);
    7.52 +			lseek(fd, 0, SEEK_SET);
    7.53 +			if (write(fd, buf2, l1_table_block) != l1_table_block) {
    7.54  				DPRINTF("qcow: Failed to write new L1 table\n");
    7.55  				goto fail;
    7.56  			}
    7.57 @@ -917,7 +923,7 @@ static int tdqcow_open (struct disk_driv
    7.58  	init_fds(dd);
    7.59  
    7.60  	if (!final_cluster)
    7.61 -		s->fd_end = s->l1_table_offset + l1_table_size;
    7.62 +		s->fd_end = l1_table_block;
    7.63  	else {
    7.64  		s->fd_end = lseek(fd, 0, SEEK_END);
    7.65  		if (s->fd_end == (off_t)-1)
     8.1 --- a/tools/firmware/hvmloader/acpi/static_tables.c	Tue Nov 04 12:07:22 2008 +0900
     8.2 +++ b/tools/firmware/hvmloader/acpi/static_tables.c	Tue Nov 04 12:43:19 2008 +0900
     8.3 @@ -67,7 +67,7 @@ struct acpi_20_fadt Fadt = {
     8.4  
     8.5      .p_lvl2_lat = 0x0fff, /* >100,  means we do not support C2 state */
     8.6      .p_lvl3_lat = 0x0fff, /* >1000, means we do not support C3 state */
     8.7 -    .iapc_boot_arch = ACPI_LEGACY_DEVICES | ACPI_8042,
     8.8 +    .iapc_boot_arch = ACPI_8042,
     8.9      .flags = (ACPI_PROC_C1 | ACPI_SLP_BUTTON |
    8.10                ACPI_WBINVD | ACPI_PWR_BUTTON |
    8.11                ACPI_FIX_RTC | ACPI_TMR_VAL_EXT),
     9.1 --- a/tools/firmware/rombios/rombios.c	Tue Nov 04 12:07:22 2008 +0900
     9.2 +++ b/tools/firmware/rombios/rombios.c	Tue Nov 04 12:43:19 2008 +0900
     9.3 @@ -7216,7 +7216,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n
     9.4          outb(0x03f5, head);
     9.5          outb(0x03f5, sector);
     9.6          outb(0x03f5, 2); // 512 byte sector size
     9.7 -        outb(0x03f5, 0); // last sector number possible on track
     9.8 +        outb(0x03f5, sector + num_sectors - 1); // last sector to read on track
     9.9          outb(0x03f5, 0); // Gap length
    9.10          outb(0x03f5, 0xff); // Gap length
    9.11  
    9.12 @@ -7364,7 +7364,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n
    9.13          outb(0x03f5, head);
    9.14          outb(0x03f5, sector);
    9.15          outb(0x03f5, 2); // 512 byte sector size
    9.16 -        outb(0x03f5, 0); // last sector number possible on track
    9.17 +        outb(0x03f5, sector + num_sectors - 1); // last sector to write on track
    9.18          outb(0x03f5, 0); // Gap length
    9.19          outb(0x03f5, 0xff); // Gap length
    9.20  
    10.1 --- a/tools/flask/policy/policy/modules/xen/xen.te	Tue Nov 04 12:07:22 2008 +0900
    10.2 +++ b/tools/flask/policy/policy/modules/xen/xen.te	Tue Nov 04 12:43:19 2008 +0900
    10.3 @@ -74,7 +74,7 @@ allow dom0_t iomem_t:mmu {map_read map_w
    10.4  allow dom0_t pirq_t:event {vector};
    10.5  allow dom0_t xen_t:mmu {memorymap};
    10.6  
    10.7 -allow dom0_t dom0_t:mmu {pinpage map_read map_write adjust};
    10.8 +allow dom0_t dom0_t:mmu {pinpage map_read map_write adjust updatemp};
    10.9  allow dom0_t dom0_t:grant {query setup};
   10.10  allow dom0_t dom0_t:domain {scheduler getdomaininfo getvcpuinfo getvcpuaffinity};
   10.11  
   10.12 @@ -112,6 +112,7 @@ allow domU_t evchnU-0_t:event {send};
   10.13  
   10.14  allow dom0_t dom0_t:event {send};
   10.15  allow dom0_t domU_t:grant {copy};
   10.16 +allow domU_t domU_t:grant {copy};
   10.17  
   10.18  manage_domain(dom0_t, domU_t)
   10.19  
    11.1 --- a/tools/python/xen/util/diagnose.py	Tue Nov 04 12:07:22 2008 +0900
    11.2 +++ b/tools/python/xen/util/diagnose.py	Tue Nov 04 12:43:19 2008 +0900
    11.3 @@ -23,7 +23,7 @@ from xen.xend import sxp
    11.4  from xen.xend.XendClient import server
    11.5  from xen.xend.XendError import XendError
    11.6  from xen.xend.xenstore.xstransact import xstransact
    11.7 -from xen.xend.server import DevController
    11.8 +from xen.xend.server import DevConstants
    11.9  
   11.10  import xen.xend.XendProtocol
   11.11  
   11.12 @@ -169,7 +169,7 @@ def diagnose_hotplugging():
   11.13  
   11.14  
   11.15  def stateString(state):
   11.16 -    return state and DevController.xenbusState[int(state)] or '<None>'
   11.17 +    return state and DevConstants.xenbusState[int(state)] or '<None>'
   11.18  
   11.19  
   11.20  def main(argv = None):
    12.1 --- a/tools/python/xen/xend/XendConfig.py	Tue Nov 04 12:07:22 2008 +0900
    12.2 +++ b/tools/python/xen/xend/XendConfig.py	Tue Nov 04 12:43:19 2008 +0900
    12.3 @@ -1602,21 +1602,21 @@ class XendConfig(dict):
    12.4          #   [vscsi,
    12.5          #     [dev,
    12.6          #       [devid, 0], [p-devname, sdb], [p-dev, 1:0:0:1],
    12.7 -        #       [v-dev, 0:0:0:0], [state, Initialising]
    12.8 +        #       [v-dev, 0:0:0:0], [state, 1]
    12.9          #     ],
   12.10          #     [dev,
   12.11          #       [devid, 0], [p-devname, sdc], [p-dev, 1:0:0:2],
   12.12 -        #       [v-dev, 0:0:0:1], [satet, Initialising]
   12.13 +        #       [v-dev, 0:0:0:1], [satet, 1]
   12.14          #     ]
   12.15          #   ],
   12.16          #   [vscsi,
   12.17          #     [dev,
   12.18          #       [devid, 1], [p-devname, sdg], [p-dev, 2:0:0:0],
   12.19 -        #       [v-dev, 1:0:0:0], [state, Initialising]
   12.20 +        #       [v-dev, 1:0:0:0], [state, 1]
   12.21          #     ],
   12.22          #     [dev,
   12.23          #       [devid, 1], [p-devname, sdh], [p-dev, 2:0:0:1],
   12.24 -        #       [v-dev, 1:0:0:1], [satet, Initialising]
   12.25 +        #       [v-dev, 1:0:0:1], [satet, 1]
   12.26          #     ]
   12.27          #   ]
   12.28          # ]
   12.29 @@ -1632,18 +1632,19 @@ class XendConfig(dict):
   12.30          #   [vscsi,
   12.31          #     [dev,
   12.32          #       [devid, 0], [p-devname, sdd], [p-dev, 1:0:0:3],
   12.33 -        #       [v-dev, 0:0:0:2], [state, Initialising]
   12.34 +        #       [v-dev, 0:0:0:2], [state, 1]
   12.35          #     ]
   12.36          #   ]
   12.37          # ]
   12.38          #
   12.39 -        # state 'Initialising' indicates that the device is being attached,
   12.40 -        # while state 'Closing' indicates that the device is being detached.
   12.41 +        # state xenbusState['Initialising'] indicates that the device is 
   12.42 +        # being attached, while state xenbusState['Closing'] indicates 
   12.43 +        # that the device is being detached.
   12.44          #
   12.45          # The Dict looks like this:
   12.46          #
   12.47          # { devs: [ {devid: 0, p-devname: sdd, p-dev: 1:0:0:3,
   12.48 -        #            v-dev: 0:0:0:2, state: Initialising} ] }
   12.49 +        #            v-dev: 0:0:0:2, state: 1} ] }
   12.50  
   12.51          dev_config = {}
   12.52  
    13.1 --- a/tools/python/xen/xend/XendDomainInfo.py	Tue Nov 04 12:07:22 2008 +0900
    13.2 +++ b/tools/python/xen/xend/XendDomainInfo.py	Tue Nov 04 12:43:19 2008 +0900
    13.3 @@ -52,6 +52,7 @@ from xen.xend.xenstore.xsutil import Get
    13.4  from xen.xend.xenstore.xswatch import xswatch
    13.5  from xen.xend.XendConstants import *
    13.6  from xen.xend.XendAPIConstants import *
    13.7 +from xen.xend.server.DevConstants import xenbusState
    13.8  
    13.9  from xen.xend.XendVMMetrics import XendVMMetrics
   13.10  
   13.11 @@ -797,7 +798,7 @@ class XendDomainInfo:
   13.12          existing_dev_info = self._getDeviceInfo_vscsi(req_devid, dev['v-dev'])
   13.13          state = dev['state']
   13.14  
   13.15 -        if state == 'Initialising':
   13.16 +        if state == xenbusState['Initialising']:
   13.17              # new create
   13.18              # If request devid does not exist, create and exit.
   13.19              if existing_dev_info is None:
   13.20 @@ -806,25 +807,48 @@ class XendDomainInfo:
   13.21              elif existing_dev_info == "exists":
   13.22                  raise XendError("The virtual device %s is already defined" % dev['v-dev'])
   13.23  
   13.24 -        elif state == 'Closing':
   13.25 +        elif state == xenbusState['Closing']:
   13.26              if existing_dev_info is None:
   13.27                  raise XendError("Cannot detach vscsi device does not exist")
   13.28  
   13.29 -        # use DevController.reconfigureDevice to change device config
   13.30 -        dev_control = self.getDeviceController(dev_class)
   13.31 -        dev_uuid = dev_control.reconfigureDevice(req_devid, dev_config)
   13.32 -        dev_control.waitForDevice_reconfigure(req_devid)
   13.33 -        num_devs = dev_control.cleanupDevice(req_devid)
   13.34 -
   13.35 -        # update XendConfig with new device info
   13.36 -        if dev_uuid:
   13.37 -            new_dev_sxp = dev_control.configuration(req_devid)
   13.38 +        if self.domid is not None:
   13.39 +            # use DevController.reconfigureDevice to change device config
   13.40 +            dev_control = self.getDeviceController(dev_class)
   13.41 +            dev_uuid = dev_control.reconfigureDevice(req_devid, dev_config)
   13.42 +            dev_control.waitForDevice_reconfigure(req_devid)
   13.43 +            num_devs = dev_control.cleanupDevice(req_devid)
   13.44 +
   13.45 +            # update XendConfig with new device info
   13.46 +            if dev_uuid:
   13.47 +                new_dev_sxp = dev_control.configuration(req_devid)
   13.48 +                self.info.device_update(dev_uuid, new_dev_sxp)
   13.49 +
   13.50 +            # If there is no device left, destroy vscsi and remove config.
   13.51 +            if num_devs == 0:
   13.52 +                self.destroyDevice('vscsi', req_devid)
   13.53 +                del self.info['devices'][dev_uuid]
   13.54 +
   13.55 +        else:
   13.56 +            cur_dev_sxp = self._getDeviceInfo_vscsi(req_devid, None)
   13.57 +            new_dev_sxp = ['vscsi']
   13.58 +            for cur_dev in sxp.children(cur_dev_sxp, 'dev'):
   13.59 +                if state == xenbusState['Closing']:
   13.60 +                    cur_dev_vdev = sxp.child_value(cur_dev, 'v-dev')
   13.61 +                    if cur_dev_vdev == dev['v-dev']:
   13.62 +                        continue
   13.63 +                new_dev_sxp.append(cur_dev)
   13.64 +
   13.65 +            if state == xenbusState['Initialising']:
   13.66 +                new_dev_sxp.append(sxp.child0(dev_sxp, 'dev'))
   13.67 +
   13.68 +            dev_uuid = sxp.child_value(cur_dev_sxp, 'uuid')
   13.69              self.info.device_update(dev_uuid, new_dev_sxp)
   13.70  
   13.71 -        # If there is no device left, destroy vscsi and remove config.
   13.72 -        if num_devs == 0:
   13.73 -            self.destroyDevice('vscsi', req_devid)
   13.74 -            del self.info['devices'][dev_uuid]
   13.75 +            # If there is only 'vscsi' in new_dev_sxp, remove the config.
   13.76 +            if len(sxp.children(new_dev_sxp, 'dev')) == 0:
   13.77 +                del self.info['devices'][dev_uuid]
   13.78 +
   13.79 +        xen.xend.XendDomain.instance().managed_config_save(self)
   13.80  
   13.81          return True
   13.82  
   13.83 @@ -986,7 +1010,17 @@ class XendDomainInfo:
   13.84              sxprs = []
   13.85              dev_num = 0
   13.86              for dev_type, dev_info in self.info.all_devices_sxpr():
   13.87 -                if dev_type == deviceClass:
   13.88 +                if dev_type != deviceClass:
   13.89 +                    continue
   13.90 +
   13.91 +                if deviceClass == 'vscsi':
   13.92 +                    vscsi_devs = ['devs', []]
   13.93 +                    for vscsi_dev in sxp.children(dev_info, 'dev'):
   13.94 +                        vscsi_dev.append(['frontstate', None])
   13.95 +                        vscsi_devs[1].append(vscsi_dev)
   13.96 +                        dev_num = int(sxp.child_value(vscsi_dev, 'devid'))
   13.97 +                    sxprs.append([dev_num, [vscsi_devs]])
   13.98 +                else:
   13.99                      sxprs.append([dev_num, dev_info])
  13.100                      dev_num += 1
  13.101              return sxprs
  13.102 @@ -2380,11 +2414,10 @@ class XendDomainInfo:
  13.103              time.sleep(2)
  13.104          for paths in plist:
  13.105              if paths.find('backend') != -1:
  13.106 -                from xen.xend.server import DevController
  13.107                  # Modify online status /before/ updating state (latter is watched by
  13.108                  # drivers, so this ordering avoids a race).
  13.109                  xstransact.Write(paths, 'online', "0")
  13.110 -                xstransact.Write(paths, 'state', str(DevController.xenbusState['Closing']))
  13.111 +                xstransact.Write(paths, 'state', str(xenbusState['Closing']))
  13.112              # force
  13.113              xstransact.Remove(paths)
  13.114  
  13.115 @@ -3439,7 +3472,7 @@ class XendDomainInfo:
  13.116                      ['p-devname', pscsi.get_dev_name()],
  13.117                      ['p-dev', pscsi.get_physical_HCTL()],
  13.118                      ['v-dev', xenapi_dscsi.get('virtual_HCTL')],
  13.119 -                    ['state', 'Initialising'],
  13.120 +                    ['state', xenbusState['Initialising']],
  13.121                      ['uuid', dscsi_uuid]
  13.122                  ]
  13.123              ]
  13.124 @@ -3558,7 +3591,7 @@ class XendDomainInfo:
  13.125          if target_dev is None:
  13.126              raise XendError('Failed to destroy device')
  13.127  
  13.128 -        target_dev.append(['state', 'Closing'])
  13.129 +        target_dev.append(['state', xenbusState['Closing']])
  13.130          target_vscsi_sxp = ['vscsi', target_dev]
  13.131  
  13.132          if self._stateGet() != XEN_API_VM_POWER_STATE_RUNNING:
    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/tools/python/xen/xend/server/DevConstants.py	Tue Nov 04 12:43:19 2008 +0900
    14.3 @@ -0,0 +1,45 @@
    14.4 +#============================================================================
    14.5 +# This library is free software; you can redistribute it and/or
    14.6 +# modify it under the terms of version 2.1 of the GNU Lesser General Public
    14.7 +# License as published by the Free Software Foundation.
    14.8 +#
    14.9 +# This library is distributed in the hope that it will be useful,
   14.10 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
   14.11 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   14.12 +# Lesser General Public License for more details.
   14.13 +#
   14.14 +# You should have received a copy of the GNU Lesser General Public
   14.15 +# License along with this library; if not, write to the Free Software
   14.16 +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   14.17 +#============================================================================
   14.18 +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@hp.com>
   14.19 +# Copyright (C) 2005 XenSource Ltd
   14.20 +#============================================================================
   14.21 +
   14.22 +DEVICE_CREATE_TIMEOUT  = 100
   14.23 +DEVICE_DESTROY_TIMEOUT = 100
   14.24 +HOTPLUG_STATUS_NODE = "hotplug-status"
   14.25 +HOTPLUG_ERROR_NODE  = "hotplug-error"
   14.26 +HOTPLUG_STATUS_ERROR = "error"
   14.27 +HOTPLUG_STATUS_BUSY  = "busy"
   14.28 +
   14.29 +Connected    = 1
   14.30 +Error        = 2
   14.31 +Missing      = 3
   14.32 +Timeout      = 4
   14.33 +Busy         = 5
   14.34 +Disconnected = 6
   14.35 +
   14.36 +xenbusState = {
   14.37 +    'Unknown'       : 0,
   14.38 +    'Initialising'  : 1,
   14.39 +    'InitWait'      : 2,
   14.40 +    'Initialised'   : 3,
   14.41 +    'Connected'     : 4,
   14.42 +    'Closing'       : 5,
   14.43 +    'Closed'        : 6,
   14.44 +    'Reconfiguring' : 7,
   14.45 +    'Reconfigured'  : 8,
   14.46 +    }
   14.47 +xenbusState.update(dict(zip(xenbusState.values(), xenbusState.keys())))
   14.48 +
    15.1 --- a/tools/python/xen/xend/server/DevController.py	Tue Nov 04 12:07:22 2008 +0900
    15.2 +++ b/tools/python/xen/xend/server/DevController.py	Tue Nov 04 12:43:19 2008 +0900
    15.3 @@ -23,42 +23,15 @@ from xen.xend import sxp, XendOptions
    15.4  from xen.xend.XendError import VmError
    15.5  from xen.xend.XendLogging import log
    15.6  import xen.xend.XendConfig
    15.7 +from xen.xend.server.DevConstants import *
    15.8  
    15.9  from xen.xend.xenstore.xstransact import xstransact, complete
   15.10  from xen.xend.xenstore.xswatch import xswatch
   15.11  
   15.12  import os
   15.13  
   15.14 -DEVICE_CREATE_TIMEOUT  = 100
   15.15 -DEVICE_DESTROY_TIMEOUT = 100
   15.16 -HOTPLUG_STATUS_NODE = "hotplug-status"
   15.17 -HOTPLUG_ERROR_NODE  = "hotplug-error"
   15.18 -HOTPLUG_STATUS_ERROR = "error"
   15.19 -HOTPLUG_STATUS_BUSY  = "busy"
   15.20 -
   15.21 -Connected    = 1
   15.22 -Error        = 2
   15.23 -Missing      = 3
   15.24 -Timeout      = 4
   15.25 -Busy         = 5
   15.26 -Disconnected = 6
   15.27 -
   15.28 -xenbusState = {
   15.29 -    'Unknown'      : 0,
   15.30 -    'Initialising' : 1,
   15.31 -    'InitWait'     : 2,
   15.32 -    'Initialised'  : 3,
   15.33 -    'Connected'    : 4,
   15.34 -    'Closing'      : 5,
   15.35 -    'Closed'       : 6,
   15.36 -    'Reconfiguring': 7,
   15.37 -    'Reconfigured' : 8,
   15.38 -    }
   15.39 -
   15.40  xoptions = XendOptions.instance()
   15.41  
   15.42 -xenbusState.update(dict(zip(xenbusState.values(), xenbusState.keys())))
   15.43 -
   15.44  
   15.45  class DevController:
   15.46      """Abstract base class for a device controller.  Device controllers create
   15.47 @@ -569,7 +542,7 @@ class DevController:
   15.48              xswatch(statusPath, hotplugStatusCallback, ev, result)
   15.49              ev.wait(DEVICE_CREATE_TIMEOUT)
   15.50              err = xstransact.Read(statusPath, HOTPLUG_ERROR_NODE)
   15.51 -            if result['status'] != 'Connected':
   15.52 +            if result['status'] != Connected:
   15.53                  return (result['status'], err)
   15.54              
   15.55          backpath = self.readVm(devid, "backend")
    16.1 --- a/tools/python/xen/xend/server/iopif.py	Tue Nov 04 12:07:22 2008 +0900
    16.2 +++ b/tools/python/xen/xend/server/iopif.py	Tue Nov 04 12:43:19 2008 +0900
    16.3 @@ -45,9 +45,22 @@ def parse_ioport(val):
    16.4  
    16.5  class IOPortsController(DevController):
    16.6  
    16.7 +    valid_cfg = ['to', 'from', 'uuid']
    16.8 +
    16.9      def __init__(self, vm):
   16.10          DevController.__init__(self, vm)
   16.11  
   16.12 +    def getDeviceConfiguration(self, devid, transaction = None):
   16.13 +        result = DevController.getDeviceConfiguration(self, devid, transaction)
   16.14 +        if transaction is None:
   16.15 +            devinfo = self.readBackend(devid, *self.valid_cfg)
   16.16 +        else:
   16.17 +            devinfo = self.readBackendTxn(transaction, devid, *self.valid_cfg)
   16.18 +        config = dict(zip(self.valid_cfg, devinfo))
   16.19 +        config = dict([(key, val) for key, val in config.items()
   16.20 +                       if val != None])
   16.21 +        return config
   16.22 +
   16.23      def getDeviceDetails(self, config):
   16.24          """@see DevController.getDeviceDetails"""
   16.25  
   16.26 @@ -81,4 +94,9 @@ class IOPortsController(DevController):
   16.27                  'ioports: Failed to configure legacy i/o range: %s - %s' %
   16.28                  (io_from, io_to))
   16.29  
   16.30 -        return (None, {}, {})
   16.31 +        back = dict([(k, config[k]) for k in self.valid_cfg if k in config])
   16.32 +        return (self.allocateDeviceID(), back, {})
   16.33 +
   16.34 +    def waitForDevice(self, devid):
   16.35 +        # don't wait for hotplug
   16.36 +        return
    17.1 --- a/tools/python/xen/xend/server/irqif.py	Tue Nov 04 12:07:22 2008 +0900
    17.2 +++ b/tools/python/xen/xend/server/irqif.py	Tue Nov 04 12:43:19 2008 +0900
    17.3 @@ -39,6 +39,18 @@ class IRQController(DevController):
    17.4      def __init__(self, vm):
    17.5          DevController.__init__(self, vm)
    17.6  
    17.7 +    valid_cfg = ['irq', 'uuid']
    17.8 +
    17.9 +    def getDeviceConfiguration(self, devid, transaction = None):
   17.10 +        result = DevController.getDeviceConfiguration(self, devid, transaction)
   17.11 +        if transaction is None:
   17.12 +            devinfo = self.readBackend(devid, *self.valid_cfg)
   17.13 +        else:
   17.14 +            devinfo = self.readBackendTxn(transaction, devid, *self.valid_cfg)
   17.15 +        config = dict(zip(self.valid_cfg, devinfo))
   17.16 +        config = dict([(key, val) for key, val in config.items()
   17.17 +                       if val != None])
   17.18 +        return config
   17.19  
   17.20      def getDeviceDetails(self, config):
   17.21          """@see DevController.getDeviceDetails"""
   17.22 @@ -75,4 +87,9 @@ class IRQController(DevController):
   17.23          if rc < 0:
   17.24              raise VmError(
   17.25                  'irq: Failed to map irq %x' % (pirq))
   17.26 -        return (None, {}, {})
   17.27 +        back = dict([(k, config[k]) for k in self.valid_cfg if k in config])
   17.28 +        return (self.allocateDeviceID(), back, {})
   17.29 +
   17.30 +    def waitForDevice(self, devid):
   17.31 +        # don't wait for hotplug
   17.32 +        return
    18.1 --- a/tools/python/xen/xend/server/pciif.py	Tue Nov 04 12:07:22 2008 +0900
    18.2 +++ b/tools/python/xen/xend/server/pciif.py	Tue Nov 04 12:43:19 2008 +0900
    18.3 @@ -25,7 +25,8 @@ from xen.xend import arch
    18.4  from xen.xend.XendError import VmError
    18.5  from xen.xend.XendLogging import log
    18.6  
    18.7 -from xen.xend.server.DevController import DevController, xenbusState
    18.8 +from xen.xend.server.DevController import DevController
    18.9 +from xen.xend.server.DevConstants import xenbusState
   18.10  
   18.11  import xen.lowlevel.xc
   18.12  
    19.1 --- a/tools/python/xen/xend/server/vscsiif.py	Tue Nov 04 12:07:22 2008 +0900
    19.2 +++ b/tools/python/xen/xend/server/vscsiif.py	Tue Nov 04 12:43:19 2008 +0900
    19.3 @@ -28,7 +28,8 @@ from xen.xend import sxp
    19.4  from xen.xend.XendError import VmError
    19.5  from xen.xend.XendLogging import log
    19.6  
    19.7 -from xen.xend.server.DevController import DevController, xenbusState
    19.8 +from xen.xend.server.DevController import DevController
    19.9 +from xen.xend.server.DevConstants import xenbusState
   19.10  from xen.xend.xenstore.xstransact import xstransact
   19.11  
   19.12  class VSCSIController(DevController):
   19.13 @@ -92,8 +93,8 @@ class VSCSIController(DevController):
   19.14              back[devpath + '/p-devname'] = pdevname
   19.15              vdev = vscsi_config.get('v-dev', '')
   19.16              back[devpath + '/v-dev'] = vdev
   19.17 -            state = vscsi_config.get('state', '')
   19.18 -            back[devpath + '/state'] = str(xenbusState[state])
   19.19 +            state = vscsi_config.get('state', xenbusState['Unknown'])
   19.20 +            back[devpath + '/state'] = str(state)
   19.21              devid = vscsi_config.get('devid', '')
   19.22              back[devpath + '/devid'] = str(devid)
   19.23  
   19.24 @@ -168,17 +169,17 @@ class VSCSIController(DevController):
   19.25          (devid, back, front) = self.getDeviceDetails(config)
   19.26          devid = int(devid)
   19.27          vscsi_config = config['devs'][0]
   19.28 -        state = vscsi_config.get('state', '')
   19.29 +        state = vscsi_config.get('state', xenbusState['Unknown'])
   19.30          driver_state = self.readBackend(devid, 'state')
   19.31          if str(xenbusState['Connected']) != driver_state:
   19.32              raise VmError("Driver status is not connected")
   19.33  
   19.34          uuid = self.readBackend(devid, 'uuid')
   19.35 -        if state == 'Initialising':
   19.36 +        if state == xenbusState['Initialising']:
   19.37              back['uuid'] = uuid
   19.38              self.writeBackend(devid, back)
   19.39  
   19.40 -        elif state == 'Closing':
   19.41 +        elif state == xenbusState['Closing']:
   19.42              found = False
   19.43              devs = self.readBackendList(devid, "vscsi-devs")
   19.44              vscsipath = "vscsi-devs/"
   19.45 @@ -198,7 +199,7 @@ class VSCSIController(DevController):
   19.46  
   19.47          else:
   19.48              raise XendError("Error configuring device invalid "
   19.49 -                            "state '%s'" % state)
   19.50 +                            "state '%s'" % xenbusState[state])
   19.51  
   19.52          self.writeBackend(devid, 'state', str(xenbusState['Reconfiguring']))
   19.53          return self.readBackend(devid, 'uuid')
    20.1 --- a/tools/python/xen/xm/create.py	Tue Nov 04 12:07:22 2008 +0900
    20.2 +++ b/tools/python/xen/xm/create.py	Tue Nov 04 12:43:19 2008 +0900
    20.3 @@ -32,6 +32,7 @@ from xen.xend import PrettyPrint as SXPP
    20.4  from xen.xend import osdep
    20.5  import xen.xend.XendClient
    20.6  from xen.xend.XendBootloader import bootloader
    20.7 +from xen.xend.server.DevConstants import xenbusState
    20.8  from xen.util import blkif
    20.9  from xen.util import vscsi_util
   20.10  import xen.util.xsm.xsm as security
   20.11 @@ -707,7 +708,7 @@ def configure_vscsis(config_devs, vals):
   20.12              vscsi_util.vscsi_get_hctl_and_devname_by(p_dev, scsi_devices)
   20.13  
   20.14          if p_hctl == None:
   20.15 -            raise ValueError("Cannot find device \"%s\"" % p_dev)
   20.16 +            raise ValueError('Cannot find device "%s"' % p_dev)
   20.17  
   20.18          for config in config_scsi:
   20.19              dev = vscsi_convert_sxp_to_dict(config)
   20.20 @@ -717,7 +718,7 @@ def configure_vscsis(config_devs, vals):
   20.21          v_hctl = v_dev.split(':')
   20.22          devid = int(v_hctl[0])
   20.23          config_scsi.append(['dev', \
   20.24 -                        ['state', 'Initialising'], \
   20.25 +                        ['state', xenbusState['Initialising']], \
   20.26                          ['devid', devid], \
   20.27                          ['p-dev', p_hctl], \
   20.28                          ['p-devname', devname], \
   20.29 @@ -1035,6 +1036,14 @@ def preprocess_ioports(vals):
   20.30          ioports.append(hexd)
   20.31      vals.ioports = ioports
   20.32          
   20.33 +def preprocess_irq(vals):
   20.34 +    if not vals.irq: return
   20.35 +    irq = []
   20.36 +    for v in vals.irq:
   20.37 +        d = repr(v)
   20.38 +        irq.append(d)
   20.39 +    vals.irq = irq
   20.40 +
   20.41  def preprocess_vtpm(vals):
   20.42      if not vals.vtpm: return
   20.43      vtpms = []
   20.44 @@ -1133,6 +1142,7 @@ def preprocess(vals):
   20.45      preprocess_vscsi(vals)
   20.46      preprocess_ioports(vals)
   20.47      preprocess_ip(vals)
   20.48 +    preprocess_irq(vals)
   20.49      preprocess_nfs(vals)
   20.50      preprocess_vtpm(vals)
   20.51      preprocess_access_control(vals)
    21.1 --- a/tools/python/xen/xm/main.py	Tue Nov 04 12:07:22 2008 +0900
    21.2 +++ b/tools/python/xen/xm/main.py	Tue Nov 04 12:43:19 2008 +0900
    21.3 @@ -47,6 +47,7 @@ from xen.xend import PrettyPrint
    21.4  from xen.xend import sxp
    21.5  from xen.xend import XendClient
    21.6  from xen.xend.XendConstants import *
    21.7 +from xen.xend.server.DevConstants import xenbusState
    21.8  
    21.9  from xen.xm.opts import OptionError, Opts, wrap, set_true
   21.10  from xen.xm import console
   21.11 @@ -2515,7 +2516,7 @@ def xm_scsi_attach(args):
   21.12      dom = args[0]
   21.13      p_scsi = args[1]
   21.14      v_hctl = args[2]
   21.15 -    scsi = parse_scsi_configuration(p_scsi, v_hctl, 'Initialising')
   21.16 +    scsi = parse_scsi_configuration(p_scsi, v_hctl, xenbusState['Initialising'])
   21.17  
   21.18      if serverType == SERVER_XEN_API:
   21.19  
   21.20 @@ -2635,7 +2636,7 @@ def xm_scsi_detach(args):
   21.21      arg_check(args, 'scsi-detach', 2)
   21.22      dom = args[0]
   21.23      v_hctl = args[1]
   21.24 -    scsi = parse_scsi_configuration(None, v_hctl, 'Closing')
   21.25 +    scsi = parse_scsi_configuration(None, v_hctl, xenbusState['Closing'])
   21.26  
   21.27      if serverType == SERVER_XEN_API:
   21.28  
    22.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    22.2 +++ b/tools/xenpmd/Makefile	Tue Nov 04 12:43:19 2008 +0900
    22.3 @@ -0,0 +1,20 @@
    22.4 +XEN_ROOT=../..
    22.5 +include $(XEN_ROOT)/tools/Rules.mk
    22.6 +
    22.7 +CFLAGS  += -Werror
    22.8 +CFLAGS  += $(CFLAGS_libxenstore)
    22.9 +LDFLAGS += $(LDFLAGS_libxenstore)
   22.10 +
   22.11 +BIN      = xenpmd
   22.12 +
   22.13 +.PHONY: all
   22.14 +all: $(BIN)
   22.15 +
   22.16 +.PHONY: install
   22.17 +install: all
   22.18 +	$(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
   22.19 +	$(INSTALL_PROG) $(BIN) $(DESTDIR)$(SBINDIR)
   22.20 +
   22.21 +.PHONY: clean
   22.22 +clean:
   22.23 +	$(RM) -f $(BIN)
    23.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    23.2 +++ b/tools/xenpmd/xenpmd.c	Tue Nov 04 12:43:19 2008 +0900
    23.3 @@ -0,0 +1,520 @@
    23.4 +/*
    23.5 + * xenpmd.c
    23.6 + *
    23.7 + * xen power management daemon - Facilitates power management 
    23.8 + * functionality within xen guests.
    23.9 + *
   23.10 + * Copyright (c) 2008  Kamala Narasimhan 
   23.11 + * Copyright (c) 2008  Citrix Systems, Inc.
   23.12 + *
   23.13 + * This program is free software; you can redistribute it and/or modify
   23.14 + * it under the terms of the GNU General Public License as published by
   23.15 + * the Free Software Foundation; either version 2 of the License, or
   23.16 + * (at your option) any later version.
   23.17 + *
   23.18 + * This program is distributed in the hope that it will be useful,
   23.19 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   23.20 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   23.21 + * GNU General Public License for more details.
   23.22 + *
   23.23 + * You should have received a copy of the GNU General Public License
   23.24 + * along with this program; if not, write to the Free Software
   23.25 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   23.26 + */
   23.27 +
   23.28 +/* Xen extended power management support provides HVM guest power management
   23.29 + * features beyond S3, S4, S5.  For example, it helps expose system level 
   23.30 + * battery status and battery meter information and in future will be extended
   23.31 + * to include more power management support.  This extended power management 
   23.32 + * support is enabled by setting xen_extended_power_mgmt to 1 or 2 in the HVM
   23.33 + * config file.  When set to 2, non-pass through mode is enabled which heavily
   23.34 + * relies on this power management daemon to glean battery information from 
   23.35 + * dom0 and store it xenstore which would then be queries and used by qemu and 
   23.36 + * passed to the guest when appropriate battery ports are read/written to.
   23.37 + */
   23.38 +
   23.39 +#include <stdio.h>
   23.40 +#include <stdarg.h>
   23.41 +#include <string.h>
   23.42 +#include <stdlib.h>
   23.43 +#include <dirent.h>
   23.44 +#include <unistd.h>
   23.45 +#include <sys/stat.h>
   23.46 +#include <xs.h>
   23.47 +
   23.48 +/* #define RUN_STANDALONE */
   23.49 +#define RUN_IN_SIMULATE_MODE
   23.50 +
   23.51 +enum BATTERY_INFO_TYPE {
   23.52 +    BIF, 
   23.53 +    BST 
   23.54 +};
   23.55 +
   23.56 +enum BATTERY_PRESENT {
   23.57 +    NO, 
   23.58 +    YES 
   23.59 +};
   23.60 +
   23.61 +enum BATTERY_TECHNOLOGY {
   23.62 +    NON_RECHARGEABLE, 
   23.63 +    RECHARGEABLE 
   23.64 +};
   23.65 +
   23.66 +struct battery_info {
   23.67 +    enum BATTERY_PRESENT    present;
   23.68 +    unsigned long           design_capacity;
   23.69 +    unsigned long           last_full_capacity;
   23.70 +    enum BATTERY_TECHNOLOGY battery_technology;
   23.71 +    unsigned long           design_voltage;
   23.72 +    unsigned long           design_capacity_warning;
   23.73 +    unsigned long           design_capacity_low;
   23.74 +    unsigned long           capacity_granularity_1;
   23.75 +    unsigned long           capacity_granularity_2;
   23.76 +    char                    model_number[32];
   23.77 +    char                    serial_number[32];
   23.78 +    char                    battery_type[32];
   23.79 +    char                    oem_info[32];
   23.80 +};
   23.81 +
   23.82 +struct battery_status {
   23.83 +    enum BATTERY_PRESENT    present;
   23.84 +    unsigned long           state;
   23.85 +    unsigned long           present_rate;
   23.86 +    unsigned long           remaining_capacity;
   23.87 +    unsigned long           present_voltage;
   23.88 +};
   23.89 +
   23.90 +static struct xs_handle *xs;
   23.91 +
   23.92 +#ifdef RUN_IN_SIMULATE_MODE
   23.93 +    #define BATTERY_DIR_PATH "/tmp/battery"
   23.94 +    #define BATTERY_INFO_FILE_PATH "/tmp/battery/%s/info" 
   23.95 +    #define BATTERY_STATE_FILE_PATH "/tmp/battery/%s/state"
   23.96 +#else
   23.97 +    #define BATTERY_DIR_PATH "/proc/acpi/battery"
   23.98 +    #define BATTERY_INFO_FILE_PATH "/proc/acpi/battery/%s/info"
   23.99 +    #define BATTERY_STATE_FILE_PATH "/proc/acpi/battery/%s/state"
  23.100 +#endif
  23.101 +
  23.102 +FILE *get_next_battery_file(DIR *battery_dir, 
  23.103 +                            enum BATTERY_INFO_TYPE battery_info_type)
  23.104 +{
  23.105 +    FILE *file = 0;
  23.106 +    struct dirent *dir_entries;
  23.107 +    char file_name[32];
  23.108 +    
  23.109 +    do 
  23.110 +    {
  23.111 +        dir_entries = readdir(battery_dir);
  23.112 +        if ( !dir_entries ) 
  23.113 +            return 0;
  23.114 +        if ( strlen(dir_entries->d_name) < 4 )
  23.115 +            continue;
  23.116 +        if ( battery_info_type == BIF ) 
  23.117 +            snprintf(file_name, 32, BATTERY_INFO_FILE_PATH,
  23.118 +                     dir_entries->d_name);
  23.119 +        else 
  23.120 +            snprintf(file_name, 32, BATTERY_STATE_FILE_PATH,
  23.121 +                     dir_entries->d_name);
  23.122 +        file = fopen(file_name, "r");
  23.123 +    } while ( !file );
  23.124 +
  23.125 +    return file;
  23.126 +}
  23.127 +
  23.128 +void set_attribute_battery_info(char *attrib_name,
  23.129 +                                char *attrib_value,
  23.130 +                                struct battery_info *info)
  23.131 +{
  23.132 +    if ( strstr(attrib_name, "present") ) 
  23.133 +    {
  23.134 +        if ( strstr(attrib_value, "yes") ) 
  23.135 +            info->present = YES;
  23.136 +        return;
  23.137 +    }
  23.138 +
  23.139 +    if ( strstr(attrib_name, "design capacity warning") ) 
  23.140 +    {
  23.141 +        info->design_capacity_warning = strtoull(attrib_value, NULL, 10);
  23.142 +        return;
  23.143 +    }
  23.144 +
  23.145 +    if ( strstr(attrib_name, "design capacity low") ) 
  23.146 +    {
  23.147 +        info->design_capacity_low = strtoull(attrib_value, NULL, 10);
  23.148 +        return;
  23.149 +    }
  23.150 +
  23.151 +    if ( strstr(attrib_name, "design capacity") ) 
  23.152 +    { 
  23.153 +        info->design_capacity = strtoull(attrib_value, NULL, 10);
  23.154 +        return;
  23.155 +    }
  23.156 +
  23.157 +    if ( strstr(attrib_name, "last full capacity") ) 
  23.158 +    {
  23.159 +        info->last_full_capacity = strtoull(attrib_value, NULL, 10);
  23.160 +        return;
  23.161 +    }
  23.162 +
  23.163 +    if ( strstr(attrib_name, "design voltage") ) 
  23.164 +    {
  23.165 +        info->design_voltage = strtoull(attrib_value, NULL, 10);
  23.166 +        return;
  23.167 +    }
  23.168 +
  23.169 +    if ( strstr(attrib_name, "capacity granularity 1") ) 
  23.170 +    {
  23.171 +        info->capacity_granularity_1 = strtoull(attrib_value, NULL, 10);
  23.172 +        return;
  23.173 +    }
  23.174 +
  23.175 +    if ( strstr(attrib_name, "capacity granularity 2") ) 
  23.176 +    {
  23.177 +        info->capacity_granularity_2 = strtoull(attrib_value, NULL, 10);
  23.178 +        return;
  23.179 +    }
  23.180 +
  23.181 +    if ( strstr(attrib_name, "battery technology") ) 
  23.182 +    {
  23.183 +        if ( strncmp(attrib_value, "rechargeable",
  23.184 +                     strlen("rechargeable")) == 0 ) 
  23.185 +            info->battery_technology = RECHARGEABLE;
  23.186 +        else 
  23.187 +            info->battery_technology = NON_RECHARGEABLE;
  23.188 +        return;
  23.189 +    }
  23.190 +
  23.191 +    if ( strstr(attrib_name, "model number") ) 
  23.192 +    {
  23.193 +        strncpy(info->model_number, attrib_value, 32);
  23.194 +        return;
  23.195 +    }
  23.196 +
  23.197 +    if ( strstr(attrib_name, "serial number") ) 
  23.198 +    {
  23.199 +        strncpy(info->serial_number, attrib_value, 32);
  23.200 +        return;
  23.201 +    }
  23.202 +
  23.203 +    if ( strstr(attrib_name, "battery type") ) 
  23.204 +    {
  23.205 +        strncpy(info->battery_type, attrib_value, 32);
  23.206 +        return;
  23.207 +    }
  23.208 +
  23.209 +    if ( strstr(attrib_name, "OEM info") ) 
  23.210 +    {
  23.211 +        strncpy(info->oem_info, attrib_value, 32);
  23.212 +        return;
  23.213 +    }
  23.214 +
  23.215 +    return;
  23.216 +}
  23.217 +
  23.218 +void set_attribute_battery_status(char *attrib_name, 
  23.219 +                                  char *attrib_value,
  23.220 +                                  struct battery_status *status)
  23.221 +{
  23.222 +    if ( strstr(attrib_name, "charging state") ) 
  23.223 +    {
  23.224 +        /* Check this, below is half baked */
  23.225 +        if ( strstr(attrib_value, "charged") ) 
  23.226 +            status->state = 0;
  23.227 +        else 
  23.228 +            status->state = 1;
  23.229 +        return;
  23.230 +    }
  23.231 +
  23.232 +    if ( strstr(attrib_name, "present rate") ) 
  23.233 +    {
  23.234 +        status->present_rate = strtoull(attrib_value, NULL, 10);
  23.235 +        return;
  23.236 +    }
  23.237 +
  23.238 +    if ( strstr(attrib_name, "remaining capacity") ) 
  23.239 +    {
  23.240 +        status->remaining_capacity = strtoull(attrib_value, NULL, 10);
  23.241 +        return;
  23.242 +    }
  23.243 +
  23.244 +    if ( strstr(attrib_name, "present voltage") ) 
  23.245 +    {
  23.246 +        status->present_voltage = strtoull(attrib_value, NULL, 10);
  23.247 +        return;
  23.248 +    }
  23.249 +
  23.250 +    if ( strstr(attrib_name, "present") ) 
  23.251 +    {
  23.252 +        if ( strstr(attrib_value, "yes") ) 
  23.253 +            status->present = YES;
  23.254 +        return;
  23.255 +    }
  23.256 +}
  23.257 +
  23.258 +void parse_battery_info_or_status(char *line_info,
  23.259 +                                  enum BATTERY_INFO_TYPE type,
  23.260 +                                  void *info_or_status)
  23.261 +{
  23.262 +    char attrib_name[128];
  23.263 +    char attrib_value[64];
  23.264 +    char *delimiter;
  23.265 +    unsigned long length;
  23.266 +
  23.267 +    length = strlen(line_info);
  23.268 +    delimiter = (char *) strchr( line_info, ':');
  23.269 +    if ( (!delimiter) || (delimiter == line_info) ||
  23.270 +         (delimiter == line_info + length) ) 
  23.271 +        return;
  23.272 +
  23.273 +    strncpy(attrib_name, line_info, delimiter-line_info);
  23.274 +    while ( *(delimiter+1) == ' ' ) 
  23.275 +    {
  23.276 +        delimiter++;
  23.277 +        if ( delimiter+1 == line_info + length)
  23.278 +            return;
  23.279 +    }
  23.280 +    strncpy(attrib_value, delimiter+1, 
  23.281 +            (unsigned long)line_info + length -(unsigned long)delimiter); 
  23.282 +    
  23.283 +    if ( type == BIF ) 
  23.284 +        set_attribute_battery_info(attrib_name, attrib_value,
  23.285 +                                   (struct battery_info *)info_or_status);
  23.286 +    else 
  23.287 +        set_attribute_battery_status(attrib_name, attrib_value,
  23.288 +                                     (struct battery_status *)info_or_status);
  23.289 +
  23.290 +    return;
  23.291 +}
  23.292 +
  23.293 +int get_next_battery_info_or_status(DIR *battery_dir,
  23.294 +                                    enum BATTERY_INFO_TYPE type,
  23.295 +                                    void *info_or_status)
  23.296 +{
  23.297 +    FILE *file;
  23.298 +    char line_info[256];
  23.299 +
  23.300 +    if  ( !info_or_status )
  23.301 +        return 0;
  23.302 +
  23.303 +    memset(line_info, 0, 256);
  23.304 +    if (type == BIF) 
  23.305 +        memset(info_or_status, 0, sizeof(struct battery_info));
  23.306 +    else 
  23.307 +        memset(info_or_status, 0, sizeof(struct battery_status));
  23.308 +
  23.309 +    file = get_next_battery_file(battery_dir, type);
  23.310 +    if ( !file )
  23.311 +        return 0;
  23.312 +
  23.313 +    while ( fgets(line_info, 1024, file) != NULL ) 
  23.314 +    {
  23.315 +        parse_battery_info_or_status(line_info, type, info_or_status);
  23.316 +        memset(line_info, 0, 256);
  23.317 +    }
  23.318 +
  23.319 +    fclose(file);
  23.320 +    return 1;
  23.321 +}
  23.322 +
  23.323 +#ifdef RUN_STANDALONE
  23.324 +void print_battery_info(struct battery_info *info)
  23.325 +{
  23.326 +    printf("present:                %d\n", info->present);
  23.327 +    printf("design capacity:        %d\n", info->design_capacity);
  23.328 +    printf("last full capacity:     %d\n", info->last_full_capacity);
  23.329 +    printf("battery technology:     %d\n", info->battery_technology);
  23.330 +    printf("design voltage:         %d\n", info->design_voltage);
  23.331 +    printf("design capacity warning:%d\n", info->design_capacity_warning);
  23.332 +    printf("design capacity low:    %d\n", info->design_capacity_low);
  23.333 +    printf("capacity granularity 1: %d\n", info->capacity_granularity_1);
  23.334 +    printf("capacity granularity 2: %d\n", info->capacity_granularity_2);
  23.335 +    printf("model number:           %s\n", info->model_number);
  23.336 +    printf("serial number:          %s\n", info->serial_number);
  23.337 +    printf("battery type:           %s\n", info->battery_type);
  23.338 +    printf("OEM info:               %s\n", info->oem_info);
  23.339 +}
  23.340 +#endif /*RUN_STANDALONE*/
  23.341 +
  23.342 +void write_ulong_lsb_first(char *temp_val, unsigned long val)
  23.343 +{
  23.344 +    snprintf(temp_val, 9, "%02x%02x%02x%02x", (unsigned int)val & 0xff, 
  23.345 +    (unsigned int)(val & 0xff00) >> 8, (unsigned int)(val & 0xff0000) >> 16, 
  23.346 +    (unsigned int)(val & 0xff000000) >> 24);
  23.347 +}
  23.348 +
  23.349 +void write_battery_info_to_xenstore(struct battery_info *info)
  23.350 +{
  23.351 +    char val[1024], string_info[256];
  23.352 +
  23.353 +    xs_mkdir(xs, XBT_NULL, "/pm");
  23.354 +   
  23.355 +    memset(val, 0, 1024);
  23.356 +    memset(string_info, 0, 256);
  23.357 +    /* write 9 dwords (so 9*4) + length of 4 strings + 4 null terminators */
  23.358 +    snprintf(val, 3, "%02x", 
  23.359 +             (unsigned int)(9*4 +
  23.360 +                            strlen(info->model_number) +
  23.361 +                            strlen(info->serial_number) +
  23.362 +                            strlen(info->battery_type) +
  23.363 +                            strlen(info->oem_info) + 4));
  23.364 +    write_ulong_lsb_first(val+2, info->present);
  23.365 +    write_ulong_lsb_first(val+10, info->design_capacity);
  23.366 +    write_ulong_lsb_first(val+18, info->last_full_capacity);
  23.367 +    write_ulong_lsb_first(val+26, info->battery_technology);
  23.368 +    write_ulong_lsb_first(val+34, info->design_voltage);
  23.369 +    write_ulong_lsb_first(val+42, info->design_capacity_warning);
  23.370 +    write_ulong_lsb_first(val+50, info->design_capacity_low);
  23.371 +    write_ulong_lsb_first(val+58, info->capacity_granularity_1);
  23.372 +    write_ulong_lsb_first(val+66, info->capacity_granularity_2);
  23.373 +
  23.374 +    snprintf(string_info, 256, "%02x%s%02x%s%02x%s%02x%s", 
  23.375 +             (unsigned int)strlen(info->model_number), info->model_number,
  23.376 +             (unsigned int)strlen(info->serial_number), info->serial_number,
  23.377 +             (unsigned int)strlen(info->battery_type), info->battery_type,
  23.378 +             (unsigned int)strlen(info->oem_info), info->oem_info);
  23.379 +    strncat(val+73, string_info, 1024);
  23.380 +    xs_write(xs, XBT_NULL, "/pm/bif", 
  23.381 +             val, 73+8+strlen(info->model_number)+strlen(info->serial_number)+
  23.382 +             strlen(info->battery_type)+strlen(info->oem_info)+1);
  23.383 +}
  23.384 +
  23.385 +int write_one_time_battery_info(void)
  23.386 +{
  23.387 +    DIR *dir;
  23.388 +    int ret = 0;
  23.389 +    struct battery_info info;
  23.390 +    
  23.391 +    dir = opendir(BATTERY_DIR_PATH);
  23.392 +    if ( !dir )
  23.393 +        return 0;
  23.394 +
  23.395 +    while ( get_next_battery_info_or_status(dir, BIF, (void *)&info) ) 
  23.396 +    {
  23.397 +#ifdef RUN_STANDALONE
  23.398 +        print_battery_info(&info);
  23.399 +#endif
  23.400 +        if ( info.present == YES ) 
  23.401 +        {
  23.402 +            write_battery_info_to_xenstore(&info);
  23.403 +            ret = 1;
  23.404 +            break; /* rethink this... */
  23.405 +        }
  23.406 +    }
  23.407 +
  23.408 +    closedir(dir);
  23.409 +    return ret;
  23.410 +}
  23.411 +
  23.412 +#ifdef RUN_STANDALONE
  23.413 +void print_battery_status(struct battery_status *status)
  23.414 +{
  23.415 +    printf("present:                     %d\n", status->present);
  23.416 +    printf("Battery state                %d\n", status->state);
  23.417 +    printf("Battery present rate         %d\n", status->present_rate);
  23.418 +    printf("Battery remining capacity    %d\n", status->remaining_capacity);
  23.419 +    printf("Battery present voltage      %d\n", status->present_voltage);
  23.420 +}
  23.421 +#endif /*RUN_STANDALONE*/
  23.422 +
  23.423 +void write_battery_status_to_xenstore(struct battery_status *status)
  23.424 +{
  23.425 +    char val[35];
  23.426 +
  23.427 +    xs_mkdir(xs, XBT_NULL, "/pm");
  23.428 +
  23.429 +    memset(val, 0, 35);
  23.430 +    snprintf(val, 3, "%02x", 16);
  23.431 +    write_ulong_lsb_first(val+2, status->state);
  23.432 +    write_ulong_lsb_first(val+10, status->present_rate);
  23.433 +    write_ulong_lsb_first(val+18, status->remaining_capacity);
  23.434 +    write_ulong_lsb_first(val+26, status->present_voltage);
  23.435 +
  23.436 +    xs_write(xs, XBT_NULL, "/pm/bst", val, 35);
  23.437 +}
  23.438 +
  23.439 +int wait_for_and_update_battery_status_request(void)
  23.440 +{
  23.441 +    DIR *dir;
  23.442 +    int ret = 0;
  23.443 +    unsigned int count;
  23.444 +    struct battery_status status;
  23.445 +
  23.446 +    while ( true )
  23.447 +    {
  23.448 +        /* KN:@TODO - It is rather inefficient to not cache the file handle.
  23.449 +         *  Switch to caching file handle. 
  23.450 +         */
  23.451 +        dir = opendir(BATTERY_DIR_PATH);
  23.452 +        if ( !dir )
  23.453 +            return 0;
  23.454 +
  23.455 +        while ( get_next_battery_info_or_status(dir, BST, (void *)&status) ) 
  23.456 +        {
  23.457 +#ifdef RUN_STANDALONE
  23.458 +            print_battery_status(&status);
  23.459 +#endif
  23.460 +            if ( status.present == YES ) 
  23.461 +            {
  23.462 +                write_battery_status_to_xenstore(&status);
  23.463 +                ret = 1;
  23.464 +                /* rethink this; though I have never seen, there might be
  23.465 +                 * systems out there with more than one battery device 
  23.466 +                 * present
  23.467 +                 */
  23.468 +                break;
  23.469 +            }
  23.470 +        }
  23.471 +        closedir(dir);
  23.472 +        xs_watch(xs, "/pm/events", "refreshbatterystatus");
  23.473 +        xs_read_watch(xs, &count); 
  23.474 +    }
  23.475 +
  23.476 +    return ret;
  23.477 +}
  23.478 +
  23.479 +/* Borrowed daemonize from xenstored - Initially written by Stevens. */
  23.480 +static void daemonize(void)
  23.481 +{
  23.482 +    pid_t pid;
  23.483 +
  23.484 +    if ( (pid = fork()) < 0 )
  23.485 +        exit(1);
  23.486 +
  23.487 +    if ( pid != 0 )
  23.488 +        exit(0);
  23.489 +
  23.490 +    setsid();
  23.491 +
  23.492 +    if ( (pid = fork()) < 0 )
  23.493 +        exit(1);
  23.494 +
  23.495 +    if ( pid != 0 )
  23.496 +        exit(0);
  23.497 +
  23.498 +    if ( chdir("/") == -1 )
  23.499 +        exit(1);
  23.500 +
  23.501 +    umask(0);
  23.502 +}
  23.503 +
  23.504 +int main(int argc, char *argv[])
  23.505 +{
  23.506 +#ifndef RUN_STANDALONE
  23.507 +    daemonize();
  23.508 +#endif
  23.509 +    xs = (struct xs_handle *)xs_daemon_open();
  23.510 +    if ( xs == NULL ) 
  23.511 +        return -1;
  23.512 +
  23.513 +    if ( write_one_time_battery_info() == 0 ) 
  23.514 +    {
  23.515 +        xs_daemon_close(xs);
  23.516 +        return -1;
  23.517 +    }
  23.518 +
  23.519 +    wait_for_and_update_battery_status_request();
  23.520 +    xs_daemon_close(xs);
  23.521 +    return 0;
  23.522 +}
  23.523 +
    24.1 --- a/xen/arch/ia64/xen/cpufreq/cpufreq.c	Tue Nov 04 12:07:22 2008 +0900
    24.2 +++ b/xen/arch/ia64/xen/cpufreq/cpufreq.c	Tue Nov 04 12:43:19 2008 +0900
    24.3 @@ -210,21 +210,6 @@ acpi_cpufreq_cpu_init (struct cpufreq_po
    24.4  
    24.5  	data->acpi_data = &processor_pminfo[cpu]->perf;
    24.6  
    24.7 -	/* capability check */
    24.8 -	if (data->acpi_data->state_count <= 1) {
    24.9 -		printk(KERN_WARNING "P-States\n");
   24.10 -		result = -ENODEV;
   24.11 -		goto err_unreg;
   24.12 -	}
   24.13 -
   24.14 -	if ((data->acpi_data->control_register.space_id !=
   24.15 -				ACPI_ADR_SPACE_FIXED_HARDWARE) ||
   24.16 -			(data->acpi_data->status_register.space_id !=
   24.17 -			 ACPI_ADR_SPACE_FIXED_HARDWARE)) {
   24.18 -		result = -ENODEV;
   24.19 -		goto err_unreg;
   24.20 -	}
   24.21 -
   24.22  	data->freq_table = xmalloc_array(struct cpufreq_frequency_table,
   24.23  			(data->acpi_data->state_count + 1));
   24.24  	if (!data->freq_table) {
    25.1 --- a/xen/arch/ia64/xen/irq.c	Tue Nov 04 12:07:22 2008 +0900
    25.2 +++ b/xen/arch/ia64/xen/irq.c	Tue Nov 04 12:43:19 2008 +0900
    25.3 @@ -74,7 +74,7 @@ unsigned int __ia64_local_vector_to_irq 
    25.4  /*
    25.5   * Controller mappings for all interrupt sources:
    25.6   */
    25.7 -irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
    25.8 +irq_desc_t irq_desc[NR_IRQS] = {
    25.9  	[0 ... NR_IRQS-1] = {
   25.10  		.status = IRQ_DISABLED,
   25.11  		.handler = &no_irq_type,
    26.1 --- a/xen/arch/x86/acpi/cpu_idle.c	Tue Nov 04 12:07:22 2008 +0900
    26.2 +++ b/xen/arch/x86/acpi/cpu_idle.c	Tue Nov 04 12:43:19 2008 +0900
    26.3 @@ -75,13 +75,14 @@ static void print_acpi_power(uint32_t cp
    26.4  
    26.5      printk("==cpu%d==\n", cpu);
    26.6      printk("active state:\t\tC%d\n",
    26.7 -           power->last_state ? (int)(power->last_state - power->states) : -1);
    26.8 +           power->last_state ? power->last_state->idx : -1);
    26.9      printk("max_cstate:\t\tC%d\n", max_cstate);
   26.10      printk("states:\n");
   26.11      
   26.12      for ( i = 1; i < power->count; i++ )
   26.13      {
   26.14 -        printk((power->last_state == &power->states[i]) ? "   *" : "    ");
   26.15 +        printk((power->last_state && power->last_state->idx == i) ?
   26.16 +               "   *" : "    ");
   26.17          printk("C%d:\t", i);
   26.18          printk("type[C%d] ", power->states[i].type);
   26.19          printk("latency[%03d] ", power->states[i].latency);
   26.20 @@ -139,20 +140,26 @@ static void acpi_processor_ffh_cstate_en
   26.21  
   26.22  static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
   26.23  {
   26.24 -    if ( cx->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE )
   26.25 +    int unused;
   26.26 +
   26.27 +    switch ( cx->entry_method )
   26.28      {
   26.29 +    case ACPI_CSTATE_EM_FFH:
   26.30          /* Call into architectural FFH based C-state */
   26.31          acpi_processor_ffh_cstate_enter(cx);
   26.32 -    }
   26.33 -    else
   26.34 -    {
   26.35 -        int unused;
   26.36 +        return;
   26.37 +    case ACPI_CSTATE_EM_SYSIO:
   26.38          /* IO port based C-state */
   26.39          inb(cx->address);
   26.40          /* Dummy wait op - must do something useless after P_LVL2 read
   26.41             because chipsets cannot guarantee that STPCLK# signal
   26.42             gets asserted in time to freeze execution properly. */
   26.43          unused = inl(pmtmr_ioport);
   26.44 +        return;
   26.45 +    case ACPI_CSTATE_EM_HALT:
   26.46 +        acpi_safe_halt();
   26.47 +        local_irq_disable();
   26.48 +        return;
   26.49      }
   26.50  }
   26.51  
   26.52 @@ -222,7 +229,7 @@ static void acpi_processor_idle(void)
   26.53          if ( power->flags.bm_check && acpi_idle_bm_check()
   26.54               && cx->type == ACPI_STATE_C3 )
   26.55              cx = power->safe_state;
   26.56 -        if ( cx - &power->states[0] > max_cstate )
   26.57 +        if ( cx->idx > max_cstate )
   26.58              cx = &power->states[max_cstate];
   26.59      }
   26.60      if ( !cx )
   26.61 @@ -252,35 +259,11 @@ static void acpi_processor_idle(void)
   26.62      switch ( cx->type )
   26.63      {
   26.64      case ACPI_STATE_C1:
   26.65 -        /* Trace cpu idle entry */
   26.66 -        TRACE_1D(TRC_PM_IDLE_ENTRY, 1);
   26.67 -
   26.68 -        /*
   26.69 -         * Invoke C1.
   26.70 -         * Use the appropriate idle routine, the one that would
   26.71 -         * be used without acpi C-states.
   26.72 -         */
   26.73 -        if ( pm_idle_save )
   26.74 -            pm_idle_save();
   26.75 -        else 
   26.76 -            acpi_safe_halt();
   26.77 -
   26.78 -        /* Trace cpu idle exit */
   26.79 -        TRACE_1D(TRC_PM_IDLE_EXIT, 1);
   26.80 -
   26.81 -        /*
   26.82 -         * TBD: Can't get time duration while in C1, as resumes
   26.83 -         *      go to an ISR rather than here.  Need to instrument
   26.84 -         *      base interrupt handler.
   26.85 -         */
   26.86 -        sleep_ticks = 0xFFFFFFFF;
   26.87 -        break;
   26.88 -
   26.89      case ACPI_STATE_C2:
   26.90 -        if ( local_apic_timer_c2_ok )
   26.91 +        if ( cx->type == ACPI_STATE_C1 || local_apic_timer_c2_ok )
   26.92          {
   26.93              /* Trace cpu idle entry */
   26.94 -            TRACE_1D(TRC_PM_IDLE_ENTRY, 2);
   26.95 +            TRACE_1D(TRC_PM_IDLE_ENTRY, cx->idx);
   26.96              /* Get start time (ticks) */
   26.97              t1 = inl(pmtmr_ioport);
   26.98              /* Invoke C2 */
   26.99 @@ -288,7 +271,7 @@ static void acpi_processor_idle(void)
  26.100              /* Get end time (ticks) */
  26.101              t2 = inl(pmtmr_ioport);
  26.102              /* Trace cpu idle exit */
  26.103 -            TRACE_1D(TRC_PM_IDLE_EXIT, 2);
  26.104 +            TRACE_1D(TRC_PM_IDLE_EXIT, cx->idx);
  26.105  
  26.106              /* Re-enable interrupts */
  26.107              local_irq_enable();
  26.108 @@ -328,7 +311,7 @@ static void acpi_processor_idle(void)
  26.109          }
  26.110  
  26.111          /* Trace cpu idle entry */
  26.112 -        TRACE_1D(TRC_PM_IDLE_ENTRY, cx - &power->states[0]);
  26.113 +        TRACE_1D(TRC_PM_IDLE_ENTRY, cx->idx);
  26.114          /*
  26.115           * Before invoking C3, be aware that TSC/APIC timer may be 
  26.116           * stopped by H/W. Without carefully handling of TSC/APIC stop issues,
  26.117 @@ -349,7 +332,7 @@ static void acpi_processor_idle(void)
  26.118          /* recovering TSC */
  26.119          cstate_restore_tsc();
  26.120          /* Trace cpu idle exit */
  26.121 -        TRACE_1D(TRC_PM_IDLE_EXIT, cx - &power->states[0]);
  26.122 +        TRACE_1D(TRC_PM_IDLE_EXIT, cx->idx);
  26.123  
  26.124          if ( power->flags.bm_check && power->flags.bm_control )
  26.125          {
  26.126 @@ -387,9 +370,15 @@ static void acpi_processor_idle(void)
  26.127  
  26.128  static int init_cx_pminfo(struct acpi_processor_power *acpi_power)
  26.129  {
  26.130 +    int i;
  26.131 +
  26.132      memset(acpi_power, 0, sizeof(*acpi_power));
  26.133  
  26.134 +    for ( i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++ )
  26.135 +        acpi_power->states[i].idx = i;
  26.136 +
  26.137      acpi_power->states[ACPI_STATE_C1].type = ACPI_STATE_C1;
  26.138 +    acpi_power->states[ACPI_STATE_C1].entry_method = ACPI_CSTATE_EM_HALT;
  26.139  
  26.140      acpi_power->states[ACPI_STATE_C0].valid = 1;
  26.141      acpi_power->states[ACPI_STATE_C1].valid = 1;
  26.142 @@ -486,16 +475,13 @@ static int check_cx(struct acpi_processo
  26.143          break;
  26.144  
  26.145      case ACPI_ADR_SPACE_FIXED_HARDWARE:
  26.146 -        if ( cx->type > ACPI_STATE_C1 )
  26.147 -        {
  26.148 -            if ( cx->reg.bit_width != VENDOR_INTEL || 
  26.149 -                 cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
  26.150 -                return -EINVAL;
  26.151 +        if ( cx->reg.bit_width != VENDOR_INTEL || 
  26.152 +             cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
  26.153 +            return -EINVAL;
  26.154  
  26.155 -            /* assume all logical cpu has the same support for mwait */
  26.156 -            if ( acpi_processor_ffh_cstate_probe(cx) )
  26.157 -                return -EINVAL;
  26.158 -        }
  26.159 +        /* assume all logical cpu has the same support for mwait */
  26.160 +        if ( acpi_processor_ffh_cstate_probe(cx) )
  26.161 +            return -EINVAL;
  26.162          break;
  26.163  
  26.164      default:
  26.165 @@ -599,7 +585,23 @@ static void set_cx(
  26.166      cx->valid    = 1;
  26.167      cx->type     = xen_cx->type;
  26.168      cx->address  = xen_cx->reg.address;
  26.169 -    cx->space_id = xen_cx->reg.space_id;
  26.170 +
  26.171 +    switch ( xen_cx->reg.space_id )
  26.172 +    {
  26.173 +    case ACPI_ADR_SPACE_FIXED_HARDWARE:
  26.174 +        if ( xen_cx->reg.bit_width == VENDOR_INTEL &&
  26.175 +             xen_cx->reg.bit_offset == NATIVE_CSTATE_BEYOND_HALT )
  26.176 +            cx->entry_method = ACPI_CSTATE_EM_FFH;
  26.177 +        else
  26.178 +            cx->entry_method = ACPI_CSTATE_EM_HALT;
  26.179 +        break;
  26.180 +    case ACPI_ADR_SPACE_SYSTEM_IO:
  26.181 +        cx->entry_method = ACPI_CSTATE_EM_SYSIO;
  26.182 +        break;
  26.183 +    default:
  26.184 +        cx->entry_method = ACPI_CSTATE_EM_NONE;
  26.185 +    }
  26.186 +
  26.187      cx->latency  = xen_cx->latency;
  26.188      cx->power    = xen_cx->power;
  26.189      
  26.190 @@ -761,8 +763,7 @@ int pmstat_get_cx_stat(uint32_t cpuid, s
  26.191          return 0;
  26.192      }
  26.193  
  26.194 -    stat->last = (power->last_state) ?
  26.195 -        (int)(power->last_state - &power->states[0]) : 0;
  26.196 +    stat->last = power->last_state ? power->last_state->idx : 0;
  26.197      stat->nr = power->count;
  26.198      stat->idle_time = v->runstate.time[RUNSTATE_running];
  26.199      if ( v->is_running )
    27.1 --- a/xen/arch/x86/acpi/cpufreq/cpufreq.c	Tue Nov 04 12:07:22 2008 +0900
    27.2 +++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c	Tue Nov 04 12:43:19 2008 +0900
    27.3 @@ -370,7 +370,7 @@ static int acpi_cpufreq_target(struct cp
    27.4      if (!check_freqs(cmd.mask, freqs.new, data))
    27.5          return -EAGAIN;
    27.6  
    27.7 -    for_each_cpu_mask(j, cmd.mask)
    27.8 +    for_each_cpu_mask(j, online_policy_cpus)
    27.9          cpufreq_statistic_update(j, perf->state, next_perf_state);
   27.10  
   27.11      perf->state = next_perf_state;
   27.12 @@ -447,18 +447,6 @@ acpi_cpufreq_cpu_init(struct cpufreq_pol
   27.13      perf = data->acpi_data;
   27.14      policy->shared_type = perf->shared_type;
   27.15  
   27.16 -    /* capability check */
   27.17 -    if (perf->state_count <= 1) {
   27.18 -        printk("No P-States\n");
   27.19 -        result = -ENODEV;
   27.20 -        goto err_unreg;
   27.21 -    }
   27.22 -
   27.23 -    if (perf->control_register.space_id != perf->status_register.space_id) {
   27.24 -        result = -ENODEV;
   27.25 -        goto err_unreg;
   27.26 -    }
   27.27 -
   27.28      switch (perf->control_register.space_id) {
   27.29      case ACPI_ADR_SPACE_SYSTEM_IO:
   27.30          printk("xen_pminfo: @acpi_cpufreq_cpu_init,"
    28.1 --- a/xen/arch/x86/acpi/cpufreq/powernow.c	Tue Nov 04 12:07:22 2008 +0900
    28.2 +++ b/xen/arch/x86/acpi/cpufreq/powernow.c	Tue Nov 04 12:43:19 2008 +0900
    28.3 @@ -229,9 +229,23 @@ err_unreg:
    28.4      return result;
    28.5  }
    28.6  
    28.7 +static int powernow_cpufreq_cpu_exit(struct cpufreq_policy *policy)
    28.8 +{
    28.9 +    struct powernow_cpufreq_data *data = drv_data[policy->cpu];
   28.10 +
   28.11 +    if (data) {
   28.12 +        drv_data[policy->cpu] = NULL;
   28.13 +        xfree(data->freq_table);
   28.14 +        xfree(data);
   28.15 +    }
   28.16 +
   28.17 +    return 0;
   28.18 +}
   28.19 +
   28.20  static struct cpufreq_driver powernow_cpufreq_driver = {
   28.21      .target = powernow_cpufreq_target,
   28.22      .init   = powernow_cpufreq_cpu_init,
   28.23 +    .exit   = powernow_cpufreq_cpu_exit
   28.24  };
   28.25  
   28.26  int powernow_cpufreq_init(void)
    29.1 --- a/xen/arch/x86/acpi/cpuidle_menu.c	Tue Nov 04 12:07:22 2008 +0900
    29.2 +++ b/xen/arch/x86/acpi/cpuidle_menu.c	Tue Nov 04 12:43:19 2008 +0900
    29.3 @@ -59,7 +59,7 @@ static int menu_select(struct acpi_proce
    29.4      data->expected_us = (u32) get_sleep_length_ns() / 1000;
    29.5  
    29.6      /* find the deepest idle state that satisfies our constraints */
    29.7 -    for ( i = 1; i < power->count; i++ )
    29.8 +    for ( i = 2; i < power->count; i++ )
    29.9      {
   29.10          struct acpi_processor_cx *s = &power->states[i];
   29.11  
   29.12 @@ -81,17 +81,7 @@ static void menu_reflect(struct acpi_pro
   29.13      unsigned int last_residency; 
   29.14      unsigned int measured_us;
   29.15  
   29.16 -    /*
   29.17 -     * Ugh, this idle state doesn't support residency measurements, so we
   29.18 -     * are basically lost in the dark.  As a compromise, assume we slept
   29.19 -     * for one full standard timer tick.  However, be aware that this
   29.20 -     * could potentially result in a suboptimal state transition.
   29.21 -     */
   29.22 -    if ( target->type == ACPI_STATE_C1 )
   29.23 -        last_residency = USEC_PER_SEC / HZ;
   29.24 -    else
   29.25 -        last_residency = power->last_residency;
   29.26 -
   29.27 +    last_residency = power->last_residency;
   29.28      measured_us = last_residency + data->elapsed_us;
   29.29  
   29.30      /* if wrapping, set to max uint (-1) */
    30.1 --- a/xen/arch/x86/domain.c	Tue Nov 04 12:07:22 2008 +0900
    30.2 +++ b/xen/arch/x86/domain.c	Tue Nov 04 12:43:19 2008 +0900
    30.3 @@ -174,9 +174,10 @@ void free_vcpu_struct(struct vcpu *v)
    30.4  
    30.5  static int setup_compat_l4(struct vcpu *v)
    30.6  {
    30.7 -    struct page_info *pg = alloc_domheap_page(NULL, 0);
    30.8 +    struct page_info *pg;
    30.9      l4_pgentry_t *l4tab;
   30.10  
   30.11 +    pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
   30.12      if ( pg == NULL )
   30.13          return -ENOMEM;
   30.14  
   30.15 @@ -1639,32 +1640,23 @@ static int relinquish_memory(
   30.16          }
   30.17  
   30.18          if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
   30.19 -            put_page_and_type(page);
   30.20 +            ret = put_page_and_type_preemptible(page, 1);
   30.21 +        switch ( ret )
   30.22 +        {
   30.23 +        case 0:
   30.24 +            break;
   30.25 +        case -EAGAIN:
   30.26 +        case -EINTR:
   30.27 +            set_bit(_PGT_pinned, &page->u.inuse.type_info);
   30.28 +            put_page(page);
   30.29 +            goto out;
   30.30 +        default:
   30.31 +            BUG();
   30.32 +        }
   30.33  
   30.34          if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
   30.35              put_page(page);
   30.36  
   30.37 -#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
   30.38 -        /*
   30.39 -         * Forcibly drop reference counts of page tables above top most (which
   30.40 -         * were skipped to prevent long latencies due to deep recursion - see
   30.41 -         * the special treatment in free_lX_table()).
   30.42 -         */
   30.43 -        y = page->u.inuse.type_info;
   30.44 -        if ( (type < PGT_root_page_table) &&
   30.45 -             unlikely(((y + PGT_type_mask) &
   30.46 -                       (PGT_type_mask|PGT_validated)) == type) )
   30.47 -        {
   30.48 -            BUG_ON((y & PGT_count_mask) >=
   30.49 -                   (page->count_info & PGC_count_mask));
   30.50 -            while ( y & PGT_count_mask )
   30.51 -            {
   30.52 -                put_page_and_type(page);
   30.53 -                y = page->u.inuse.type_info;
   30.54 -            }
   30.55 -        }
   30.56 -#endif
   30.57 -
   30.58          /*
   30.59           * Forcibly invalidate top-most, still valid page tables at this point
   30.60           * to break circular 'linear page table' references as well as clean up
   30.61 @@ -1685,8 +1677,31 @@ static int relinquish_memory(
   30.62                          x & ~(PGT_validated|PGT_partial));
   30.63              if ( likely(y == x) )
   30.64              {
   30.65 -                if ( free_page_type(page, x, 0) != 0 )
   30.66 +                /* No need for atomic update of type_info here: noone else updates it. */
   30.67 +                switch ( ret = free_page_type(page, x, 1) )
   30.68 +                {
   30.69 +                case 0:
   30.70 +                    break;
   30.71 +                case -EINTR:
   30.72 +                    page->u.inuse.type_info |= PGT_validated;
   30.73 +                    if ( x & PGT_partial )
   30.74 +                        put_page(page);
   30.75 +                    put_page(page);
   30.76 +                    ret = -EAGAIN;
   30.77 +                    goto out;
   30.78 +                case -EAGAIN:
   30.79 +                    page->u.inuse.type_info |= PGT_partial;
   30.80 +                    if ( x & PGT_partial )
   30.81 +                        put_page(page);
   30.82 +                    goto out;
   30.83 +                default:
   30.84                      BUG();
   30.85 +                }
   30.86 +                if ( x & PGT_partial )
   30.87 +                {
   30.88 +                    page->u.inuse.type_info--;
   30.89 +                    put_page(page);
   30.90 +                }
   30.91                  break;
   30.92              }
   30.93          }
   30.94 @@ -1831,11 +1846,6 @@ int domain_relinquish_resources(struct d
   30.95          /* fallthrough */
   30.96  
   30.97      case RELMEM_done:
   30.98 -#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
   30.99 -        ret = relinquish_memory(d, &d->page_list, PGT_l1_page_table);
  30.100 -        if ( ret )
  30.101 -            return ret;
  30.102 -#endif
  30.103          break;
  30.104  
  30.105      default:
  30.106 @@ -1892,6 +1902,54 @@ void domain_cpuid(
  30.107      *eax = *ebx = *ecx = *edx = 0;
  30.108  }
  30.109  
  30.110 +void vcpu_kick(struct vcpu *v)
  30.111 +{
  30.112 +    /*
  30.113 +     * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
  30.114 +     * pending flag. These values may fluctuate (after all, we hold no
  30.115 +     * locks) but the key insight is that each change will cause
  30.116 +     * evtchn_upcall_pending to be polled.
  30.117 +     * 
  30.118 +     * NB2. We save the running flag across the unblock to avoid a needless
  30.119 +     * IPI for domains that we IPI'd to unblock.
  30.120 +     */
  30.121 +    bool_t running = v->is_running;
  30.122 +    vcpu_unblock(v);
  30.123 +    if ( running && (in_irq() || (v != current)) )
  30.124 +        cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
  30.125 +}
  30.126 +
  30.127 +void vcpu_mark_events_pending(struct vcpu *v)
  30.128 +{
  30.129 +    int already_pending = test_and_set_bit(
  30.130 +        0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
  30.131 +
  30.132 +    if ( already_pending )
  30.133 +        return;
  30.134 +
  30.135 +    if ( is_hvm_vcpu(v) )
  30.136 +        hvm_assert_evtchn_irq(v);
  30.137 +    else
  30.138 +        vcpu_kick(v);
  30.139 +}
  30.140 +
  30.141 +static void vcpu_kick_softirq(void)
  30.142 +{
  30.143 +    /*
  30.144 +     * Nothing to do here: we merely prevent notifiers from racing with checks
  30.145 +     * executed on return to guest context with interrupts enabled. See, for
  30.146 +     * example, xxx_intr_assist() executed on return to HVM guest context.
  30.147 +     */
  30.148 +}
  30.149 +
  30.150 +static int __init init_vcpu_kick_softirq(void)
  30.151 +{
  30.152 +    open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
  30.153 +    return 0;
  30.154 +}
  30.155 +__initcall(init_vcpu_kick_softirq);
  30.156 +
  30.157 +
  30.158  /*
  30.159   * Local variables:
  30.160   * mode: C
    31.1 --- a/xen/arch/x86/domain_build.c	Tue Nov 04 12:07:22 2008 +0900
    31.2 +++ b/xen/arch/x86/domain_build.c	Tue Nov 04 12:43:19 2008 +0900
    31.3 @@ -194,6 +194,30 @@ static void __init process_dom0_ioports_
    31.4      }
    31.5  }
    31.6  
    31.7 +/* We run on dom0's page tables for the final part of the build process. */
    31.8 +static void dom0_pt_enter(struct vcpu *v)
    31.9 +{
   31.10 +    struct desc_ptr gdt_desc = {
   31.11 +        .limit = LAST_RESERVED_GDT_BYTE,
   31.12 +        .base = (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY)
   31.13 +    };
   31.14 +
   31.15 +    asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
   31.16 +    write_ptbase(v);
   31.17 +}
   31.18 +
   31.19 +/* Return to idle domain's page tables. */
   31.20 +static void dom0_pt_exit(void)
   31.21 +{
   31.22 +    struct desc_ptr gdt_desc = {
   31.23 +        .limit = LAST_RESERVED_GDT_BYTE,
   31.24 +        .base = GDT_VIRT_START(current)
   31.25 +    };
   31.26 +
   31.27 +    write_ptbase(current);
   31.28 +    asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
   31.29 +}
   31.30 +
   31.31  int __init construct_dom0(
   31.32      struct domain *d,
   31.33      unsigned long _image_start, unsigned long image_len, 
   31.34 @@ -700,14 +724,12 @@ int __init construct_dom0(
   31.35          (void)alloc_vcpu(d, i, i % num_online_cpus());
   31.36  
   31.37      /* Set up CR3 value for write_ptbase */
   31.38 -    if ( paging_mode_enabled(v->domain) )
   31.39 +    if ( paging_mode_enabled(d) )
   31.40          paging_update_paging_modes(v);
   31.41      else
   31.42          update_cr3(v);
   31.43  
   31.44 -    /* Install the new page tables. */
   31.45 -    local_irq_disable();
   31.46 -    write_ptbase(v);
   31.47 +    dom0_pt_enter(v);
   31.48  
   31.49      /* Copy the OS image and free temporary buffer. */
   31.50      elf.dest = (void*)vkern_start;
   31.51 @@ -804,9 +826,7 @@ int __init construct_dom0(
   31.52          xlat_start_info(si, XLAT_start_info_console_dom0);
   31.53  #endif
   31.54  
   31.55 -    /* Reinstate the caller's page tables. */
   31.56 -    write_ptbase(current);
   31.57 -    local_irq_enable();
   31.58 +    dom0_pt_exit();
   31.59  
   31.60  #if defined(__i386__)
   31.61      /* Destroy low mappings - they were only for our convenience. */
    32.1 --- a/xen/arch/x86/hpet.c	Tue Nov 04 12:07:22 2008 +0900
    32.2 +++ b/xen/arch/x86/hpet.c	Tue Nov 04 12:43:19 2008 +0900
    32.3 @@ -14,8 +14,6 @@
    32.4  #include <asm/div64.h>
    32.5  #include <asm/hpet.h>
    32.6  
    32.7 -#define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1))
    32.8 -
    32.9  #define MAX_DELTA_NS MILLISECS(10*1000)
   32.10  #define MIN_DELTA_NS MICROSECS(20)
   32.11  
   32.12 @@ -146,7 +144,7 @@ static void handle_hpet_broadcast(struct
   32.13      s_time_t now, next_event;
   32.14      int cpu;
   32.15  
   32.16 -    spin_lock(&ch->lock);
   32.17 +    spin_lock_irq(&ch->lock);
   32.18  
   32.19  again:
   32.20      ch->next_event = STIME_MAX;
   32.21 @@ -171,7 +169,7 @@ again:
   32.22          if ( reprogram_hpet_evt_channel(ch, next_event, now, 0) )
   32.23              goto again;
   32.24      }
   32.25 -    spin_unlock(&ch->lock);
   32.26 +    spin_unlock_irq(&ch->lock);
   32.27  }
   32.28  
   32.29  void hpet_broadcast_init(void)
   32.30 @@ -213,6 +211,7 @@ void hpet_broadcast_enter(void)
   32.31  {
   32.32      struct hpet_event_channel *ch = &hpet_event;
   32.33  
   32.34 +    ASSERT(!local_irq_is_enabled());
   32.35      spin_lock(&ch->lock);
   32.36  
   32.37      disable_APIC_timer();
    33.1 --- a/xen/arch/x86/hvm/emulate.c	Tue Nov 04 12:07:22 2008 +0900
    33.2 +++ b/xen/arch/x86/hvm/emulate.c	Tue Nov 04 12:43:19 2008 +0900
    33.3 @@ -14,11 +14,39 @@
    33.4  #include <xen/lib.h>
    33.5  #include <xen/sched.h>
    33.6  #include <xen/paging.h>
    33.7 +#include <xen/trace.h>
    33.8  #include <asm/event.h>
    33.9  #include <asm/hvm/emulate.h>
   33.10  #include <asm/hvm/hvm.h>
   33.11  #include <asm/hvm/support.h>
   33.12  
   33.13 +#define HVMTRACE_IO_ASSIST_WRITE 0x200
   33.14 +static void hvmtrace_io_assist(int is_mmio, ioreq_t *p)
   33.15 +{
   33.16 +    unsigned int size, event;
   33.17 +    unsigned char buffer[12];
   33.18 +
   33.19 +    if ( likely(!tb_init_done) )
   33.20 +        return;
   33.21 +
   33.22 +    event = is_mmio ? TRC_HVM_MMIO_ASSIST : TRC_HVM_IO_ASSIST;
   33.23 +    if ( !p->dir )
   33.24 +        event |= HVMTRACE_IO_ASSIST_WRITE;
   33.25 +
   33.26 +    *(uint64_t *)buffer = p->addr;
   33.27 +    size = (p->addr != (u32)p->addr) ? 8 : 4;
   33.28 +    if ( size == 8 )
   33.29 +        event |= TRC_64_FLAG;
   33.30 +
   33.31 +    if ( !p->data_is_ptr )
   33.32 +    {
   33.33 +        *(uint32_t *)&buffer[size] = p->data;
   33.34 +        size += 4;
   33.35 +    }
   33.36 +
   33.37 +    trace_var(event, 0/*!cycles*/, size, buffer);
   33.38 +}
   33.39 +
   33.40  static int hvmemul_do_io(
   33.41      int is_mmio, paddr_t addr, unsigned long *reps, int size,
   33.42      paddr_t ram_gpa, int dir, int df, void *p_data)
   33.43 @@ -111,6 +139,8 @@ static int hvmemul_do_io(
   33.44      p->data = value;
   33.45      p->io_count++;
   33.46  
   33.47 +    hvmtrace_io_assist(is_mmio, p);
   33.48 +
   33.49      if ( is_mmio )
   33.50      {
   33.51          rc = hvm_mmio_intercept(p);
    34.1 --- a/xen/arch/x86/hvm/hpet.c	Tue Nov 04 12:07:22 2008 +0900
    34.2 +++ b/xen/arch/x86/hvm/hpet.c	Tue Nov 04 12:43:19 2008 +0900
    34.3 @@ -76,6 +76,7 @@
    34.4          ~0ULL : (tick) * (h)->hpet_to_ns_scale) >> 10))
    34.5  
    34.6  #define timer_config(h, n)       (h->hpet.timers[n].config)
    34.7 +#define timer_enabled(h, n)      (timer_config(h, n) & HPET_TN_ENABLE)
    34.8  #define timer_is_periodic(h, n)  (timer_config(h, n) & HPET_TN_PERIODIC)
    34.9  #define timer_is_32bit(h, n)     (timer_config(h, n) & HPET_TN_32BIT)
   34.10  #define hpet_enabled(h)          (h->hpet.config & HPET_CFG_ENABLE)
   34.11 @@ -88,9 +89,40 @@
   34.12      ((timer_config(h, n) & HPET_TN_INT_ROUTE_CAP_MASK) \
   34.13          >> HPET_TN_INT_ROUTE_CAP_SHIFT)
   34.14  
   34.15 -#define hpet_time_after(a, b)   ((int32_t)(b) - (int32_t)(a) < 0)
   34.16 -#define hpet_time_after64(a, b) ((int64_t)(b) - (int64_t)(a) < 0)
   34.17 +static inline uint64_t hpet_read_maincounter(HPETState *h)
   34.18 +{
   34.19 +    ASSERT(spin_is_locked(&h->lock));
   34.20 +
   34.21 +    if ( hpet_enabled(h) )
   34.22 +        return guest_time_hpet(h->vcpu) + h->mc_offset;
   34.23 +    else 
   34.24 +        return h->hpet.mc64;
   34.25 +}
   34.26 +
   34.27 +static uint64_t hpet_get_comparator(HPETState *h, unsigned int tn)
   34.28 +{
   34.29 +    uint64_t comparator;
   34.30 +    uint64_t elapsed;
   34.31  
   34.32 +    comparator = h->hpet.comparator64[tn];
   34.33 +    if ( timer_is_periodic(h, tn) )
   34.34 +    {
   34.35 +        /* update comparator by number of periods elapsed since last update */
   34.36 +        uint64_t period = h->hpet.period[tn];
   34.37 +        if (period)
   34.38 +        {
   34.39 +            elapsed = hpet_read_maincounter(h) + period - 1 - comparator;
   34.40 +            comparator += (elapsed / period) * period;
   34.41 +            h->hpet.comparator64[tn] = comparator;
   34.42 +        }
   34.43 +    }
   34.44 +    
   34.45 +    /* truncate if timer is in 32 bit mode */
   34.46 +    if ( timer_is_32bit(h, tn) )
   34.47 +        comparator = (uint32_t)comparator;
   34.48 +    h->hpet.timers[tn].cmp = comparator;
   34.49 +    return comparator;
   34.50 +}
   34.51  static inline uint64_t hpet_read64(HPETState *h, unsigned long addr)
   34.52  {
   34.53      addr &= ~7;
   34.54 @@ -104,7 +136,7 @@ static inline uint64_t hpet_read64(HPETS
   34.55      case HPET_STATUS:
   34.56          return h->hpet.isr;
   34.57      case HPET_COUNTER:
   34.58 -        return h->hpet.mc64;
   34.59 +        return hpet_read_maincounter(h);
   34.60      case HPET_T0_CFG:
   34.61      case HPET_T1_CFG:
   34.62      case HPET_T2_CFG:
   34.63 @@ -112,7 +144,7 @@ static inline uint64_t hpet_read64(HPETS
   34.64      case HPET_T0_CMP:
   34.65      case HPET_T1_CMP:
   34.66      case HPET_T2_CMP:
   34.67 -        return h->hpet.timers[(addr - HPET_T0_CMP) >> 5].cmp;
   34.68 +        return hpet_get_comparator(h, (addr - HPET_T0_CMP) >> 5);
   34.69      case HPET_T0_ROUTE:
   34.70      case HPET_T1_ROUTE:
   34.71      case HPET_T2_ROUTE:
   34.72 @@ -140,16 +172,6 @@ static inline int hpet_check_access_leng
   34.73      return 0;
   34.74  }
   34.75  
   34.76 -static inline uint64_t hpet_read_maincounter(HPETState *h)
   34.77 -{
   34.78 -    ASSERT(spin_is_locked(&h->lock));
   34.79 -
   34.80 -    if ( hpet_enabled(h) )
   34.81 -        return guest_time_hpet(h->vcpu) + h->mc_offset;
   34.82 -    else 
   34.83 -        return h->hpet.mc64;
   34.84 -}
   34.85 -
   34.86  static int hpet_read(
   34.87      struct vcpu *v, unsigned long addr, unsigned long length,
   34.88      unsigned long *pval)
   34.89 @@ -169,8 +191,6 @@ static int hpet_read(
   34.90      spin_lock(&h->lock);
   34.91  
   34.92      val = hpet_read64(h, addr);
   34.93 -    if ( (addr & ~7) == HPET_COUNTER )
   34.94 -        val = hpet_read_maincounter(h);
   34.95  
   34.96      result = val;
   34.97      if ( length != 8 )
   34.98 @@ -187,7 +207,10 @@ static void hpet_stop_timer(HPETState *h
   34.99  {
  34.100      ASSERT(tn < HPET_TIMER_NUM);
  34.101      ASSERT(spin_is_locked(&h->lock));
  34.102 -    stop_timer(&h->timers[tn]);
  34.103 +    destroy_periodic_time(&h->pt[tn]);
  34.104 +    /* read the comparator to get it updated so a read while stopped will
  34.105 +     * return the expected value. */
  34.106 +    hpet_get_comparator(h, tn);
  34.107  }
  34.108  
  34.109  /* the number of HPET tick that stands for
  34.110 @@ -197,6 +220,8 @@ static void hpet_stop_timer(HPETState *h
  34.111  static void hpet_set_timer(HPETState *h, unsigned int tn)
  34.112  {
  34.113      uint64_t tn_cmp, cur_tick, diff;
  34.114 +    unsigned int irq;
  34.115 +    unsigned int oneshot;
  34.116  
  34.117      ASSERT(tn < HPET_TIMER_NUM);
  34.118      ASSERT(spin_is_locked(&h->lock));
  34.119 @@ -209,7 +234,10 @@ static void hpet_set_timer(HPETState *h,
  34.120          pit_stop_channel0_irq(pit);
  34.121      }
  34.122  
  34.123 -    tn_cmp   = h->hpet.timers[tn].cmp;
  34.124 +    if ( !timer_enabled(h, tn) )
  34.125 +        return;
  34.126 +
  34.127 +    tn_cmp   = hpet_get_comparator(h, tn);
  34.128      cur_tick = hpet_read_maincounter(h);
  34.129      if ( timer_is_32bit(h, tn) )
  34.130      {
  34.131 @@ -229,7 +257,25 @@ static void hpet_set_timer(HPETState *h,
  34.132          diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN))
  34.133              ? (uint32_t)diff : 0;
  34.134  
  34.135 -    set_timer(&h->timers[tn], NOW() + hpet_tick_to_ns(h, diff));
  34.136 +    if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) )
  34.137 +        /* if LegacyReplacementRoute bit is set, HPET specification requires
  34.138 +           timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC,
  34.139 +           timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */
  34.140 +        irq = (tn == 0) ? 0 : 8;
  34.141 +    else
  34.142 +        irq = timer_int_route(h, tn);
  34.143 +
  34.144 +    /*
  34.145 +     * diff is the time from now when the timer should fire, for a periodic 
  34.146 +     * timer we also need the period which may be different because time may
  34.147 +     * have elapsed between the time the comparator was written and the timer
  34.148 +     * being enabled (now).
  34.149 +     */
  34.150 +    oneshot = !timer_is_periodic(h, tn);
  34.151 +    create_periodic_time(h->vcpu, &h->pt[tn],
  34.152 +                         hpet_tick_to_ns(h, diff),
  34.153 +                         oneshot ? 0 : hpet_tick_to_ns(h, h->hpet.period[tn]),
  34.154 +                         irq, NULL, NULL);
  34.155  }
  34.156  
  34.157  static inline uint64_t hpet_fixup_reg(
  34.158 @@ -248,6 +294,13 @@ static int hpet_write(
  34.159      uint64_t old_val, new_val;
  34.160      int tn, i;
  34.161  
  34.162 +    /* Acculumate a bit mask of timers whos state is changed by this write. */
  34.163 +    unsigned long start_timers = 0;
  34.164 +    unsigned long stop_timers  = 0;
  34.165 +#define set_stop_timer(n)    (__set_bit((n), &stop_timers))
  34.166 +#define set_start_timer(n)   (__set_bit((n), &start_timers))
  34.167 +#define set_restart_timer(n) (set_stop_timer(n),set_start_timer(n))
  34.168 +
  34.169      addr &= HPET_MMAP_SIZE-1;
  34.170  
  34.171      if ( hpet_check_access_length(addr, length) != 0 )
  34.172 @@ -256,9 +309,6 @@ static int hpet_write(
  34.173      spin_lock(&h->lock);
  34.174  
  34.175      old_val = hpet_read64(h, addr);
  34.176 -    if ( (addr & ~7) == HPET_COUNTER )
  34.177 -        old_val = hpet_read_maincounter(h);
  34.178 -
  34.179      new_val = val;
  34.180      if ( length != 8 )
  34.181          new_val = hpet_fixup_reg(
  34.182 @@ -275,22 +325,35 @@ static int hpet_write(
  34.183              /* Enable main counter and interrupt generation. */
  34.184              h->mc_offset = h->hpet.mc64 - guest_time_hpet(h->vcpu);
  34.185              for ( i = 0; i < HPET_TIMER_NUM; i++ )
  34.186 -                hpet_set_timer(h, i); 
  34.187 +            {
  34.188 +                h->hpet.comparator64[i] =
  34.189 +                            h->hpet.timers[i].config & HPET_TN_32BIT ?
  34.190 +                                          (uint32_t)h->hpet.timers[i].cmp :
  34.191 +                                                    h->hpet.timers[i].cmp;
  34.192 +                if ( timer_enabled(h, i) )
  34.193 +                    set_start_timer(i);
  34.194 +            }
  34.195          }
  34.196          else if ( (old_val & HPET_CFG_ENABLE) && !(new_val & HPET_CFG_ENABLE) )
  34.197          {
  34.198              /* Halt main counter and disable interrupt generation. */
  34.199              h->hpet.mc64 = h->mc_offset + guest_time_hpet(h->vcpu);
  34.200              for ( i = 0; i < HPET_TIMER_NUM; i++ )
  34.201 -                hpet_stop_timer(h, i);
  34.202 +                if ( timer_enabled(h, i) )
  34.203 +                    set_stop_timer(i);
  34.204          }
  34.205          break;
  34.206  
  34.207      case HPET_COUNTER:
  34.208 +        h->hpet.mc64 = new_val;
  34.209          if ( hpet_enabled(h) )
  34.210 +        {
  34.211              gdprintk(XENLOG_WARNING, 
  34.212                       "HPET: writing main counter but it's not halted!\n");
  34.213 -        h->hpet.mc64 = new_val;
  34.214 +            for ( i = 0; i < HPET_TIMER_NUM; i++ )
  34.215 +                if ( timer_enabled(h, i) )
  34.216 +                    set_restart_timer(i);
  34.217 +        }
  34.218          break;
  34.219  
  34.220      case HPET_T0_CFG:
  34.221 @@ -313,7 +376,28 @@ static int hpet_write(
  34.222              h->hpet.timers[tn].cmp = (uint32_t)h->hpet.timers[tn].cmp;
  34.223              h->hpet.period[tn] = (uint32_t)h->hpet.period[tn];
  34.224          }
  34.225 -
  34.226 +        if ( hpet_enabled(h) )
  34.227 +        {
  34.228 +            if ( new_val & HPET_TN_ENABLE )
  34.229 +            {
  34.230 +                if ( (new_val ^ old_val) & HPET_TN_PERIODIC )
  34.231 +                    /* timer is enabled but switching mode to/from periodic/
  34.232 +                     * one-shot, stop and restart the vpt timer to get it in
  34.233 +                     * the right mode. */
  34.234 +                    set_restart_timer(tn);
  34.235 +                else if ( (new_val & HPET_TN_32BIT) &&
  34.236 +                         !(old_val & HPET_TN_32BIT) )
  34.237 +                    /* switching from 64 bit to 32 bit mode could cause timer
  34.238 +                     * next fire time, or period, to change. */
  34.239 +                    set_restart_timer(tn);
  34.240 +                else if ( !(old_val & HPET_TN_ENABLE) )
  34.241 +                    /* transition from timer disabled to timer enabled. */
  34.242 +                    set_start_timer(tn);
  34.243 +            }
  34.244 +            else if ( old_val & HPET_TN_ENABLE )
  34.245 +                /* transition from timer enabled to timer disabled. */
  34.246 +                set_stop_timer(tn);
  34.247 +        }
  34.248          break;
  34.249  
  34.250      case HPET_T0_CMP:
  34.251 @@ -322,24 +406,32 @@ static int hpet_write(
  34.252          tn = (addr - HPET_T0_CMP) >> 5;
  34.253          if ( timer_is_32bit(h, tn) )
  34.254              new_val = (uint32_t)new_val;
  34.255 -        if ( !timer_is_periodic(h, tn) ||
  34.256 -             (h->hpet.timers[tn].config & HPET_TN_SETVAL) )
  34.257 -            h->hpet.timers[tn].cmp = new_val;
  34.258 -        else
  34.259 +        h->hpet.timers[tn].cmp = new_val;
  34.260 +        if ( h->hpet.timers[tn].config & HPET_TN_SETVAL )
  34.261 +            /*
  34.262 +             * When SETVAL is one, software is able to "directly set a periodic
  34.263 +             * timer's accumulator."  That is, set the comparator without
  34.264 +             * adjusting the period.  Much the same as just setting the
  34.265 +             * comparator on an enabled one-shot timer.
  34.266 +             * 
  34.267 +             * This configuration bit clears when the comparator is written.
  34.268 +             */
  34.269 +            h->hpet.timers[tn].config &= ~HPET_TN_SETVAL;
  34.270 +        else if ( timer_is_periodic(h, tn) )
  34.271          {
  34.272              /*
  34.273               * Clamp period to reasonable min/max values:
  34.274 -             *  - minimum is 900us, same as timers controlled by vpt.c
  34.275 +             *  - minimum is 100us, same as timers controlled by vpt.c
  34.276               *  - maximum is to prevent overflow in time_after() calculations
  34.277               */
  34.278 -            if ( hpet_tick_to_ns(h, new_val) < MICROSECS(900) )
  34.279 -                new_val = (MICROSECS(900) << 10) / h->hpet_to_ns_scale;
  34.280 +            if ( hpet_tick_to_ns(h, new_val) < MICROSECS(100) )
  34.281 +                new_val = (MICROSECS(100) << 10) / h->hpet_to_ns_scale;
  34.282              new_val &= (timer_is_32bit(h, tn) ? ~0u : ~0ull) >> 1;
  34.283              h->hpet.period[tn] = new_val;
  34.284          }
  34.285 -        h->hpet.timers[tn].config &= ~HPET_TN_SETVAL;
  34.286 -        if ( hpet_enabled(h) )
  34.287 -            hpet_set_timer(h, tn);
  34.288 +        h->hpet.comparator64[tn] = new_val;
  34.289 +        if ( hpet_enabled(h) && timer_enabled(h, tn) )
  34.290 +            set_restart_timer(tn);
  34.291          break;
  34.292  
  34.293      case HPET_T0_ROUTE:
  34.294 @@ -354,6 +446,25 @@ static int hpet_write(
  34.295          break;
  34.296      }
  34.297  
  34.298 +    /* stop/start timers whos state was changed by this write. */
  34.299 +    while (stop_timers)
  34.300 +    {
  34.301 +        i = find_first_set_bit(stop_timers);
  34.302 +        __clear_bit(i, &stop_timers);
  34.303 +        hpet_stop_timer(h, i);
  34.304 +    }
  34.305 +
  34.306 +    while (start_timers)
  34.307 +    {
  34.308 +        i = find_first_set_bit(start_timers);
  34.309 +        __clear_bit(i, &start_timers);
  34.310 +        hpet_set_timer(h, i);
  34.311 +    }
  34.312 +
  34.313 +#undef set_stop_timer
  34.314 +#undef set_start_timer
  34.315 +#undef set_restart_timer
  34.316 +
  34.317      spin_unlock(&h->lock);
  34.318  
  34.319   out:
  34.320 @@ -373,86 +484,6 @@ struct hvm_mmio_handler hpet_mmio_handle
  34.321      .write_handler = hpet_write
  34.322  };
  34.323  
  34.324 -static void hpet_route_interrupt(HPETState *h, unsigned int tn)
  34.325 -{
  34.326 -    unsigned int tn_int_route = timer_int_route(h, tn);
  34.327 -    struct domain *d = h->vcpu->domain;
  34.328 -
  34.329 -    ASSERT(spin_is_locked(&h->lock));
  34.330 -
  34.331 -    if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) )
  34.332 -    {
  34.333 -        /* if LegacyReplacementRoute bit is set, HPET specification requires
  34.334 -           timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC,
  34.335 -           timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */
  34.336 -        int isa_irq = (tn == 0) ? 0 : 8;
  34.337 -        hvm_isa_irq_deassert(d, isa_irq);
  34.338 -        hvm_isa_irq_assert(d, isa_irq);
  34.339 -        return;
  34.340 -    }
  34.341 -
  34.342 -    if ( !(timer_int_route_cap(h, tn) & (1U << tn_int_route)) )
  34.343 -    {
  34.344 -        gdprintk(XENLOG_ERR,
  34.345 -                 "HPET: timer%u: invalid interrupt route config\n", tn);
  34.346 -        domain_crash(d);
  34.347 -        return;
  34.348 -    }
  34.349 -
  34.350 -    /* We support only edge-triggered interrupt. */
  34.351 -    spin_lock(&d->arch.hvm_domain.irq_lock);
  34.352 -    vioapic_irq_positive_edge(d, tn_int_route);
  34.353 -    spin_unlock(&d->arch.hvm_domain.irq_lock);
  34.354 -}
  34.355 -
  34.356 -static void hpet_timer_fn(void *opaque)
  34.357 -{
  34.358 -    struct HPET_timer_fn_info *htfi = opaque;
  34.359 -    HPETState *h = htfi->hs;
  34.360 -    unsigned int tn = htfi->tn;
  34.361 -
  34.362 -    spin_lock(&h->lock);
  34.363 -
  34.364 -    if ( !hpet_enabled(h) )
  34.365 -    {
  34.366 -        spin_unlock(&h->lock);
  34.367 -        return;
  34.368 -    }
  34.369 -
  34.370 -    if ( timer_config(h, tn) & HPET_TN_ENABLE )
  34.371 -        hpet_route_interrupt(h, tn);
  34.372 -
  34.373 -    if ( timer_is_periodic(h, tn) && (h->hpet.period[tn] != 0) )
  34.374 -    {
  34.375 -        uint64_t mc = hpet_read_maincounter(h), period = h->hpet.period[tn];
  34.376 -        if ( timer_is_32bit(h, tn) )
  34.377 -        {
  34.378 -            while ( hpet_time_after(mc, h->hpet.timers[tn].cmp) )
  34.379 -                h->hpet.timers[tn].cmp = (uint32_t)(
  34.380 -                    h->hpet.timers[tn].cmp + period);
  34.381 -        }
  34.382 -        else
  34.383 -        {
  34.384 -            while ( hpet_time_after64(mc, h->hpet.timers[tn].cmp) )
  34.385 -                h->hpet.timers[tn].cmp += period;
  34.386 -        }
  34.387 -        set_timer(&h->timers[tn], NOW() + hpet_tick_to_ns(h, period));
  34.388 -    }
  34.389 -
  34.390 -    spin_unlock(&h->lock);
  34.391 -}
  34.392 -
  34.393 -void hpet_migrate_timers(struct vcpu *v)
  34.394 -{
  34.395 -    struct HPETState *h = &v->domain->arch.hvm_domain.pl_time.vhpet;
  34.396 -    int i;
  34.397 -
  34.398 -    if ( v != h->vcpu )
  34.399 -        return;
  34.400 -
  34.401 -    for ( i = 0; i < HPET_TIMER_NUM; i++ )
  34.402 -        migrate_timer(&h->timers[i], v->processor);
  34.403 -}
  34.404  
  34.405  static int hpet_save(struct domain *d, hvm_domain_context_t *h)
  34.406  {
  34.407 @@ -477,18 +508,20 @@ static int hpet_save(struct domain *d, h
  34.408          C(isr);
  34.409          C(mc64);
  34.410          C(timers[0].config);
  34.411 -        C(timers[0].cmp);
  34.412          C(timers[0].fsb);
  34.413          C(timers[1].config);
  34.414 -        C(timers[1].cmp);
  34.415          C(timers[1].fsb);
  34.416          C(timers[2].config);
  34.417 -        C(timers[2].cmp);
  34.418          C(timers[2].fsb);
  34.419          C(period[0]);
  34.420          C(period[1]);
  34.421          C(period[2]);
  34.422  #undef C
  34.423 +        /* save the 64 bit comparator in the 64 bit timer[n].cmp field
  34.424 +         * regardless of whether or not the timer is in 32 bit mode. */
  34.425 +        rec->timers[0].cmp = hp->hpet.comparator64[0];
  34.426 +        rec->timers[1].cmp = hp->hpet.comparator64[1];
  34.427 +        rec->timers[2].cmp = hp->hpet.comparator64[2];
  34.428      }
  34.429  
  34.430      spin_unlock(&hp->lock);
  34.431 @@ -500,6 +533,7 @@ static int hpet_load(struct domain *d, h
  34.432  {
  34.433      HPETState *hp = &d->arch.hvm_domain.pl_time.vhpet;
  34.434      struct hvm_hw_hpet *rec;
  34.435 +    uint64_t cmp;
  34.436      int i;
  34.437  
  34.438      spin_lock(&hp->lock);
  34.439 @@ -515,32 +549,38 @@ static int hpet_load(struct domain *d, h
  34.440      h->cur += HVM_SAVE_LENGTH(HPET);
  34.441  
  34.442  #define C(x) hp->hpet.x = rec->x
  34.443 -        C(capability);
  34.444 -        C(config);
  34.445 -        C(isr);
  34.446 -        C(mc64);
  34.447 -        C(timers[0].config);
  34.448 -        C(timers[0].cmp);
  34.449 -        C(timers[0].fsb);
  34.450 -        C(timers[1].config);
  34.451 -        C(timers[1].cmp);
  34.452 -        C(timers[1].fsb);
  34.453 -        C(timers[2].config);
  34.454 -        C(timers[2].cmp);
  34.455 -        C(timers[2].fsb);
  34.456 -        C(period[0]);
  34.457 -        C(period[1]);
  34.458 -        C(period[2]);
  34.459 +    C(capability);
  34.460 +    C(config);
  34.461 +    C(isr);
  34.462 +    C(mc64);
  34.463 +    /* The following define will generate a compiler error if HPET_TIMER_NUM
  34.464 +     * changes. This indicates an incompatability with previous saved state. */
  34.465 +#define HPET_TIMER_NUM 3
  34.466 +    for ( i = 0; i < HPET_TIMER_NUM; i++ )
  34.467 +    {
  34.468 +        C(timers[i].config);
  34.469 +        C(timers[i].fsb);
  34.470 +        C(period[i]);
  34.471 +        /* restore the hidden 64 bit comparator and truncate the timer's
  34.472 +         * visible comparator field if in 32 bit mode. */
  34.473 +        cmp = rec->timers[i].cmp;
  34.474 +        hp->hpet.comparator64[i] = cmp;
  34.475 +        if ( timer_is_32bit(hp, i) )
  34.476 +            cmp = (uint32_t)cmp;
  34.477 +        hp->hpet.timers[i].cmp = cmp;
  34.478 +    }
  34.479  #undef C
  34.480      
  34.481      /* Recalculate the offset between the main counter and guest time */
  34.482      hp->mc_offset = hp->hpet.mc64 - guest_time_hpet(hp->vcpu);
  34.483 -                
  34.484 -    /* Restart the timers */
  34.485 -    for ( i = 0; i < HPET_TIMER_NUM; i++ )
  34.486 -        if ( hpet_enabled(hp) )
  34.487 -            hpet_set_timer(hp, i);
  34.488 +
  34.489 +    /* restart all timers */
  34.490  
  34.491 +    if ( hpet_enabled(hp) )
  34.492 +        for ( i = 0; i < HPET_TIMER_NUM; i++ )
  34.493 +            if ( timer_enabled(hp, i) )
  34.494 +                hpet_set_timer(hp, i);
  34.495 + 
  34.496      spin_unlock(&hp->lock);
  34.497  
  34.498      return 0;
  34.499 @@ -575,10 +615,7 @@ void hpet_init(struct vcpu *v)
  34.500          h->hpet.timers[i].config = 
  34.501              HPET_TN_INT_ROUTE_CAP | HPET_TN_SIZE_CAP | HPET_TN_PERIODIC_CAP;
  34.502          h->hpet.timers[i].cmp = ~0ULL;
  34.503 -        h->timer_fn_info[i].hs = h;
  34.504 -        h->timer_fn_info[i].tn = i;
  34.505 -        init_timer(&h->timers[i], hpet_timer_fn, &h->timer_fn_info[i],
  34.506 -                   v->processor);
  34.507 +        h->pt[i].source = PTSRC_isa;
  34.508      }
  34.509  }
  34.510  
  34.511 @@ -587,8 +624,14 @@ void hpet_deinit(struct domain *d)
  34.512      int i;
  34.513      HPETState *h = &d->arch.hvm_domain.pl_time.vhpet;
  34.514  
  34.515 -    for ( i = 0; i < HPET_TIMER_NUM; i++ )
  34.516 -        kill_timer(&h->timers[i]);
  34.517 +    spin_lock(&h->lock);
  34.518 +
  34.519 +    if ( hpet_enabled(h) )
  34.520 +        for ( i = 0; i < HPET_TIMER_NUM; i++ )
  34.521 +            if ( timer_enabled(h, i) )
  34.522 +                hpet_stop_timer(h, i);
  34.523 +
  34.524 +    spin_unlock(&h->lock);
  34.525  }
  34.526  
  34.527  void hpet_reset(struct domain *d)
    35.1 --- a/xen/arch/x86/hvm/hvm.c	Tue Nov 04 12:07:22 2008 +0900
    35.2 +++ b/xen/arch/x86/hvm/hvm.c	Tue Nov 04 12:43:19 2008 +0900
    35.3 @@ -163,7 +163,6 @@ u64 hvm_get_guest_tsc(struct vcpu *v)
    35.4  void hvm_migrate_timers(struct vcpu *v)
    35.5  {
    35.6      rtc_migrate_timers(v);
    35.7 -    hpet_migrate_timers(v);
    35.8      pt_migrate(v);
    35.9  }
   35.10  
    36.1 --- a/xen/arch/x86/hvm/i8254.c	Tue Nov 04 12:07:22 2008 +0900
    36.2 +++ b/xen/arch/x86/hvm/i8254.c	Tue Nov 04 12:43:19 2008 +0900
    36.3 @@ -213,13 +213,13 @@ static void pit_load_count(PITState *pit
    36.4      case 2:
    36.5      case 3:
    36.6          /* Periodic timer. */
    36.7 -        create_periodic_time(v, &pit->pt0, period, 0, 0, pit_time_fired, 
    36.8 +        create_periodic_time(v, &pit->pt0, period, period, 0, pit_time_fired, 
    36.9                               &pit->count_load_time[channel]);
   36.10          break;
   36.11      case 1:
   36.12      case 4:
   36.13          /* One-shot timer. */
   36.14 -        create_periodic_time(v, &pit->pt0, period, 0, 1, pit_time_fired,
   36.15 +        create_periodic_time(v, &pit->pt0, period, 0, 0, pit_time_fired,
   36.16                               &pit->count_load_time[channel]);
   36.17          break;
   36.18      default:
    37.1 --- a/xen/arch/x86/hvm/rtc.c	Tue Nov 04 12:07:22 2008 +0900
    37.2 +++ b/xen/arch/x86/hvm/rtc.c	Tue Nov 04 12:43:19 2008 +0900
    37.3 @@ -59,8 +59,8 @@ static void rtc_timer_update(RTCState *s
    37.4  
    37.5          period = 1 << (period_code - 1); /* period in 32 Khz cycles */
    37.6          period = DIV_ROUND((period * 1000000000ULL), 32768); /* period in ns */
    37.7 -        create_periodic_time(v, &s->pt, period, RTC_IRQ,
    37.8 -                             0, rtc_periodic_cb, s);
    37.9 +        create_periodic_time(v, &s->pt, period, period, RTC_IRQ,
   37.10 +                             rtc_periodic_cb, s);
   37.11      }
   37.12      else
   37.13      {
    38.1 --- a/xen/arch/x86/hvm/svm/entry.S	Tue Nov 04 12:07:22 2008 +0900
    38.2 +++ b/xen/arch/x86/hvm/svm/entry.S	Tue Nov 04 12:43:19 2008 +0900
    38.3 @@ -57,6 +57,8 @@
    38.4  #endif
    38.5  
    38.6  ENTRY(svm_asm_do_resume)
    38.7 +        call svm_intr_assist
    38.8 +
    38.9          get_current(bx)
   38.10          CLGI
   38.11  
   38.12 @@ -67,7 +69,6 @@ ENTRY(svm_asm_do_resume)
   38.13          jnz  .Lsvm_process_softirqs
   38.14  
   38.15          call svm_asid_handle_vmrun
   38.16 -        call svm_intr_assist
   38.17  
   38.18          cmpb $0,addr_of(tb_init_done)
   38.19          jnz  .Lsvm_trace
    39.1 --- a/xen/arch/x86/hvm/vlapic.c	Tue Nov 04 12:07:22 2008 +0900
    39.2 +++ b/xen/arch/x86/hvm/vlapic.c	Tue Nov 04 12:43:19 2008 +0900
    39.3 @@ -701,8 +701,9 @@ static int vlapic_write(struct vcpu *v, 
    39.4                              (uint32_t)val * vlapic->hw.timer_divisor;
    39.5  
    39.6          vlapic_set_reg(vlapic, APIC_TMICT, val);
    39.7 -        create_periodic_time(current, &vlapic->pt, period, vlapic->pt.irq,
    39.8 -                             !vlapic_lvtt_period(vlapic), vlapic_pt_cb,
    39.9 +        create_periodic_time(current, &vlapic->pt, period, 
   39.10 +                             vlapic_lvtt_period(vlapic) ? period : 0,
   39.11 +                             vlapic->pt.irq, vlapic_pt_cb,
   39.12                               &vlapic->timer_last_update);
   39.13          vlapic->timer_last_update = vlapic->pt.last_plt_gtime;
   39.14  
   39.15 @@ -861,8 +862,9 @@ static void lapic_rearm(struct vlapic *s
   39.16      period = ((uint64_t)APIC_BUS_CYCLE_NS *
   39.17                (uint32_t)tmict * s->hw.timer_divisor);
   39.18      s->pt.irq = vlapic_get_reg(s, APIC_LVTT) & APIC_VECTOR_MASK;
   39.19 -    create_periodic_time(vlapic_vcpu(s), &s->pt, period, s->pt.irq,
   39.20 -                         !vlapic_lvtt_period(s), vlapic_pt_cb,
   39.21 +    create_periodic_time(vlapic_vcpu(s), &s->pt, period,
   39.22 +                         vlapic_lvtt_period(s) ? period : 0,
   39.23 +                         s->pt.irq, vlapic_pt_cb,
   39.24                           &s->timer_last_update);
   39.25      s->timer_last_update = s->pt.last_plt_gtime;
   39.26  }
    40.1 --- a/xen/arch/x86/hvm/vmx/entry.S	Tue Nov 04 12:07:22 2008 +0900
    40.2 +++ b/xen/arch/x86/hvm/vmx/entry.S	Tue Nov 04 12:43:19 2008 +0900
    40.3 @@ -122,6 +122,8 @@ vmx_asm_vmexit_handler:
    40.4  
    40.5  .globl vmx_asm_do_vmentry
    40.6  vmx_asm_do_vmentry:
    40.7 +        call vmx_intr_assist
    40.8 +
    40.9          get_current(bx)
   40.10          cli
   40.11  
   40.12 @@ -131,8 +133,6 @@ vmx_asm_do_vmentry:
   40.13          cmpl $0,(r(dx),r(ax),1)
   40.14          jnz  .Lvmx_process_softirqs
   40.15  
   40.16 -        call vmx_intr_assist
   40.17 -
   40.18          testb $0xff,VCPU_vmx_emul(r(bx))
   40.19          jnz  .Lvmx_goto_realmode
   40.20  
   40.21 @@ -179,11 +179,13 @@ vmx_asm_do_vmentry:
   40.22  
   40.23  /*.Lvmx_resume:*/
   40.24          VMRESUME
   40.25 +        sti
   40.26          call vm_resume_fail
   40.27          ud2
   40.28  
   40.29  .Lvmx_launch:
   40.30          VMLAUNCH
   40.31 +        sti
   40.32          call vm_launch_fail
   40.33          ud2
   40.34  
    41.1 --- a/xen/arch/x86/hvm/vmx/vmx.c	Tue Nov 04 12:07:22 2008 +0900
    41.2 +++ b/xen/arch/x86/hvm/vmx/vmx.c	Tue Nov 04 12:43:19 2008 +0900
    41.3 @@ -49,6 +49,7 @@
    41.4  #include <asm/hvm/vpt.h>
    41.5  #include <public/hvm/save.h>
    41.6  #include <asm/hvm/trace.h>
    41.7 +#include <asm/xenoprof.h>
    41.8  
    41.9  enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
   41.10  
   41.11 @@ -132,6 +133,7 @@ static void vmx_vcpu_destroy(struct vcpu
   41.12  {
   41.13      vmx_destroy_vmcs(v);
   41.14      vpmu_destroy(v);
   41.15 +    passive_domain_destroy(v);
   41.16  }
   41.17  
   41.18  #ifdef __x86_64__
   41.19 @@ -1666,6 +1668,8 @@ static int vmx_msr_read_intercept(struct
   41.20      default:
   41.21          if ( vpmu_do_rdmsr(regs) )
   41.22              goto done;
   41.23 +        if ( passive_domain_do_rdmsr(regs) )
   41.24 +            goto done;
   41.25          switch ( long_mode_do_msr_read(regs) )
   41.26          {
   41.27              case HNDL_unhandled:
   41.28 @@ -1861,6 +1865,8 @@ static int vmx_msr_write_intercept(struc
   41.29      default:
   41.30          if ( vpmu_do_wrmsr(regs) )
   41.31              return X86EMUL_OKAY;
   41.32 +        if ( passive_domain_do_wrmsr(regs) )
   41.33 +            return X86EMUL_OKAY;
   41.34  
   41.35          if ( wrmsr_viridian_regs(ecx, regs->eax, regs->edx) ) 
   41.36              break;
   41.37 @@ -1964,27 +1970,25 @@ static void ept_handle_violation(unsigne
   41.38  {
   41.39      unsigned long gla_validity = qualification & EPT_GLA_VALIDITY_MASK;
   41.40      struct domain *d = current->domain;
   41.41 -    unsigned long gfn = gpa >> PAGE_SHIFT;
   41.42 +    unsigned long gla, gfn = gpa >> PAGE_SHIFT;
   41.43      mfn_t mfn;
   41.44      p2m_type_t t;
   41.45  
   41.46 -    if ( unlikely(qualification & EPT_GAW_VIOLATION) )
   41.47 +    mfn = gfn_to_mfn(d, gfn, &t);
   41.48 +
   41.49 +    /* There are two legitimate reasons for taking an EPT violation. 
   41.50 +     * One is a guest access to MMIO space. */
   41.51 +    if ( gla_validity == EPT_GLA_VALIDITY_MATCH && p2m_is_mmio(t) )
   41.52      {
   41.53 -        gdprintk(XENLOG_ERR, "EPT violation: guest physical address %"PRIpaddr
   41.54 -                 " exceeded its width limit.\n", gpa);
   41.55 -        goto crash;
   41.56 +        handle_mmio();
   41.57 +        return;
   41.58      }
   41.59  
   41.60 -    if ( unlikely(gla_validity == EPT_GLA_VALIDITY_RSVD) ||
   41.61 -         unlikely(gla_validity == EPT_GLA_VALIDITY_PDPTR_LOAD) )
   41.62 -    {
   41.63 -        gdprintk(XENLOG_ERR, "EPT violation: reserved bit or "
   41.64 -                 "pdptr load violation.\n");
   41.65 -        goto crash;
   41.66 -    }
   41.67 -
   41.68 -    mfn = gfn_to_mfn(d, gfn, &t);
   41.69 -    if ( (t != p2m_ram_ro) && p2m_is_ram(t) && paging_mode_log_dirty(d) )
   41.70 +    /* The other is log-dirty mode, writing to a read-only page */
   41.71 +    if ( paging_mode_log_dirty(d)
   41.72 +         && (gla_validity == EPT_GLA_VALIDITY_MATCH
   41.73 +             || gla_validity == EPT_GLA_VALIDITY_GPT_WALK)
   41.74 +         && p2m_is_ram(t) && (t != p2m_ram_ro) )
   41.75      {
   41.76          paging_mark_dirty(d, mfn_x(mfn));
   41.77          p2m_change_type(d, gfn, p2m_ram_logdirty, p2m_ram_rw);
   41.78 @@ -1992,16 +1996,39 @@ static void ept_handle_violation(unsigne
   41.79          return;
   41.80      }
   41.81  
   41.82 -    /* This can only happen in log-dirty mode, writing back A/D bits. */
   41.83 -    if ( unlikely(gla_validity == EPT_GLA_VALIDITY_GPT_WALK) )
   41.84 -        goto crash;
   41.85 +    /* Everything else is an error. */
   41.86 +    gla = __vmread(GUEST_LINEAR_ADDRESS);
   41.87 +    gdprintk(XENLOG_ERR, "EPT violation %#lx (%c%c%c/%c%c%c), "
   41.88 +             "gpa %#"PRIpaddr", mfn %#lx, type %i.\n", 
   41.89 +             qualification, 
   41.90 +             (qualification & EPT_READ_VIOLATION) ? 'r' : '-',
   41.91 +             (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-',
   41.92 +             (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-',
   41.93 +             (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-',
   41.94 +             (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-',
   41.95 +             (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-',
   41.96 +             gpa, mfn_x(mfn), t);
   41.97 +
   41.98 +    if ( qualification & EPT_GAW_VIOLATION )
   41.99 +        gdprintk(XENLOG_ERR, " --- GPA too wide (max %u bits)\n", 
  41.100 +                 9 * (unsigned) d->arch.hvm_domain.vmx.ept_control.gaw + 21);
  41.101  
  41.102 -    ASSERT(gla_validity == EPT_GLA_VALIDITY_MATCH);
  41.103 -    handle_mmio();
  41.104 +    switch ( gla_validity )
  41.105 +    {
  41.106 +    case EPT_GLA_VALIDITY_PDPTR_LOAD:
  41.107 +        gdprintk(XENLOG_ERR, " --- PDPTR load failed\n"); 
  41.108 +        break;
  41.109 +    case EPT_GLA_VALIDITY_GPT_WALK:
  41.110 +        gdprintk(XENLOG_ERR, " --- guest PT walk to %#lx failed\n", gla);
  41.111 +        break;
  41.112 +    case EPT_GLA_VALIDITY_RSVD:
  41.113 +        gdprintk(XENLOG_ERR, " --- GLA_validity 2 (reserved)\n");
  41.114 +        break;
  41.115 +    case EPT_GLA_VALIDITY_MATCH:
  41.116 +        gdprintk(XENLOG_ERR, " --- guest access to %#lx failed\n", gla);
  41.117 +        break;
  41.118 +    }
  41.119  
  41.120 -    return;
  41.121 -
  41.122 - crash:
  41.123      domain_crash(d);
  41.124  }
  41.125  
    42.1 --- a/xen/arch/x86/hvm/vmx/vpmu_core2.c	Tue Nov 04 12:07:22 2008 +0900
    42.2 +++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c	Tue Nov 04 12:43:19 2008 +0900
    42.3 @@ -35,6 +35,26 @@
    42.4  #include <asm/hvm/vmx/vpmu.h>
    42.5  #include <asm/hvm/vmx/vpmu_core2.h>
    42.6  
    42.7 +u32 core2_counters_msr[] =   {
    42.8 +    MSR_CORE_PERF_FIXED_CTR0,
    42.9 +    MSR_CORE_PERF_FIXED_CTR1,
   42.10 +    MSR_CORE_PERF_FIXED_CTR2};
   42.11 +
   42.12 +/* Core 2 Non-architectual Performance Control MSRs. */
   42.13 +u32 core2_ctrls_msr[] = {
   42.14 +    MSR_CORE_PERF_FIXED_CTR_CTRL,
   42.15 +    MSR_IA32_PEBS_ENABLE,
   42.16 +    MSR_IA32_DS_AREA};
   42.17 +
   42.18 +struct pmumsr core2_counters = {
   42.19 +    3,
   42.20 +    core2_counters_msr
   42.21 +};
   42.22 +
   42.23 +struct pmumsr core2_ctrls = {
   42.24 +    3,
   42.25 +    core2_ctrls_msr
   42.26 +};
   42.27  static int arch_pmc_cnt;
   42.28  
   42.29  static int core2_get_pmc_count(void)
    43.1 --- a/xen/arch/x86/hvm/vpt.c	Tue Nov 04 12:07:22 2008 +0900
    43.2 +++ b/xen/arch/x86/hvm/vpt.c	Tue Nov 04 12:43:19 2008 +0900
    43.3 @@ -355,8 +355,8 @@ void pt_migrate(struct vcpu *v)
    43.4  }
    43.5  
    43.6  void create_periodic_time(
    43.7 -    struct vcpu *v, struct periodic_time *pt, uint64_t period,
    43.8 -    uint8_t irq, char one_shot, time_cb *cb, void *data)
    43.9 +    struct vcpu *v, struct periodic_time *pt, uint64_t delta,
   43.10 +    uint64_t period, uint8_t irq, time_cb *cb, void *data)
   43.11  {
   43.12      ASSERT(pt->source != 0);
   43.13  
   43.14 @@ -368,13 +368,13 @@ void create_periodic_time(
   43.15      pt->do_not_freeze = 0;
   43.16      pt->irq_issued = 0;
   43.17  
   43.18 -    /* Periodic timer must be at least 0.9ms. */
   43.19 -    if ( (period < 900000) && !one_shot )
   43.20 +    /* Periodic timer must be at least 0.1ms. */
   43.21 +    if ( (period < 100000) && period )
   43.22      {
   43.23          if ( !test_and_set_bool(pt->warned_timeout_too_short) )
   43.24              gdprintk(XENLOG_WARNING, "HVM_PlatformTime: program too "
   43.25                       "small period %"PRIu64"\n", period);
   43.26 -        period = 900000;
   43.27 +        period = 100000;
   43.28      }
   43.29  
   43.30      pt->period = period;
   43.31 @@ -382,15 +382,15 @@ void create_periodic_time(
   43.32      pt->last_plt_gtime = hvm_get_guest_time(pt->vcpu);
   43.33      pt->irq = irq;
   43.34      pt->period_cycles = (u64)period;
   43.35 -    pt->one_shot = one_shot;
   43.36 -    pt->scheduled = NOW() + period;
   43.37 +    pt->one_shot = !period;
   43.38 +    pt->scheduled = NOW() + delta;
   43.39      /*
   43.40       * Offset LAPIC ticks from other timer ticks. Otherwise guests which use
   43.41       * LAPIC ticks for process accounting can see long sequences of process
   43.42       * ticks incorrectly accounted to interrupt processing.
   43.43       */
   43.44 -    if ( pt->source == PTSRC_lapic )
   43.45 -        pt->scheduled += period >> 1;
   43.46 +    if ( !pt->one_shot && (pt->source == PTSRC_lapic) )
   43.47 +        pt->scheduled += delta >> 1;
   43.48      pt->cb = cb;
   43.49      pt->priv = data;
   43.50  
    44.1 --- a/xen/arch/x86/irq.c	Tue Nov 04 12:07:22 2008 +0900
    44.2 +++ b/xen/arch/x86/irq.c	Tue Nov 04 12:43:19 2008 +0900
    44.3 @@ -793,6 +793,10 @@ int map_domain_pirq(
    44.4  
    44.5      ASSERT(spin_is_locked(&d->event_lock));
    44.6  
    44.7 +    /* XXX Until pcidev and msi locking is fixed. */
    44.8 +    if ( type == MAP_PIRQ_TYPE_MSI )
    44.9 +        return -EINVAL;
   44.10 +
   44.11      if ( !IS_PRIV(current->domain) )
   44.12          return -EPERM;
   44.13  
   44.14 @@ -840,7 +844,7 @@ int map_domain_pirq(
   44.15      d->arch.pirq_vector[pirq] = vector;
   44.16      d->arch.vector_pirq[vector] = pirq;
   44.17  
   44.18 -done:
   44.19 + done:
   44.20      spin_unlock_irqrestore(&desc->lock, flags);
   44.21      return ret;
   44.22  }
    45.1 --- a/xen/arch/x86/mm.c	Tue Nov 04 12:07:22 2008 +0900
    45.2 +++ b/xen/arch/x86/mm.c	Tue Nov 04 12:43:19 2008 +0900
    45.3 @@ -566,19 +566,21 @@ static int get_page_from_pagenr(unsigned
    45.4  static int get_page_and_type_from_pagenr(unsigned long page_nr, 
    45.5                                           unsigned long type,
    45.6                                           struct domain *d,
    45.7 +                                         int partial,
    45.8                                           int preemptible)
    45.9  {
   45.10      struct page_info *page = mfn_to_page(page_nr);
   45.11      int rc;
   45.12  
   45.13 -    if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
   45.14 +    if ( likely(partial >= 0) &&
   45.15 +         unlikely(!get_page_from_pagenr(page_nr, d)) )
   45.16          return -EINVAL;
   45.17  
   45.18      rc = (preemptible ?
   45.19            get_page_type_preemptible(page, type) :
   45.20            (get_page_type(page, type) ? 0 : -EINVAL));
   45.21  
   45.22 -    if ( rc )
   45.23 +    if ( unlikely(rc) && partial >= 0 )
   45.24          put_page(page);
   45.25  
   45.26      return rc;
   45.27 @@ -761,7 +763,7 @@ get_page_from_l2e(
   45.28      }
   45.29  
   45.30      rc = get_page_and_type_from_pagenr(
   45.31 -        l2e_get_pfn(l2e), PGT_l1_page_table, d, 0);
   45.32 +        l2e_get_pfn(l2e), PGT_l1_page_table, d, 0, 0);
   45.33      if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
   45.34          rc = 0;
   45.35  
   45.36 @@ -772,7 +774,7 @@ get_page_from_l2e(
   45.37  define_get_linear_pagetable(l3);
   45.38  static int
   45.39  get_page_from_l3e(
   45.40 -    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible)
   45.41 +    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible)
   45.42  {
   45.43      int rc;
   45.44  
   45.45 @@ -786,7 +788,7 @@ get_page_from_l3e(
   45.46      }
   45.47  
   45.48      rc = get_page_and_type_from_pagenr(
   45.49 -        l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible);
   45.50 +        l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
   45.51      if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
   45.52          rc = 0;
   45.53  
   45.54 @@ -797,7 +799,7 @@ get_page_from_l3e(
   45.55  define_get_linear_pagetable(l4);
   45.56  static int
   45.57  get_page_from_l4e(
   45.58 -    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible)
   45.59 +    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible)
   45.60  {
   45.61      int rc;
   45.62  
   45.63 @@ -811,7 +813,7 @@ get_page_from_l4e(
   45.64      }
   45.65  
   45.66      rc = get_page_and_type_from_pagenr(
   45.67 -        l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible);
   45.68 +        l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
   45.69      if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
   45.70          rc = 0;
   45.71  
   45.72 @@ -961,23 +963,32 @@ static int put_page_from_l2e(l2_pgentry_
   45.73      return 1;
   45.74  }
   45.75  
   45.76 +static int __put_page_type(struct page_info *, int preemptible);
   45.77  
   45.78  static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
   45.79 -                             int preemptible)
   45.80 +                             int partial, int preemptible)
   45.81  {
   45.82      if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && 
   45.83           (l3e_get_pfn(l3e) != pfn) )
   45.84 +    {
   45.85 +        if ( unlikely(partial > 0) )
   45.86 +            return __put_page_type(l3e_get_page(l3e), preemptible);
   45.87          return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
   45.88 +    }
   45.89      return 1;
   45.90  }
   45.91  
   45.92  #if CONFIG_PAGING_LEVELS >= 4
   45.93  static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
   45.94 -                             int preemptible)
   45.95 +                             int partial, int preemptible)
   45.96  {
   45.97      if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 
   45.98           (l4e_get_pfn(l4e) != pfn) )
   45.99 +    {
  45.100 +        if ( unlikely(partial > 0) )
  45.101 +            return __put_page_type(l4e_get_page(l4e), preemptible);
  45.102          return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
  45.103 +    }
  45.104      return 1;
  45.105  }
  45.106  #endif
  45.107 @@ -1184,7 +1195,7 @@ static int alloc_l3_table(struct page_in
  45.108      unsigned long  pfn = page_to_mfn(page);
  45.109      l3_pgentry_t  *pl3e;
  45.110      unsigned int   i;
  45.111 -    int            rc = 0;
  45.112 +    int            rc = 0, partial = page->partial_pte;
  45.113  
  45.114  #if CONFIG_PAGING_LEVELS == 3
  45.115      /*
  45.116 @@ -1213,7 +1224,8 @@ static int alloc_l3_table(struct page_in
  45.117      if ( is_pv_32on64_domain(d) )
  45.118          memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
  45.119  
  45.120 -    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ )
  45.121 +    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
  45.122 +          i++, partial = 0 )
  45.123      {
  45.124          if ( is_pv_32bit_domain(d) && (i == 3) )
  45.125          {
  45.126 @@ -1224,16 +1236,17 @@ static int alloc_l3_table(struct page_in
  45.127                  rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
  45.128                                                     PGT_l2_page_table |
  45.129                                                     PGT_pae_xen_l2,
  45.130 -                                                   d, preemptible);
  45.131 +                                                   d, partial, preemptible);
  45.132          }
  45.133          else if ( !is_guest_l3_slot(i) ||
  45.134 -                  (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 )
  45.135 +                  (rc = get_page_from_l3e(pl3e[i], pfn, d,
  45.136 +                                          partial, preemptible)) > 0 )
  45.137              continue;
  45.138  
  45.139          if ( rc == -EAGAIN )
  45.140          {
  45.141              page->nr_validated_ptes = i;
  45.142 -            page->partial_pte = 1;
  45.143 +            page->partial_pte = partial ?: 1;
  45.144          }
  45.145          else if ( rc == -EINTR && i )
  45.146          {
  45.147 @@ -1257,7 +1270,7 @@ static int alloc_l3_table(struct page_in
  45.148              if ( !is_guest_l3_slot(i) )
  45.149                  continue;
  45.150              unadjust_guest_l3e(pl3e[i], d);
  45.151 -            put_page_from_l3e(pl3e[i], pfn, 0);
  45.152 +            put_page_from_l3e(pl3e[i], pfn, 0, 0);
  45.153          }
  45.154      }
  45.155  
  45.156 @@ -1272,18 +1285,20 @@ static int alloc_l4_table(struct page_in
  45.157      unsigned long  pfn = page_to_mfn(page);
  45.158      l4_pgentry_t  *pl4e = page_to_virt(page);
  45.159      unsigned int   i;
  45.160 -    int            rc = 0;
  45.161 -
  45.162 -    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ )
  45.163 +    int            rc = 0, partial = page->partial_pte;
  45.164 +
  45.165 +    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
  45.166 +          i++, partial = 0 )
  45.167      {
  45.168          if ( !is_guest_l4_slot(d, i) ||
  45.169 -             (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 )
  45.170 +             (rc = get_page_from_l4e(pl4e[i], pfn, d,
  45.171 +                                     partial, preemptible)) > 0 )
  45.172              continue;
  45.173  
  45.174          if ( rc == -EAGAIN )
  45.175          {
  45.176              page->nr_validated_ptes = i;
  45.177 -            page->partial_pte = 1;
  45.178 +            page->partial_pte = partial ?: 1;
  45.179          }
  45.180          else if ( rc == -EINTR )
  45.181          {
  45.182 @@ -1299,7 +1314,7 @@ static int alloc_l4_table(struct page_in
  45.183              MEM_LOG("Failure in alloc_l4_table: entry %d", i);
  45.184              while ( i-- > 0 )
  45.185                  if ( is_guest_l4_slot(d, i) )
  45.186 -                    put_page_from_l4e(pl4e[i], pfn, 0);
  45.187 +                    put_page_from_l4e(pl4e[i], pfn, 0, 0);
  45.188          }
  45.189          if ( rc < 0 )
  45.190              return rc;
  45.191 @@ -1377,24 +1392,20 @@ static int free_l3_table(struct page_inf
  45.192      struct domain *d = page_get_owner(page);
  45.193      unsigned long pfn = page_to_mfn(page);
  45.194      l3_pgentry_t *pl3e;
  45.195 -    unsigned int  i = page->nr_validated_ptes - !page->partial_pte;
  45.196 -    int rc = 0;
  45.197 -
  45.198 -#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
  45.199 -    if ( d->arch.relmem == RELMEM_l3 )
  45.200 -        return 0;
  45.201 -#endif
  45.202 +    int rc = 0, partial = page->partial_pte;
  45.203 +    unsigned int  i = page->nr_validated_ptes - !partial;
  45.204  
  45.205      pl3e = map_domain_page(pfn);
  45.206  
  45.207      do {
  45.208          if ( is_guest_l3_slot(i) )
  45.209          {
  45.210 -            rc = put_page_from_l3e(pl3e[i], pfn, preemptible);
  45.211 +            rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
  45.212 +            if ( rc < 0 )
  45.213 +                break;
  45.214 +            partial = 0;
  45.215              if ( rc > 0 )
  45.216                  continue;
  45.217 -            if ( rc )
  45.218 -                break;
  45.219              unadjust_guest_l3e(pl3e[i], d);
  45.220          }
  45.221      } while ( i-- );
  45.222 @@ -1404,7 +1415,7 @@ static int free_l3_table(struct page_inf
  45.223      if ( rc == -EAGAIN )
  45.224      {
  45.225          page->nr_validated_ptes = i;
  45.226 -        page->partial_pte = 1;
  45.227 +        page->partial_pte = partial ?: -1;
  45.228      }
  45.229      else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
  45.230      {
  45.231 @@ -1421,23 +1432,21 @@ static int free_l4_table(struct page_inf
  45.232      struct domain *d = page_get_owner(page);
  45.233      unsigned long pfn = page_to_mfn(page);
  45.234      l4_pgentry_t *pl4e = page_to_virt(page);
  45.235 -    unsigned int  i = page->nr_validated_ptes - !page->partial_pte;
  45.236 -    int rc = 0;
  45.237 -
  45.238 -#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
  45.239 -    if ( d->arch.relmem == RELMEM_l4 )
  45.240 -        return 0;
  45.241 -#endif
  45.242 +    int rc = 0, partial = page->partial_pte;
  45.243 +    unsigned int  i = page->nr_validated_ptes - !partial;
  45.244  
  45.245      do {
  45.246          if ( is_guest_l4_slot(d, i) )
  45.247 -            rc = put_page_from_l4e(pl4e[i], pfn, preemptible);
  45.248 -    } while ( rc >= 0 && i-- );
  45.249 +            rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
  45.250 +        if ( rc < 0 )
  45.251 +            break;
  45.252 +        partial = 0;
  45.253 +    } while ( i-- );
  45.254  
  45.255      if ( rc == -EAGAIN )
  45.256      {
  45.257          page->nr_validated_ptes = i;
  45.258 -        page->partial_pte = 1;
  45.259 +        page->partial_pte = partial ?: -1;
  45.260      }
  45.261      else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
  45.262      {
  45.263 @@ -1713,7 +1722,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
  45.264              return rc ? 0 : -EFAULT;
  45.265          }
  45.266  
  45.267 -        rc = get_page_from_l3e(nl3e, pfn, d, preemptible);
  45.268 +        rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
  45.269          if ( unlikely(rc < 0) )
  45.270              return page_unlock(l3pg), rc;
  45.271          rc = 0;
  45.272 @@ -1742,7 +1751,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
  45.273      }
  45.274  
  45.275      page_unlock(l3pg);
  45.276 -    put_page_from_l3e(ol3e, pfn, 0);
  45.277 +    put_page_from_l3e(ol3e, pfn, 0, 0);
  45.278      return rc;
  45.279  }
  45.280  
  45.281 @@ -1791,7 +1800,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
  45.282              return rc ? 0 : -EFAULT;
  45.283          }
  45.284  
  45.285 -        rc = get_page_from_l4e(nl4e, pfn, d, preemptible);
  45.286 +        rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
  45.287          if ( unlikely(rc < 0) )
  45.288              return page_unlock(l4pg), rc;
  45.289          rc = 0;
  45.290 @@ -1812,7 +1821,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
  45.291      }
  45.292  
  45.293      page_unlock(l4pg);
  45.294 -    put_page_from_l4e(ol4e, pfn, 0);
  45.295 +    put_page_from_l4e(ol4e, pfn, 0, 0);
  45.296      return rc;
  45.297  }
  45.298  
  45.299 @@ -1847,7 +1856,8 @@ int get_page(struct page_info *page, str
  45.300          nx = x + 1;
  45.301          d  = nd;
  45.302          if ( unlikely((x & PGC_count_mask) == 0) ||  /* Not allocated? */
  45.303 -             unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
  45.304 +             /* Keep one spare reference to be acquired by get_page_light(). */
  45.305 +             unlikely(((nx + 1) & PGC_count_mask) <= 1) || /* Overflow? */
  45.306               unlikely(d != _domain) )                /* Wrong owner? */
  45.307          {
  45.308              if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
  45.309 @@ -1869,6 +1879,28 @@ int get_page(struct page_info *page, str
  45.310      return 1;
  45.311  }
  45.312  
  45.313 +/*
  45.314 + * Special version of get_page() to be used exclusively when
  45.315 + * - a page is known to already have a non-zero reference count
  45.316 + * - the page does not need its owner to be checked
  45.317 + * - it will not be called more than once without dropping the thus
  45.318 + *   acquired reference again.
  45.319 + * Due to get_page() reserving one reference, this call cannot fail.
  45.320 + */
  45.321 +static void get_page_light(struct page_info *page)
  45.322 +{
  45.323 +    u32 x, nx, y = page->count_info;
  45.324 +
  45.325 +    do {
  45.326 +        x  = y;
  45.327 +        nx = x + 1;
  45.328 +        BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
  45.329 +        BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
  45.330 +        y = cmpxchg(&page->count_info, x, nx);
  45.331 +    }
  45.332 +    while ( unlikely(y != x) );
  45.333 +}
  45.334 +
  45.335  
  45.336  static int alloc_page_type(struct page_info *page, unsigned long type,
  45.337                             int preemptible)
  45.338 @@ -1909,6 +1941,7 @@ static int alloc_page_type(struct page_i
  45.339      wmb();
  45.340      if ( rc == -EAGAIN )
  45.341      {
  45.342 +        get_page_light(page);
  45.343          page->u.inuse.type_info |= PGT_partial;
  45.344      }
  45.345      else if ( rc == -EINTR )
  45.346 @@ -1973,6 +2006,7 @@ int free_page_type(struct page_info *pag
  45.347          page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
  45.348          page->partial_pte = 0;
  45.349      }
  45.350 +
  45.351      switch ( type & PGT_type_mask )
  45.352      {
  45.353      case PGT_l1_page_table:
  45.354 @@ -1998,6 +2032,15 @@ int free_page_type(struct page_info *pag
  45.355          BUG();
  45.356      }
  45.357  
  45.358 +    return rc;
  45.359 +}
  45.360 +
  45.361 +
  45.362 +static int __put_final_page_type(
  45.363 +    struct page_info *page, unsigned long type, int preemptible)
  45.364 +{
  45.365 +    int rc = free_page_type(page, type, preemptible);
  45.366 +
  45.367      /* No need for atomic update of type_info here: noone else updates it. */
  45.368      if ( rc == 0 )
  45.369      {
  45.370 @@ -2016,8 +2059,8 @@ int free_page_type(struct page_info *pag
  45.371      }
  45.372      else if ( rc == -EINTR )
  45.373      {
  45.374 -        ASSERT(!(page->u.inuse.type_info &
  45.375 -                 (PGT_count_mask|PGT_validated|PGT_partial)));
  45.376 +        ASSERT((page->u.inuse.type_info &
  45.377 +                (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
  45.378          if ( !(shadow_mode_enabled(page_get_owner(page)) &&
  45.379                 (page->count_info & PGC_page_table)) )
  45.380              page->tlbflush_timestamp = tlbflush_current_time();
  45.381 @@ -2028,6 +2071,7 @@ int free_page_type(struct page_info *pag
  45.382      {
  45.383          BUG_ON(rc != -EAGAIN);
  45.384          wmb();
  45.385 +        get_page_light(page);
  45.386          page->u.inuse.type_info |= PGT_partial;
  45.387      }
  45.388  
  45.389 @@ -2039,6 +2083,7 @@ static int __put_page_type(struct page_i
  45.390                             int preemptible)
  45.391  {
  45.392      unsigned long nx, x, y = page->u.inuse.type_info;
  45.393 +    int rc = 0;
  45.394  
  45.395      for ( ; ; )
  45.396      {
  45.397 @@ -2062,7 +2107,10 @@ static int __put_page_type(struct page_i
  45.398                                             x, nx)) != x) )
  45.399                      continue;
  45.400                  /* We cleared the 'valid bit' so we do the clean up. */
  45.401 -                return free_page_type(page, x, preemptible);
  45.402 +                rc = __put_final_page_type(page, x, preemptible);
  45.403 +                if ( x & PGT_partial )
  45.404 +                    put_page(page);
  45.405 +                break;
  45.406              }
  45.407  
  45.408              /*
  45.409 @@ -2084,7 +2132,7 @@ static int __put_page_type(struct page_i
  45.410              return -EINTR;
  45.411      }
  45.412  
  45.413 -    return 0;
  45.414 +    return rc;
  45.415  }
  45.416  
  45.417  
  45.418 @@ -2092,6 +2140,7 @@ static int __get_page_type(struct page_i
  45.419                             int preemptible)
  45.420  {
  45.421      unsigned long nx, x, y = page->u.inuse.type_info;
  45.422 +    int rc = 0;
  45.423  
  45.424      ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
  45.425  
  45.426 @@ -2214,10 +2263,13 @@ static int __get_page_type(struct page_i
  45.427              page->nr_validated_ptes = 0;
  45.428              page->partial_pte = 0;
  45.429          }
  45.430 -        return alloc_page_type(page, type, preemptible);
  45.431 +        rc = alloc_page_type(page, type, preemptible);
  45.432      }
  45.433  
  45.434 -    return 0;
  45.435 +    if ( (x & PGT_partial) && !(nx & PGT_partial) )
  45.436 +        put_page(page);
  45.437 +
  45.438 +    return rc;
  45.439  }
  45.440  
  45.441  void put_page_type(struct page_info *page)
  45.442 @@ -2296,7 +2348,7 @@ int new_guest_cr3(unsigned long mfn)
  45.443  #endif
  45.444      okay = paging_mode_refcounts(d)
  45.445          ? get_page_from_pagenr(mfn, d)
  45.446 -        : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0);
  45.447 +        : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
  45.448      if ( unlikely(!okay) )
  45.449      {
  45.450          MEM_LOG("Error while installing new baseptr %lx", mfn);
  45.451 @@ -2431,6 +2483,29 @@ static inline cpumask_t vcpumask_to_pcpu
  45.452      return pmask;
  45.453  }
  45.454  
  45.455 +#ifdef __i386__
  45.456 +static inline void *fixmap_domain_page(unsigned long mfn)
  45.457 +{
  45.458 +    unsigned int cpu = smp_processor_id();
  45.459 +    void *ptr = (void *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
  45.460 +
  45.461 +    l1e_write(fix_pae_highmem_pl1e - cpu,
  45.462 +              l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
  45.463 +    flush_tlb_one_local(ptr);
  45.464 +    return ptr;
  45.465 +}
  45.466 +static inline void fixunmap_domain_page(const void *ptr)
  45.467 +{
  45.468 +    unsigned int cpu = virt_to_fix((unsigned long)ptr) - FIX_PAE_HIGHMEM_0;
  45.469 +
  45.470 +    l1e_write(fix_pae_highmem_pl1e - cpu, l1e_empty());
  45.471 +    this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
  45.472 +}
  45.473 +#else
  45.474 +#define fixmap_domain_page(mfn) mfn_to_virt(mfn)
  45.475 +#define fixunmap_domain_page(ptr) ((void)(ptr))
  45.476 +#endif
  45.477 +
  45.478  int do_mmuext_op(
  45.479      XEN_GUEST_HANDLE(mmuext_op_t) uops,
  45.480      unsigned int count,
  45.481 @@ -2517,7 +2592,7 @@ int do_mmuext_op(
  45.482              if ( paging_mode_refcounts(FOREIGNDOM) )
  45.483                  break;
  45.484  
  45.485 -            rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1);
  45.486 +            rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1);
  45.487              okay = !rc;
  45.488              if ( unlikely(!okay) )
  45.489              {
  45.490 @@ -2598,7 +2673,7 @@ int do_mmuext_op(
  45.491                      okay = get_page_from_pagenr(mfn, d);
  45.492                  else
  45.493                      okay = !get_page_and_type_from_pagenr(
  45.494 -                        mfn, PGT_root_page_table, d, 0);
  45.495 +                        mfn, PGT_root_page_table, d, 0, 0);
  45.496                  if ( unlikely(!okay) )
  45.497                  {
  45.498                      MEM_LOG("Error while installing new mfn %lx", mfn);
  45.499 @@ -2700,6 +2775,66 @@ int do_mmuext_op(
  45.500              break;
  45.501          }
  45.502  
  45.503 +        case MMUEXT_CLEAR_PAGE:
  45.504 +        {
  45.505 +            unsigned char *ptr;
  45.506 +
  45.507 +            okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
  45.508 +                                                  FOREIGNDOM, 0, 0);
  45.509 +            if ( unlikely(!okay) )
  45.510 +            {
  45.511 +                MEM_LOG("Error while clearing mfn %lx", mfn);
  45.512 +                break;
  45.513 +            }
  45.514 +
  45.515 +            /* A page is dirtied when it's being cleared. */
  45.516 +            paging_mark_dirty(d, mfn);
  45.517 +
  45.518 +            ptr = fixmap_domain_page(mfn);
  45.519 +            clear_page(ptr);
  45.520 +            fixunmap_domain_page(ptr);
  45.521 +
  45.522 +            put_page_and_type(page);
  45.523 +            break;
  45.524 +        }
  45.525 +
  45.526 +        case MMUEXT_COPY_PAGE:
  45.527 +        {
  45.528 +            const unsigned char *src;
  45.529 +            unsigned char *dst;
  45.530 +            unsigned long src_mfn;
  45.531 +
  45.532 +            src_mfn = gmfn_to_mfn(FOREIGNDOM, op.arg2.src_mfn);
  45.533 +            okay = get_page_from_pagenr(src_mfn, FOREIGNDOM);
  45.534 +            if ( unlikely(!okay) )
  45.535 +            {
  45.536 +                MEM_LOG("Error while copying from mfn %lx", src_mfn);
  45.537 +                break;
  45.538 +            }
  45.539 +
  45.540 +            okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
  45.541 +                                                  FOREIGNDOM, 0, 0);
  45.542 +            if ( unlikely(!okay) )
  45.543 +            {
  45.544 +                put_page(mfn_to_page(src_mfn));
  45.545 +                MEM_LOG("Error while copying to mfn %lx", mfn);
  45.546 +                break;
  45.547 +            }
  45.548 +
  45.549 +            /* A page is dirtied when it's being copied to. */
  45.550 +            paging_mark_dirty(d, mfn);
  45.551 +
  45.552 +            src = map_domain_page(src_mfn);
  45.553 +            dst = fixmap_domain_page(mfn);
  45.554 +            copy_page(dst, src);
  45.555 +            fixunmap_domain_page(dst);
  45.556 +            unmap_domain_page(src);
  45.557 +
  45.558 +            put_page_and_type(page);
  45.559 +            put_page(mfn_to_page(src_mfn));
  45.560 +            break;
  45.561 +        }
  45.562 +
  45.563          default:
  45.564              MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
  45.565              rc = -ENOSYS;
    46.1 --- a/xen/arch/x86/mm/hap/p2m-ept.c	Tue Nov 04 12:07:22 2008 +0900
    46.2 +++ b/xen/arch/x86/mm/hap/p2m-ept.c	Tue Nov 04 12:43:19 2008 +0900
    46.3 @@ -157,9 +157,6 @@ ept_set_entry(struct domain *d, unsigned
    46.4      {
    46.5          if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
    46.6          {
    46.7 -            /* Track the highest gfn for which we have ever had a valid mapping */
    46.8 -            if ( gfn > d->arch.p2m->max_mapped_pfn )
    46.9 -                d->arch.p2m->max_mapped_pfn = gfn;
   46.10              ept_entry->emt = epte_get_entry_emt(d, gfn, mfn_x(mfn));
   46.11              ept_entry->sp_avail = walk_level ? 1 : 0;
   46.12  
   46.13 @@ -234,6 +231,11 @@ ept_set_entry(struct domain *d, unsigned
   46.14          unmap_domain_page(split_table);
   46.15      }
   46.16  
   46.17 +    /* Track the highest gfn for which we have ever had a valid mapping */
   46.18 +    if ( mfn_valid(mfn_x(mfn))
   46.19 +         && (gfn + (1UL << order) - 1 > d->arch.p2m->max_mapped_pfn) )
   46.20 +        d->arch.p2m->max_mapped_pfn = gfn + (1UL << order) - 1;
   46.21 +
   46.22      /* Success */
   46.23      rv = 1;
   46.24  
    47.1 --- a/xen/arch/x86/mm/p2m.c	Tue Nov 04 12:07:22 2008 +0900
    47.2 +++ b/xen/arch/x86/mm/p2m.c	Tue Nov 04 12:43:19 2008 +0900
    47.3 @@ -322,7 +322,8 @@ p2m_set_entry(struct domain *d, unsigned
    47.4      }
    47.5  
    47.6      /* Track the highest gfn for which we have ever had a valid mapping */
    47.7 -    if ( mfn_valid(mfn) && (gfn > d->arch.p2m->max_mapped_pfn) )
    47.8 +    if ( mfn_valid(mfn) 
    47.9 +         && (gfn + (1UL << page_order) - 1 > d->arch.p2m->max_mapped_pfn) )
   47.10          d->arch.p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
   47.11  
   47.12      if ( iommu_enabled && (is_hvm_domain(d) || need_iommu(d)) )
   47.13 @@ -956,18 +957,18 @@ guest_physmap_add_entry(struct domain *d
   47.14      /* First, remove m->p mappings for existing p->m mappings */
   47.15      for ( i = 0; i < (1UL << page_order); i++ )
   47.16      {
   47.17 -        omfn = gfn_to_mfn(d, gfn, &ot);
   47.18 +        omfn = gfn_to_mfn(d, gfn + i, &ot);
   47.19          if ( p2m_is_ram(ot) )
   47.20          {
   47.21              ASSERT(mfn_valid(omfn));
   47.22 -            set_gpfn_from_mfn(mfn_x(omfn)+i, INVALID_M2P_ENTRY);
   47.23 +            set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
   47.24          }
   47.25      }
   47.26  
   47.27      /* Then, look for m->p mappings for this range and deal with them */
   47.28      for ( i = 0; i < (1UL << page_order); i++ )
   47.29      {
   47.30 -        ogfn = mfn_to_gfn(d, _mfn(mfn));
   47.31 +        ogfn = mfn_to_gfn(d, _mfn(mfn+i));
   47.32          if (
   47.33  #ifdef __x86_64__
   47.34              (ogfn != 0x5555555555555555L)
   47.35 @@ -975,20 +976,20 @@ guest_physmap_add_entry(struct domain *d
   47.36              (ogfn != 0x55555555L)
   47.37  #endif
   47.38              && (ogfn != INVALID_M2P_ENTRY)
   47.39 -            && (ogfn != gfn) )
   47.40 +            && (ogfn != gfn + i) )
   47.41          {
   47.42              /* This machine frame is already mapped at another physical
   47.43               * address */
   47.44              P2M_DEBUG("aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
   47.45 -                      mfn, ogfn, gfn);
   47.46 +                      mfn + i, ogfn, gfn + i);
   47.47              omfn = gfn_to_mfn(d, ogfn, &ot);
   47.48              if ( p2m_is_ram(ot) )
   47.49              {
   47.50                  ASSERT(mfn_valid(omfn));
   47.51                  P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n",
   47.52                            ogfn , mfn_x(omfn));
   47.53 -                if ( mfn_x(omfn) == mfn )
   47.54 -                    p2m_remove_page(d, ogfn, mfn, 0);
   47.55 +                if ( mfn_x(omfn) == (mfn + i) )
   47.56 +                    p2m_remove_page(d, ogfn, mfn + i, 0);
   47.57              }
   47.58          }
   47.59      }
    48.1 --- a/xen/arch/x86/msi.c	Tue Nov 04 12:07:22 2008 +0900
    48.2 +++ b/xen/arch/x86/msi.c	Tue Nov 04 12:43:19 2008 +0900
    48.3 @@ -33,8 +33,7 @@ DECLARE_BITMAP(msix_fixmap_pages, MAX_MS
    48.4  
    48.5  static int msix_fixmap_alloc(void)
    48.6  {
    48.7 -    int i;
    48.8 -    int rc = -1;
    48.9 +    int i, rc = -1;
   48.10  
   48.11      spin_lock(&msix_fixmap_lock);
   48.12      for ( i = 0; i < MAX_MSIX_PAGES; i++ )
   48.13 @@ -52,12 +51,8 @@ static int msix_fixmap_alloc(void)
   48.14  
   48.15  static void msix_fixmap_free(int idx)
   48.16  {
   48.17 -    if ( idx < FIX_MSIX_IO_RESERV_BASE )
   48.18 -        return;
   48.19 -
   48.20 -    spin_lock(&msix_fixmap_lock);
   48.21 -    clear_bit(idx - FIX_MSIX_IO_RESERV_BASE, &msix_fixmap_pages);
   48.22 -    spin_unlock(&msix_fixmap_lock);
   48.23 +    if ( idx >= FIX_MSIX_IO_RESERV_BASE )
   48.24 +        clear_bit(idx - FIX_MSIX_IO_RESERV_BASE, &msix_fixmap_pages);
   48.25  }
   48.26  
   48.27  /*
   48.28 @@ -78,19 +73,19 @@ static void msi_compose_msg(struct pci_d
   48.29          msg->address_lo =
   48.30              MSI_ADDR_BASE_LO |
   48.31              ((INT_DEST_MODE == 0) ?
   48.32 -                MSI_ADDR_DESTMODE_PHYS:
   48.33 -                MSI_ADDR_DESTMODE_LOGIC) |
   48.34 +             MSI_ADDR_DESTMODE_PHYS:
   48.35 +             MSI_ADDR_DESTMODE_LOGIC) |
   48.36              ((INT_DELIVERY_MODE != dest_LowestPrio) ?
   48.37 -                MSI_ADDR_REDIRECTION_CPU:
   48.38 -                MSI_ADDR_REDIRECTION_LOWPRI) |
   48.39 +             MSI_ADDR_REDIRECTION_CPU:
   48.40 +             MSI_ADDR_REDIRECTION_LOWPRI) |
   48.41              MSI_ADDR_DEST_ID(dest);
   48.42  
   48.43          msg->data =
   48.44              MSI_DATA_TRIGGER_EDGE |
   48.45              MSI_DATA_LEVEL_ASSERT |
   48.46              ((INT_DELIVERY_MODE != dest_LowestPrio) ?
   48.47 -                MSI_DATA_DELIVERY_FIXED:
   48.48 -                MSI_DATA_DELIVERY_LOWPRI) |
   48.49 +             MSI_DATA_DELIVERY_FIXED:
   48.50 +             MSI_DATA_DELIVERY_LOWPRI) |
   48.51              MSI_DATA_VECTOR(vector);
   48.52      }
   48.53  }
   48.54 @@ -128,7 +123,7 @@ static void read_msi_msg(struct msi_desc
   48.55      {
   48.56          void __iomem *base;
   48.57          base = entry->mask_base +
   48.58 -	    entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
   48.59 +            entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
   48.60  
   48.61          msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
   48.62          msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
   48.63 @@ -205,9 +200,9 @@ static void write_msi_msg(struct msi_des
   48.64              entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
   48.65  
   48.66          writel(msg->address_lo,
   48.67 -            base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
   48.68 +               base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
   48.69          writel(msg->address_hi,
   48.70 -            base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
   48.71 +               base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
   48.72          writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
   48.73          break;
   48.74      }
   48.75 @@ -230,7 +225,7 @@ void set_msi_irq_affinity(unsigned int i
   48.76      dest = cpu_mask_to_apicid(mask);
   48.77  
   48.78      if ( !desc )
   48.79 -	return;
   48.80 +        return;
   48.81  
   48.82      ASSERT(spin_is_locked(&irq_desc[irq].lock));
   48.83      spin_lock(&desc->dev->lock);
   48.84 @@ -398,8 +393,8 @@ static void msi_free_vector(int vector)
   48.85          unsigned long start;
   48.86  
   48.87          writel(1, entry->mask_base + entry->msi_attrib.entry_nr
   48.88 -              * PCI_MSIX_ENTRY_SIZE
   48.89 -              + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
   48.90 +               * PCI_MSIX_ENTRY_SIZE
   48.91 +               + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
   48.92  
   48.93          start = (unsigned long)entry->mask_base & ~(PAGE_SIZE - 1);
   48.94          msix_fixmap_free(virt_to_fix(start));
   48.95 @@ -460,20 +455,20 @@ static int msi_capability_init(struct pc
   48.96      entry->vector = vector;
   48.97      if ( is_mask_bit_support(control) )
   48.98          entry->mask_base = (void __iomem *)(long)msi_mask_bits_reg(pos,
   48.99 -                is_64bit_address(control));
  48.100 +                                                                   is_64bit_address(control));
  48.101      entry->dev = dev;
  48.102      if ( entry->msi_attrib.maskbit )
  48.103      {
  48.104          unsigned int maskbits, temp;
  48.105          /* All MSIs are unmasked by default, Mask them all */
  48.106          maskbits = pci_conf_read32(bus, slot, func,
  48.107 -                       msi_mask_bits_reg(pos, is_64bit_address(control)));
  48.108 +                                   msi_mask_bits_reg(pos, is_64bit_address(control)));
  48.109          temp = (1 << multi_msi_capable(control));
  48.110          temp = ((temp - 1) & ~temp);
  48.111          maskbits |= temp;
  48.112          pci_conf_write32(bus, slot, func,
  48.113 -            msi_mask_bits_reg(pos, is_64bit_address(control)),
  48.114 -            maskbits);
  48.115 +                         msi_mask_bits_reg(pos, is_64bit_address(control)),
  48.116 +                         maskbits);
  48.117      }
  48.118      list_add_tail(&entry->list, &dev->msi_list);
  48.119  
  48.120 @@ -575,14 +570,14 @@ static int __pci_enable_msi(struct msi_i
  48.121  
  48.122      pdev = pci_lock_pdev(msi->bus, msi->devfn);
  48.123      if ( !pdev )
  48.124 -	return -ENODEV;
  48.125 +        return -ENODEV;
  48.126  
  48.127      if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSI) )
  48.128      {
  48.129 -	spin_unlock(&pdev->lock);
  48.130 +        spin_unlock(&pdev->lock);
  48.131          dprintk(XENLOG_WARNING, "vector %d has already mapped to MSI on "
  48.132 -            "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
  48.133 -            PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
  48.134 +                "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
  48.135 +                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
  48.136          return 0;
  48.137      }
  48.138  
  48.139 @@ -601,7 +596,7 @@ static void __pci_disable_msi(int vector
  48.140  
  48.141      entry = irq_desc[vector].msi_desc;
  48.142      if ( !entry )
  48.143 -	return;
  48.144 +        return;
  48.145      /*
  48.146       * Lock here is safe.  msi_desc can not be removed without holding
  48.147       * both irq_desc[].lock (which we do) and pdev->lock.
  48.148 @@ -649,20 +644,20 @@ static int __pci_enable_msix(struct msi_
  48.149  
  48.150      pdev = pci_lock_pdev(msi->bus, msi->devfn);
  48.151      if ( !pdev )
  48.152 -	return -ENODEV;
  48.153 +        return -ENODEV;
  48.154  
  48.155      pos = pci_find_cap_offset(msi->bus, slot, func, PCI_CAP_ID_MSIX);
  48.156      control = pci_conf_read16(msi->bus, slot, func, msi_control_reg(pos));
  48.157      nr_entries = multi_msix_capable(control);
  48.158      if (msi->entry_nr > nr_entries)
  48.159      {
  48.160 -	spin_unlock(&pdev->lock);
  48.161 +        spin_unlock(&pdev->lock);
  48.162          return -EINVAL;
  48.163      }
  48.164  
  48.165      if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSIX) )
  48.166      {
  48.167 -	spin_unlock(&pdev->lock);
  48.168 +        spin_unlock(&pdev->lock);
  48.169          dprintk(XENLOG_WARNING, "vector %d has already mapped to MSIX on "
  48.170                  "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
  48.171                  PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
  48.172 @@ -684,7 +679,7 @@ static void __pci_disable_msix(int vecto
  48.173  
  48.174      entry = irq_desc[vector].msi_desc;
  48.175      if ( !entry )
  48.176 -	return;
  48.177 +        return;
  48.178      /*
  48.179       * Lock here is safe.  msi_desc can not be removed without holding
  48.180       * both irq_desc[].lock (which we do) and pdev->lock.
  48.181 @@ -712,7 +707,7 @@ int pci_enable_msi(struct msi_info *msi)
  48.182      ASSERT(spin_is_locked(&irq_desc[msi->vector].lock));
  48.183  
  48.184      return  msi->table_base ? __pci_enable_msix(msi) :
  48.185 -                              __pci_enable_msi(msi);
  48.186 +        __pci_enable_msi(msi);
  48.187  }
  48.188  
  48.189  void pci_disable_msi(int vector)
  48.190 @@ -720,7 +715,7 @@ void pci_disable_msi(int vector)
  48.191      irq_desc_t *desc = &irq_desc[vector];
  48.192      ASSERT(spin_is_locked(&desc->lock));
  48.193      if ( !desc->msi_desc )
  48.194 -	return;
  48.195 +        return;
  48.196  
  48.197      if ( desc->msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
  48.198          __pci_disable_msi(vector);
  48.199 @@ -734,7 +729,7 @@ static void msi_free_vectors(struct pci_
  48.200      irq_desc_t *desc;
  48.201      unsigned long flags;
  48.202  
  48.203 -retry:
  48.204 + retry:
  48.205      list_for_each_entry_safe( entry, tmp, &dev->msi_list, list )
  48.206      {
  48.207          desc = &irq_desc[entry->vector];
  48.208 @@ -742,7 +737,7 @@ retry:
  48.209          local_irq_save(flags);
  48.210          if ( !spin_trylock(&desc->lock) )
  48.211          {
  48.212 -             local_irq_restore(flags);
  48.213 +            local_irq_restore(flags);
  48.214              goto retry;
  48.215          }
  48.216  
    49.1 --- a/xen/arch/x86/oprofile/nmi_int.c	Tue Nov 04 12:07:22 2008 +0900
    49.2 +++ b/xen/arch/x86/oprofile/nmi_int.c	Tue Nov 04 12:43:19 2008 +0900
    49.3 @@ -36,6 +36,55 @@ static unsigned long saved_lvtpc[NR_CPUS
    49.4  static char *cpu_type;
    49.5  
    49.6  extern int is_active(struct domain *d);
    49.7 +extern int is_passive(struct domain *d);
    49.8 +
    49.9 +int passive_domain_do_rdmsr(struct cpu_user_regs *regs)
   49.10 +{
   49.11 +	u64 msr_content;
   49.12 +	int type, index;
   49.13 +	struct vpmu_struct *vpmu = vcpu_vpmu(current);
   49.14 +
   49.15 +	if ( model->is_arch_pmu_msr == NULL )
   49.16 +		return 0;
   49.17 +	if ( !model->is_arch_pmu_msr((u64)regs->ecx, &type, &index) )
   49.18 +		return 0;
   49.19 +	if ( !(vpmu->flags & PASSIVE_DOMAIN_ALLOCATED) )
   49.20 +		if ( ! model->allocated_msr(current) )
   49.21 +			return 0;
   49.22 +
   49.23 +	model->load_msr(current, type, index, &msr_content);
   49.24 +	regs->eax = msr_content & 0xFFFFFFFF;
   49.25 +	regs->edx = msr_content >> 32;
   49.26 +	return 1;
   49.27 +}
   49.28 +
   49.29 +
   49.30 +int passive_domain_do_wrmsr(struct cpu_user_regs *regs)
   49.31 +{
   49.32 +	u64 msr_content;
   49.33 +	int type, index;
   49.34 +	struct vpmu_struct *vpmu = vcpu_vpmu(current);
   49.35 +
   49.36 +	if ( model->is_arch_pmu_msr == NULL )
   49.37 +		return 0;
   49.38 +	if ( !model->is_arch_pmu_msr((u64)regs->ecx, &type, &index) )
   49.39 +		return 0;
   49.40 +
   49.41 +	if ( !(vpmu->flags & PASSIVE_DOMAIN_ALLOCATED) )
   49.42 +		if ( ! model->allocated_msr(current) )
   49.43 +			return 0;
   49.44 +
   49.45 +	msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
   49.46 +	model->save_msr(current, type, index, msr_content);
   49.47 +	return 1;
   49.48 +}
   49.49 +
   49.50 +void passive_domain_destroy(struct vcpu *v)
   49.51 +{
   49.52 +	struct vpmu_struct *vpmu = vcpu_vpmu(v);
   49.53 +	if ( vpmu->flags & PASSIVE_DOMAIN_ALLOCATED )
   49.54 +		model->free_msr(v);
   49.55 +}
   49.56  
   49.57  static int nmi_callback(struct cpu_user_regs *regs, int cpu)
   49.58  {
   49.59 @@ -46,6 +95,8 @@ static int nmi_callback(struct cpu_user_
   49.60  	if ( ovf && is_active(current->domain) && !xen_mode )
   49.61  		send_guest_vcpu_virq(current, VIRQ_XENOPROF);
   49.62  
   49.63 +	if ( ovf == 2 ) 
   49.64 +                test_and_set_bool(current->nmi_pending);
   49.65  	return 1;
   49.66  }
   49.67   
    50.1 --- a/xen/arch/x86/oprofile/op_model_ppro.c	Tue Nov 04 12:07:22 2008 +0900
    50.2 +++ b/xen/arch/x86/oprofile/op_model_ppro.c	Tue Nov 04 12:43:19 2008 +0900
    50.3 @@ -18,6 +18,8 @@
    50.4  #include <xen/sched.h>
    50.5  #include <asm/regs.h>
    50.6  #include <asm/current.h>
    50.7 +#include <asm/hvm/vmx/vpmu.h>
    50.8 +#include <asm/hvm/vmx/vpmu_core2.h>
    50.9   
   50.10  #include "op_x86_model.h"
   50.11  #include "op_counter.h"
   50.12 @@ -39,9 +41,11 @@
   50.13  #define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17))
   50.14  #define CTRL_SET_UM(val, m) (val |= (m << 8))
   50.15  #define CTRL_SET_EVENT(val, e) (val |= e)
   50.16 -
   50.17 +#define IS_ACTIVE(val) (val & (1 << 22) )  
   50.18 +#define IS_ENABLE(val) (val & (1 << 20) )
   50.19  static unsigned long reset_value[NUM_COUNTERS];
   50.20  int ppro_has_global_ctrl = 0;
   50.21 +extern int is_passive(struct domain *d);
   50.22   
   50.23  static void ppro_fill_in_addresses(struct op_msrs * const msrs)
   50.24  {
   50.25 @@ -103,6 +107,7 @@ static int ppro_check_ctrs(unsigned int 
   50.26  	int ovf = 0;
   50.27  	unsigned long eip = regs->eip;
   50.28  	int mode = xenoprofile_get_mode(current, regs);
   50.29 +	struct arch_msr_pair *msrs_content = vcpu_vpmu(current)->context;
   50.30  
   50.31  	for (i = 0 ; i < NUM_COUNTERS; ++i) {
   50.32  		if (!reset_value[i])
   50.33 @@ -111,7 +116,18 @@ static int ppro_check_ctrs(unsigned int 
   50.34  		if (CTR_OVERFLOWED(low)) {
   50.35  			xenoprof_log_event(current, regs, eip, mode, i);
   50.36  			CTR_WRITE(reset_value[i], msrs, i);
   50.37 -			ovf = 1;
   50.38 +			if ( is_passive(current->domain) && (mode != 2) && 
   50.39 +				(vcpu_vpmu(current)->flags & PASSIVE_DOMAIN_ALLOCATED) ) 
   50.40 +			{
   50.41 +				if ( IS_ACTIVE(msrs_content[i].control) )
   50.42 +				{
   50.43 +					msrs_content[i].counter = (low | (u64)high << 32);
   50.44 +					if ( IS_ENABLE(msrs_content[i].control) )
   50.45 +						ovf = 2;
   50.46 +				}
   50.47 +			}
   50.48 +			if ( !ovf )
   50.49 +				ovf = 1;
   50.50  		}
   50.51  	}
   50.52  
   50.53 @@ -159,6 +175,82 @@ static void ppro_stop(struct op_msrs con
   50.54          wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
   50.55  }
   50.56  
   50.57 +static int ppro_is_arch_pmu_msr(u64 msr_index, int *type, int *index)
   50.58 +{
   50.59 +	if ( (msr_index >= MSR_IA32_PERFCTR0) &&
   50.60 +            (msr_index < (MSR_IA32_PERFCTR0 + NUM_COUNTERS)) )
   50.61 +	{
   50.62 +        	*type = MSR_TYPE_ARCH_COUNTER;
   50.63 +		*index = msr_index - MSR_IA32_PERFCTR0;
   50.64 +		return 1;
   50.65 +        }
   50.66 +        if ( (msr_index >= MSR_P6_EVNTSEL0) &&
   50.67 +            (msr_index < (MSR_P6_EVNTSEL0 + NUM_CONTROLS)) )
   50.68 +        {
   50.69 +		*type = MSR_TYPE_ARCH_CTRL;
   50.70 +		*index = msr_index - MSR_P6_EVNTSEL0;
   50.71 +		return 1;
   50.72 +        }
   50.73 +
   50.74 +        return 0;
   50.75 +}
   50.76 +
   50.77 +static int ppro_allocate_msr(struct vcpu *v)
   50.78 +{
   50.79 +	struct vpmu_struct *vpmu = vcpu_vpmu(v);
   50.80 +	struct arch_msr_pair *msr_content;
   50.81 +	
   50.82 +	msr_content = xmalloc_bytes( sizeof(struct arch_msr_pair) * NUM_COUNTERS );
   50.83 +	if ( !msr_content )
   50.84 +		goto out;
   50.85 +	memset(msr_content, 0, sizeof(struct arch_msr_pair) * NUM_COUNTERS);
   50.86 +	vpmu->context = (void *)msr_content;
   50.87 +	vpmu->flags = 0;
   50.88 +	vpmu->flags |= PASSIVE_DOMAIN_ALLOCATED;
   50.89 +	return 1;
   50.90 +out:
   50.91 +        gdprintk(XENLOG_WARNING, "Insufficient memory for oprofile, oprofile is "
   50.92 +                 "unavailable on domain %d vcpu %d.\n",
   50.93 +                 v->vcpu_id, v->domain->domain_id);
   50.94 +        return 0;	
   50.95 +}
   50.96 +
   50.97 +static void ppro_free_msr(struct vcpu *v)
   50.98 +{
   50.99 +	struct vpmu_struct *vpmu = vcpu_vpmu(v);
  50.100 +
  50.101 +	xfree(vpmu->context);
  50.102 +	vpmu->flags &= ~PASSIVE_DOMAIN_ALLOCATED;
  50.103 +}
  50.104 +
  50.105 +static void ppro_load_msr(struct vcpu *v, int type, int index, u64 *msr_content)
  50.106 +{
  50.107 +	struct arch_msr_pair *msrs = vcpu_vpmu(v)->context;
  50.108 +	switch ( type )
  50.109 +	{
  50.110 +	case MSR_TYPE_ARCH_COUNTER:
  50.111 +		*msr_content = msrs[index].counter;
  50.112 +		break;
  50.113 +	case MSR_TYPE_ARCH_CTRL:
  50.114 +		*msr_content = msrs[index].control;
  50.115 +		break;
  50.116 +	}	
  50.117 +}
  50.118 +
  50.119 +static void ppro_save_msr(struct vcpu *v, int type, int index, u64 msr_content)
  50.120 +{
  50.121 +	struct arch_msr_pair *msrs = vcpu_vpmu(v)->context;
  50.122 +	
  50.123 +	switch ( type )
  50.124 +	{
  50.125 +	case MSR_TYPE_ARCH_COUNTER:
  50.126 +		msrs[index].counter = msr_content;
  50.127 +		break;
  50.128 +	case MSR_TYPE_ARCH_CTRL:
  50.129 +		msrs[index].control = msr_content;
  50.130 +		break;
  50.131 +	}	
  50.132 +}
  50.133  
  50.134  struct op_x86_model_spec const op_ppro_spec = {
  50.135  	.num_counters = NUM_COUNTERS,
  50.136 @@ -167,5 +259,10 @@ struct op_x86_model_spec const op_ppro_s
  50.137  	.setup_ctrs = &ppro_setup_ctrs,
  50.138  	.check_ctrs = &ppro_check_ctrs,
  50.139  	.start = &ppro_start,
  50.140 -	.stop = &ppro_stop
  50.141 +	.stop = &ppro_stop,
  50.142 +	.is_arch_pmu_msr = &ppro_is_arch_pmu_msr,
  50.143 +	.allocated_msr = &ppro_allocate_msr,
  50.144 +	.free_msr = &ppro_free_msr,
  50.145 +	.load_msr = &ppro_load_msr,
  50.146 +	.save_msr = &ppro_save_msr
  50.147  };
    51.1 --- a/xen/arch/x86/oprofile/op_x86_model.h	Tue Nov 04 12:07:22 2008 +0900
    51.2 +++ b/xen/arch/x86/oprofile/op_x86_model.h	Tue Nov 04 12:43:19 2008 +0900
    51.3 @@ -41,6 +41,11 @@ struct op_x86_model_spec {
    51.4  			  struct cpu_user_regs * const regs);
    51.5  	void (*start)(struct op_msrs const * const msrs);
    51.6  	void (*stop)(struct op_msrs const * const msrs);
    51.7 +	int (*is_arch_pmu_msr)(u64 msr_index, int *type, int *index);
    51.8 +	int (*allocated_msr)(struct vcpu *v);
    51.9 +	void (*free_msr)(struct vcpu *v);
   51.10 +	void (*load_msr)(struct vcpu * const v, int type, int index, u64 *msr_content);
   51.11 +        void (*save_msr)(struct vcpu * const v, int type, int index, u64 msr_content);
   51.12  };
   51.13  
   51.14  extern struct op_x86_model_spec const op_ppro_spec;
    52.1 --- a/xen/arch/x86/setup.c	Tue Nov 04 12:07:22 2008 +0900
    52.2 +++ b/xen/arch/x86/setup.c	Tue Nov 04 12:43:19 2008 +0900
    52.3 @@ -969,6 +969,7 @@ void __init __start_xen(unsigned long mb
    52.4      serial_init_postirq();
    52.5  
    52.6      BUG_ON(!local_irq_is_enabled());
    52.7 +    spin_debug_enable();
    52.8  
    52.9      for_each_present_cpu ( i )
   52.10      {
    53.1 --- a/xen/arch/x86/smpboot.c	Tue Nov 04 12:07:22 2008 +0900
    53.2 +++ b/xen/arch/x86/smpboot.c	Tue Nov 04 12:43:19 2008 +0900
    53.3 @@ -101,7 +101,7 @@ static cpumask_t smp_commenced_mask;
    53.4  static int __devinitdata tsc_sync_disabled;
    53.5  
    53.6  /* Per CPU bogomips and other parameters */
    53.7 -struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
    53.8 +struct cpuinfo_x86 cpu_data[NR_CPUS];
    53.9  EXPORT_SYMBOL(cpu_data);
   53.10  
   53.11  u32 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
   53.12 @@ -112,7 +112,7 @@ static void map_cpu_to_logical_apicid(vo
   53.13  /* State of each CPU. */
   53.14  DEFINE_PER_CPU(int, cpu_state) = { 0 };
   53.15  
   53.16 -static void *stack_base[NR_CPUS] __cacheline_aligned;
   53.17 +static void *stack_base[NR_CPUS];
   53.18  static DEFINE_SPINLOCK(cpu_add_remove_lock);
   53.19  
   53.20  /*
   53.21 @@ -805,14 +805,6 @@ static inline int alloc_cpu_id(void)
   53.22  	return cpu;
   53.23  }
   53.24  
   53.25 -static struct vcpu *prepare_idle_vcpu(unsigned int cpu)
   53.26 -{
   53.27 -	if (idle_vcpu[cpu])
   53.28 -		return idle_vcpu[cpu];
   53.29 -
   53.30 -	return alloc_idle_vcpu(cpu);
   53.31 -}
   53.32 -
   53.33  static void *prepare_idle_stack(unsigned int cpu)
   53.34  {
   53.35  	if (!stack_base[cpu])
   53.36 @@ -849,7 +841,7 @@ static int __devinit do_boot_cpu(int api
   53.37  
   53.38  	booting_cpu = cpu;
   53.39  
   53.40 -	v = prepare_idle_vcpu(cpu);
   53.41 +	v = alloc_idle_vcpu(cpu);
   53.42  	BUG_ON(v == NULL);
   53.43  
   53.44  	/* start_eip had better be page-aligned! */
    54.1 --- a/xen/arch/x86/time.c	Tue Nov 04 12:07:22 2008 +0900
    54.2 +++ b/xen/arch/x86/time.c	Tue Nov 04 12:43:19 2008 +0900
    54.3 @@ -1063,8 +1063,6 @@ void init_percpu_time(void)
    54.4  /* Late init function (after all CPUs are booted). */
    54.5  int __init init_xen_time(void)
    54.6  {
    54.7 -    local_irq_disable();
    54.8 -
    54.9      /* check if TSC is invariant during deep C state
   54.10         this is a new feature introduced by Nehalem*/
   54.11      if ( cpuid_edx(0x80000007) & (1u<<8) )
   54.12 @@ -1079,8 +1077,6 @@ int __init init_xen_time(void)
   54.13  
   54.14      do_settime(get_cmos_time(), 0, NOW());
   54.15  
   54.16 -    local_irq_enable();
   54.17 -
   54.18      return 0;
   54.19  }
   54.20  
    55.1 --- a/xen/arch/x86/traps.c	Tue Nov 04 12:07:22 2008 +0900
    55.2 +++ b/xen/arch/x86/traps.c	Tue Nov 04 12:43:19 2008 +0900
    55.3 @@ -1030,7 +1030,7 @@ static int handle_gdt_ldt_mapping_fault(
    55.4  #endif
    55.5  
    55.6  static int __spurious_page_fault(
    55.7 -    unsigned long addr, struct cpu_user_regs *regs)
    55.8 +    unsigned long addr, unsigned int error_code)
    55.9  {
   55.10      unsigned long mfn, cr3 = read_cr3();
   55.11  #if CONFIG_PAGING_LEVELS >= 4
   55.12 @@ -1052,17 +1052,17 @@ static int __spurious_page_fault(
   55.13          return 0;
   55.14  
   55.15      /* Reserved bit violations are never spurious faults. */
   55.16 -    if ( regs->error_code & PFEC_reserved_bit )
   55.17 +    if ( error_code & PFEC_reserved_bit )
   55.18          return 0;
   55.19  
   55.20      required_flags  = _PAGE_PRESENT;
   55.21 -    if ( regs->error_code & PFEC_write_access )
   55.22 +    if ( error_code & PFEC_write_access )
   55.23          required_flags |= _PAGE_RW;
   55.24 -    if ( regs->error_code & PFEC_user_mode )
   55.25 +    if ( error_code & PFEC_user_mode )
   55.26          required_flags |= _PAGE_USER;
   55.27  
   55.28      disallowed_flags = 0;
   55.29 -    if ( regs->error_code & PFEC_insn_fetch )
   55.30 +    if ( error_code & PFEC_insn_fetch )
   55.31          disallowed_flags |= _PAGE_NX;
   55.32  
   55.33      mfn = cr3 >> PAGE_SHIFT;
   55.34 @@ -1120,7 +1120,7 @@ static int __spurious_page_fault(
   55.35      dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
   55.36              "at addr %lx, e/c %04x\n",
   55.37              current->domain->domain_id, current->vcpu_id,
   55.38 -            addr, regs->error_code);
   55.39 +            addr, error_code);
   55.40  #if CONFIG_PAGING_LEVELS >= 4
   55.41      dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
   55.42  #endif
   55.43 @@ -1129,14 +1129,11 @@ static int __spurious_page_fault(
   55.44  #endif
   55.45      dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
   55.46      dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
   55.47 -#ifndef NDEBUG
   55.48 -    show_registers(regs);
   55.49 -#endif
   55.50      return 1;
   55.51  }
   55.52  
   55.53  static int spurious_page_fault(
   55.54 -    unsigned long addr, struct cpu_user_regs *regs)
   55.55 +    unsigned long addr, unsigned int error_code)
   55.56  {
   55.57      unsigned long flags;
   55.58      int           is_spurious;
   55.59 @@ -1146,7 +1143,7 @@ static int spurious_page_fault(
   55.60       * page tables from becoming invalid under our feet during the walk.
   55.61       */
   55.62      local_irq_save(flags);
   55.63 -    is_spurious = __spurious_page_fault(addr, regs);
   55.64 +    is_spurious = __spurious_page_fault(addr, error_code);
   55.65      local_irq_restore(flags);
   55.66  
   55.67      return is_spurious;
   55.68 @@ -1208,9 +1205,13 @@ static int fixup_page_fault(unsigned lon
   55.69  asmlinkage void do_page_fault(struct cpu_user_regs *regs)
   55.70  {
   55.71      unsigned long addr, fixup;
   55.72 +    unsigned int error_code;
   55.73  
   55.74      addr = read_cr2();
   55.75  
   55.76 +    /* fixup_page_fault() might change regs->error_code, so cache it here. */
   55.77 +    error_code = regs->error_code;
   55.78 +
   55.79      DEBUGGER_trap_entry(TRAP_page_fault, regs);
   55.80  
   55.81      perfc_incr(page_faults);
   55.82 @@ -1220,7 +1221,7 @@ asmlinkage void do_page_fault(struct cpu
   55.83  
   55.84      if ( unlikely(!guest_mode(regs)) )
   55.85      {
   55.86 -        if ( spurious_page_fault(addr, regs) )
   55.87 +        if ( spurious_page_fault(addr, error_code) )
   55.88              return;
   55.89  
   55.90          if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
   55.91 @@ -1239,11 +1240,11 @@ asmlinkage void do_page_fault(struct cpu
   55.92          panic("FATAL PAGE FAULT\n"
   55.93                "[error_code=%04x]\n"
   55.94                "Faulting linear address: %p\n",
   55.95 -              regs->error_code, _p(addr));
   55.96 +              error_code, _p(addr));
   55.97      }
   55.98  
   55.99      if ( unlikely(current->domain->arch.suppress_spurious_page_faults
  55.100 -                  && spurious_page_fault(addr, regs)) )
  55.101 +                  && spurious_page_fault(addr, error_code)) )
  55.102          return;
  55.103  
  55.104      propagate_page_fault(addr, regs->error_code);
    56.1 --- a/xen/arch/x86/x86_32/domain_page.c	Tue Nov 04 12:07:22 2008 +0900
    56.2 +++ b/xen/arch/x86/x86_32/domain_page.c	Tue Nov 04 12:43:19 2008 +0900
    56.3 @@ -43,7 +43,7 @@ static inline struct vcpu *mapcache_curr
    56.4  void *map_domain_page(unsigned long mfn)
    56.5  {
    56.6      unsigned long va;
    56.7 -    unsigned int idx, i;
    56.8 +    unsigned int idx, i, flags;
    56.9      struct vcpu *v;
   56.10      struct mapcache_domain *dcache;
   56.11      struct mapcache_vcpu *vcache;
   56.12 @@ -69,7 +69,7 @@ void *map_domain_page(unsigned long mfn)
   56.13          goto out;
   56.14      }
   56.15  
   56.16 -    spin_lock(&dcache->lock);
   56.17 +    spin_lock_irqsave(&dcache->lock, flags);
   56.18  
   56.19      /* Has some other CPU caused a wrap? We must flush if so. */
   56.20      if ( unlikely(dcache->epoch != vcache->shadow_epoch) )
   56.21 @@ -105,7 +105,7 @@ void *map_domain_page(unsigned long mfn)
   56.22      set_bit(idx, dcache->inuse);
   56.23      dcache->cursor = idx + 1;
   56.24  
   56.25 -    spin_unlock(&dcache->lock);
   56.26 +    spin_unlock_irqrestore(&dcache->lock, flags);
   56.27  
   56.28      l1e_write(&dcache->l1tab[idx], l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
   56.29  
   56.30 @@ -114,7 +114,7 @@ void *map_domain_page(unsigned long mfn)
   56.31      return (void *)va;
   56.32  }
   56.33  
   56.34 -void unmap_domain_page(void *va)
   56.35 +void unmap_domain_page(const void *va)
   56.36  {
   56.37      unsigned int idx;
   56.38      struct vcpu *v;
   56.39 @@ -241,7 +241,7 @@ void *map_domain_page_global(unsigned lo
   56.40      return (void *)va;
   56.41  }
   56.42  
   56.43 -void unmap_domain_page_global(void *va)
   56.44 +void unmap_domain_page_global(const void *va)
   56.45  {
   56.46      unsigned long __va = (unsigned long)va;
   56.47      l2_pgentry_t *pl2e;
    57.1 --- a/xen/arch/x86/x86_64/compat/mm.c	Tue Nov 04 12:07:22 2008 +0900
    57.2 +++ b/xen/arch/x86/x86_64/compat/mm.c	Tue Nov 04 12:43:19 2008 +0900
    57.3 @@ -231,6 +231,8 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mm
    57.4              case MMUEXT_PIN_L4_TABLE:
    57.5              case MMUEXT_UNPIN_TABLE:
    57.6              case MMUEXT_NEW_BASEPTR:
    57.7 +            case MMUEXT_CLEAR_PAGE:
    57.8 +            case MMUEXT_COPY_PAGE:
    57.9                  arg1 = XLAT_mmuext_op_arg1_mfn;
   57.10                  break;
   57.11              default:
   57.12 @@ -258,6 +260,9 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mm
   57.13              case MMUEXT_INVLPG_MULTI:
   57.14                  arg2 = XLAT_mmuext_op_arg2_vcpumask;
   57.15                  break;
   57.16 +            case MMUEXT_COPY_PAGE:
   57.17 +                arg2 = XLAT_mmuext_op_arg2_src_mfn;
   57.18 +                break;
   57.19              default:
   57.20                  arg2 = -1;
   57.21                  break;
    58.1 --- a/xen/arch/x86/x86_64/cpufreq.c	Tue Nov 04 12:07:22 2008 +0900
    58.2 +++ b/xen/arch/x86/x86_64/cpufreq.c	Tue Nov 04 12:43:19 2008 +0900
    58.3 @@ -56,34 +56,13 @@ compat_set_px_pminfo(uint32_t cpu, struc
    58.4  	return -EFAULT;
    58.5  
    58.6  #define XLAT_processor_performance_HNDL_states(_d_, _s_) do { \
    58.7 -    xen_processor_px_t *xen_states = NULL; \
    58.8 -\
    58.9 -    if ( likely((_s_)->state_count > 0) ) \
   58.10 -    { \
   58.11 -        XEN_GUEST_HANDLE(compat_processor_px_t) states; \
   58.12 -        compat_processor_px_t state; \
   58.13 -        int i; \
   58.14 -\
   58.15 -        xen_states = xlat_malloc_array(xlat_page_current, \
   58.16 -                               xen_processor_px_t, (_s_)->state_count); \
   58.17 -        if ( unlikely(xen_states == NULL) ) \
   58.18 -            return -EFAULT; \
   58.19 -\
   58.20 -        if ( unlikely(!compat_handle_okay((_s_)->states, \
   58.21 -                                (_s_)->state_count)) ) \
   58.22 -            return -EFAULT; \
   58.23 -        guest_from_compat_handle(states, (_s_)->states); \
   58.24 -\
   58.25 -        for ( i = 0; i < _s_->state_count; i++ ) \
   58.26 -        { \
   58.27 -           if ( unlikely(copy_from_guest_offset(&state, states, i, 1)) ) \
   58.28 -               return -EFAULT; \
   58.29 -           XLAT_processor_px(&xen_states[i], &state); \
   58.30 -        } \
   58.31 -    } \
   58.32 -\
   58.33 -    set_xen_guest_handle((_d_)->states, xen_states); \
   58.34 +    XEN_GUEST_HANDLE(compat_processor_px_t) states; \
   58.35 +    if ( unlikely(!compat_handle_okay((_s_)->states, (_s_)->state_count)) ) \
   58.36 +        return -EFAULT; \
   58.37 +    guest_from_compat_handle(states, (_s_)->states); \
   58.38 +    (_d_)->states = guest_handle_cast(states, xen_processor_px_t); \
   58.39  } while (0)
   58.40 +
   58.41      XLAT_processor_performance(xen_perf, perf);
   58.42  #undef XLAT_processor_performance_HNDL_states
   58.43  
    59.1 --- a/xen/common/event_channel.c	Tue Nov 04 12:07:22 2008 +0900
    59.2 +++ b/xen/common/event_channel.c	Tue Nov 04 12:43:19 2008 +0900
    59.3 @@ -386,7 +386,7 @@ static long __evtchn_close(struct domain
    59.4              if ( v->virq_to_evtchn[chn1->u.virq] != port1 )
    59.5                  continue;
    59.6              v->virq_to_evtchn[chn1->u.virq] = 0;
    59.7 -            spin_barrier(&v->virq_lock);
    59.8 +            spin_barrier_irq(&v->virq_lock);
    59.9          }
   59.10          break;
   59.11  
    60.1 --- a/xen/common/kernel.c	Tue Nov 04 12:07:22 2008 +0900
    60.2 +++ b/xen/common/kernel.c	Tue Nov 04 12:43:19 2008 +0900
    60.3 @@ -221,7 +221,8 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDL
    60.4                  fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
    60.5  #ifdef CONFIG_X86
    60.6              if ( !is_hvm_vcpu(current) )
    60.7 -                fi.submap |= 1U << XENFEAT_mmu_pt_update_preserve_ad;
    60.8 +                fi.submap |= (1U << XENFEAT_mmu_pt_update_preserve_ad) |
    60.9 +                             (1U << XENFEAT_highmem_assist);
   60.10  #endif
   60.11              break;
   60.12          default:
    61.1 --- a/xen/common/keyhandler.c	Tue Nov 04 12:07:22 2008 +0900
    61.2 +++ b/xen/common/keyhandler.c	Tue Nov 04 12:43:19 2008 +0900
    61.3 @@ -183,9 +183,9 @@ static void dump_domains(unsigned char k
    61.4      {
    61.5          printk("General information for domain %u:\n", d->domain_id);
    61.6          cpuset_print(tmpstr, sizeof(tmpstr), d->domain_dirty_cpumask);
    61.7 -        printk("    refcnt=%d nr_pages=%d xenheap_pages=%d "
    61.8 +        printk("    refcnt=%d dying=%d nr_pages=%d xenheap_pages=%d "
    61.9                 "dirty_cpus=%s\n",
   61.10 -               atomic_read(&d->refcnt),
   61.11 +               atomic_read(&d->refcnt), d->is_dying,
   61.12                 d->tot_pages, d->xenheap_pages, tmpstr);
   61.13          printk("    handle=%02x%02x%02x%02x-%02x%02x-%02x%02x-"
   61.14                 "%02x%02x-%02x%02x%02x%02x%02x%02x vm_assist=%08lx\n",
    62.1 --- a/xen/common/spinlock.c	Tue Nov 04 12:07:22 2008 +0900
    62.2 +++ b/xen/common/spinlock.c	Tue Nov 04 12:43:19 2008 +0900
    62.3 @@ -1,15 +1,56 @@
    62.4  #include <xen/config.h>
    62.5 +#include <xen/irq.h>
    62.6  #include <xen/smp.h>
    62.7  #include <xen/spinlock.h>
    62.8  
    62.9 +#ifndef NDEBUG
   62.10 +
   62.11 +static atomic_t spin_debug __read_mostly = ATOMIC_INIT(0);
   62.12 +
   62.13 +static void check_lock(struct lock_debug *debug)
   62.14 +{
   62.15 +    int irq_safe = !local_irq_is_enabled();
   62.16 +
   62.17 +    if ( unlikely(atomic_read(&spin_debug) <= 0) )
   62.18 +        return;
   62.19 +
   62.20 +    /* A few places take liberties with this. */
   62.21 +    /* BUG_ON(in_irq() && !irq_safe); */
   62.22 +
   62.23 +    if ( unlikely(debug->irq_safe != irq_safe) )
   62.24 +    {
   62.25 +        int seen = cmpxchg(&debug->irq_safe, -1, irq_safe);
   62.26 +        BUG_ON(seen == !irq_safe);
   62.27 +    }
   62.28 +}
   62.29 +
   62.30 +void spin_debug_enable(void)
   62.31 +{
   62.32 +    atomic_inc(&spin_debug);
   62.33 +}
   62.34 +
   62.35 +void spin_debug_disable(void)
   62.36 +{
   62.37 +    atomic_dec(&spin_debug);
   62.38 +}
   62.39 +
   62.40 +#else /* defined(NDEBUG) */
   62.41 +
   62.42 +#define check_lock(l) ((void)0)
   62.43 +
   62.44 +#endif
   62.45 +
   62.46  void _spin_lock(spinlock_t *lock)
   62.47  {
   62.48 +    check_lock(&lock->debug);
   62.49      _raw_spin_lock(&lock->raw);
   62.50  }
   62.51  
   62.52  void _spin_lock_irq(spinlock_t *lock)
   62.53  {
   62.54 +    ASSERT(local_irq_is_enabled());
   62.55      local_irq_disable();
   62.56 +    check_lock(&lock->debug);
   62.57      _raw_spin_lock(&lock->raw);
   62.58  }
   62.59  
   62.60 @@ -17,6 +58,7 @@ unsigned long _spin_lock_irqsave(spinloc
   62.61  {
   62.62      unsigned long flags;
   62.63      local_irq_save(flags);
   62.64 +    check_lock(&lock->debug);
   62.65      _raw_spin_lock(&lock->raw);
   62.66      return flags;
   62.67  }
   62.68 @@ -40,20 +82,31 @@ void _spin_unlock_irqrestore(spinlock_t 
   62.69  
   62.70  int _spin_is_locked(spinlock_t *lock)
   62.71  {
   62.72 +    check_lock(&lock->debug);
   62.73      return _raw_spin_is_locked(&lock->raw);
   62.74  }
   62.75  
   62.76  int _spin_trylock(spinlock_t *lock)
   62.77  {
   62.78 +    check_lock(&lock->debug);
   62.79      return _raw_spin_trylock(&lock->raw);
   62.80  }
   62.81  
   62.82  void _spin_barrier(spinlock_t *lock)
   62.83  {
   62.84 +    check_lock(&lock->debug);
   62.85      do { mb(); } while ( _raw_spin_is_locked(&lock->raw) );
   62.86      mb();
   62.87  }
   62.88  
   62.89 +void _spin_barrier_irq(spinlock_t *lock)
   62.90 +{
   62.91 +    unsigned long flags;
   62.92 +    local_irq_save(flags);
   62.93 +    _spin_barrier(lock);
   62.94 +    local_irq_restore(flags);
   62.95 +}
   62.96 +
   62.97  void _spin_lock_recursive(spinlock_t *lock)
   62.98  {
   62.99      int cpu = smp_processor_id();
  62.100 @@ -61,6 +114,8 @@ void _spin_lock_recursive(spinlock_t *lo
  62.101      /* Don't allow overflow of recurse_cpu field. */
  62.102      BUILD_BUG_ON(NR_CPUS > 0xfffu);
  62.103  
  62.104 +    check_lock(&lock->debug);
  62.105 +
  62.106      if ( likely(lock->recurse_cpu != cpu) )
  62.107      {
  62.108          spin_lock(lock);
  62.109 @@ -83,12 +138,15 @@ void _spin_unlock_recursive(spinlock_t *
  62.110  
  62.111  void _read_lock(rwlock_t *lock)
  62.112  {
  62.113 +    check_lock(&lock->debug);
  62.114      _raw_read_lock(&lock->raw);
  62.115  }
  62.116  
  62.117  void _read_lock_irq(rwlock_t *lock)
  62.118  {
  62.119 +    ASSERT(local_irq_is_enabled());
  62.120      local_irq_disable();
  62.121 +    check_lock(&lock->debug);
  62.122      _raw_read_lock(&lock->raw);
  62.123  }
  62.124  
  62.125 @@ -96,6 +154,7 @@ unsigned long _read_lock_irqsave(rwlock_
  62.126  {
  62.127      unsigned long flags;
  62.128      local_irq_save(flags);
  62.129 +    check_lock(&lock->debug);
  62.130      _raw_read_lock(&lock->raw);
  62.131      return flags;
  62.132  }
  62.133 @@ -119,12 +178,15 @@ void _read_unlock_irqrestore(rwlock_t *l
  62.134  
  62.135  void _write_lock(rwlock_t *lock)
  62.136  {
  62.137 +    check_lock(&lock->debug);
  62.138      _raw_write_lock(&lock->raw);
  62.139  }
  62.140  
  62.141  void _write_lock_irq(rwlock_t *lock)
  62.142  {
  62.143 +    ASSERT(local_irq_is_enabled());
  62.144      local_irq_disable();
  62.145 +    check_lock(&lock->debug);
  62.146      _raw_write_lock(&lock->raw);
  62.147  }
  62.148  
  62.149 @@ -132,6 +194,7 @@ unsigned long _write_lock_irqsave(rwlock
  62.150  {
  62.151      unsigned long flags;
  62.152      local_irq_save(flags);
  62.153 +    check_lock(&lock->debug);
  62.154      _raw_write_lock(&lock->raw);
  62.155      return flags;
  62.156  }
    63.1 --- a/xen/common/timer.c	Tue Nov 04 12:07:22 2008 +0900
    63.2 +++ b/xen/common/timer.c	Tue Nov 04 12:43:19 2008 +0900
    63.3 @@ -25,10 +25,12 @@
    63.4   * We pull handlers off the timer list this far in future,
    63.5   * rather than reprogramming the time hardware.
    63.6   */
    63.7 -#define TIMER_SLOP (50*1000) /* ns */
    63.8 +static unsigned int timer_slop __read_mostly = 50000; /* 50 us */
    63.9 +integer_param("timer_slop", timer_slop);
   63.10  
   63.11  struct timers {
   63.12      spinlock_t     lock;
   63.13 +    bool_t         overflow;
   63.14      struct timer **heap;
   63.15      struct timer  *list;
   63.16      struct timer  *running;
   63.17 @@ -200,6 +202,7 @@ static int add_entry(struct timers *time
   63.18          return rc;
   63.19  
   63.20      /* Fall back to adding to the slower linked list. */
   63.21 +    timers->overflow = 1;
   63.22      t->status = TIMER_STATUS_in_list;
   63.23      return add_to_list(&timers->list, t);
   63.24  }
   63.25 @@ -258,6 +261,7 @@ void set_timer(struct timer *timer, s_ti
   63.26          __stop_timer(timer);
   63.27  
   63.28      timer->expires = expires;
   63.29 +    timer->expires_end = expires + timer_slop;
   63.30  
   63.31      if ( likely(timer->status != TIMER_STATUS_killed) )
   63.32          __add_timer(timer);
   63.33 @@ -344,19 +348,30 @@ void kill_timer(struct timer *timer)
   63.34  }
   63.35  
   63.36  
   63.37 +static void execute_timer(struct timers *ts, struct timer *t)
   63.38 +{
   63.39 +    void (*fn)(void *) = t->function;
   63.40 +    void *data = t->data;
   63.41 +
   63.42 +    ts->running = t;
   63.43 +    spin_unlock_irq(&ts->lock);
   63.44 +    (*fn)(data);
   63.45 +    spin_lock_irq(&ts->lock);
   63.46 +    ts->running = NULL;
   63.47 +}
   63.48 +
   63.49 +
   63.50  static void timer_softirq_action(void)
   63.51  {
   63.52      struct timer  *t, **heap, *next;
   63.53      struct timers *ts;
   63.54 -    s_time_t       now, deadline;
   63.55 -    void         (*fn)(void *);
   63.56 -    void          *data;
   63.57 +    s_time_t       now;
   63.58  
   63.59      ts = &this_cpu(timers);
   63.60      heap = ts->heap;
   63.61  
   63.62 -    /* If we are using overflow linked list, try to allocate a larger heap. */
   63.63 -    if ( unlikely(ts->list != NULL) )
   63.64 +    /* If we overflowed the heap, try to allocate a larger heap. */
   63.65 +    if ( unlikely(ts->overflow) )
   63.66      {
   63.67          /* old_limit == (2^n)-1; new_limit == (2^(n+4))-1 */
   63.68          int old_limit = GET_HEAP_LIMIT(heap);
   63.69 @@ -377,7 +392,26 @@ static void timer_softirq_action(void)
   63.70  
   63.71      spin_lock_irq(&ts->lock);
   63.72  
   63.73 -    /* Try to move timers from overflow linked list to more efficient heap. */
   63.74 +    now = NOW();
   63.75 +
   63.76 +    /* Execute ready heap timers. */
   63.77 +    while ( (GET_HEAP_SIZE(heap) != 0) &&
   63.78 +            ((t = heap[1])->expires_end < now) )
   63.79 +    {
   63.80 +        remove_from_heap(heap, t);
   63.81 +        t->status = TIMER_STATUS_inactive;
   63.82 +        execute_timer(ts, t);
   63.83 +    }
   63.84 +
   63.85 +    /* Execute ready list timers. */
   63.86 +    while ( ((t = ts->list) != NULL) && (t->expires_end < now) )
   63.87 +    {
   63.88 +        ts->list = t->list_next;
   63.89 +        t->status = TIMER_STATUS_inactive;
   63.90 +        execute_timer(ts, t);
   63.91 +    }
   63.92 +
   63.93 +    /* Try to move timers from linked list to more efficient heap. */
   63.94      next = ts->list;
   63.95      ts->list = NULL;
   63.96      while ( unlikely((t = next) != NULL) )
   63.97 @@ -387,51 +421,44 @@ static void timer_softirq_action(void)
   63.98          add_entry(ts, t);
   63.99      }
  63.100  
  63.101 -    now = NOW();
  63.102 -
  63.103 -    while ( (GET_HEAP_SIZE(heap) != 0) &&
  63.104 -            ((t = heap[1])->expires < (now + TIMER_SLOP)) )
  63.105 +    ts->overflow = (ts->list != NULL);
  63.106 +    if ( unlikely(ts->overflow) )
  63.107      {
  63.108 -        remove_entry(ts, t);
  63.109 +        /* Find earliest deadline at head of list or top of heap. */
  63.110 +        this_cpu(timer_deadline) = ts->list->expires;
  63.111 +        if ( (GET_HEAP_SIZE(heap) != 0) &&
  63.112 +             ((t = heap[1])->expires < this_cpu(timer_deadline)) )
  63.113 +            this_cpu(timer_deadline) = t->expires;
  63.114 +    }
  63.115 +    else
  63.116 +    {
  63.117 +        /*
  63.118 +         * Find the earliest deadline that encompasses largest number of timers
  63.119 +         * on the heap. To do this we take timers from the heap while their
  63.120 +         * valid deadline ranges continue to intersect.
  63.121 +         */
  63.122 +        s_time_t start = 0, end = STIME_MAX;
  63.123 +        struct timer **list_tail = &ts->list;
  63.124  
  63.125 -        ts->running = t;
  63.126 +        while ( (GET_HEAP_SIZE(heap) != 0) &&
  63.127 +                ((t = heap[1])->expires <= end) )
  63.128 +        {
  63.129 +            remove_entry(ts, t);
  63.130  
  63.131 -        fn   = t->function;
  63.132 -        data = t->data;
  63.133 +            t->status = TIMER_STATUS_in_list;
  63.134 +            t->list_next = NULL;
  63.135 +            *list_tail = t;
  63.136 +            list_tail = &t->list_next;
  63.137  
  63.138 -        spin_unlock_irq(&ts->lock);
  63.139 -        (*fn)(data);
  63.140 -        spin_lock_irq(&ts->lock);
  63.141 +            start = t->expires;
  63.142 +            if ( end > t->expires_end )
  63.143 +                end = t->expires_end;
  63.144 +        }
  63.145 +
  63.146 +        this_cpu(timer_deadline) = start;
  63.147      }
  63.148  
  63.149 -    deadline = GET_HEAP_SIZE(heap) ? heap[1]->expires : 0;
  63.150 -
  63.151 -    while ( unlikely((t = ts->list) != NULL) )
  63.152 -    {
  63.153 -        if ( t->expires >= (now + TIMER_SLOP) )
  63.154 -        {
  63.155 -            if ( (deadline == 0) || (deadline > t->expires) )
  63.156 -                deadline = t->expires;
  63.157 -            break;
  63.158 -        }
  63.159 -
  63.160 -        ts->list = t->list_next;
  63.161 -        t->status = TIMER_STATUS_inactive;
  63.162 -
  63.163 -        ts->running = t;
  63.164 -
  63.165 -        fn   = t->function;
  63.166 -        data = t->data;
  63.167 -
  63.168 -        spin_unlock_irq(&ts->lock);
  63.169 -        (*fn)(data);
  63.170 -        spin_lock_irq(&ts->lock);
  63.171 -    }
  63.172 -
  63.173 -    ts->running = NULL;
  63.174 -
  63.175 -    this_cpu(timer_deadline) = deadline;
  63.176 -    if ( !reprogram_timer(deadline) )
  63.177 +    if ( !reprogram_timer(this_cpu(timer_deadline)) )
  63.178          raise_softirq(TIMER_SOFTIRQ);
  63.179  
  63.180      spin_unlock_irq(&ts->lock);
    64.1 --- a/xen/common/xenoprof.c	Tue Nov 04 12:07:22 2008 +0900
    64.2 +++ b/xen/common/xenoprof.c	Tue Nov 04 12:43:19 2008 +0900
    64.3 @@ -85,7 +85,7 @@ int is_active(struct domain *d)
    64.4      return ((x != NULL) && (x->domain_type == XENOPROF_DOMAIN_ACTIVE));
    64.5  }
    64.6  
    64.7 -static int is_passive(struct domain *d)
    64.8 +int is_passive(struct domain *d)
    64.9  {
   64.10      struct xenoprof *x = d->xenoprof;
   64.11      return ((x != NULL) && (x->domain_type == XENOPROF_DOMAIN_PASSIVE));
    65.1 --- a/xen/common/xmalloc.c	Tue Nov 04 12:07:22 2008 +0900
    65.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    65.3 @@ -1,286 +0,0 @@
    65.4 -/******************************************************************************
    65.5 - * Simple allocator for Xen.  If larger than a page, simply use the
    65.6 - * page-order allocator.
    65.7 - *
    65.8 - * Copyright (C) 2005 Rusty Russell IBM Corporation
    65.9 - *
   65.10 - * This program is free software; you can redistribute it and/or modify
   65.11 - * it under the terms of the GNU General Public License as published by
   65.12 - * the Free Software Foundation; either version 2 of the License, or
   65.13 - * (at your option) any later version.
   65.14 - *
   65.15 - * This program is distributed in the hope that it will be useful,
   65.16 - * but WITHOUT ANY WARRANTY; without even the implied warranty of
   65.17 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   65.18 - * GNU General Public License for more details.
   65.19 - *
   65.20 - * You should have received a copy of the GNU General Public License
   65.21 - * along with this program; if not, write to the Free Software
   65.22 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   65.23 - */
   65.24 -
   65.25 -/*
   65.26 - * TODO (Keir, 17/2/05):
   65.27 - *  1. Use space in page_info to avoid xmalloc_hdr in allocated blocks.
   65.28 - *  2. page_info points into free list to make xfree() O(1) complexity.
   65.29 - *  3. Perhaps make this a sub-page buddy allocator? xmalloc() == O(1).
   65.30 - *     (Disadvantage is potentially greater internal fragmentation).
   65.31 - */
   65.32 -
   65.33 -#include <xen/config.h>
   65.34 -#include <xen/mm.h>
   65.35 -#include <xen/spinlock.h>
   65.36 -#include <xen/timer.h>
   65.37 -#include <xen/cache.h>
   65.38 -#include <xen/prefetch.h>
   65.39 -#include <xen/irq.h>
   65.40 -#include <xen/smp.h>
   65.41 -
   65.42 -/*
   65.43 - * XMALLOC_DEBUG:
   65.44 - *  1. Free data blocks are filled with poison bytes.
   65.45 - *  2. In-use data blocks have guard bytes at the start and end.
   65.46 - */
   65.47 -#ifndef NDEBUG
   65.48 -#define XMALLOC_DEBUG 1
   65.49 -#endif
   65.50 -
   65.51 -static LIST_HEAD(freelist);
   65.52 -static DEFINE_SPINLOCK(freelist_lock);
   65.53 -
   65.54 -struct xmalloc_hdr
   65.55 -{
   65.56 -    /* Size is total including this header. */
   65.57 -    size_t size;
   65.58 -    struct list_head freelist;
   65.59 -} __cacheline_aligned;
   65.60 -
   65.61 -static void add_to_freelist(struct xmalloc_hdr *hdr)
   65.62 -{
   65.63 -#if XMALLOC_DEBUG
   65.64 -    memset(hdr + 1, 0xa5, hdr->size - sizeof(*hdr));
   65.65 -#endif
   65.66 -    list_add(&hdr->freelist, &freelist);
   65.67 -}
   65.68 -
   65.69 -static void del_from_freelist(struct xmalloc_hdr *hdr)
   65.70 -{
   65.71 -#if XMALLOC_DEBUG
   65.72 -    size_t i;
   65.73 -    unsigned char *data = (unsigned char *)(hdr + 1);
   65.74 -    for ( i = 0; i < (hdr->size - sizeof(*hdr)); i++ )
   65.75 -        BUG_ON(data[i] != 0xa5);
   65.76 -    BUG_ON((hdr->size <= 0) || (hdr->size >= PAGE_SIZE));
   65.77 -#endif
   65.78 -    list_del(&hdr->freelist);
   65.79 -}
   65.80 -
   65.81 -static void *data_from_header(struct xmalloc_hdr *hdr)
   65.82 -{
   65.83 -#if XMALLOC_DEBUG
   65.84 -    /* Data block contain SMP_CACHE_BYTES of guard canary. */
   65.85 -    unsigned char *data = (unsigned char *)(hdr + 1);
   65.86 -    memset(data, 0x5a, SMP_CACHE_BYTES);
   65.87 -    memset(data + hdr->size - sizeof(*hdr) - SMP_CACHE_BYTES,
   65.88 -           0x5a, SMP_CACHE_BYTES);
   65.89 -    return data + SMP_CACHE_BYTES;
   65.90 -#else
   65.91 -    return hdr + 1;
   65.92 -#endif
   65.93 -}
   65.94 -
   65.95 -static struct xmalloc_hdr *header_from_data(void *p)
   65.96 -{
   65.97 -#if XMALLOC_DEBUG
   65.98 -    unsigned char *data = (unsigned char *)p - SMP_CACHE_BYTES;
   65.99 -    struct xmalloc_hdr *hdr = (struct xmalloc_hdr *)data - 1;
  65.100 -    size_t i;
  65.101 -
  65.102 -    /* Check header guard canary. */
  65.103 -    for ( i = 0; i < SMP_CACHE_BYTES; i++ )
  65.104 -        BUG_ON(data[i] != 0x5a);
  65.105 -
  65.106 -    /* Check footer guard canary. */
  65.107 -    data += hdr->size - sizeof(*hdr) - SMP_CACHE_BYTES;
  65.108 -    for ( i = 0; i < SMP_CACHE_BYTES; i++ )
  65.109 -        BUG_ON(data[i] != 0x5a);
  65.110 -
  65.111 -    return hdr;
  65.112 -#else
  65.113 -    return (struct xmalloc_hdr *)p - 1;
  65.114 -#endif
  65.115 -}
  65.116 -
  65.117 -static void maybe_split(struct xmalloc_hdr *hdr, size_t size, size_t block)
  65.118 -{
  65.119 -    struct xmalloc_hdr *extra;
  65.120 -    size_t leftover = block - size;
  65.121 -
  65.122 -    /* If enough is left to make a block, put it on free list. */
  65.123 -    if ( leftover >= (2 * sizeof(struct xmalloc_hdr)) )
  65.124 -    {
  65.125 -        extra = (struct xmalloc_hdr *)((unsigned long)hdr + size);
  65.126 -        extra->size = leftover;
  65.127 -        add_to_freelist(extra);
  65.128 -    }
  65.129 -    else
  65.130 -    {
  65.131 -        size = block;
  65.132 -    }
  65.133 -
  65.134 -    hdr->size = size;
  65.135 -    /* Debugging aid. */
  65.136 -    hdr->freelist.next = hdr->freelist.prev = NULL;
  65.137 -}
  65.138 -
  65.139 -static void *xmalloc_new_page(size_t size)
  65.140 -{
  65.141 -    struct xmalloc_hdr *hdr;
  65.142 -
  65.143 -    hdr = alloc_xenheap_page();
  65.144 -    if ( hdr == NULL )
  65.145 -        return NULL;
  65.146 -
  65.147 -    spin_lock(&freelist_lock);
  65.148 -    maybe_split(hdr, size, PAGE_SIZE);
  65.149 -    spin_unlock(&freelist_lock);
  65.150 -
  65.151 -    return data_from_header(hdr);
  65.152 -}
  65.153 -
  65.154 -/* Big object?  Just use the page allocator. */
  65.155 -static void *xmalloc_whole_pages(size_t size)
  65.156 -{
  65.157 -    struct xmalloc_hdr *hdr;
  65.158 -    unsigned int pageorder = get_order_from_bytes(size);
  65.159 -
  65.160 -    hdr = alloc_xenheap_pages(pageorder);
  65.161 -    if ( hdr == NULL )
  65.162 -        return NULL;
  65.163 -
  65.164 -    hdr->size = (1 << (pageorder + PAGE_SHIFT));
  65.165 -    /* Debugging aid. */
  65.166 -    hdr->freelist.next = hdr->freelist.prev = NULL;
  65.167 -
  65.168 -    return data_from_header(hdr);
  65.169 -}
  65.170 -
  65.171 -/* Return size, increased to alignment with align. */
  65.172 -static inline size_t align_up(size_t size, size_t align)
  65.173 -{
  65.174 -    return (size + align - 1) & ~(align - 1);
  65.175 -}
  65.176 -
  65.177 -void *_xmalloc(size_t size, size_t align)
  65.178 -{
  65.179 -    struct xmalloc_hdr *i;
  65.180 -
  65.181 -    ASSERT(!in_irq());
  65.182 -
  65.183 -    /* We currently always return cacheline aligned. */
  65.184 -    BUG_ON(align > SMP_CACHE_BYTES);
  65.185 -
  65.186 -#if XMALLOC_DEBUG
  65.187 -    /* Add room for canaries at start and end of data block. */
  65.188 -    size += 2 * SMP_CACHE_BYTES;
  65.189 -#endif
  65.190 -
  65.191 -    /* Add room for header, pad to align next header. */
  65.192 -    size += sizeof(struct xmalloc_hdr);
  65.193 -    size = align_up(size, __alignof__(struct xmalloc_hdr));
  65.194 -
  65.195 -    /* For big allocs, give them whole pages. */
  65.196 -    if ( size >= PAGE_SIZE )
  65.197 -        return xmalloc_whole_pages(size);
  65.198 -
  65.199 -    /* Search free list. */
  65.200 -    spin_lock(&freelist_lock);
  65.201 -    list_for_each_entry( i, &freelist, freelist )
  65.202 -    {
  65.203 -        if ( i->size < size )
  65.204 -            continue;
  65.205 -        del_from_freelist(i);
  65.206 -        maybe_split(i, size, i->size);
  65.207 -        spin_unlock(&freelist_lock);
  65.208 -        return data_from_header(i);
  65.209 -    }
  65.210 -    spin_unlock(&freelist_lock);
  65.211 -
  65.212 -    /* Alloc a new page and return from that. */
  65.213 -    return xmalloc_new_page(size);
  65.214 -}
  65.215 -
  65.216 -void xfree(void *p)
  65.217 -{
  65.218 -    struct xmalloc_hdr *i, *tmp, *hdr;
  65.219 -
  65.220 -    ASSERT(!in_irq());
  65.221 -
  65.222 -    if ( p == NULL )
  65.223 -        return;
  65.224 -
  65.225 -    hdr = header_from_data(p);
  65.226 -
  65.227 -    /* We know hdr will be on same page. */
  65.228 -    BUG_ON(((long)p & PAGE_MASK) != ((long)hdr & PAGE_MASK));
  65.229 -
  65.230 -    /* Not previously freed. */
  65.231 -    BUG_ON(hdr->freelist.next || hdr->freelist.prev);
  65.232 -
  65.233 -    /* Big allocs free directly. */
  65.234 -    if ( hdr->size >= PAGE_SIZE )
  65.235 -    {
  65.236 -        free_xenheap_pages(hdr, get_order_from_bytes(hdr->size));
  65.237 -        return;
  65.238 -    }
  65.239 -
  65.240 -    /* Merge with other free block, or put in list. */
  65.241 -    spin_lock(&freelist_lock);
  65.242 -    list_for_each_entry_safe( i, tmp, &freelist, freelist )
  65.243 -    {
  65.244 -        unsigned long _i   = (unsigned long)i;
  65.245 -        unsigned long _hdr = (unsigned long)hdr;
  65.246 -
  65.247 -        /* Do not merge across page boundaries. */
  65.248 -        if ( ((_i ^ _hdr) & PAGE_MASK) != 0 )
  65.249 -            continue;
  65.250 -
  65.251 -        /* We follow this block?  Swallow it. */
  65.252 -        if ( (_i + i->size) == _hdr )
  65.253 -        {
  65.254 -            del_from_freelist(i);
  65.255 -            i->size += hdr->size;
  65.256 -            hdr = i;
  65.257 -        }
  65.258 -
  65.259 -        /* We precede this block? Swallow it. */
  65.260 -        if ( (_hdr + hdr->size) == _i )
  65.261 -        {
  65.262 -            del_from_freelist(i);
  65.263 -            hdr->size += i->size;
  65.264 -        }
  65.265 -    }
  65.266 -
  65.267 -    /* Did we merge an entire page? */
  65.268 -    if ( hdr->size == PAGE_SIZE )
  65.269 -    {
  65.270 -        BUG_ON((((unsigned long)hdr) & (PAGE_SIZE-1)) != 0);
  65.271 -        free_xenheap_pages(hdr, 0);
  65.272 -    }
  65.273 -    else
  65.274 -    {
  65.275 -        add_to_freelist(hdr);
  65.276 -    }
  65.277 -
  65.278 -    spin_unlock(&freelist_lock);
  65.279 -}
  65.280 -
  65.281 -/*
  65.282 - * Local variables:
  65.283 - * mode: C
  65.284 - * c-set-style: "BSD"
  65.285 - * c-basic-offset: 4
  65.286 - * tab-width: 4
  65.287 - * indent-tabs-mode: nil
  65.288 - * End:
  65.289 - */
    66.1 --- a/xen/drivers/char/serial.c	Tue Nov 04 12:07:22 2008 +0900
    66.2 +++ b/xen/drivers/char/serial.c	Tue Nov 04 12:43:19 2008 +0900
    66.3 @@ -74,7 +74,7 @@ void serial_tx_interrupt(struct serial_p
    66.4      while ( !spin_trylock(&port->tx_lock) )
    66.5      {
    66.6          if ( !port->driver->tx_empty(port) )
    66.7 -            return;
    66.8 +            goto out;
    66.9          cpu_relax();
   66.10      }
   66.11  
   66.12 @@ -89,7 +89,10 @@ void serial_tx_interrupt(struct serial_p
   66.13          }
   66.14      }
   66.15  
   66.16 -    spin_unlock_irqrestore(&port->tx_lock, flags);
   66.17 +    spin_unlock(&port->tx_lock);
   66.18 +
   66.19 + out:
   66.20 +    local_irq_restore(flags);
   66.21  }
   66.22  
   66.23  static void __serial_putc(struct serial_port *port, char c)
    67.1 --- a/xen/drivers/cpufreq/cpufreq.c	Tue Nov 04 12:07:22 2008 +0900
    67.2 +++ b/xen/drivers/cpufreq/cpufreq.c	Tue Nov 04 12:43:19 2008 +0900
    67.3 @@ -31,6 +31,7 @@
    67.4  #include <xen/errno.h>
    67.5  #include <xen/delay.h>
    67.6  #include <xen/cpumask.h>
    67.7 +#include <xen/list.h>
    67.8  #include <xen/sched.h>
    67.9  #include <xen/timer.h>
   67.10  #include <xen/xmalloc.h>
   67.11 @@ -44,8 +45,12 @@
   67.12  #include <acpi/acpi.h>
   67.13  #include <acpi/cpufreq/cpufreq.h>
   67.14  
   67.15 -/* TODO: change to link list later as domain number may be sparse */
   67.16 -static cpumask_t cpufreq_dom_map[NR_CPUS];
   67.17 +struct cpufreq_dom {
   67.18 +    unsigned int	dom;
   67.19 +    cpumask_t		map;
   67.20 +    struct list_head	node;
   67.21 +};
   67.22 +static LIST_HEAD(cpufreq_dom_list_head);
   67.23  
   67.24  int cpufreq_limit_change(unsigned int cpu)
   67.25  {
   67.26 @@ -72,48 +77,80 @@ int cpufreq_add_cpu(unsigned int cpu)
   67.27  {
   67.28      int ret = 0;
   67.29      unsigned int firstcpu;
   67.30 -    unsigned int dom;
   67.31 +    unsigned int dom, domexist = 0;
   67.32      unsigned int j;
   67.33 +    struct list_head *pos;
   67.34 +    struct cpufreq_dom *cpufreq_dom = NULL;
   67.35      struct cpufreq_policy new_policy;
   67.36      struct cpufreq_policy *policy;
   67.37      struct processor_performance *perf = &processor_pminfo[cpu]->perf;
   67.38  
   67.39      /* to protect the case when Px was not controlled by xen */
   67.40 -    if (!processor_pminfo[cpu] || !(perf->init & XEN_PX_INIT))
   67.41 -        return 0;
   67.42 +    if (!processor_pminfo[cpu]      ||
   67.43 +        !(perf->init & XEN_PX_INIT) ||
   67.44 +        !cpu_online(cpu))
   67.45 +        return -EINVAL;
   67.46  
   67.47 -    if (!cpu_online(cpu) || cpufreq_cpu_policy[cpu])
   67.48 -        return -EINVAL;
   67.49 +    if (cpufreq_cpu_policy[cpu])
   67.50 +        return 0;
   67.51  
   67.52      ret = cpufreq_statistic_init(cpu);
   67.53      if (ret)
   67.54          return ret;
   67.55  
   67.56      dom = perf->domain_info.domain;
   67.57 -    if (cpus_weight(cpufreq_dom_map[dom])) {
   67.58 +
   67.59 +    list_for_each(pos, &cpufreq_dom_list_head) {
   67.60 +        cpufreq_dom = list_entry(pos, struct cpufreq_dom, node);
   67.61 +        if (dom == cpufreq_dom->dom) {
   67.62 +            domexist = 1;
   67.63 +            break;
   67.64 +        }
   67.65 +    }
   67.66 +
   67.67 +    if (domexist) {
   67.68          /* share policy with the first cpu since on same boat */
   67.69 -        firstcpu = first_cpu(cpufreq_dom_map[dom]);
   67.70 +        firstcpu = first_cpu(cpufreq_dom->map);
   67.71          policy = cpufreq_cpu_policy[firstcpu];
   67.72  
   67.73          cpufreq_cpu_policy[cpu] = policy;
   67.74 -        cpu_set(cpu, cpufreq_dom_map[dom]);
   67.75 +        cpu_set(cpu, cpufreq_dom->map);
   67.76          cpu_set(cpu, policy->cpus);
   67.77  
   67.78 +        /* domain coordination sanity check */
   67.79 +        if ((perf->domain_info.coord_type !=
   67.80 +             processor_pminfo[firstcpu]->perf.domain_info.coord_type) ||
   67.81 +            (perf->domain_info.num_processors !=
   67.82 +             processor_pminfo[firstcpu]->perf.domain_info.num_processors)) {
   67.83 +            ret = -EINVAL;
   67.84 +            goto err2;
   67.85 +        }
   67.86 +
   67.87          printk(KERN_EMERG"adding CPU %u\n", cpu);
   67.88      } else {
   67.89 +        cpufreq_dom = xmalloc(struct cpufreq_dom);
   67.90 +        if (!cpufreq_dom) {
   67.91 +            cpufreq_statistic_exit(cpu);
   67.92 +            return -ENOMEM;
   67.93 +        }
   67.94 +        memset(cpufreq_dom, 0, sizeof(struct cpufreq_dom));
   67.95 +        cpufreq_dom->dom = dom;
   67.96 +        cpu_set(cpu, cpufreq_dom->map);
   67.97 +        list_add(&cpufreq_dom->node, &cpufreq_dom_list_head);
   67.98 +
   67.99          /* for the first cpu, setup policy and do init work */
  67.100          policy = xmalloc(struct cpufreq_policy);
  67.101          if (!policy) {
  67.102 +            list_del(&cpufreq_dom->node);
  67.103 +            xfree(cpufreq_dom);
  67.104              cpufreq_statistic_exit(cpu);
  67.105              return -ENOMEM;
  67.106          }
  67.107          memset(policy, 0, sizeof(struct cpufreq_policy));
  67.108 -
  67.109 +        policy->cpu = cpu;
  67.110 +        cpu_set(cpu, policy->cpus);
  67.111          cpufreq_cpu_policy[cpu] = policy;
  67.112 -        cpu_set(cpu, cpufreq_dom_map[dom]);
  67.113 -        cpu_set(cpu, policy->cpus);
  67.114  
  67.115 -        policy->cpu = cpu;
  67.116          ret = cpufreq_driver->init(policy);
  67.117          if (ret)
  67.118              goto err1;
  67.119 @@ -124,7 +161,7 @@ int cpufreq_add_cpu(unsigned int cpu)
  67.120       * After get full cpumap of the coordination domain,
  67.121       * we can safely start gov here.
  67.122       */
  67.123 -    if (cpus_weight(cpufreq_dom_map[dom]) ==
  67.124 +    if (cpus_weight(cpufreq_dom->map) ==
  67.125          perf->domain_info.num_processors) {
  67.126          memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
  67.127          policy->governor = NULL;
  67.128 @@ -138,51 +175,68 @@ int cpufreq_add_cpu(unsigned int cpu)
  67.129  err2:
  67.130      cpufreq_driver->exit(policy);
  67.131  err1:
  67.132 -    for_each_cpu_mask(j, cpufreq_dom_map[dom]) {
  67.133 +    for_each_cpu_mask(j, cpufreq_dom->map) {
  67.134          cpufreq_cpu_policy[j] = NULL;
  67.135          cpufreq_statistic_exit(j);
  67.136      }
  67.137  
  67.138 -    cpus_clear(cpufreq_dom_map[dom]);
  67.139 +    list_del(&cpufreq_dom->node);
  67.140 +    xfree(cpufreq_dom);
  67.141      xfree(policy);
  67.142      return ret;
  67.143  }
  67.144  
  67.145  int cpufreq_del_cpu(unsigned int cpu)
  67.146  {
  67.147 -    unsigned int dom;
  67.148 +    unsigned int dom, domexist = 0;
  67.149 +    struct list_head *pos;
  67.150 +    struct cpufreq_dom *cpufreq_dom = NULL;
  67.151      struct cpufreq_policy *policy;
  67.152      struct processor_performance *perf = &processor_pminfo[cpu]->perf;
  67.153  
  67.154      /* to protect the case when Px was not controlled by xen */
  67.155 -    if (!processor_pminfo[cpu] || !(perf->init & XEN_PX_INIT))
  67.156 -        return 0;
  67.157 +    if (!processor_pminfo[cpu]      ||
  67.158 +        !(perf->init & XEN_PX_INIT) ||
  67.159 +        !cpu_online(cpu))
  67.160 +        return -EINVAL;
  67.161  
  67.162 -    if (!cpu_online(cpu) || !cpufreq_cpu_policy[cpu])
  67.163 -        return -EINVAL;
  67.164 +    if (!cpufreq_cpu_policy[cpu])
  67.165 +        return 0;
  67.166  
  67.167      dom = perf->domain_info.domain;
  67.168      policy = cpufreq_cpu_policy[cpu];
  67.169  
  67.170 -    printk(KERN_EMERG"deleting CPU %u\n", cpu);
  67.171 +    list_for_each(pos, &cpufreq_dom_list_head) {
  67.172 +        cpufreq_dom = list_entry(pos, struct cpufreq_dom, node);
  67.173 +        if (dom == cpufreq_dom->dom) {
  67.174 +            domexist = 1;
  67.175 +            break;
  67.176 +        }
  67.177 +    }
  67.178 +
  67.179 +    if (!domexist)
  67.180 +        return -EINVAL;
  67.181  
  67.182      /* for the first cpu of the domain, stop gov */
  67.183 -    if (cpus_weight(cpufreq_dom_map[dom]) ==
  67.184 +    if (cpus_weight(cpufreq_dom->map) ==
  67.185          perf->domain_info.num_processors)
  67.186          __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
  67.187  
  67.188      cpufreq_cpu_policy[cpu] = NULL;
  67.189      cpu_clear(cpu, policy->cpus);
  67.190 -    cpu_clear(cpu, cpufreq_dom_map[dom]);
  67.191 +    cpu_clear(cpu, cpufreq_dom->map);
  67.192      cpufreq_statistic_exit(cpu);
  67.193  
  67.194      /* for the last cpu of the domain, clean room */
  67.195      /* It's safe here to free freq_table, drv_data and policy */
  67.196 -    if (!cpus_weight(cpufreq_dom_map[dom])) {
  67.197 +    if (!cpus_weight(cpufreq_dom->map)) {
  67.198          cpufreq_driver->exit(policy);
  67.199 +        list_del(&cpufreq_dom->node);
  67.200 +        xfree(cpufreq_dom);
  67.201          xfree(policy);
  67.202      }
  67.203  
  67.204 +    printk(KERN_EMERG"deleting CPU %u\n", cpu);
  67.205      return 0;
  67.206  }
  67.207  
  67.208 @@ -258,6 +312,24 @@ int set_px_pminfo(uint32_t acpi_id, stru
  67.209  
  67.210      if ( dom0_px_info->flags & XEN_PX_PCT )
  67.211      {
  67.212 +        /* space_id check */
  67.213 +        if (dom0_px_info->control_register.space_id != 
  67.214 +            dom0_px_info->status_register.space_id)
  67.215 +        {
  67.216 +            ret = -EINVAL;
  67.217 +            goto out;
  67.218 +        }
  67.219 +
  67.220 +#ifdef CONFIG_IA64
  67.221 +        /* for IA64, currently it only supports FFH */
  67.222 +        if (dom0_px_info->control_register.space_id !=
  67.223 +            ACPI_ADR_SPACE_FIXED_HARDWARE)
  67.224 +        {
  67.225 +            ret = -EINVAL;
  67.226 +            goto out;
  67.227 +        }
  67.228 +#endif
  67.229 +
  67.230          memcpy ((void *)&pxpt->control_register,
  67.231                  (void *)&dom0_px_info->control_register,
  67.232                  sizeof(struct xen_pct_register));
  67.233 @@ -267,8 +339,16 @@ int set_px_pminfo(uint32_t acpi_id, stru
  67.234          print_PCT(&pxpt->control_register);
  67.235          print_PCT(&pxpt->status_register);
  67.236      }
  67.237 +
  67.238      if ( dom0_px_info->flags & XEN_PX_PSS ) 
  67.239      {
  67.240 +        /* capability check */
  67.241 +        if (dom0_px_info->state_count <= 1)
  67.242 +        {
  67.243 +            ret = -EINVAL;
  67.244 +            goto out;
  67.245 +        }
  67.246 +
  67.247          if ( !(pxpt->states = xmalloc_array(struct xen_processor_px,
  67.248                          dom0_px_info->state_count)) )
  67.249          {
  67.250 @@ -280,14 +360,28 @@ int set_px_pminfo(uint32_t acpi_id, stru
  67.251          pxpt->state_count = dom0_px_info->state_count;
  67.252          print_PSS(pxpt->states,pxpt->state_count);
  67.253      }
  67.254 +
  67.255      if ( dom0_px_info->flags & XEN_PX_PSD )
  67.256      {
  67.257 +#ifdef CONFIG_X86
  67.258 +        /* for X86, check domain coordination */
  67.259 +        /* for IA64, _PSD is optional for current IA64 cpufreq algorithm */
  67.260 +        if (dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_ALL &&
  67.261 +            dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_ANY &&
  67.262 +            dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_HW)
  67.263 +        {
  67.264 +            ret = -EINVAL;
  67.265 +            goto out;
  67.266 +        }
  67.267 +#endif
  67.268 +
  67.269          pxpt->shared_type = dom0_px_info->shared_type;
  67.270          memcpy ((void *)&pxpt->domain_info,
  67.271                  (void *)&dom0_px_info->domain_info,
  67.272                  sizeof(struct xen_psd_package));
  67.273          print_PSD(&pxpt->domain_info);
  67.274      }
  67.275 +
  67.276      if ( dom0_px_info->flags & XEN_PX_PPC )
  67.277      {
  67.278          pxpt->platform_limit = dom0_px_info->platform_limit;
  67.279 @@ -295,7 +389,6 @@ int set_px_pminfo(uint32_t acpi_id, stru
  67.280  
  67.281          if ( pxpt->init == XEN_PX_INIT )
  67.282          {
  67.283 -
  67.284              ret = cpufreq_limit_change(cpuid); 
  67.285              goto out;
  67.286          }
    68.1 --- a/xen/include/asm-x86/config.h	Tue Nov 04 12:07:22 2008 +0900
    68.2 +++ b/xen/include/asm-x86/config.h	Tue Nov 04 12:43:19 2008 +0900
    68.3 @@ -41,14 +41,6 @@
    68.4  #define CONFIG_HOTPLUG 1
    68.5  #define CONFIG_HOTPLUG_CPU 1
    68.6  
    68.7 -/*
    68.8 - * Avoid deep recursion when tearing down pagetables during domain destruction,
    68.9 - * causing dom0 to become unresponsive and Xen to miss time-critical softirq
   68.10 - * deadlines. This will ultimately be replaced by built-in preemptibility of
   68.11 - * get_page_type().
   68.12 - */
   68.13 -#define DOMAIN_DESTRUCT_AVOID_RECURSION 1
   68.14 -
   68.15  #define HZ 100
   68.16  
   68.17  #define OPT_CONSOLE_STR "vga"
    69.1 --- a/xen/include/asm-x86/event.h	Tue Nov 04 12:07:22 2008 +0900
    69.2 +++ b/xen/include/asm-x86/event.h	Tue Nov 04 12:43:19 2008 +0900
    69.3 @@ -11,36 +11,8 @@
    69.4  
    69.5  #include <xen/shared.h>
    69.6  
    69.7 -static inline void vcpu_kick(struct vcpu *v)
    69.8 -{
    69.9 -    /*
   69.10 -     * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
   69.11 -     * pending flag. These values may fluctuate (after all, we hold no
   69.12 -     * locks) but the key insight is that each change will cause
   69.13 -     * evtchn_upcall_pending to be polled.
   69.14 -     * 
   69.15 -     * NB2. We save the running flag across the unblock to avoid a needless
   69.16 -     * IPI for domains that we IPI'd to unblock.
   69.17 -     */
   69.18 -    int running = v->is_running;
   69.19 -    vcpu_unblock(v);
   69.20 -    if ( running )
   69.21 -        smp_send_event_check_cpu(v->processor);
   69.22 -}
   69.23 -
   69.24 -static inline void vcpu_mark_events_pending(struct vcpu *v)
   69.25 -{
   69.26 -    int already_pending = test_and_set_bit(
   69.27 -        0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
   69.28 -
   69.29 -    if ( already_pending )
   69.30 -        return;
   69.31 -
   69.32 -    if ( is_hvm_vcpu(v) )
   69.33 -        hvm_assert_evtchn_irq(v);
   69.34 -    else
   69.35 -        vcpu_kick(v);
   69.36 -}
   69.37 +void vcpu_kick(struct vcpu *v);
   69.38 +void vcpu_mark_events_pending(struct vcpu *v);
   69.39  
   69.40  int hvm_local_events_need_delivery(struct vcpu *v);
   69.41  static inline int local_events_need_delivery(void)
    70.1 --- a/xen/include/asm-x86/fixmap.h	Tue Nov 04 12:07:22 2008 +0900
    70.2 +++ b/xen/include/asm-x86/fixmap.h	Tue Nov 04 12:43:19 2008 +0900
    70.3 @@ -29,6 +29,7 @@
    70.4   * from the end of virtual memory backwards.
    70.5   */
    70.6  enum fixed_addresses {
    70.7 +    FIX_RESERVED, /* Index 0 is reserved since fix_to_virt(0) > FIXADDR_TOP. */
    70.8  #ifdef __i386__
    70.9      FIX_PAE_HIGHMEM_0,
   70.10      FIX_PAE_HIGHMEM_END = FIX_PAE_HIGHMEM_0 + NR_CPUS-1,
    71.1 --- a/xen/include/asm-x86/hvm/vmx/vpmu.h	Tue Nov 04 12:07:22 2008 +0900
    71.2 +++ b/xen/include/asm-x86/hvm/vmx/vpmu.h	Tue Nov 04 12:43:19 2008 +0900
    71.3 @@ -67,7 +67,7 @@ struct vpmu_struct {
    71.4  #define VPMU_CONTEXT_ALLOCATED              0x1
    71.5  #define VPMU_CONTEXT_LOADED                 0x2
    71.6  #define VPMU_RUNNING                        0x4
    71.7 -
    71.8 +#define PASSIVE_DOMAIN_ALLOCATED	    0x8
    71.9  int vpmu_do_wrmsr(struct cpu_user_regs *regs);
   71.10  int vpmu_do_rdmsr(struct cpu_user_regs *regs);
   71.11  int vpmu_do_interrupt(struct cpu_user_regs *regs);
    72.1 --- a/xen/include/asm-x86/hvm/vmx/vpmu_core2.h	Tue Nov 04 12:07:22 2008 +0900
    72.2 +++ b/xen/include/asm-x86/hvm/vmx/vpmu_core2.h	Tue Nov 04 12:43:19 2008 +0900
    72.3 @@ -23,28 +23,6 @@
    72.4  #ifndef __ASM_X86_HVM_VPMU_CORE_H_
    72.5  #define __ASM_X86_HVM_VPMU_CORE_H_
    72.6  
    72.7 -/* Core 2 Non-architectual Performance Counter MSRs. */
    72.8 -u32 core2_counters_msr[] =   {
    72.9 -    MSR_CORE_PERF_FIXED_CTR0,
   72.10 -    MSR_CORE_PERF_FIXED_CTR1,
   72.11 -    MSR_CORE_PERF_FIXED_CTR2};
   72.12 -
   72.13 -/* Core 2 Non-architectual Performance Control MSRs. */
   72.14 -u32 core2_ctrls_msr[] = {
   72.15 -    MSR_CORE_PERF_FIXED_CTR_CTRL,
   72.16 -    MSR_IA32_PEBS_ENABLE,
   72.17 -    MSR_IA32_DS_AREA};
   72.18 -
   72.19 -struct pmumsr core2_counters = {
   72.20 -    3,
   72.21 -    core2_counters_msr
   72.22 -};
   72.23 -
   72.24 -struct pmumsr core2_ctrls = {
   72.25 -    3,
   72.26 -    core2_ctrls_msr
   72.27 -};
   72.28 -
   72.29  struct arch_msr_pair {
   72.30      u64 counter;
   72.31      u64 control;
    73.1 --- a/xen/include/asm-x86/hvm/vpt.h	Tue Nov 04 12:07:22 2008 +0900
    73.2 +++ b/xen/include/asm-x86/hvm/vpt.h	Tue Nov 04 12:43:19 2008 +0900
    73.3 @@ -32,41 +32,6 @@
    73.4  #include <asm/hvm/irq.h>
    73.5  #include <public/hvm/save.h>
    73.6  
    73.7 -struct HPETState;
    73.8 -struct HPET_timer_fn_info {
    73.9 -    struct HPETState *hs;
   73.10 -    unsigned int tn;
   73.11 -};
   73.12 -
   73.13 -struct hpet_registers {
   73.14 -    /* Memory-mapped, software visible registers */
   73.15 -    uint64_t capability;        /* capabilities */
   73.16 -    uint64_t config;            /* configuration */
   73.17 -    uint64_t isr;               /* interrupt status reg */
   73.18 -    uint64_t mc64;              /* main counter */
   73.19 -    struct {                    /* timers */
   73.20 -        uint64_t config;        /* configuration/cap */
   73.21 -        uint64_t cmp;           /* comparator */
   73.22 -        uint64_t fsb;           /* FSB route, not supported now */
   73.23 -    } timers[HPET_TIMER_NUM];
   73.24 -
   73.25 -    /* Hidden register state */
   73.26 -    uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */
   73.27 -};
   73.28 -
   73.29 -typedef struct HPETState {
   73.30 -    struct hpet_registers hpet;
   73.31 -    struct vcpu *vcpu;
   73.32 -    uint64_t stime_freq;
   73.33 -    uint64_t hpet_to_ns_scale; /* hpet ticks to ns (multiplied by 2^10) */
   73.34 -    uint64_t hpet_to_ns_limit; /* max hpet ticks convertable to ns      */
   73.35 -    uint64_t mc_offset;
   73.36 -    struct timer timers[HPET_TIMER_NUM];
   73.37 -    struct HPET_timer_fn_info timer_fn_info[HPET_TIMER_NUM]; 
   73.38 -    spinlock_t lock;
   73.39 -} HPETState;
   73.40 -
   73.41 -
   73.42  /*
   73.43   * Abstract layer of periodic time, one short time.
   73.44   */
   73.45 @@ -108,6 +73,34 @@ typedef struct PITState {
   73.46      spinlock_t lock;
   73.47  } PITState;
   73.48  
   73.49 +struct hpet_registers {
   73.50 +    /* Memory-mapped, software visible registers */
   73.51 +    uint64_t capability;        /* capabilities */
   73.52 +    uint64_t config;            /* configuration */
   73.53 +    uint64_t isr;               /* interrupt status reg */
   73.54 +    uint64_t mc64;              /* main counter */
   73.55 +    struct {                    /* timers */
   73.56 +        uint64_t config;        /* configuration/cap */
   73.57 +        uint64_t cmp;           /* comparator */
   73.58 +        uint64_t fsb;           /* FSB route, not supported now */
   73.59 +    } timers[HPET_TIMER_NUM];
   73.60 +
   73.61 +    /* Hidden register state */
   73.62 +    uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */
   73.63 +    uint64_t comparator64[HPET_TIMER_NUM]; /* 64 bit running comparator */
   73.64 +};
   73.65 +
   73.66 +typedef struct HPETState {
   73.67 +    struct hpet_registers hpet;
   73.68 +    struct vcpu *vcpu;
   73.69 +    uint64_t stime_freq;
   73.70 +    uint64_t hpet_to_ns_scale; /* hpet ticks to ns (multiplied by 2^10) */
   73.71 +    uint64_t hpet_to_ns_limit; /* max hpet ticks convertable to ns      */
   73.72 +    uint64_t mc_offset;
   73.73 +    struct periodic_time pt[HPET_TIMER_NUM];
   73.74 +    spinlock_t lock;
   73.75 +} HPETState;
   73.76 +
   73.77  typedef struct RTCState {
   73.78      /* Hardware state */
   73.79      struct hvm_hw_rtc hw;
   73.80 @@ -160,13 +153,13 @@ void pt_migrate(struct vcpu *v);
   73.81   * The given periodic timer structure must be initialised with zero bytes,
   73.82   * except for the 'source' field which must be initialised with the
   73.83   * correct PTSRC_ value. The initialised timer structure can then be passed
   73.84 - * to {create,destroy}_periodic_time() and number of times and in any order.
   73.85 + * to {create,destroy}_periodic_time() any number of times and in any order.
   73.86   * Note that, for a given periodic timer, invocations of these functions MUST
   73.87   * be serialised.
   73.88   */
   73.89  void create_periodic_time(
   73.90 -    struct vcpu *v, struct periodic_time *pt, uint64_t period,
   73.91 -    uint8_t irq, char one_shot, time_cb *cb, void *data);
   73.92 +    struct vcpu *v, struct periodic_time *pt, uint64_t delta,
   73.93 +    uint64_t period, uint8_t irq, time_cb *cb, void *data);
   73.94  void destroy_periodic_time(struct periodic_time *pt);
   73.95  
   73.96  int pv_pit_handler(int port, int data, int write);
   73.97 @@ -185,7 +178,6 @@ void pmtimer_init(struct vcpu *v);
   73.98  void pmtimer_deinit(struct domain *d);
   73.99  void pmtimer_reset(struct domain *d);
  73.100  
  73.101 -void hpet_migrate_timers(struct vcpu *v);
  73.102  void hpet_init(struct vcpu *v);
  73.103  void hpet_deinit(struct domain *d);
  73.104  void hpet_reset(struct domain *d);
    74.1 --- a/xen/include/asm-x86/mm.h	Tue Nov 04 12:07:22 2008 +0900
    74.2 +++ b/xen/include/asm-x86/mm.h	Tue Nov 04 12:43:19 2008 +0900
    74.3 @@ -61,12 +61,36 @@ struct page_info
    74.4          /*
    74.5           * When PGT_partial is true then this field is valid and indicates
    74.6           * that PTEs in the range [0, @nr_validated_ptes) have been validated.
    74.7 -         * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been
    74.8 -         * partially validated.
    74.9 +         * An extra page reference must be acquired (or not dropped) whenever
   74.10 +         * PGT_partial gets set, and it must be dropped when the flag gets
   74.11 +         * cleared. This is so that a get() leaving a page in partially
   74.12 +         * validated state (where the caller would drop the reference acquired
   74.13 +         * due to the getting of the type [apparently] failing [-EAGAIN])
   74.14 +         * would not accidentally result in a page left with zero general
   74.15 +         * reference count, but non-zero type reference count (possible when
   74.16 +         * the partial get() is followed immediately by domain destruction).
   74.17 +         * Likewise, the ownership of the single type reference for partially
   74.18 +         * (in-)validated pages is tied to this flag, i.e. the instance
   74.19 +         * setting the flag must not drop that reference, whereas the instance
   74.20 +         * clearing it will have to.
   74.21 +         *
   74.22 +         * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has
   74.23 +         * been partially validated. This implies that the general reference
   74.24 +         * to the page (acquired from get_page_from_lNe()) would be dropped
   74.25 +         * (again due to the apparent failure) and hence must be re-acquired
   74.26 +         * when resuming the validation, but must not be dropped when picking
   74.27 +         * up the page for invalidation.
   74.28 +         *
   74.29 +         * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has
   74.30 +         * been partially invalidated. This is basically the opposite case of
   74.31 +         * above, i.e. the general reference to the page was not dropped in
   74.32 +         * put_page_from_lNe() (due to the apparent failure), and hence it
   74.33 +         * must be dropped when the put operation is resumed (and completes),
   74.34 +         * but it must not be acquired if picking up the page for validation.
   74.35           */
   74.36          struct {
   74.37              u16 nr_validated_ptes;
   74.38 -            bool_t partial_pte;
   74.39 +            s8 partial_pte;
   74.40          };
   74.41  
   74.42          /*
    75.1 --- a/xen/include/asm-x86/page.h	Tue Nov 04 12:07:22 2008 +0900
    75.2 +++ b/xen/include/asm-x86/page.h	Tue Nov 04 12:43:19 2008 +0900
    75.3 @@ -314,6 +314,9 @@ unsigned long clone_idle_pagetable(struc
    75.4  #define __PAGE_HYPERVISOR_NOCACHE \
    75.5      (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED)
    75.6  
    75.7 +#define GRANT_PTE_FLAGS \
    75.8 +    (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX | _PAGE_GNTTAB)
    75.9 +
   75.10  #ifndef __ASSEMBLY__
   75.11  
   75.12  static inline int get_order_from_bytes(paddr_t size)
    76.1 --- a/xen/include/asm-x86/softirq.h	Tue Nov 04 12:07:22 2008 +0900
    76.2 +++ b/xen/include/asm-x86/softirq.h	Tue Nov 04 12:43:19 2008 +0900
    76.3 @@ -3,7 +3,8 @@
    76.4  
    76.5  #define NMI_MCE_SOFTIRQ        (NR_COMMON_SOFTIRQS + 0)
    76.6  #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
    76.7 +#define VCPU_KICK_SOFTIRQ      (NR_COMMON_SOFTIRQS + 2)
    76.8  
    76.9 -#define NR_ARCH_SOFTIRQS       2
   76.10 +#define NR_ARCH_SOFTIRQS       3
   76.11  
   76.12  #endif /* __ASM_SOFTIRQ_H__ */
    77.1 --- a/xen/include/asm-x86/x86_32/page.h	Tue Nov 04 12:07:22 2008 +0900
    77.2 +++ b/xen/include/asm-x86/x86_32/page.h	Tue Nov 04 12:43:19 2008 +0900
    77.3 @@ -105,9 +105,6 @@ extern unsigned int PAGE_HYPERVISOR_NOCA
    77.4  #define get_pte_flags(x) (((int)((x) >> 32) & ~0xFFF) | ((int)(x) & 0xFFF))
    77.5  #define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 32) | ((x) & 0xFFF))
    77.6  
    77.7 -#define GRANT_PTE_FLAGS \
    77.8 -    (_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_GNTTAB)
    77.9 -
   77.10  /*
   77.11   * Disallow unused flag bits plus PAT/PSE, PCD, PWT and GLOBAL.
   77.12   * Permit the NX bit if the hardware supports it.
    78.1 --- a/xen/include/asm-x86/x86_64/page.h	Tue Nov 04 12:07:22 2008 +0900
    78.2 +++ b/xen/include/asm-x86/x86_64/page.h	Tue Nov 04 12:43:19 2008 +0900
    78.3 @@ -119,14 +119,11 @@ typedef l4_pgentry_t root_pgentry_t;
    78.4  #define L3_DISALLOW_MASK (BASE_DISALLOW_MASK)
    78.5  #define L4_DISALLOW_MASK (BASE_DISALLOW_MASK)
    78.6  
    78.7 -#define COMPAT_L3_DISALLOW_MASK 0xFFFFF1FEU
    78.8 +#define COMPAT_L3_DISALLOW_MASK 0xFFFFF198U
    78.9  
   78.10  #define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
   78.11  #define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
   78.12  
   78.13 -#define GRANT_PTE_FLAGS \
   78.14 -    (_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_GNTTAB|_PAGE_USER)
   78.15 -
   78.16  #define USER_MAPPINGS_ARE_GLOBAL
   78.17  #ifdef USER_MAPPINGS_ARE_GLOBAL
   78.18  /*
    79.1 --- a/xen/include/asm-x86/xenoprof.h	Tue Nov 04 12:07:22 2008 +0900
    79.2 +++ b/xen/include/asm-x86/xenoprof.h	Tue Nov 04 12:43:19 2008 +0900
    79.3 @@ -64,6 +64,9 @@ void xenoprof_backtrace(
    79.4                   "xenoprof/x86 with autotranslated mode enabled"    \
    79.5                   "isn't supported yet\n");                          \
    79.6      } while (0)
    79.7 +int passive_domain_do_rdmsr(struct cpu_user_regs *regs);
    79.8 +int passive_domain_do_wrmsr(struct cpu_user_regs *regs);
    79.9 +void passive_domain_destroy(struct vcpu *v);
   79.10  
   79.11  #endif /* __ASM_X86_XENOPROF_H__ */
   79.12  
    80.1 --- a/xen/include/public/features.h	Tue Nov 04 12:07:22 2008 +0900
    80.2 +++ b/xen/include/public/features.h	Tue Nov 04 12:43:19 2008 +0900
    80.3 @@ -59,6 +59,9 @@
    80.4  /* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */
    80.5  #define XENFEAT_mmu_pt_update_preserve_ad  5
    80.6  
    80.7 +/* x86: Does this Xen host support the MMU_{CLEAR,COPY}_PAGE hypercall? */
    80.8 +#define XENFEAT_highmem_assist             6
    80.9 +
   80.10  #define XENFEAT_NR_SUBMAPS 1
   80.11  
   80.12  #endif /* __XEN_PUBLIC_FEATURES_H__ */
    81.1 --- a/xen/include/public/trace.h	Tue Nov 04 12:07:22 2008 +0900
    81.2 +++ b/xen/include/public/trace.h	Tue Nov 04 12:43:19 2008 +0900
    81.3 @@ -142,7 +142,9 @@
    81.4  #define TRC_HVM_INVLPG64        (TRC_HVM_HANDLER + TRC_64_FLAG + 0x14)
    81.5  #define TRC_HVM_MCE             (TRC_HVM_HANDLER + 0x15)
    81.6  #define TRC_HVM_IO_ASSIST       (TRC_HVM_HANDLER + 0x16)
    81.7 +#define TRC_HVM_IO_ASSIST64     (TRC_HVM_HANDLER + TRC_64_FLAG + 0x16)
    81.8  #define TRC_HVM_MMIO_ASSIST     (TRC_HVM_HANDLER + 0x17)
    81.9 +#define TRC_HVM_MMIO_ASSIST64   (TRC_HVM_HANDLER + TRC_64_FLAG + 0x17)
   81.10  #define TRC_HVM_CLTS            (TRC_HVM_HANDLER + 0x18)
   81.11  #define TRC_HVM_LMSW            (TRC_HVM_HANDLER + 0x19)
   81.12  #define TRC_HVM_LMSW64          (TRC_HVM_HANDLER + TRC_64_FLAG + 0x19)
    82.1 --- a/xen/include/public/xen.h	Tue Nov 04 12:07:22 2008 +0900
    82.2 +++ b/xen/include/public/xen.h	Tue Nov 04 12:43:19 2008 +0900
    82.3 @@ -231,6 +231,13 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
    82.4   * cmd: MMUEXT_SET_LDT
    82.5   * linear_addr: Linear address of LDT base (NB. must be page-aligned).
    82.6   * nr_ents: Number of entries in LDT.
    82.7 + *
    82.8 + * cmd: MMUEXT_CLEAR_PAGE
    82.9 + * mfn: Machine frame number to be cleared.
   82.10 + *
   82.11 + * cmd: MMUEXT_COPY_PAGE
   82.12 + * mfn: Machine frame number of the destination page.
   82.13 + * src_mfn: Machine frame number of the source page.
   82.14   */
   82.15  #define MMUEXT_PIN_L1_TABLE      0
   82.16  #define MMUEXT_PIN_L2_TABLE      1
   82.17 @@ -247,12 +254,15 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
   82.18  #define MMUEXT_FLUSH_CACHE      12
   82.19  #define MMUEXT_SET_LDT          13
   82.20  #define MMUEXT_NEW_USER_BASEPTR 15
   82.21 +#define MMUEXT_CLEAR_PAGE       16
   82.22 +#define MMUEXT_COPY_PAGE        17
   82.23  
   82.24  #ifndef __ASSEMBLY__
   82.25  struct mmuext_op {
   82.26      unsigned int cmd;
   82.27      union {
   82.28 -        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
   82.29 +        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR
   82.30 +         * CLEAR_PAGE, COPY_PAGE */
   82.31          xen_pfn_t     mfn;
   82.32          /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
   82.33          unsigned long linear_addr;
   82.34 @@ -266,6 +276,8 @@ struct mmuext_op {
   82.35  #else
   82.36          void *vcpumask;
   82.37  #endif
   82.38 +        /* COPY_PAGE */
   82.39 +        xen_pfn_t src_mfn;
   82.40      } arg2;
   82.41  };
   82.42  typedef struct mmuext_op mmuext_op_t;
    83.1 --- a/xen/include/xen/cpuidle.h	Tue Nov 04 12:07:22 2008 +0900
    83.2 +++ b/xen/include/xen/cpuidle.h	Tue Nov 04 12:43:19 2008 +0900
    83.3 @@ -30,12 +30,18 @@
    83.4  #define ACPI_PROCESSOR_MAX_POWER        8
    83.5  #define CPUIDLE_NAME_LEN                16
    83.6  
    83.7 +#define ACPI_CSTATE_EM_NONE     0
    83.8 +#define ACPI_CSTATE_EM_SYSIO    1
    83.9 +#define ACPI_CSTATE_EM_FFH      2
   83.10 +#define ACPI_CSTATE_EM_HALT     3
   83.11 +
   83.12  struct acpi_processor_cx
   83.13  {
   83.14 +    u8 idx;
   83.15      u8 valid;
   83.16      u8 type;
   83.17      u32 address;
   83.18 -    u8 space_id;
   83.19 +    u8 entry_method; /* ACPI_CSTATE_EM_xxx */
   83.20      u32 latency;
   83.21      u32 latency_ticks;
   83.22      u32 power;
    84.1 --- a/xen/include/xen/domain_page.h	Tue Nov 04 12:07:22 2008 +0900
    84.2 +++ b/xen/include/xen/domain_page.h	Tue Nov 04 12:43:19 2008 +0900
    84.3 @@ -24,7 +24,7 @@ void *map_domain_page(unsigned long mfn)
    84.4   * Pass a VA within a page previously mapped in the context of the
    84.5   * currently-executing VCPU via a call to map_domain_page().
    84.6   */
    84.7 -void unmap_domain_page(void *va);
    84.8 +void unmap_domain_page(const void *va);
    84.9  
   84.10  /*
   84.11   * Similar to the above calls, except the mapping is accessible in all
   84.12 @@ -32,7 +32,7 @@ void unmap_domain_page(void *va);
   84.13   * mappings can also be unmapped from any context.
   84.14   */
   84.15  void *map_domain_page_global(unsigned long mfn);
   84.16 -void unmap_domain_page_global(void *va);
   84.17 +void unmap_domain_page_global(const void *va);
   84.18  
   84.19  #define DMCACHE_ENTRY_VALID 1U
   84.20  #define DMCACHE_ENTRY_HELD  2U
   84.21 @@ -75,7 +75,7 @@ map_domain_page_with_cache(unsigned long
   84.22  }
   84.23  
   84.24  static inline void
   84.25 -unmap_domain_page_with_cache(void *va, struct domain_mmap_cache *cache)
   84.26 +unmap_domain_page_with_cache(const void *va, struct domain_mmap_cache *cache)
   84.27  {
   84.28      ASSERT(cache != NULL);
   84.29      cache->flags &= ~DMCACHE_ENTRY_HELD;
    85.1 --- a/xen/include/xen/spinlock.h	Tue Nov 04 12:07:22 2008 +0900
    85.2 +++ b/xen/include/xen/spinlock.h	Tue Nov 04 12:43:19 2008 +0900
    85.3 @@ -5,21 +5,38 @@
    85.4  #include <asm/system.h>
    85.5  #include <asm/spinlock.h>
    85.6  
    85.7 +#ifndef NDEBUG
    85.8 +struct lock_debug {
    85.9 +    int irq_safe; /* +1: IRQ-safe; 0: not IRQ-safe; -1: don't know yet */
   85.10 +};
   85.11 +#define _LOCK_DEBUG { -1 }
   85.12 +void spin_debug_enable(void);
   85.13 +void spin_debug_disable(void);
   85.14 +#else
   85.15 +struct lock_debug { };
   85.16 +#define _LOCK_DEBUG { }
   85.17 +#define spin_debug_enable() ((void)0)
   85.18 +#define spin_debug_disable() ((void)0)
   85.19 +#endif
   85.20 +
   85.21  typedef struct {
   85.22      raw_spinlock_t raw;
   85.23      u16 recurse_cpu:12;
   85.24      u16 recurse_cnt:4;
   85.25 +    struct lock_debug debug;
   85.26  } spinlock_t;
   85.27  
   85.28 -#define SPIN_LOCK_UNLOCKED { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0 }
   85.29 +
   85.30 +#define SPIN_LOCK_UNLOCKED { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, _LOCK_DEBUG }
   85.31  #define DEFINE_SPINLOCK(l) spinlock_t l = SPIN_LOCK_UNLOCKED
   85.32  #define spin_lock_init(l) (*(l) = (spinlock_t)SPIN_LOCK_UNLOCKED)
   85.33  
   85.34  typedef struct {
   85.35      raw_rwlock_t raw;
   85.36 +    struct lock_debug debug;
   85.37  } rwlock_t;
   85.38  
   85.39 -#define RW_LOCK_UNLOCKED { _RAW_RW_LOCK_UNLOCKED }
   85.40 +#define RW_LOCK_UNLOCKED { _RAW_RW_LOCK_UNLOCKED, _LOCK_DEBUG }
   85.41  #define DEFINE_RWLOCK(l) rwlock_t l = RW_LOCK_UNLOCKED
   85.42  #define rwlock_init(l) (*(l) = (rwlock_t)RW_LOCK_UNLOCKED)
   85.43  
   85.44 @@ -34,6 +51,7 @@ void _spin_unlock_irqrestore(spinlock_t 
   85.45  int _spin_is_locked(spinlock_t *lock);
   85.46  int _spin_trylock(spinlock_t *lock);
   85.47  void _spin_barrier(spinlock_t *lock);
   85.48 +void _spin_barrier_irq(spinlock_t *lock);
   85.49  
   85.50  void _spin_lock_recursive(spinlock_t *lock);
   85.51  void _spin_unlock_recursive(spinlock_t *lock);
   85.52 @@ -67,6 +85,7 @@ void _write_unlock_irqrestore(rwlock_t *
   85.53  
   85.54  /* Ensure a lock is quiescent between two critical operations. */
   85.55  #define spin_barrier(l)               _spin_barrier(l)
   85.56 +#define spin_barrier_irq(l)           _spin_barrier_irq(l)
   85.57  
   85.58  /*
   85.59   * spin_[un]lock_recursive(): Use these forms when the lock can (safely!) be
    86.1 --- a/xen/include/xen/time.h	Tue Nov 04 12:07:22 2008 +0900
    86.2 +++ b/xen/include/xen/time.h	Tue Nov 04 12:43:19 2008 +0900
    86.3 @@ -52,6 +52,7 @@ struct tm gmtime(unsigned long t);
    86.4  #define SECONDS(_s)     ((s_time_t)((_s)  * 1000000000ULL))
    86.5  #define MILLISECS(_ms)  ((s_time_t)((_ms) * 1000000ULL))
    86.6  #define MICROSECS(_us)  ((s_time_t)((_us) * 1000ULL))
    86.7 +#define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1))
    86.8  
    86.9  extern void update_vcpu_system_time(struct vcpu *v);
   86.10  extern void update_domain_wallclock_time(struct domain *d);
    87.1 --- a/xen/include/xen/timer.h	Tue Nov 04 12:07:22 2008 +0900
    87.2 +++ b/xen/include/xen/timer.h	Tue Nov 04 12:43:19 2008 +0900
    87.3 @@ -15,12 +15,13 @@
    87.4  struct timer {
    87.5      /* System time expiry value (nanoseconds since boot). */
    87.6      s_time_t expires;
    87.7 +    s_time_t expires_end;
    87.8  
    87.9      /* Position in active-timer data structure. */
   87.10      union {
   87.11          /* Timer-heap offset. */
   87.12          unsigned int heap_offset;
   87.13 -        /* Overflow linked list. */
   87.14 +        /* Linked list. */
   87.15          struct timer *list_next;
   87.16      };
   87.17  
    89.1 --- a/xen/include/xlat.lst	Tue Nov 04 12:07:22 2008 +0900
    89.2 +++ b/xen/include/xlat.lst	Tue Nov 04 12:43:19 2008 +0900
    89.3 @@ -56,6 +56,6 @@
    89.4  !	processor_flags			platform.h
    89.5  !	processor_power			platform.h
    89.6  !	pct_register			platform.h
    89.7 -!	processor_px			platform.h
    89.8 +?	processor_px			platform.h
    89.9  !	psd_package			platform.h
   89.10  !	processor_performance		platform.h