ia64/xen-unstable

changeset 18758:e75cb35c798b

merge with xen-unstable.hg
author Isaku Yamahata <yamahata@valinux.co.jp>
date Tue Nov 04 12:43:19 2008 +0900 (2008-11-04)
parents 10f0e1bb8e5e 43a079fd50fd
children 57c94bdbd6b3
files xen/arch/ia64/xen/cpufreq/cpufreq.c xen/arch/ia64/xen/irq.c xen/common/xmalloc.c
line diff
     1.1 --- a/.hgignore	Tue Nov 04 12:07:22 2008 +0900
     1.2 +++ b/.hgignore	Tue Nov 04 12:43:19 2008 +0900
     1.3 @@ -211,6 +211,7 @@
     1.4  ^tools/xenfb/vncfb$
     1.5  ^tools/xenmon/xentrace_setmask$
     1.6  ^tools/xenmon/xenbaked$
     1.7 +^tools/xenpmd/xenpmd$
     1.8  ^tools/xenstat/xentop/xentop$
     1.9  ^tools/xenstore/testsuite/tmp/.*$
    1.10  ^tools/xenstore/xen$
     2.1 --- a/extras/mini-os/include/sched.h	Tue Nov 04 12:07:22 2008 +0900
     2.2 +++ b/extras/mini-os/include/sched.h	Tue Nov 04 12:43:19 2008 +0900
     2.3 @@ -48,8 +48,9 @@ struct thread* create_thread(char *name,
     2.4  void exit_thread(void) __attribute__((noreturn));
     2.5  void schedule(void);
     2.6  
     2.7 +#ifdef __INSIDE_MINIOS__
     2.8  #define current get_current()
     2.9 -
    2.10 +#endif
    2.11  
    2.12  void wake(struct thread *thread);
    2.13  void block(struct thread *thread);
     3.1 --- a/extras/mini-os/include/wait.h	Tue Nov 04 12:07:22 2008 +0900
     3.2 +++ b/extras/mini-os/include/wait.h	Tue Nov 04 12:43:19 2008 +0900
     3.3 @@ -7,7 +7,7 @@
     3.4  
     3.5  #define DEFINE_WAIT(name)                               \
     3.6  struct wait_queue name = {                              \
     3.7 -    .thread       = current,                            \
     3.8 +    .thread       = get_current(),                            \
     3.9      .thread_list  = MINIOS_LIST_HEAD_INIT((name).thread_list), \
    3.10  }
    3.11  
    3.12 @@ -53,7 +53,7 @@ static inline void wake_up(struct wait_q
    3.13      unsigned long flags;        \
    3.14      local_irq_save(flags);      \
    3.15      add_wait_queue(&wq, &w);    \
    3.16 -    block(current);             \
    3.17 +    block(get_current());       \
    3.18      local_irq_restore(flags);   \
    3.19  } while (0)
    3.20  
    3.21 @@ -74,8 +74,8 @@ static inline void wake_up(struct wait_q
    3.22          /* protect the list */                                  \
    3.23          local_irq_save(flags);                                  \
    3.24          add_wait_queue(&wq, &__wait);                           \
    3.25 -        current->wakeup_time = deadline;                        \
    3.26 -        clear_runnable(current);                                \
    3.27 +        get_current()->wakeup_time = deadline;                  \
    3.28 +        clear_runnable(get_current());                          \
    3.29          local_irq_restore(flags);                               \
    3.30          if((condition) || (deadline && NOW() >= deadline))      \
    3.31              break;                                              \
    3.32 @@ -83,7 +83,7 @@ static inline void wake_up(struct wait_q
    3.33      }                                                           \
    3.34      local_irq_save(flags);                                      \
    3.35      /* need to wake up */                                       \
    3.36 -    wake(current);                                              \
    3.37 +    wake(get_current());                                        \
    3.38      remove_wait_queue(&__wait);                                 \
    3.39      local_irq_restore(flags);                                   \
    3.40  } while(0) 
     4.1 --- a/extras/mini-os/minios.mk	Tue Nov 04 12:07:22 2008 +0900
     4.2 +++ b/extras/mini-os/minios.mk	Tue Nov 04 12:43:19 2008 +0900
     4.3 @@ -26,6 +26,9 @@ else
     4.4  DEF_CFLAGS += -O3
     4.5  endif
     4.6  
     4.7 +# Make the headers define our internal stuff
     4.8 +DEF_CFLAGS += -D__INSIDE_MINIOS__
     4.9 +
    4.10  # Build the CFLAGS and ASFLAGS for compiling and assembling.
    4.11  # DEF_... flags are the common mini-os flags,
    4.12  # ARCH_... flags may be defined in arch/$(TARGET_ARCH_FAM/rules.mk
     5.1 --- a/tools/Makefile	Tue Nov 04 12:07:22 2008 +0900
     5.2 +++ b/tools/Makefile	Tue Nov 04 12:43:19 2008 +0900
     5.3 @@ -24,6 +24,7 @@ SUBDIRS-y += libfsimage
     5.4  SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen
     5.5  SUBDIRS-y += fs-back
     5.6  SUBDIRS-$(CONFIG_IOEMU) += ioemu-dir
     5.7 +SUBDIRS-y += xenpmd
     5.8  
     5.9  # These don't cross-compile
    5.10  ifeq ($(XEN_COMPILE_ARCH),$(XEN_TARGET_ARCH))
     6.1 --- a/tools/blktap/drivers/block-qcow.c	Tue Nov 04 12:07:22 2008 +0900
     6.2 +++ b/tools/blktap/drivers/block-qcow.c	Tue Nov 04 12:43:19 2008 +0900
     6.3 @@ -722,11 +722,11 @@ static inline void init_fds(struct disk_
     6.4  /* Open the disk file and initialize qcow state. */
     6.5  static int tdqcow_open (struct disk_driver *dd, const char *name, td_flag_t flags)
     6.6  {
     6.7 -	int fd, len, i, shift, ret, size, l1_table_size, o_flags;
     6.8 +	int fd, len, i, shift, ret, size, l1_table_size, o_flags, l1_table_block;
     6.9  	int max_aio_reqs;
    6.10  	struct td_state     *bs = dd->td_state;
    6.11  	struct tdqcow_state *s  = (struct tdqcow_state *)dd->private;
    6.12 -	char *buf;
    6.13 +	char *buf, *buf2;
    6.14  	QCowHeader *header;
    6.15  	QCowHeader_ext *exthdr;
    6.16  	uint32_t cksum;
    6.17 @@ -734,8 +734,8 @@ static int tdqcow_open (struct disk_driv
    6.18  
    6.19   	DPRINTF("QCOW: Opening %s\n",name);
    6.20  
    6.21 -	/* Since we don't handle O_DIRECT correctly, don't use it */
    6.22 -	o_flags = O_LARGEFILE | ((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
    6.23 +	o_flags = O_DIRECT | O_LARGEFILE | 
    6.24 +		((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
    6.25  	fd = open(name, o_flags);
    6.26  	if (fd < 0) {
    6.27  		DPRINTF("Unable to open %s (%d)\n",name,0 - errno);
    6.28 @@ -819,9 +819,14 @@ static int tdqcow_open (struct disk_driv
    6.29  		(int) (s->l1_size * sizeof(uint64_t)), 
    6.30  		l1_table_size);
    6.31  
    6.32 -	lseek(fd, s->l1_table_offset, SEEK_SET);
    6.33 -	if (read(fd, s->l1_table, l1_table_size) != l1_table_size)
    6.34 +	lseek(fd, 0, SEEK_SET);
    6.35 +	l1_table_block = l1_table_size + s->l1_table_offset;
    6.36 +	l1_table_block = l1_table_block + 512 - (l1_table_block % 512); 
    6.37 +	ret = posix_memalign((void **)&buf2, 4096, l1_table_block);
    6.38 +	if (ret != 0) goto fail;
    6.39 +	if (read(fd, buf2, l1_table_block) != l1_table_block)
    6.40  		goto fail;
    6.41 +	memcpy(s->l1_table, buf2 + s->l1_table_offset, l1_table_size);
    6.42  
    6.43  	for(i = 0; i < s->l1_size; i++) {
    6.44  		be64_to_cpus(&s->l1_table[i]);
    6.45 @@ -871,8 +876,9 @@ static int tdqcow_open (struct disk_driv
    6.46  
    6.47  			DPRINTF("qcow: Converting image to big endian L1 table\n");
    6.48  
    6.49 -			lseek(fd, s->l1_table_offset, SEEK_SET);
    6.50 -			if (write(fd, s->l1_table, l1_table_size) != l1_table_size) {
    6.51 +			memcpy(buf2 + s->l1_table_offset, s->l1_table, l1_table_size);
    6.52 +			lseek(fd, 0, SEEK_SET);
    6.53 +			if (write(fd, buf2, l1_table_block) != l1_table_block) {
    6.54  				DPRINTF("qcow: Failed to write new L1 table\n");
    6.55  				goto fail;
    6.56  			}
    6.57 @@ -917,7 +923,7 @@ static int tdqcow_open (struct disk_driv
    6.58  	init_fds(dd);
    6.59  
    6.60  	if (!final_cluster)
    6.61 -		s->fd_end = s->l1_table_offset + l1_table_size;
    6.62 +		s->fd_end = l1_table_block;
    6.63  	else {
    6.64  		s->fd_end = lseek(fd, 0, SEEK_END);
    6.65  		if (s->fd_end == (off_t)-1)
     7.1 --- a/tools/firmware/hvmloader/acpi/static_tables.c	Tue Nov 04 12:07:22 2008 +0900
     7.2 +++ b/tools/firmware/hvmloader/acpi/static_tables.c	Tue Nov 04 12:43:19 2008 +0900
     7.3 @@ -67,7 +67,7 @@ struct acpi_20_fadt Fadt = {
     7.4  
     7.5      .p_lvl2_lat = 0x0fff, /* >100,  means we do not support C2 state */
     7.6      .p_lvl3_lat = 0x0fff, /* >1000, means we do not support C3 state */
     7.7 -    .iapc_boot_arch = ACPI_LEGACY_DEVICES | ACPI_8042,
     7.8 +    .iapc_boot_arch = ACPI_8042,
     7.9      .flags = (ACPI_PROC_C1 | ACPI_SLP_BUTTON |
    7.10                ACPI_WBINVD | ACPI_PWR_BUTTON |
    7.11                ACPI_FIX_RTC | ACPI_TMR_VAL_EXT),
     8.1 --- a/tools/firmware/rombios/rombios.c	Tue Nov 04 12:07:22 2008 +0900
     8.2 +++ b/tools/firmware/rombios/rombios.c	Tue Nov 04 12:43:19 2008 +0900
     8.3 @@ -7216,7 +7216,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n
     8.4          outb(0x03f5, head);
     8.5          outb(0x03f5, sector);
     8.6          outb(0x03f5, 2); // 512 byte sector size
     8.7 -        outb(0x03f5, 0); // last sector number possible on track
     8.8 +        outb(0x03f5, sector + num_sectors - 1); // last sector to read on track
     8.9          outb(0x03f5, 0); // Gap length
    8.10          outb(0x03f5, 0xff); // Gap length
    8.11  
    8.12 @@ -7364,7 +7364,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n
    8.13          outb(0x03f5, head);
    8.14          outb(0x03f5, sector);
    8.15          outb(0x03f5, 2); // 512 byte sector size
    8.16 -        outb(0x03f5, 0); // last sector number possible on track
    8.17 +        outb(0x03f5, sector + num_sectors - 1); // last sector to write on track
    8.18          outb(0x03f5, 0); // Gap length
    8.19          outb(0x03f5, 0xff); // Gap length
    8.20  
     9.1 --- a/tools/flask/policy/policy/modules/xen/xen.te	Tue Nov 04 12:07:22 2008 +0900
     9.2 +++ b/tools/flask/policy/policy/modules/xen/xen.te	Tue Nov 04 12:43:19 2008 +0900
     9.3 @@ -74,7 +74,7 @@ allow dom0_t iomem_t:mmu {map_read map_w
     9.4  allow dom0_t pirq_t:event {vector};
     9.5  allow dom0_t xen_t:mmu {memorymap};
     9.6  
     9.7 -allow dom0_t dom0_t:mmu {pinpage map_read map_write adjust};
     9.8 +allow dom0_t dom0_t:mmu {pinpage map_read map_write adjust updatemp};
     9.9  allow dom0_t dom0_t:grant {query setup};
    9.10  allow dom0_t dom0_t:domain {scheduler getdomaininfo getvcpuinfo getvcpuaffinity};
    9.11  
    9.12 @@ -112,6 +112,7 @@ allow domU_t evchnU-0_t:event {send};
    9.13  
    9.14  allow dom0_t dom0_t:event {send};
    9.15  allow dom0_t domU_t:grant {copy};
    9.16 +allow domU_t domU_t:grant {copy};
    9.17  
    9.18  manage_domain(dom0_t, domU_t)
    9.19  
    10.1 --- a/tools/python/xen/util/diagnose.py	Tue Nov 04 12:07:22 2008 +0900
    10.2 +++ b/tools/python/xen/util/diagnose.py	Tue Nov 04 12:43:19 2008 +0900
    10.3 @@ -23,7 +23,7 @@ from xen.xend import sxp
    10.4  from xen.xend.XendClient import server
    10.5  from xen.xend.XendError import XendError
    10.6  from xen.xend.xenstore.xstransact import xstransact
    10.7 -from xen.xend.server import DevController
    10.8 +from xen.xend.server import DevConstants
    10.9  
   10.10  import xen.xend.XendProtocol
   10.11  
   10.12 @@ -169,7 +169,7 @@ def diagnose_hotplugging():
   10.13  
   10.14  
   10.15  def stateString(state):
   10.16 -    return state and DevController.xenbusState[int(state)] or '<None>'
   10.17 +    return state and DevConstants.xenbusState[int(state)] or '<None>'
   10.18  
   10.19  
   10.20  def main(argv = None):
    11.1 --- a/tools/python/xen/xend/XendConfig.py	Tue Nov 04 12:07:22 2008 +0900
    11.2 +++ b/tools/python/xen/xend/XendConfig.py	Tue Nov 04 12:43:19 2008 +0900
    11.3 @@ -1602,21 +1602,21 @@ class XendConfig(dict):
    11.4          #   [vscsi,
    11.5          #     [dev,
    11.6          #       [devid, 0], [p-devname, sdb], [p-dev, 1:0:0:1],
    11.7 -        #       [v-dev, 0:0:0:0], [state, Initialising]
    11.8 +        #       [v-dev, 0:0:0:0], [state, 1]
    11.9          #     ],
   11.10          #     [dev,
   11.11          #       [devid, 0], [p-devname, sdc], [p-dev, 1:0:0:2],
   11.12 -        #       [v-dev, 0:0:0:1], [satet, Initialising]
   11.13 +        #       [v-dev, 0:0:0:1], [satet, 1]
   11.14          #     ]
   11.15          #   ],
   11.16          #   [vscsi,
   11.17          #     [dev,
   11.18          #       [devid, 1], [p-devname, sdg], [p-dev, 2:0:0:0],
   11.19 -        #       [v-dev, 1:0:0:0], [state, Initialising]
   11.20 +        #       [v-dev, 1:0:0:0], [state, 1]
   11.21          #     ],
   11.22          #     [dev,
   11.23          #       [devid, 1], [p-devname, sdh], [p-dev, 2:0:0:1],
   11.24 -        #       [v-dev, 1:0:0:1], [satet, Initialising]
   11.25 +        #       [v-dev, 1:0:0:1], [satet, 1]
   11.26          #     ]
   11.27          #   ]
   11.28          # ]
   11.29 @@ -1632,18 +1632,19 @@ class XendConfig(dict):
   11.30          #   [vscsi,
   11.31          #     [dev,
   11.32          #       [devid, 0], [p-devname, sdd], [p-dev, 1:0:0:3],
   11.33 -        #       [v-dev, 0:0:0:2], [state, Initialising]
   11.34 +        #       [v-dev, 0:0:0:2], [state, 1]
   11.35          #     ]
   11.36          #   ]
   11.37          # ]
   11.38          #
   11.39 -        # state 'Initialising' indicates that the device is being attached,
   11.40 -        # while state 'Closing' indicates that the device is being detached.
   11.41 +        # state xenbusState['Initialising'] indicates that the device is 
   11.42 +        # being attached, while state xenbusState['Closing'] indicates 
   11.43 +        # that the device is being detached.
   11.44          #
   11.45          # The Dict looks like this:
   11.46          #
   11.47          # { devs: [ {devid: 0, p-devname: sdd, p-dev: 1:0:0:3,
   11.48 -        #            v-dev: 0:0:0:2, state: Initialising} ] }
   11.49 +        #            v-dev: 0:0:0:2, state: 1} ] }
   11.50  
   11.51          dev_config = {}
   11.52  
    12.1 --- a/tools/python/xen/xend/XendDomainInfo.py	Tue Nov 04 12:07:22 2008 +0900
    12.2 +++ b/tools/python/xen/xend/XendDomainInfo.py	Tue Nov 04 12:43:19 2008 +0900
    12.3 @@ -52,6 +52,7 @@ from xen.xend.xenstore.xsutil import Get
    12.4  from xen.xend.xenstore.xswatch import xswatch
    12.5  from xen.xend.XendConstants import *
    12.6  from xen.xend.XendAPIConstants import *
    12.7 +from xen.xend.server.DevConstants import xenbusState
    12.8  
    12.9  from xen.xend.XendVMMetrics import XendVMMetrics
   12.10  
   12.11 @@ -797,7 +798,7 @@ class XendDomainInfo:
   12.12          existing_dev_info = self._getDeviceInfo_vscsi(req_devid, dev['v-dev'])
   12.13          state = dev['state']
   12.14  
   12.15 -        if state == 'Initialising':
   12.16 +        if state == xenbusState['Initialising']:
   12.17              # new create
   12.18              # If request devid does not exist, create and exit.
   12.19              if existing_dev_info is None:
   12.20 @@ -806,25 +807,48 @@ class XendDomainInfo:
   12.21              elif existing_dev_info == "exists":
   12.22                  raise XendError("The virtual device %s is already defined" % dev['v-dev'])
   12.23  
   12.24 -        elif state == 'Closing':
   12.25 +        elif state == xenbusState['Closing']:
   12.26              if existing_dev_info is None:
   12.27                  raise XendError("Cannot detach vscsi device does not exist")
   12.28  
   12.29 -        # use DevController.reconfigureDevice to change device config
   12.30 -        dev_control = self.getDeviceController(dev_class)
   12.31 -        dev_uuid = dev_control.reconfigureDevice(req_devid, dev_config)
   12.32 -        dev_control.waitForDevice_reconfigure(req_devid)
   12.33 -        num_devs = dev_control.cleanupDevice(req_devid)
   12.34 -
   12.35 -        # update XendConfig with new device info
   12.36 -        if dev_uuid:
   12.37 -            new_dev_sxp = dev_control.configuration(req_devid)
   12.38 +        if self.domid is not None:
   12.39 +            # use DevController.reconfigureDevice to change device config
   12.40 +            dev_control = self.getDeviceController(dev_class)
   12.41 +            dev_uuid = dev_control.reconfigureDevice(req_devid, dev_config)
   12.42 +            dev_control.waitForDevice_reconfigure(req_devid)
   12.43 +            num_devs = dev_control.cleanupDevice(req_devid)
   12.44 +
   12.45 +            # update XendConfig with new device info
   12.46 +            if dev_uuid:
   12.47 +                new_dev_sxp = dev_control.configuration(req_devid)
   12.48 +                self.info.device_update(dev_uuid, new_dev_sxp)
   12.49 +
   12.50 +            # If there is no device left, destroy vscsi and remove config.
   12.51 +            if num_devs == 0:
   12.52 +                self.destroyDevice('vscsi', req_devid)
   12.53 +                del self.info['devices'][dev_uuid]
   12.54 +
   12.55 +        else:
   12.56 +            cur_dev_sxp = self._getDeviceInfo_vscsi(req_devid, None)
   12.57 +            new_dev_sxp = ['vscsi']
   12.58 +            for cur_dev in sxp.children(cur_dev_sxp, 'dev'):
   12.59 +                if state == xenbusState['Closing']:
   12.60 +                    cur_dev_vdev = sxp.child_value(cur_dev, 'v-dev')
   12.61 +                    if cur_dev_vdev == dev['v-dev']:
   12.62 +                        continue
   12.63 +                new_dev_sxp.append(cur_dev)
   12.64 +
   12.65 +            if state == xenbusState['Initialising']:
   12.66 +                new_dev_sxp.append(sxp.child0(dev_sxp, 'dev'))
   12.67 +
   12.68 +            dev_uuid = sxp.child_value(cur_dev_sxp, 'uuid')
   12.69              self.info.device_update(dev_uuid, new_dev_sxp)
   12.70  
   12.71 -        # If there is no device left, destroy vscsi and remove config.
   12.72 -        if num_devs == 0:
   12.73 -            self.destroyDevice('vscsi', req_devid)
   12.74 -            del self.info['devices'][dev_uuid]
   12.75 +            # If there is only 'vscsi' in new_dev_sxp, remove the config.
   12.76 +            if len(sxp.children(new_dev_sxp, 'dev')) == 0:
   12.77 +                del self.info['devices'][dev_uuid]
   12.78 +
   12.79 +        xen.xend.XendDomain.instance().managed_config_save(self)
   12.80  
   12.81          return True
   12.82  
   12.83 @@ -986,7 +1010,17 @@ class XendDomainInfo:
   12.84              sxprs = []
   12.85              dev_num = 0
   12.86              for dev_type, dev_info in self.info.all_devices_sxpr():
   12.87 -                if dev_type == deviceClass:
   12.88 +                if dev_type != deviceClass:
   12.89 +                    continue
   12.90 +
   12.91 +                if deviceClass == 'vscsi':
   12.92 +                    vscsi_devs = ['devs', []]
   12.93 +                    for vscsi_dev in sxp.children(dev_info, 'dev'):
   12.94 +                        vscsi_dev.append(['frontstate', None])
   12.95 +                        vscsi_devs[1].append(vscsi_dev)
   12.96 +                        dev_num = int(sxp.child_value(vscsi_dev, 'devid'))
   12.97 +                    sxprs.append([dev_num, [vscsi_devs]])
   12.98 +                else:
   12.99                      sxprs.append([dev_num, dev_info])
  12.100                      dev_num += 1
  12.101              return sxprs
  12.102 @@ -2380,11 +2414,10 @@ class XendDomainInfo:
  12.103              time.sleep(2)
  12.104          for paths in plist:
  12.105              if paths.find('backend') != -1:
  12.106 -                from xen.xend.server import DevController
  12.107                  # Modify online status /before/ updating state (latter is watched by
  12.108                  # drivers, so this ordering avoids a race).
  12.109                  xstransact.Write(paths, 'online', "0")
  12.110 -                xstransact.Write(paths, 'state', str(DevController.xenbusState['Closing']))
  12.111 +                xstransact.Write(paths, 'state', str(xenbusState['Closing']))
  12.112              # force
  12.113              xstransact.Remove(paths)
  12.114  
  12.115 @@ -3439,7 +3472,7 @@ class XendDomainInfo:
  12.116                      ['p-devname', pscsi.get_dev_name()],
  12.117                      ['p-dev', pscsi.get_physical_HCTL()],
  12.118                      ['v-dev', xenapi_dscsi.get('virtual_HCTL')],
  12.119 -                    ['state', 'Initialising'],
  12.120 +                    ['state', xenbusState['Initialising']],
  12.121                      ['uuid', dscsi_uuid]
  12.122                  ]
  12.123              ]
  12.124 @@ -3558,7 +3591,7 @@ class XendDomainInfo:
  12.125          if target_dev is None:
  12.126              raise XendError('Failed to destroy device')
  12.127  
  12.128 -        target_dev.append(['state', 'Closing'])
  12.129 +        target_dev.append(['state', xenbusState['Closing']])
  12.130          target_vscsi_sxp = ['vscsi', target_dev]
  12.131  
  12.132          if self._stateGet() != XEN_API_VM_POWER_STATE_RUNNING:
    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/tools/python/xen/xend/server/DevConstants.py	Tue Nov 04 12:43:19 2008 +0900
    13.3 @@ -0,0 +1,45 @@
    13.4 +#============================================================================
    13.5 +# This library is free software; you can redistribute it and/or
    13.6 +# modify it under the terms of version 2.1 of the GNU Lesser General Public
    13.7 +# License as published by the Free Software Foundation.
    13.8 +#
    13.9 +# This library is distributed in the hope that it will be useful,
   13.10 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
   13.11 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   13.12 +# Lesser General Public License for more details.
   13.13 +#
   13.14 +# You should have received a copy of the GNU Lesser General Public
   13.15 +# License along with this library; if not, write to the Free Software
   13.16 +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   13.17 +#============================================================================
   13.18 +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@hp.com>
   13.19 +# Copyright (C) 2005 XenSource Ltd
   13.20 +#============================================================================
   13.21 +
   13.22 +DEVICE_CREATE_TIMEOUT  = 100
   13.23 +DEVICE_DESTROY_TIMEOUT = 100
   13.24 +HOTPLUG_STATUS_NODE = "hotplug-status"
   13.25 +HOTPLUG_ERROR_NODE  = "hotplug-error"
   13.26 +HOTPLUG_STATUS_ERROR = "error"
   13.27 +HOTPLUG_STATUS_BUSY  = "busy"
   13.28 +
   13.29 +Connected    = 1
   13.30 +Error        = 2
   13.31 +Missing      = 3
   13.32 +Timeout      = 4
   13.33 +Busy         = 5
   13.34 +Disconnected = 6
   13.35 +
   13.36 +xenbusState = {
   13.37 +    'Unknown'       : 0,
   13.38 +    'Initialising'  : 1,
   13.39 +    'InitWait'      : 2,
   13.40 +    'Initialised'   : 3,
   13.41 +    'Connected'     : 4,
   13.42 +    'Closing'       : 5,
   13.43 +    'Closed'        : 6,
   13.44 +    'Reconfiguring' : 7,
   13.45 +    'Reconfigured'  : 8,
   13.46 +    }
   13.47 +xenbusState.update(dict(zip(xenbusState.values(), xenbusState.keys())))
   13.48 +
    14.1 --- a/tools/python/xen/xend/server/DevController.py	Tue Nov 04 12:07:22 2008 +0900
    14.2 +++ b/tools/python/xen/xend/server/DevController.py	Tue Nov 04 12:43:19 2008 +0900
    14.3 @@ -23,42 +23,15 @@ from xen.xend import sxp, XendOptions
    14.4  from xen.xend.XendError import VmError
    14.5  from xen.xend.XendLogging import log
    14.6  import xen.xend.XendConfig
    14.7 +from xen.xend.server.DevConstants import *
    14.8  
    14.9  from xen.xend.xenstore.xstransact import xstransact, complete
   14.10  from xen.xend.xenstore.xswatch import xswatch
   14.11  
   14.12  import os
   14.13  
   14.14 -DEVICE_CREATE_TIMEOUT  = 100
   14.15 -DEVICE_DESTROY_TIMEOUT = 100
   14.16 -HOTPLUG_STATUS_NODE = "hotplug-status"
   14.17 -HOTPLUG_ERROR_NODE  = "hotplug-error"
   14.18 -HOTPLUG_STATUS_ERROR = "error"
   14.19 -HOTPLUG_STATUS_BUSY  = "busy"
   14.20 -
   14.21 -Connected    = 1
   14.22 -Error        = 2
   14.23 -Missing      = 3
   14.24 -Timeout      = 4
   14.25 -Busy         = 5
   14.26 -Disconnected = 6
   14.27 -
   14.28 -xenbusState = {
   14.29 -    'Unknown'      : 0,
   14.30 -    'Initialising' : 1,
   14.31 -    'InitWait'     : 2,
   14.32 -    'Initialised'  : 3,
   14.33 -    'Connected'    : 4,
   14.34 -    'Closing'      : 5,
   14.35 -    'Closed'       : 6,
   14.36 -    'Reconfiguring': 7,
   14.37 -    'Reconfigured' : 8,
   14.38 -    }
   14.39 -
   14.40  xoptions = XendOptions.instance()
   14.41  
   14.42 -xenbusState.update(dict(zip(xenbusState.values(), xenbusState.keys())))
   14.43 -
   14.44  
   14.45  class DevController:
   14.46      """Abstract base class for a device controller.  Device controllers create
   14.47 @@ -569,7 +542,7 @@ class DevController:
   14.48              xswatch(statusPath, hotplugStatusCallback, ev, result)
   14.49              ev.wait(DEVICE_CREATE_TIMEOUT)
   14.50              err = xstransact.Read(statusPath, HOTPLUG_ERROR_NODE)
   14.51 -            if result['status'] != 'Connected':
   14.52 +            if result['status'] != Connected:
   14.53                  return (result['status'], err)
   14.54              
   14.55          backpath = self.readVm(devid, "backend")
    15.1 --- a/tools/python/xen/xend/server/iopif.py	Tue Nov 04 12:07:22 2008 +0900
    15.2 +++ b/tools/python/xen/xend/server/iopif.py	Tue Nov 04 12:43:19 2008 +0900
    15.3 @@ -45,9 +45,22 @@ def parse_ioport(val):
    15.4  
    15.5  class IOPortsController(DevController):
    15.6  
    15.7 +    valid_cfg = ['to', 'from', 'uuid']
    15.8 +
    15.9      def __init__(self, vm):
   15.10          DevController.__init__(self, vm)
   15.11  
   15.12 +    def getDeviceConfiguration(self, devid, transaction = None):
   15.13 +        result = DevController.getDeviceConfiguration(self, devid, transaction)
   15.14 +        if transaction is None:
   15.15 +            devinfo = self.readBackend(devid, *self.valid_cfg)
   15.16 +        else:
   15.17 +            devinfo = self.readBackendTxn(transaction, devid, *self.valid_cfg)
   15.18 +        config = dict(zip(self.valid_cfg, devinfo))
   15.19 +        config = dict([(key, val) for key, val in config.items()
   15.20 +                       if val != None])
   15.21 +        return config
   15.22 +
   15.23      def getDeviceDetails(self, config):
   15.24          """@see DevController.getDeviceDetails"""
   15.25  
   15.26 @@ -81,4 +94,9 @@ class IOPortsController(DevController):
   15.27                  'ioports: Failed to configure legacy i/o range: %s - %s' %
   15.28                  (io_from, io_to))
   15.29  
   15.30 -        return (None, {}, {})
   15.31 +        back = dict([(k, config[k]) for k in self.valid_cfg if k in config])
   15.32 +        return (self.allocateDeviceID(), back, {})
   15.33 +
   15.34 +    def waitForDevice(self, devid):
   15.35 +        # don't wait for hotplug
   15.36 +        return
    16.1 --- a/tools/python/xen/xend/server/irqif.py	Tue Nov 04 12:07:22 2008 +0900
    16.2 +++ b/tools/python/xen/xend/server/irqif.py	Tue Nov 04 12:43:19 2008 +0900
    16.3 @@ -39,6 +39,18 @@ class IRQController(DevController):
    16.4      def __init__(self, vm):
    16.5          DevController.__init__(self, vm)
    16.6  
    16.7 +    valid_cfg = ['irq', 'uuid']
    16.8 +
    16.9 +    def getDeviceConfiguration(self, devid, transaction = None):
   16.10 +        result = DevController.getDeviceConfiguration(self, devid, transaction)
   16.11 +        if transaction is None:
   16.12 +            devinfo = self.readBackend(devid, *self.valid_cfg)
   16.13 +        else:
   16.14 +            devinfo = self.readBackendTxn(transaction, devid, *self.valid_cfg)
   16.15 +        config = dict(zip(self.valid_cfg, devinfo))
   16.16 +        config = dict([(key, val) for key, val in config.items()
   16.17 +                       if val != None])
   16.18 +        return config
   16.19  
   16.20      def getDeviceDetails(self, config):
   16.21          """@see DevController.getDeviceDetails"""
   16.22 @@ -75,4 +87,9 @@ class IRQController(DevController):
   16.23          if rc < 0:
   16.24              raise VmError(
   16.25                  'irq: Failed to map irq %x' % (pirq))
   16.26 -        return (None, {}, {})
   16.27 +        back = dict([(k, config[k]) for k in self.valid_cfg if k in config])
   16.28 +        return (self.allocateDeviceID(), back, {})
   16.29 +
   16.30 +    def waitForDevice(self, devid):
   16.31 +        # don't wait for hotplug
   16.32 +        return
    17.1 --- a/tools/python/xen/xend/server/pciif.py	Tue Nov 04 12:07:22 2008 +0900
    17.2 +++ b/tools/python/xen/xend/server/pciif.py	Tue Nov 04 12:43:19 2008 +0900
    17.3 @@ -25,7 +25,8 @@ from xen.xend import arch
    17.4  from xen.xend.XendError import VmError
    17.5  from xen.xend.XendLogging import log
    17.6  
    17.7 -from xen.xend.server.DevController import DevController, xenbusState
    17.8 +from xen.xend.server.DevController import DevController
    17.9 +from xen.xend.server.DevConstants import xenbusState
   17.10  
   17.11  import xen.lowlevel.xc
   17.12  
    18.1 --- a/tools/python/xen/xend/server/vscsiif.py	Tue Nov 04 12:07:22 2008 +0900
    18.2 +++ b/tools/python/xen/xend/server/vscsiif.py	Tue Nov 04 12:43:19 2008 +0900
    18.3 @@ -28,7 +28,8 @@ from xen.xend import sxp
    18.4  from xen.xend.XendError import VmError
    18.5  from xen.xend.XendLogging import log
    18.6  
    18.7 -from xen.xend.server.DevController import DevController, xenbusState
    18.8 +from xen.xend.server.DevController import DevController
    18.9 +from xen.xend.server.DevConstants import xenbusState
   18.10  from xen.xend.xenstore.xstransact import xstransact
   18.11  
   18.12  class VSCSIController(DevController):
   18.13 @@ -92,8 +93,8 @@ class VSCSIController(DevController):
   18.14              back[devpath + '/p-devname'] = pdevname
   18.15              vdev = vscsi_config.get('v-dev', '')
   18.16              back[devpath + '/v-dev'] = vdev
   18.17 -            state = vscsi_config.get('state', '')
   18.18 -            back[devpath + '/state'] = str(xenbusState[state])
   18.19 +            state = vscsi_config.get('state', xenbusState['Unknown'])
   18.20 +            back[devpath + '/state'] = str(state)
   18.21              devid = vscsi_config.get('devid', '')
   18.22              back[devpath + '/devid'] = str(devid)
   18.23  
   18.24 @@ -168,17 +169,17 @@ class VSCSIController(DevController):
   18.25          (devid, back, front) = self.getDeviceDetails(config)
   18.26          devid = int(devid)
   18.27          vscsi_config = config['devs'][0]
   18.28 -        state = vscsi_config.get('state', '')
   18.29 +        state = vscsi_config.get('state', xenbusState['Unknown'])
   18.30          driver_state = self.readBackend(devid, 'state')
   18.31          if str(xenbusState['Connected']) != driver_state:
   18.32              raise VmError("Driver status is not connected")
   18.33  
   18.34          uuid = self.readBackend(devid, 'uuid')
   18.35 -        if state == 'Initialising':
   18.36 +        if state == xenbusState['Initialising']:
   18.37              back['uuid'] = uuid
   18.38              self.writeBackend(devid, back)
   18.39  
   18.40 -        elif state == 'Closing':
   18.41 +        elif state == xenbusState['Closing']:
   18.42              found = False
   18.43              devs = self.readBackendList(devid, "vscsi-devs")
   18.44              vscsipath = "vscsi-devs/"
   18.45 @@ -198,7 +199,7 @@ class VSCSIController(DevController):
   18.46  
   18.47          else:
   18.48              raise XendError("Error configuring device invalid "
   18.49 -                            "state '%s'" % state)
   18.50 +                            "state '%s'" % xenbusState[state])
   18.51  
   18.52          self.writeBackend(devid, 'state', str(xenbusState['Reconfiguring']))
   18.53          return self.readBackend(devid, 'uuid')
    19.1 --- a/tools/python/xen/xm/create.py	Tue Nov 04 12:07:22 2008 +0900
    19.2 +++ b/tools/python/xen/xm/create.py	Tue Nov 04 12:43:19 2008 +0900
    19.3 @@ -32,6 +32,7 @@ from xen.xend import PrettyPrint as SXPP
    19.4  from xen.xend import osdep
    19.5  import xen.xend.XendClient
    19.6  from xen.xend.XendBootloader import bootloader
    19.7 +from xen.xend.server.DevConstants import xenbusState
    19.8  from xen.util import blkif
    19.9  from xen.util import vscsi_util
   19.10  import xen.util.xsm.xsm as security
   19.11 @@ -707,7 +708,7 @@ def configure_vscsis(config_devs, vals):
   19.12              vscsi_util.vscsi_get_hctl_and_devname_by(p_dev, scsi_devices)
   19.13  
   19.14          if p_hctl == None:
   19.15 -            raise ValueError("Cannot find device \"%s\"" % p_dev)
   19.16 +            raise ValueError('Cannot find device "%s"' % p_dev)
   19.17  
   19.18          for config in config_scsi:
   19.19              dev = vscsi_convert_sxp_to_dict(config)
   19.20 @@ -717,7 +718,7 @@ def configure_vscsis(config_devs, vals):
   19.21          v_hctl = v_dev.split(':')
   19.22          devid = int(v_hctl[0])
   19.23          config_scsi.append(['dev', \
   19.24 -                        ['state', 'Initialising'], \
   19.25 +                        ['state', xenbusState['Initialising']], \
   19.26                          ['devid', devid], \
   19.27                          ['p-dev', p_hctl], \
   19.28                          ['p-devname', devname], \
   19.29 @@ -1035,6 +1036,14 @@ def preprocess_ioports(vals):
   19.30          ioports.append(hexd)
   19.31      vals.ioports = ioports
   19.32          
   19.33 +def preprocess_irq(vals):
   19.34 +    if not vals.irq: return
   19.35 +    irq = []
   19.36 +    for v in vals.irq:
   19.37 +        d = repr(v)
   19.38 +        irq.append(d)
   19.39 +    vals.irq = irq
   19.40 +
   19.41  def preprocess_vtpm(vals):
   19.42      if not vals.vtpm: return
   19.43      vtpms = []
   19.44 @@ -1133,6 +1142,7 @@ def preprocess(vals):
   19.45      preprocess_vscsi(vals)
   19.46      preprocess_ioports(vals)
   19.47      preprocess_ip(vals)
   19.48 +    preprocess_irq(vals)
   19.49      preprocess_nfs(vals)
   19.50      preprocess_vtpm(vals)
   19.51      preprocess_access_control(vals)
    20.1 --- a/tools/python/xen/xm/main.py	Tue Nov 04 12:07:22 2008 +0900
    20.2 +++ b/tools/python/xen/xm/main.py	Tue Nov 04 12:43:19 2008 +0900
    20.3 @@ -47,6 +47,7 @@ from xen.xend import PrettyPrint
    20.4  from xen.xend import sxp
    20.5  from xen.xend import XendClient
    20.6  from xen.xend.XendConstants import *
    20.7 +from xen.xend.server.DevConstants import xenbusState
    20.8  
    20.9  from xen.xm.opts import OptionError, Opts, wrap, set_true
   20.10  from xen.xm import console
   20.11 @@ -2515,7 +2516,7 @@ def xm_scsi_attach(args):
   20.12      dom = args[0]
   20.13      p_scsi = args[1]
   20.14      v_hctl = args[2]
   20.15 -    scsi = parse_scsi_configuration(p_scsi, v_hctl, 'Initialising')
   20.16 +    scsi = parse_scsi_configuration(p_scsi, v_hctl, xenbusState['Initialising'])
   20.17  
   20.18      if serverType == SERVER_XEN_API:
   20.19  
   20.20 @@ -2635,7 +2636,7 @@ def xm_scsi_detach(args):
   20.21      arg_check(args, 'scsi-detach', 2)
   20.22      dom = args[0]
   20.23      v_hctl = args[1]
   20.24 -    scsi = parse_scsi_configuration(None, v_hctl, 'Closing')
   20.25 +    scsi = parse_scsi_configuration(None, v_hctl, xenbusState['Closing'])
   20.26  
   20.27      if serverType == SERVER_XEN_API:
   20.28  
    21.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    21.2 +++ b/tools/xenpmd/Makefile	Tue Nov 04 12:43:19 2008 +0900
    21.3 @@ -0,0 +1,20 @@
    21.4 +XEN_ROOT=../..
    21.5 +include $(XEN_ROOT)/tools/Rules.mk
    21.6 +
    21.7 +CFLAGS  += -Werror
    21.8 +CFLAGS  += $(CFLAGS_libxenstore)
    21.9 +LDFLAGS += $(LDFLAGS_libxenstore)
   21.10 +
   21.11 +BIN      = xenpmd
   21.12 +
   21.13 +.PHONY: all
   21.14 +all: $(BIN)
   21.15 +
   21.16 +.PHONY: install
   21.17 +install: all
   21.18 +	$(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
   21.19 +	$(INSTALL_PROG) $(BIN) $(DESTDIR)$(SBINDIR)
   21.20 +
   21.21 +.PHONY: clean
   21.22 +clean:
   21.23 +	$(RM) -f $(BIN)
    22.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    22.2 +++ b/tools/xenpmd/xenpmd.c	Tue Nov 04 12:43:19 2008 +0900
    22.3 @@ -0,0 +1,520 @@
    22.4 +/*
    22.5 + * xenpmd.c
    22.6 + *
    22.7 + * xen power management daemon - Facilitates power management 
    22.8 + * functionality within xen guests.
    22.9 + *
   22.10 + * Copyright (c) 2008  Kamala Narasimhan 
   22.11 + * Copyright (c) 2008  Citrix Systems, Inc.
   22.12 + *
   22.13 + * This program is free software; you can redistribute it and/or modify
   22.14 + * it under the terms of the GNU General Public License as published by
   22.15 + * the Free Software Foundation; either version 2 of the License, or
   22.16 + * (at your option) any later version.
   22.17 + *
   22.18 + * This program is distributed in the hope that it will be useful,
   22.19 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   22.20 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   22.21 + * GNU General Public License for more details.
   22.22 + *
   22.23 + * You should have received a copy of the GNU General Public License
   22.24 + * along with this program; if not, write to the Free Software
   22.25 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   22.26 + */
   22.27 +
   22.28 +/* Xen extended power management support provides HVM guest power management
   22.29 + * features beyond S3, S4, S5.  For example, it helps expose system level 
   22.30 + * battery status and battery meter information and in future will be extended
   22.31 + * to include more power management support.  This extended power management 
   22.32 + * support is enabled by setting xen_extended_power_mgmt to 1 or 2 in the HVM
   22.33 + * config file.  When set to 2, non-pass through mode is enabled which heavily
   22.34 + * relies on this power management daemon to glean battery information from 
   22.35 + * dom0 and store it xenstore which would then be queries and used by qemu and 
   22.36 + * passed to the guest when appropriate battery ports are read/written to.
   22.37 + */
   22.38 +
   22.39 +#include <stdio.h>
   22.40 +#include <stdarg.h>
   22.41 +#include <string.h>
   22.42 +#include <stdlib.h>
   22.43 +#include <dirent.h>
   22.44 +#include <unistd.h>
   22.45 +#include <sys/stat.h>
   22.46 +#include <xs.h>
   22.47 +
   22.48 +/* #define RUN_STANDALONE */
   22.49 +#define RUN_IN_SIMULATE_MODE
   22.50 +
   22.51 +enum BATTERY_INFO_TYPE {
   22.52 +    BIF, 
   22.53 +    BST 
   22.54 +};
   22.55 +
   22.56 +enum BATTERY_PRESENT {
   22.57 +    NO, 
   22.58 +    YES 
   22.59 +};
   22.60 +
   22.61 +enum BATTERY_TECHNOLOGY {
   22.62 +    NON_RECHARGEABLE, 
   22.63 +    RECHARGEABLE 
   22.64 +};
   22.65 +
   22.66 +struct battery_info {
   22.67 +    enum BATTERY_PRESENT    present;
   22.68 +    unsigned long           design_capacity;
   22.69 +    unsigned long           last_full_capacity;
   22.70 +    enum BATTERY_TECHNOLOGY battery_technology;
   22.71 +    unsigned long           design_voltage;
   22.72 +    unsigned long           design_capacity_warning;
   22.73 +    unsigned long           design_capacity_low;
   22.74 +    unsigned long           capacity_granularity_1;
   22.75 +    unsigned long           capacity_granularity_2;
   22.76 +    char                    model_number[32];
   22.77 +    char                    serial_number[32];
   22.78 +    char                    battery_type[32];
   22.79 +    char                    oem_info[32];
   22.80 +};
   22.81 +
   22.82 +struct battery_status {
   22.83 +    enum BATTERY_PRESENT    present;
   22.84 +    unsigned long           state;
   22.85 +    unsigned long           present_rate;
   22.86 +    unsigned long           remaining_capacity;
   22.87 +    unsigned long           present_voltage;
   22.88 +};
   22.89 +
   22.90 +static struct xs_handle *xs;
   22.91 +
   22.92 +#ifdef RUN_IN_SIMULATE_MODE
   22.93 +    #define BATTERY_DIR_PATH "/tmp/battery"
   22.94 +    #define BATTERY_INFO_FILE_PATH "/tmp/battery/%s/info" 
   22.95 +    #define BATTERY_STATE_FILE_PATH "/tmp/battery/%s/state"
   22.96 +#else
   22.97 +    #define BATTERY_DIR_PATH "/proc/acpi/battery"
   22.98 +    #define BATTERY_INFO_FILE_PATH "/proc/acpi/battery/%s/info"
   22.99 +    #define BATTERY_STATE_FILE_PATH "/proc/acpi/battery/%s/state"
  22.100 +#endif
  22.101 +
  22.102 +FILE *get_next_battery_file(DIR *battery_dir, 
  22.103 +                            enum BATTERY_INFO_TYPE battery_info_type)
  22.104 +{
  22.105 +    FILE *file = 0;
  22.106 +    struct dirent *dir_entries;
  22.107 +    char file_name[32];
  22.108 +    
  22.109 +    do 
  22.110 +    {
  22.111 +        dir_entries = readdir(battery_dir);
  22.112 +        if ( !dir_entries ) 
  22.113 +            return 0;
  22.114 +        if ( strlen(dir_entries->d_name) < 4 )
  22.115 +            continue;
  22.116 +        if ( battery_info_type == BIF ) 
  22.117 +            snprintf(file_name, 32, BATTERY_INFO_FILE_PATH,
  22.118 +                     dir_entries->d_name);
  22.119 +        else 
  22.120 +            snprintf(file_name, 32, BATTERY_STATE_FILE_PATH,
  22.121 +                     dir_entries->d_name);
  22.122 +        file = fopen(file_name, "r");
  22.123 +    } while ( !file );
  22.124 +
  22.125 +    return file;
  22.126 +}
  22.127 +
  22.128 +void set_attribute_battery_info(char *attrib_name,
  22.129 +                                char *attrib_value,
  22.130 +                                struct battery_info *info)
  22.131 +{
  22.132 +    if ( strstr(attrib_name, "present") ) 
  22.133 +    {
  22.134 +        if ( strstr(attrib_value, "yes") ) 
  22.135 +            info->present = YES;
  22.136 +        return;
  22.137 +    }
  22.138 +
  22.139 +    if ( strstr(attrib_name, "design capacity warning") ) 
  22.140 +    {
  22.141 +        info->design_capacity_warning = strtoull(attrib_value, NULL, 10);
  22.142 +        return;
  22.143 +    }
  22.144 +
  22.145 +    if ( strstr(attrib_name, "design capacity low") ) 
  22.146 +    {
  22.147 +        info->design_capacity_low = strtoull(attrib_value, NULL, 10);
  22.148 +        return;
  22.149 +    }
  22.150 +
  22.151 +    if ( strstr(attrib_name, "design capacity") ) 
  22.152 +    { 
  22.153 +        info->design_capacity = strtoull(attrib_value, NULL, 10);
  22.154 +        return;
  22.155 +    }
  22.156 +
  22.157 +    if ( strstr(attrib_name, "last full capacity") ) 
  22.158 +    {
  22.159 +        info->last_full_capacity = strtoull(attrib_value, NULL, 10);
  22.160 +        return;
  22.161 +    }
  22.162 +
  22.163 +    if ( strstr(attrib_name, "design voltage") ) 
  22.164 +    {
  22.165 +        info->design_voltage = strtoull(attrib_value, NULL, 10);
  22.166 +        return;
  22.167 +    }
  22.168 +
  22.169 +    if ( strstr(attrib_name, "capacity granularity 1") ) 
  22.170 +    {
  22.171 +        info->capacity_granularity_1 = strtoull(attrib_value, NULL, 10);
  22.172 +        return;
  22.173 +    }
  22.174 +
  22.175 +    if ( strstr(attrib_name, "capacity granularity 2") ) 
  22.176 +    {
  22.177 +        info->capacity_granularity_2 = strtoull(attrib_value, NULL, 10);
  22.178 +        return;
  22.179 +    }
  22.180 +
  22.181 +    if ( strstr(attrib_name, "battery technology") ) 
  22.182 +    {
  22.183 +        if ( strncmp(attrib_value, "rechargeable",
  22.184 +                     strlen("rechargeable")) == 0 ) 
  22.185 +            info->battery_technology = RECHARGEABLE;
  22.186 +        else 
  22.187 +            info->battery_technology = NON_RECHARGEABLE;
  22.188 +        return;
  22.189 +    }
  22.190 +
  22.191 +    if ( strstr(attrib_name, "model number") ) 
  22.192 +    {
  22.193 +        strncpy(info->model_number, attrib_value, 32);
  22.194 +        return;
  22.195 +    }
  22.196 +
  22.197 +    if ( strstr(attrib_name, "serial number") ) 
  22.198 +    {
  22.199 +        strncpy(info->serial_number, attrib_value, 32);
  22.200 +        return;
  22.201 +    }
  22.202 +
  22.203 +    if ( strstr(attrib_name, "battery type") ) 
  22.204 +    {
  22.205 +        strncpy(info->battery_type, attrib_value, 32);
  22.206 +        return;
  22.207 +    }
  22.208 +
  22.209 +    if ( strstr(attrib_name, "OEM info") ) 
  22.210 +    {
  22.211 +        strncpy(info->oem_info, attrib_value, 32);
  22.212 +        return;
  22.213 +    }
  22.214 +
  22.215 +    return;
  22.216 +}
  22.217 +
  22.218 +void set_attribute_battery_status(char *attrib_name, 
  22.219 +                                  char *attrib_value,
  22.220 +                                  struct battery_status *status)
  22.221 +{
  22.222 +    if ( strstr(attrib_name, "charging state") ) 
  22.223 +    {
  22.224 +        /* Check this, below is half baked */
  22.225 +        if ( strstr(attrib_value, "charged") ) 
  22.226 +            status->state = 0;
  22.227 +        else 
  22.228 +            status->state = 1;
  22.229 +        return;
  22.230 +    }
  22.231 +
  22.232 +    if ( strstr(attrib_name, "present rate") ) 
  22.233 +    {
  22.234 +        status->present_rate = strtoull(attrib_value, NULL, 10);
  22.235 +        return;
  22.236 +    }
  22.237 +
  22.238 +    if ( strstr(attrib_name, "remaining capacity") ) 
  22.239 +    {
  22.240 +        status->remaining_capacity = strtoull(attrib_value, NULL, 10);
  22.241 +        return;
  22.242 +    }
  22.243 +
  22.244 +    if ( strstr(attrib_name, "present voltage") ) 
  22.245 +    {
  22.246 +        status->present_voltage = strtoull(attrib_value, NULL, 10);
  22.247 +        return;
  22.248 +    }
  22.249 +
  22.250 +    if ( strstr(attrib_name, "present") ) 
  22.251 +    {
  22.252 +        if ( strstr(attrib_value, "yes") ) 
  22.253 +            status->present = YES;
  22.254 +        return;
  22.255 +    }
  22.256 +}
  22.257 +
  22.258 +void parse_battery_info_or_status(char *line_info,
  22.259 +                                  enum BATTERY_INFO_TYPE type,
  22.260 +                                  void *info_or_status)
  22.261 +{
  22.262 +    char attrib_name[128];
  22.263 +    char attrib_value[64];
  22.264 +    char *delimiter;
  22.265 +    unsigned long length;
  22.266 +
  22.267 +    length = strlen(line_info);
  22.268 +    delimiter = (char *) strchr( line_info, ':');
  22.269 +    if ( (!delimiter) || (delimiter == line_info) ||
  22.270 +         (delimiter == line_info + length) ) 
  22.271 +        return;
  22.272 +
  22.273 +    strncpy(attrib_name, line_info, delimiter-line_info);
  22.274 +    while ( *(delimiter+1) == ' ' ) 
  22.275 +    {
  22.276 +        delimiter++;
  22.277 +        if ( delimiter+1 == line_info + length)
  22.278 +            return;
  22.279 +    }
  22.280 +    strncpy(attrib_value, delimiter+1, 
  22.281 +            (unsigned long)line_info + length -(unsigned long)delimiter); 
  22.282 +    
  22.283 +    if ( type == BIF ) 
  22.284 +        set_attribute_battery_info(attrib_name, attrib_value,
  22.285 +                                   (struct battery_info *)info_or_status);
  22.286 +    else 
  22.287 +        set_attribute_battery_status(attrib_name, attrib_value,
  22.288 +                                     (struct battery_status *)info_or_status);
  22.289 +
  22.290 +    return;
  22.291 +}
  22.292 +
  22.293 +int get_next_battery_info_or_status(DIR *battery_dir,
  22.294 +                                    enum BATTERY_INFO_TYPE type,
  22.295 +                                    void *info_or_status)
  22.296 +{
  22.297 +    FILE *file;
  22.298 +    char line_info[256];
  22.299 +
  22.300 +    if  ( !info_or_status )
  22.301 +        return 0;
  22.302 +
  22.303 +    memset(line_info, 0, 256);
  22.304 +    if (type == BIF) 
  22.305 +        memset(info_or_status, 0, sizeof(struct battery_info));
  22.306 +    else 
  22.307 +        memset(info_or_status, 0, sizeof(struct battery_status));
  22.308 +
  22.309 +    file = get_next_battery_file(battery_dir, type);
  22.310 +    if ( !file )
  22.311 +        return 0;
  22.312 +
  22.313 +    while ( fgets(line_info, 1024, file) != NULL ) 
  22.314 +    {
  22.315 +        parse_battery_info_or_status(line_info, type, info_or_status);
  22.316 +        memset(line_info, 0, 256);
  22.317 +    }
  22.318 +
  22.319 +    fclose(file);
  22.320 +    return 1;
  22.321 +}
  22.322 +
  22.323 +#ifdef RUN_STANDALONE
  22.324 +void print_battery_info(struct battery_info *info)
  22.325 +{
  22.326 +    printf("present:                %d\n", info->present);
  22.327 +    printf("design capacity:        %d\n", info->design_capacity);
  22.328 +    printf("last full capacity:     %d\n", info->last_full_capacity);
  22.329 +    printf("battery technology:     %d\n", info->battery_technology);
  22.330 +    printf("design voltage:         %d\n", info->design_voltage);
  22.331 +    printf("design capacity warning:%d\n", info->design_capacity_warning);
  22.332 +    printf("design capacity low:    %d\n", info->design_capacity_low);
  22.333 +    printf("capacity granularity 1: %d\n", info->capacity_granularity_1);
  22.334 +    printf("capacity granularity 2: %d\n", info->capacity_granularity_2);
  22.335 +    printf("model number:           %s\n", info->model_number);
  22.336 +    printf("serial number:          %s\n", info->serial_number);
  22.337 +    printf("battery type:           %s\n", info->battery_type);
  22.338 +    printf("OEM info:               %s\n", info->oem_info);
  22.339 +}
  22.340 +#endif /*RUN_STANDALONE*/
  22.341 +
  22.342 +void write_ulong_lsb_first(char *temp_val, unsigned long val)
  22.343 +{
  22.344 +    snprintf(temp_val, 9, "%02x%02x%02x%02x", (unsigned int)val & 0xff, 
  22.345 +    (unsigned int)(val & 0xff00) >> 8, (unsigned int)(val & 0xff0000) >> 16, 
  22.346 +    (unsigned int)(val & 0xff000000) >> 24);
  22.347 +}
  22.348 +
  22.349 +void write_battery_info_to_xenstore(struct battery_info *info)
  22.350 +{
  22.351 +    char val[1024], string_info[256];
  22.352 +
  22.353 +    xs_mkdir(xs, XBT_NULL, "/pm");
  22.354 +   
  22.355 +    memset(val, 0, 1024);
  22.356 +    memset(string_info, 0, 256);
  22.357 +    /* write 9 dwords (so 9*4) + length of 4 strings + 4 null terminators */
  22.358 +    snprintf(val, 3, "%02x", 
  22.359 +             (unsigned int)(9*4 +
  22.360 +                            strlen(info->model_number) +
  22.361 +                            strlen(info->serial_number) +
  22.362 +                            strlen(info->battery_type) +
  22.363 +                            strlen(info->oem_info) + 4));
  22.364 +    write_ulong_lsb_first(val+2, info->present);
  22.365 +    write_ulong_lsb_first(val+10, info->design_capacity);
  22.366 +    write_ulong_lsb_first(val+18, info->last_full_capacity);
  22.367 +    write_ulong_lsb_first(val+26, info->battery_technology);
  22.368 +    write_ulong_lsb_first(val+34, info->design_voltage);
  22.369 +    write_ulong_lsb_first(val+42, info->design_capacity_warning);
  22.370 +    write_ulong_lsb_first(val+50, info->design_capacity_low);
  22.371 +    write_ulong_lsb_first(val+58, info->capacity_granularity_1);
  22.372 +    write_ulong_lsb_first(val+66, info->capacity_granularity_2);
  22.373 +
  22.374 +    snprintf(string_info, 256, "%02x%s%02x%s%02x%s%02x%s", 
  22.375 +             (unsigned int)strlen(info->model_number), info->model_number,
  22.376 +             (unsigned int)strlen(info->serial_number), info->serial_number,
  22.377 +             (unsigned int)strlen(info->battery_type), info->battery_type,
  22.378 +             (unsigned int)strlen(info->oem_info), info->oem_info);
  22.379 +    strncat(val+73, string_info, 1024);
  22.380 +    xs_write(xs, XBT_NULL, "/pm/bif", 
  22.381 +             val, 73+8+strlen(info->model_number)+strlen(info->serial_number)+
  22.382 +             strlen(info->battery_type)+strlen(info->oem_info)+1);
  22.383 +}
  22.384 +
  22.385 +int write_one_time_battery_info(void)
  22.386 +{
  22.387 +    DIR *dir;
  22.388 +    int ret = 0;
  22.389 +    struct battery_info info;
  22.390 +    
  22.391 +    dir = opendir(BATTERY_DIR_PATH);
  22.392 +    if ( !dir )
  22.393 +        return 0;
  22.394 +
  22.395 +    while ( get_next_battery_info_or_status(dir, BIF, (void *)&info) ) 
  22.396 +    {
  22.397 +#ifdef RUN_STANDALONE
  22.398 +        print_battery_info(&info);
  22.399 +#endif
  22.400 +        if ( info.present == YES ) 
  22.401 +        {
  22.402 +            write_battery_info_to_xenstore(&info);
  22.403 +            ret = 1;
  22.404 +            break; /* rethink this... */
  22.405 +        }
  22.406 +    }
  22.407 +
  22.408 +    closedir(dir);
  22.409 +    return ret;
  22.410 +}
  22.411 +
  22.412 +#ifdef RUN_STANDALONE
  22.413 +void print_battery_status(struct battery_status *status)
  22.414 +{
  22.415 +    printf("present:                     %d\n", status->present);
  22.416 +    printf("Battery state                %d\n", status->state);
  22.417 +    printf("Battery present rate         %d\n", status->present_rate);
  22.418 +    printf("Battery remining capacity    %d\n", status->remaining_capacity);
  22.419 +    printf("Battery present voltage      %d\n", status->present_voltage);
  22.420 +}
  22.421 +#endif /*RUN_STANDALONE*/
  22.422 +
  22.423 +void write_battery_status_to_xenstore(struct battery_status *status)
  22.424 +{
  22.425 +    char val[35];
  22.426 +
  22.427 +    xs_mkdir(xs, XBT_NULL, "/pm");
  22.428 +
  22.429 +    memset(val, 0, 35);
  22.430 +    snprintf(val, 3, "%02x", 16);
  22.431 +    write_ulong_lsb_first(val+2, status->state);
  22.432 +    write_ulong_lsb_first(val+10, status->present_rate);
  22.433 +    write_ulong_lsb_first(val+18, status->remaining_capacity);
  22.434 +    write_ulong_lsb_first(val+26, status->present_voltage);
  22.435 +
  22.436 +    xs_write(xs, XBT_NULL, "/pm/bst", val, 35);
  22.437 +}
  22.438 +
  22.439 +int wait_for_and_update_battery_status_request(void)
  22.440 +{
  22.441 +    DIR *dir;
  22.442 +    int ret = 0;
  22.443 +    unsigned int count;
  22.444 +    struct battery_status status;
  22.445 +
  22.446 +    while ( true )
  22.447 +    {
  22.448 +        /* KN:@TODO - It is rather inefficient to not cache the file handle.
  22.449 +         *  Switch to caching file handle. 
  22.450 +         */
  22.451 +        dir = opendir(BATTERY_DIR_PATH);
  22.452 +        if ( !dir )
  22.453 +            return 0;
  22.454 +
  22.455 +        while ( get_next_battery_info_or_status(dir, BST, (void *)&status) ) 
  22.456 +        {
  22.457 +#ifdef RUN_STANDALONE
  22.458 +            print_battery_status(&status);
  22.459 +#endif
  22.460 +            if ( status.present == YES ) 
  22.461 +            {
  22.462 +                write_battery_status_to_xenstore(&status);
  22.463 +                ret = 1;
  22.464 +                /* rethink this; though I have never seen, there might be
  22.465 +                 * systems out there with more than one battery device 
  22.466 +                 * present
  22.467 +                 */
  22.468 +                break;
  22.469 +            }
  22.470 +        }
  22.471 +        closedir(dir);
  22.472 +        xs_watch(xs, "/pm/events", "refreshbatterystatus");
  22.473 +        xs_read_watch(xs, &count); 
  22.474 +    }
  22.475 +
  22.476 +    return ret;
  22.477 +}
  22.478 +
  22.479 +/* Borrowed daemonize from xenstored - Initially written by Stevens. */
  22.480 +static void daemonize(void)
  22.481 +{
  22.482 +    pid_t pid;
  22.483 +
  22.484 +    if ( (pid = fork()) < 0 )
  22.485 +        exit(1);
  22.486 +
  22.487 +    if ( pid != 0 )
  22.488 +        exit(0);
  22.489 +
  22.490 +    setsid();
  22.491 +
  22.492 +    if ( (pid = fork()) < 0 )
  22.493 +        exit(1);
  22.494 +
  22.495 +    if ( pid != 0 )
  22.496 +        exit(0);
  22.497 +
  22.498 +    if ( chdir("/") == -1 )
  22.499 +        exit(1);
  22.500 +
  22.501 +    umask(0);
  22.502 +}
  22.503 +
  22.504 +int main(int argc, char *argv[])
  22.505 +{
  22.506 +#ifndef RUN_STANDALONE
  22.507 +    daemonize();
  22.508 +#endif
  22.509 +    xs = (struct xs_handle *)xs_daemon_open();
  22.510 +    if ( xs == NULL ) 
  22.511 +        return -1;
  22.512 +
  22.513 +    if ( write_one_time_battery_info() == 0 ) 
  22.514 +    {
  22.515 +        xs_daemon_close(xs);
  22.516 +        return -1;
  22.517 +    }
  22.518 +
  22.519 +    wait_for_and_update_battery_status_request();
  22.520 +    xs_daemon_close(xs);
  22.521 +    return 0;
  22.522 +}
  22.523 +
    23.1 --- a/xen/arch/ia64/xen/cpufreq/cpufreq.c	Tue Nov 04 12:07:22 2008 +0900
    23.2 +++ b/xen/arch/ia64/xen/cpufreq/cpufreq.c	Tue Nov 04 12:43:19 2008 +0900
    23.3 @@ -210,21 +210,6 @@ acpi_cpufreq_cpu_init (struct cpufreq_po
    23.4  
    23.5  	data->acpi_data = &processor_pminfo[cpu]->perf;
    23.6  
    23.7 -	/* capability check */
    23.8 -	if (data->acpi_data->state_count <= 1) {
    23.9 -		printk(KERN_WARNING "P-States\n");
   23.10 -		result = -ENODEV;
   23.11 -		goto err_unreg;
   23.12 -	}
   23.13 -
   23.14 -	if ((data->acpi_data->control_register.space_id !=
   23.15 -				ACPI_ADR_SPACE_FIXED_HARDWARE) ||
   23.16 -			(data->acpi_data->status_register.space_id !=
   23.17 -			 ACPI_ADR_SPACE_FIXED_HARDWARE)) {
   23.18 -		result = -ENODEV;
   23.19 -		goto err_unreg;
   23.20 -	}
   23.21 -
   23.22  	data->freq_table = xmalloc_array(struct cpufreq_frequency_table,
   23.23  			(data->acpi_data->state_count + 1));
   23.24  	if (!data->freq_table) {
    24.1 --- a/xen/arch/ia64/xen/irq.c	Tue Nov 04 12:07:22 2008 +0900
    24.2 +++ b/xen/arch/ia64/xen/irq.c	Tue Nov 04 12:43:19 2008 +0900
    24.3 @@ -74,7 +74,7 @@ unsigned int __ia64_local_vector_to_irq 
    24.4  /*
    24.5   * Controller mappings for all interrupt sources:
    24.6   */
    24.7 -irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
    24.8 +irq_desc_t irq_desc[NR_IRQS] = {
    24.9  	[0 ... NR_IRQS-1] = {
   24.10  		.status = IRQ_DISABLED,
   24.11  		.handler = &no_irq_type,
    25.1 --- a/xen/arch/x86/acpi/cpu_idle.c	Tue Nov 04 12:07:22 2008 +0900
    25.2 +++ b/xen/arch/x86/acpi/cpu_idle.c	Tue Nov 04 12:43:19 2008 +0900
    25.3 @@ -75,13 +75,14 @@ static void print_acpi_power(uint32_t cp
    25.4  
    25.5      printk("==cpu%d==\n", cpu);
    25.6      printk("active state:\t\tC%d\n",
    25.7 -           power->last_state ? (int)(power->last_state - power->states) : -1);
    25.8 +           power->last_state ? power->last_state->idx : -1);
    25.9      printk("max_cstate:\t\tC%d\n", max_cstate);
   25.10      printk("states:\n");
   25.11      
   25.12      for ( i = 1; i < power->count; i++ )
   25.13      {
   25.14 -        printk((power->last_state == &power->states[i]) ? "   *" : "    ");
   25.15 +        printk((power->last_state && power->last_state->idx == i) ?
   25.16 +               "   *" : "    ");
   25.17          printk("C%d:\t", i);
   25.18          printk("type[C%d] ", power->states[i].type);
   25.19          printk("latency[%03d] ", power->states[i].latency);
   25.20 @@ -139,20 +140,26 @@ static void acpi_processor_ffh_cstate_en
   25.21  
   25.22  static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
   25.23  {
   25.24 -    if ( cx->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE )
   25.25 +    int unused;
   25.26 +
   25.27 +    switch ( cx->entry_method )
   25.28      {
   25.29 +    case ACPI_CSTATE_EM_FFH:
   25.30          /* Call into architectural FFH based C-state */
   25.31          acpi_processor_ffh_cstate_enter(cx);
   25.32 -    }
   25.33 -    else
   25.34 -    {
   25.35 -        int unused;
   25.36 +        return;
   25.37 +    case ACPI_CSTATE_EM_SYSIO:
   25.38          /* IO port based C-state */
   25.39          inb(cx->address);
   25.40          /* Dummy wait op - must do something useless after P_LVL2 read
   25.41             because chipsets cannot guarantee that STPCLK# signal
   25.42             gets asserted in time to freeze execution properly. */
   25.43          unused = inl(pmtmr_ioport);
   25.44 +        return;
   25.45 +    case ACPI_CSTATE_EM_HALT:
   25.46 +        acpi_safe_halt();
   25.47 +        local_irq_disable();
   25.48 +        return;
   25.49      }
   25.50  }
   25.51  
   25.52 @@ -222,7 +229,7 @@ static void acpi_processor_idle(void)
   25.53          if ( power->flags.bm_check && acpi_idle_bm_check()
   25.54               && cx->type == ACPI_STATE_C3 )
   25.55              cx = power->safe_state;
   25.56 -        if ( cx - &power->states[0] > max_cstate )
   25.57 +        if ( cx->idx > max_cstate )
   25.58              cx = &power->states[max_cstate];
   25.59      }
   25.60      if ( !cx )
   25.61 @@ -252,35 +259,11 @@ static void acpi_processor_idle(void)
   25.62      switch ( cx->type )
   25.63      {
   25.64      case ACPI_STATE_C1:
   25.65 -        /* Trace cpu idle entry */
   25.66 -        TRACE_1D(TRC_PM_IDLE_ENTRY, 1);
   25.67 -
   25.68 -        /*
   25.69 -         * Invoke C1.
   25.70 -         * Use the appropriate idle routine, the one that would
   25.71 -         * be used without acpi C-states.
   25.72 -         */
   25.73 -        if ( pm_idle_save )
   25.74 -            pm_idle_save();
   25.75 -        else 
   25.76 -            acpi_safe_halt();
   25.77 -
   25.78 -        /* Trace cpu idle exit */
   25.79 -        TRACE_1D(TRC_PM_IDLE_EXIT, 1);
   25.80 -
   25.81 -        /*
   25.82 -         * TBD: Can't get time duration while in C1, as resumes
   25.83 -         *      go to an ISR rather than here.  Need to instrument
   25.84 -         *      base interrupt handler.
   25.85 -         */
   25.86 -        sleep_ticks = 0xFFFFFFFF;
   25.87 -        break;
   25.88 -
   25.89      case ACPI_STATE_C2:
   25.90 -        if ( local_apic_timer_c2_ok )
   25.91 +        if ( cx->type == ACPI_STATE_C1 || local_apic_timer_c2_ok )
   25.92          {
   25.93              /* Trace cpu idle entry */
   25.94 -            TRACE_1D(TRC_PM_IDLE_ENTRY, 2);
   25.95 +            TRACE_1D(TRC_PM_IDLE_ENTRY, cx->idx);
   25.96              /* Get start time (ticks) */
   25.97              t1 = inl(pmtmr_ioport);
   25.98              /* Invoke C2 */
   25.99 @@ -288,7 +271,7 @@ static void acpi_processor_idle(void)
  25.100              /* Get end time (ticks) */
  25.101              t2 = inl(pmtmr_ioport);
  25.102              /* Trace cpu idle exit */
  25.103 -            TRACE_1D(TRC_PM_IDLE_EXIT, 2);
  25.104 +            TRACE_1D(TRC_PM_IDLE_EXIT, cx->idx);
  25.105  
  25.106              /* Re-enable interrupts */
  25.107              local_irq_enable();
  25.108 @@ -328,7 +311,7 @@ static void acpi_processor_idle(void)
  25.109          }
  25.110  
  25.111          /* Trace cpu idle entry */
  25.112 -        TRACE_1D(TRC_PM_IDLE_ENTRY, cx - &power->states[0]);
  25.113 +        TRACE_1D(TRC_PM_IDLE_ENTRY, cx->idx);
  25.114          /*
  25.115           * Before invoking C3, be aware that TSC/APIC timer may be 
  25.116           * stopped by H/W. Without carefully handling of TSC/APIC stop issues,
  25.117 @@ -349,7 +332,7 @@ static void acpi_processor_idle(void)
  25.118          /* recovering TSC */
  25.119          cstate_restore_tsc();
  25.120          /* Trace cpu idle exit */
  25.121 -        TRACE_1D(TRC_PM_IDLE_EXIT, cx - &power->states[0]);
  25.122 +        TRACE_1D(TRC_PM_IDLE_EXIT, cx->idx);
  25.123  
  25.124          if ( power->flags.bm_check && power->flags.bm_control )
  25.125          {
  25.126 @@ -387,9 +370,15 @@ static void acpi_processor_idle(void)
  25.127  
  25.128  static int init_cx_pminfo(struct acpi_processor_power *acpi_power)
  25.129  {
  25.130 +    int i;
  25.131 +
  25.132      memset(acpi_power, 0, sizeof(*acpi_power));
  25.133  
  25.134 +    for ( i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++ )
  25.135 +        acpi_power->states[i].idx = i;
  25.136 +
  25.137      acpi_power->states[ACPI_STATE_C1].type = ACPI_STATE_C1;
  25.138 +    acpi_power->states[ACPI_STATE_C1].entry_method = ACPI_CSTATE_EM_HALT;
  25.139  
  25.140      acpi_power->states[ACPI_STATE_C0].valid = 1;
  25.141      acpi_power->states[ACPI_STATE_C1].valid = 1;
  25.142 @@ -486,16 +475,13 @@ static int check_cx(struct acpi_processo
  25.143          break;
  25.144  
  25.145      case ACPI_ADR_SPACE_FIXED_HARDWARE:
  25.146 -        if ( cx->type > ACPI_STATE_C1 )
  25.147 -        {
  25.148 -            if ( cx->reg.bit_width != VENDOR_INTEL || 
  25.149 -                 cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
  25.150 -                return -EINVAL;
  25.151 +        if ( cx->reg.bit_width != VENDOR_INTEL || 
  25.152 +             cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
  25.153 +            return -EINVAL;
  25.154  
  25.155 -            /* assume all logical cpu has the same support for mwait */
  25.156 -            if ( acpi_processor_ffh_cstate_probe(cx) )
  25.157 -                return -EINVAL;
  25.158 -        }
  25.159 +        /* assume all logical cpu has the same support for mwait */
  25.160 +        if ( acpi_processor_ffh_cstate_probe(cx) )
  25.161 +            return -EINVAL;
  25.162          break;
  25.163  
  25.164      default:
  25.165 @@ -599,7 +585,23 @@ static void set_cx(
  25.166      cx->valid    = 1;
  25.167      cx->type     = xen_cx->type;
  25.168      cx->address  = xen_cx->reg.address;
  25.169 -    cx->space_id = xen_cx->reg.space_id;
  25.170 +
  25.171 +    switch ( xen_cx->reg.space_id )
  25.172 +    {
  25.173 +    case ACPI_ADR_SPACE_FIXED_HARDWARE:
  25.174 +        if ( xen_cx->reg.bit_width == VENDOR_INTEL &&
  25.175 +             xen_cx->reg.bit_offset == NATIVE_CSTATE_BEYOND_HALT )
  25.176 +            cx->entry_method = ACPI_CSTATE_EM_FFH;
  25.177 +        else
  25.178 +            cx->entry_method = ACPI_CSTATE_EM_HALT;
  25.179 +        break;
  25.180 +    case ACPI_ADR_SPACE_SYSTEM_IO:
  25.181 +        cx->entry_method = ACPI_CSTATE_EM_SYSIO;
  25.182 +        break;
  25.183 +    default:
  25.184 +        cx->entry_method = ACPI_CSTATE_EM_NONE;
  25.185 +    }
  25.186 +
  25.187      cx->latency  = xen_cx->latency;
  25.188      cx->power    = xen_cx->power;
  25.189      
  25.190 @@ -761,8 +763,7 @@ int pmstat_get_cx_stat(uint32_t cpuid, s
  25.191          return 0;
  25.192      }
  25.193  
  25.194 -    stat->last = (power->last_state) ?
  25.195 -        (int)(power->last_state - &power->states[0]) : 0;
  25.196 +    stat->last = power->last_state ? power->last_state->idx : 0;
  25.197      stat->nr = power->count;
  25.198      stat->idle_time = v->runstate.time[RUNSTATE_running];
  25.199      if ( v->is_running )
    26.1 --- a/xen/arch/x86/acpi/cpufreq/cpufreq.c	Tue Nov 04 12:07:22 2008 +0900
    26.2 +++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c	Tue Nov 04 12:43:19 2008 +0900
    26.3 @@ -370,7 +370,7 @@ static int acpi_cpufreq_target(struct cp
    26.4      if (!check_freqs(cmd.mask, freqs.new, data))
    26.5          return -EAGAIN;
    26.6  
    26.7 -    for_each_cpu_mask(j, cmd.mask)
    26.8 +    for_each_cpu_mask(j, online_policy_cpus)
    26.9          cpufreq_statistic_update(j, perf->state, next_perf_state);
   26.10  
   26.11      perf->state = next_perf_state;
   26.12 @@ -447,18 +447,6 @@ acpi_cpufreq_cpu_init(struct cpufreq_pol
   26.13      perf = data->acpi_data;
   26.14      policy->shared_type = perf->shared_type;
   26.15  
   26.16 -    /* capability check */
   26.17 -    if (perf->state_count <= 1) {
   26.18 -        printk("No P-States\n");
   26.19 -        result = -ENODEV;
   26.20 -        goto err_unreg;
   26.21 -    }
   26.22 -
   26.23 -    if (perf->control_register.space_id != perf->status_register.space_id) {
   26.24 -        result = -ENODEV;
   26.25 -        goto err_unreg;
   26.26 -    }
   26.27 -
   26.28      switch (perf->control_register.space_id) {
   26.29      case ACPI_ADR_SPACE_SYSTEM_IO:
   26.30          printk("xen_pminfo: @acpi_cpufreq_cpu_init,"
    27.1 --- a/xen/arch/x86/acpi/cpufreq/powernow.c	Tue Nov 04 12:07:22 2008 +0900
    27.2 +++ b/xen/arch/x86/acpi/cpufreq/powernow.c	Tue Nov 04 12:43:19 2008 +0900
    27.3 @@ -229,9 +229,23 @@ err_unreg:
    27.4      return result;
    27.5  }
    27.6  
    27.7 +static int powernow_cpufreq_cpu_exit(struct cpufreq_policy *policy)
    27.8 +{
    27.9 +    struct powernow_cpufreq_data *data = drv_data[policy->cpu];
   27.10 +
   27.11 +    if (data) {
   27.12 +        drv_data[policy->cpu] = NULL;
   27.13 +        xfree(data->freq_table);
   27.14 +        xfree(data);
   27.15 +    }
   27.16 +
   27.17 +    return 0;
   27.18 +}
   27.19 +
   27.20  static struct cpufreq_driver powernow_cpufreq_driver = {
   27.21      .target = powernow_cpufreq_target,
   27.22      .init   = powernow_cpufreq_cpu_init,
   27.23 +    .exit   = powernow_cpufreq_cpu_exit
   27.24  };
   27.25  
   27.26  int powernow_cpufreq_init(void)
    28.1 --- a/xen/arch/x86/acpi/cpuidle_menu.c	Tue Nov 04 12:07:22 2008 +0900
    28.2 +++ b/xen/arch/x86/acpi/cpuidle_menu.c	Tue Nov 04 12:43:19 2008 +0900
    28.3 @@ -59,7 +59,7 @@ static int menu_select(struct acpi_proce
    28.4      data->expected_us = (u32) get_sleep_length_ns() / 1000;
    28.5  
    28.6      /* find the deepest idle state that satisfies our constraints */
    28.7 -    for ( i = 1; i < power->count; i++ )
    28.8 +    for ( i = 2; i < power->count; i++ )
    28.9      {
   28.10          struct acpi_processor_cx *s = &power->states[i];
   28.11  
   28.12 @@ -81,17 +81,7 @@ static void menu_reflect(struct acpi_pro
   28.13      unsigned int last_residency; 
   28.14      unsigned int measured_us;
   28.15  
   28.16 -    /*
   28.17 -     * Ugh, this idle state doesn't support residency measurements, so we
   28.18 -     * are basically lost in the dark.  As a compromise, assume we slept
   28.19 -     * for one full standard timer tick.  However, be aware that this
   28.20 -     * could potentially result in a suboptimal state transition.
   28.21 -     */
   28.22 -    if ( target->type == ACPI_STATE_C1 )
   28.23 -        last_residency = USEC_PER_SEC / HZ;
   28.24 -    else
   28.25 -        last_residency = power->last_residency;
   28.26 -
   28.27 +    last_residency = power->last_residency;
   28.28      measured_us = last_residency + data->elapsed_us;
   28.29  
   28.30      /* if wrapping, set to max uint (-1) */
    29.1 --- a/xen/arch/x86/domain.c	Tue Nov 04 12:07:22 2008 +0900
    29.2 +++ b/xen/arch/x86/domain.c	Tue Nov 04 12:43:19 2008 +0900
    29.3 @@ -174,9 +174,10 @@ void free_vcpu_struct(struct vcpu *v)
    29.4  
    29.5  static int setup_compat_l4(struct vcpu *v)
    29.6  {
    29.7 -    struct page_info *pg = alloc_domheap_page(NULL, 0);
    29.8 +    struct page_info *pg;
    29.9      l4_pgentry_t *l4tab;
   29.10  
   29.11 +    pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
   29.12      if ( pg == NULL )
   29.13          return -ENOMEM;
   29.14  
   29.15 @@ -1639,32 +1640,23 @@ static int relinquish_memory(
   29.16          }
   29.17  
   29.18          if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
   29.19 -            put_page_and_type(page);
   29.20 +            ret = put_page_and_type_preemptible(page, 1);
   29.21 +        switch ( ret )
   29.22 +        {
   29.23 +        case 0:
   29.24 +            break;
   29.25 +        case -EAGAIN:
   29.26 +        case -EINTR:
   29.27 +            set_bit(_PGT_pinned, &page->u.inuse.type_info);
   29.28 +            put_page(page);
   29.29 +            goto out;
   29.30 +        default:
   29.31 +            BUG();
   29.32 +        }
   29.33  
   29.34          if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
   29.35              put_page(page);
   29.36  
   29.37 -#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
   29.38 -        /*
   29.39 -         * Forcibly drop reference counts of page tables above top most (which
   29.40 -         * were skipped to prevent long latencies due to deep recursion - see
   29.41 -         * the special treatment in free_lX_table()).
   29.42 -         */
   29.43 -        y = page->u.inuse.type_info;
   29.44 -        if ( (type < PGT_root_page_table) &&
   29.45 -             unlikely(((y + PGT_type_mask) &
   29.46 -                       (PGT_type_mask|PGT_validated)) == type) )
   29.47 -        {
   29.48 -            BUG_ON((y & PGT_count_mask) >=
   29.49 -                   (page->count_info & PGC_count_mask));
   29.50 -            while ( y & PGT_count_mask )
   29.51 -            {
   29.52 -                put_page_and_type(page);
   29.53 -                y = page->u.inuse.type_info;
   29.54 -            }
   29.55 -        }
   29.56 -#endif
   29.57 -
   29.58          /*
   29.59           * Forcibly invalidate top-most, still valid page tables at this point
   29.60           * to break circular 'linear page table' references as well as clean up
   29.61 @@ -1685,8 +1677,31 @@ static int relinquish_memory(
   29.62                          x & ~(PGT_validated|PGT_partial));
   29.63              if ( likely(y == x) )
   29.64              {
   29.65 -                if ( free_page_type(page, x, 0) != 0 )
   29.66 +                /* No need for atomic update of type_info here: noone else updates it. */
   29.67 +                switch ( ret = free_page_type(page, x, 1) )
   29.68 +                {
   29.69 +                case 0:
   29.70 +                    break;
   29.71 +                case -EINTR:
   29.72 +                    page->u.inuse.type_info |= PGT_validated;
   29.73 +                    if ( x & PGT_partial )
   29.74 +                        put_page(page);
   29.75 +                    put_page(page);
   29.76 +                    ret = -EAGAIN;
   29.77 +                    goto out;
   29.78 +                case -EAGAIN:
   29.79 +                    page->u.inuse.type_info |= PGT_partial;
   29.80 +                    if ( x & PGT_partial )
   29.81 +                        put_page(page);
   29.82 +                    goto out;
   29.83 +                default:
   29.84                      BUG();
   29.85 +                }
   29.86 +                if ( x & PGT_partial )
   29.87 +                {
   29.88 +                    page->u.inuse.type_info--;
   29.89 +                    put_page(page);
   29.90 +                }
   29.91                  break;
   29.92              }
   29.93          }
   29.94 @@ -1831,11 +1846,6 @@ int domain_relinquish_resources(struct d
   29.95          /* fallthrough */
   29.96  
   29.97      case RELMEM_done:
   29.98 -#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
   29.99 -        ret = relinquish_memory(d, &d->page_list, PGT_l1_page_table);
  29.100 -        if ( ret )
  29.101 -            return ret;
  29.102 -#endif
  29.103          break;
  29.104  
  29.105      default:
  29.106 @@ -1892,6 +1902,54 @@ void domain_cpuid(
  29.107      *eax = *ebx = *ecx = *edx = 0;
  29.108  }
  29.109  
  29.110 +void vcpu_kick(struct vcpu *v)
  29.111 +{
  29.112 +    /*
  29.113 +     * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
  29.114 +     * pending flag. These values may fluctuate (after all, we hold no
  29.115 +     * locks) but the key insight is that each change will cause
  29.116 +     * evtchn_upcall_pending to be polled.
  29.117 +     * 
  29.118 +     * NB2. We save the running flag across the unblock to avoid a needless
  29.119 +     * IPI for domains that we IPI'd to unblock.
  29.120 +     */
  29.121 +    bool_t running = v->is_running;
  29.122 +    vcpu_unblock(v);
  29.123 +    if ( running && (in_irq() || (v != current)) )
  29.124 +        cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
  29.125 +}
  29.126 +
  29.127 +void vcpu_mark_events_pending(struct vcpu *v)
  29.128 +{
  29.129 +    int already_pending = test_and_set_bit(
  29.130 +        0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
  29.131 +
  29.132 +    if ( already_pending )
  29.133 +        return;
  29.134 +
  29.135 +    if ( is_hvm_vcpu(v) )
  29.136 +        hvm_assert_evtchn_irq(v);
  29.137 +    else
  29.138 +        vcpu_kick(v);
  29.139 +}
  29.140 +
  29.141 +static void vcpu_kick_softirq(void)
  29.142 +{
  29.143 +    /*
  29.144 +     * Nothing to do here: we merely prevent notifiers from racing with checks
  29.145 +     * executed on return to guest context with interrupts enabled. See, for
  29.146 +     * example, xxx_intr_assist() executed on return to HVM guest context.
  29.147 +     */
  29.148 +}
  29.149 +
  29.150 +static int __init init_vcpu_kick_softirq(void)
  29.151 +{
  29.152 +    open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
  29.153 +    return 0;
  29.154 +}
  29.155 +__initcall(init_vcpu_kick_softirq);
  29.156 +
  29.157 +
  29.158  /*
  29.159   * Local variables:
  29.160   * mode: C
    30.1 --- a/xen/arch/x86/domain_build.c	Tue Nov 04 12:07:22 2008 +0900
    30.2 +++ b/xen/arch/x86/domain_build.c	Tue Nov 04 12:43:19 2008 +0900
    30.3 @@ -194,6 +194,30 @@ static void __init process_dom0_ioports_
    30.4      }
    30.5  }
    30.6  
    30.7 +/* We run on dom0's page tables for the final part of the build process. */
    30.8 +static void dom0_pt_enter(struct vcpu *v)
    30.9 +{
   30.10 +    struct desc_ptr gdt_desc = {
   30.11 +        .limit = LAST_RESERVED_GDT_BYTE,
   30.12 +        .base = (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY)
   30.13 +    };
   30.14 +
   30.15 +    asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
   30.16 +    write_ptbase(v);
   30.17 +}
   30.18 +
   30.19 +/* Return to idle domain's page tables. */
   30.20 +static void dom0_pt_exit(void)
   30.21 +{
   30.22 +    struct desc_ptr gdt_desc = {
   30.23 +        .limit = LAST_RESERVED_GDT_BYTE,
   30.24 +        .base = GDT_VIRT_START(current)
   30.25 +    };
   30.26 +
   30.27 +    write_ptbase(current);
   30.28 +    asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
   30.29 +}
   30.30 +
   30.31  int __init construct_dom0(
   30.32      struct domain *d,
   30.33      unsigned long _image_start, unsigned long image_len, 
   30.34 @@ -700,14 +724,12 @@ int __init construct_dom0(
   30.35          (void)alloc_vcpu(d, i, i % num_online_cpus());
   30.36  
   30.37      /* Set up CR3 value for write_ptbase */
   30.38 -    if ( paging_mode_enabled(v->domain) )
   30.39 +    if ( paging_mode_enabled(d) )
   30.40          paging_update_paging_modes(v);
   30.41      else
   30.42          update_cr3(v);
   30.43  
   30.44 -    /* Install the new page tables. */
   30.45 -    local_irq_disable();
   30.46 -    write_ptbase(v);
   30.47 +    dom0_pt_enter(v);
   30.48  
   30.49      /* Copy the OS image and free temporary buffer. */
   30.50      elf.dest = (void*)vkern_start;
   30.51 @@ -804,9 +826,7 @@ int __init construct_dom0(
   30.52          xlat_start_info(si, XLAT_start_info_console_dom0);
   30.53  #endif
   30.54  
   30.55 -    /* Reinstate the caller's page tables. */
   30.56 -    write_ptbase(current);
   30.57 -    local_irq_enable();
   30.58 +    dom0_pt_exit();
   30.59  
   30.60  #if defined(__i386__)
   30.61      /* Destroy low mappings - they were only for our convenience. */
    31.1 --- a/xen/arch/x86/hpet.c	Tue Nov 04 12:07:22 2008 +0900
    31.2 +++ b/xen/arch/x86/hpet.c	Tue Nov 04 12:43:19 2008 +0900
    31.3 @@ -14,8 +14,6 @@
    31.4  #include <asm/div64.h>
    31.5  #include <asm/hpet.h>
    31.6  
    31.7 -#define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1))
    31.8 -
    31.9  #define MAX_DELTA_NS MILLISECS(10*1000)
   31.10  #define MIN_DELTA_NS MICROSECS(20)
   31.11  
   31.12 @@ -146,7 +144,7 @@ static void handle_hpet_broadcast(struct
   31.13      s_time_t now, next_event;
   31.14      int cpu;
   31.15  
   31.16 -    spin_lock(&ch->lock);
   31.17 +    spin_lock_irq(&ch->lock);
   31.18  
   31.19  again:
   31.20      ch->next_event = STIME_MAX;
   31.21 @@ -171,7 +169,7 @@ again:
   31.22          if ( reprogram_hpet_evt_channel(ch, next_event, now, 0) )
   31.23              goto again;
   31.24      }
   31.25 -    spin_unlock(&ch->lock);
   31.26 +    spin_unlock_irq(&ch->lock);
   31.27  }
   31.28  
   31.29  void hpet_broadcast_init(void)
   31.30 @@ -213,6 +211,7 @@ void hpet_broadcast_enter(void)
   31.31  {
   31.32      struct hpet_event_channel *ch = &hpet_event;
   31.33  
   31.34 +    ASSERT(!local_irq_is_enabled());
   31.35      spin_lock(&ch->lock);
   31.36  
   31.37      disable_APIC_timer();
    32.1 --- a/xen/arch/x86/hvm/emulate.c	Tue Nov 04 12:07:22 2008 +0900
    32.2 +++ b/xen/arch/x86/hvm/emulate.c	Tue Nov 04 12:43:19 2008 +0900
    32.3 @@ -14,11 +14,39 @@
    32.4  #include <xen/lib.h>
    32.5  #include <xen/sched.h>
    32.6  #include <xen/paging.h>
    32.7 +#include <xen/trace.h>
    32.8  #include <asm/event.h>
    32.9  #include <asm/hvm/emulate.h>
   32.10  #include <asm/hvm/hvm.h>
   32.11  #include <asm/hvm/support.h>
   32.12  
   32.13 +#define HVMTRACE_IO_ASSIST_WRITE 0x200
   32.14 +static void hvmtrace_io_assist(int is_mmio, ioreq_t *p)
   32.15 +{
   32.16 +    unsigned int size, event;
   32.17 +    unsigned char buffer[12];
   32.18 +
   32.19 +    if ( likely(!tb_init_done) )
   32.20 +        return;
   32.21 +
   32.22 +    event = is_mmio ? TRC_HVM_MMIO_ASSIST : TRC_HVM_IO_ASSIST;
   32.23 +    if ( !p->dir )
   32.24 +        event |= HVMTRACE_IO_ASSIST_WRITE;
   32.25 +
   32.26 +    *(uint64_t *)buffer = p->addr;
   32.27 +    size = (p->addr != (u32)p->addr) ? 8 : 4;
   32.28 +    if ( size == 8 )
   32.29 +        event |= TRC_64_FLAG;
   32.30 +
   32.31 +    if ( !p->data_is_ptr )
   32.32 +    {
   32.33 +        *(uint32_t *)&buffer[size] = p->data;
   32.34 +        size += 4;
   32.35 +    }
   32.36 +
   32.37 +    trace_var(event, 0/*!cycles*/, size, buffer);
   32.38 +}
   32.39 +
   32.40  static int hvmemul_do_io(
   32.41      int is_mmio, paddr_t addr, unsigned long *reps, int size,
   32.42      paddr_t ram_gpa, int dir, int df, void *p_data)
   32.43 @@ -111,6 +139,8 @@ static int hvmemul_do_io(
   32.44      p->data = value;
   32.45      p->io_count++;
   32.46  
   32.47 +    hvmtrace_io_assist(is_mmio, p);
   32.48 +
   32.49      if ( is_mmio )
   32.50      {
   32.51          rc = hvm_mmio_intercept(p);
    33.1 --- a/xen/arch/x86/hvm/hpet.c	Tue Nov 04 12:07:22 2008 +0900
    33.2 +++ b/xen/arch/x86/hvm/hpet.c	Tue Nov 04 12:43:19 2008 +0900
    33.3 @@ -76,6 +76,7 @@
    33.4          ~0ULL : (tick) * (h)->hpet_to_ns_scale) >> 10))
    33.5  
    33.6  #define timer_config(h, n)       (h->hpet.timers[n].config)
    33.7 +#define timer_enabled(h, n)      (timer_config(h, n) & HPET_TN_ENABLE)
    33.8  #define timer_is_periodic(h, n)  (timer_config(h, n) & HPET_TN_PERIODIC)
    33.9  #define timer_is_32bit(h, n)     (timer_config(h, n) & HPET_TN_32BIT)
   33.10  #define hpet_enabled(h)          (h->hpet.config & HPET_CFG_ENABLE)
   33.11 @@ -88,9 +89,40 @@
   33.12      ((timer_config(h, n) & HPET_TN_INT_ROUTE_CAP_MASK) \
   33.13          >> HPET_TN_INT_ROUTE_CAP_SHIFT)
   33.14  
   33.15 -#define hpet_time_after(a, b)   ((int32_t)(b) - (int32_t)(a) < 0)
   33.16 -#define hpet_time_after64(a, b) ((int64_t)(b) - (int64_t)(a) < 0)
   33.17 +static inline uint64_t hpet_read_maincounter(HPETState *h)
   33.18 +{
   33.19 +    ASSERT(spin_is_locked(&h->lock));
   33.20  
   33.21 +    if ( hpet_enabled(h) )
   33.22 +        return guest_time_hpet(h->vcpu) + h->mc_offset;
   33.23 +    else 
   33.24 +        return h->hpet.mc64;
   33.25 +}
   33.26 +
   33.27 +static uint64_t hpet_get_comparator(HPETState *h, unsigned int tn)
   33.28 +{
   33.29 +    uint64_t comparator;
   33.30 +    uint64_t elapsed;
   33.31 +
   33.32 +    comparator = h->hpet.comparator64[tn];
   33.33 +    if ( timer_is_periodic(h, tn) )
   33.34 +    {
   33.35 +        /* update comparator by number of periods elapsed since last update */
   33.36 +        uint64_t period = h->hpet.period[tn];
   33.37 +        if (period)
   33.38 +        {
   33.39 +            elapsed = hpet_read_maincounter(h) + period - 1 - comparator;
   33.40 +            comparator += (elapsed / period) * period;
   33.41 +            h->hpet.comparator64[tn] = comparator;
   33.42 +        }
   33.43 +    }
   33.44 +    
   33.45 +    /* truncate if timer is in 32 bit mode */
   33.46 +    if ( timer_is_32bit(h, tn) )
   33.47 +        comparator = (uint32_t)comparator;
   33.48 +    h->hpet.timers[tn].cmp = comparator;
   33.49 +    return comparator;
   33.50 +}
   33.51  static inline uint64_t hpet_read64(HPETState *h, unsigned long addr)
   33.52  {
   33.53      addr &= ~7;
   33.54 @@ -104,7 +136,7 @@ static inline uint64_t hpet_read64(HPETS
   33.55      case HPET_STATUS:
   33.56          return h->hpet.isr;
   33.57      case HPET_COUNTER:
   33.58 -        return h->hpet.mc64;
   33.59 +        return hpet_read_maincounter(h);
   33.60      case HPET_T0_CFG:
   33.61      case HPET_T1_CFG:
   33.62      case HPET_T2_CFG:
   33.63 @@ -112,7 +144,7 @@ static inline uint64_t hpet_read64(HPETS
   33.64      case HPET_T0_CMP:
   33.65      case HPET_T1_CMP:
   33.66      case HPET_T2_CMP:
   33.67 -        return h->hpet.timers[(addr - HPET_T0_CMP) >> 5].cmp;
   33.68 +        return hpet_get_comparator(h, (addr - HPET_T0_CMP) >> 5);
   33.69      case HPET_T0_ROUTE:
   33.70      case HPET_T1_ROUTE:
   33.71      case HPET_T2_ROUTE:
   33.72 @@ -140,16 +172,6 @@ static inline int hpet_check_access_leng
   33.73      return 0;
   33.74  }
   33.75  
   33.76 -static inline uint64_t hpet_read_maincounter(HPETState *h)
   33.77 -{
   33.78 -    ASSERT(spin_is_locked(&h->lock));
   33.79 -
   33.80 -    if ( hpet_enabled(h) )
   33.81 -        return guest_time_hpet(h->vcpu) + h->mc_offset;
   33.82 -    else 
   33.83 -        return h->hpet.mc64;
   33.84 -}
   33.85 -
   33.86  static int hpet_read(
   33.87      struct vcpu *v, unsigned long addr, unsigned long length,
   33.88      unsigned long *pval)
   33.89 @@ -169,8 +191,6 @@ static int hpet_read(
   33.90      spin_lock(&h->lock);
   33.91  
   33.92      val = hpet_read64(h, addr);
   33.93 -    if ( (addr & ~7) == HPET_COUNTER )
   33.94 -        val = hpet_read_maincounter(h);
   33.95  
   33.96      result = val;
   33.97      if ( length != 8 )
   33.98 @@ -187,7 +207,10 @@ static void hpet_stop_timer(HPETState *h
   33.99  {
  33.100      ASSERT(tn < HPET_TIMER_NUM);
  33.101      ASSERT(spin_is_locked(&h->lock));
  33.102 -    stop_timer(&h->timers[tn]);
  33.103 +    destroy_periodic_time(&h->pt[tn]);
  33.104 +    /* read the comparator to get it updated so a read while stopped will
  33.105 +     * return the expected value. */
  33.106 +    hpet_get_comparator(h, tn);
  33.107  }
  33.108  
  33.109  /* the number of HPET tick that stands for
  33.110 @@ -197,6 +220,8 @@ static void hpet_stop_timer(HPETState *h
  33.111  static void hpet_set_timer(HPETState *h, unsigned int tn)
  33.112  {
  33.113      uint64_t tn_cmp, cur_tick, diff;
  33.114 +    unsigned int irq;
  33.115 +    unsigned int oneshot;
  33.116  
  33.117      ASSERT(tn < HPET_TIMER_NUM);
  33.118      ASSERT(spin_is_locked(&h->lock));
  33.119 @@ -209,7 +234,10 @@ static void hpet_set_timer(HPETState *h,
  33.120          pit_stop_channel0_irq(pit);
  33.121      }
  33.122  
  33.123 -    tn_cmp   = h->hpet.timers[tn].cmp;
  33.124 +    if ( !timer_enabled(h, tn) )
  33.125 +        return;
  33.126 +
  33.127 +    tn_cmp   = hpet_get_comparator(h, tn);
  33.128      cur_tick = hpet_read_maincounter(h);
  33.129      if ( timer_is_32bit(h, tn) )
  33.130      {
  33.131 @@ -229,7 +257,25 @@ static void hpet_set_timer(HPETState *h,
  33.132          diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN))
  33.133              ? (uint32_t)diff : 0;
  33.134  
  33.135 -    set_timer(&h->timers[tn], NOW() + hpet_tick_to_ns(h, diff));
  33.136 +    if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) )
  33.137 +        /* if LegacyReplacementRoute bit is set, HPET specification requires
  33.138 +           timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC,
  33.139 +           timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */
  33.140 +        irq = (tn == 0) ? 0 : 8;
  33.141 +    else
  33.142 +        irq = timer_int_route(h, tn);
  33.143 +
  33.144 +    /*
  33.145 +     * diff is the time from now when the timer should fire, for a periodic 
  33.146 +     * timer we also need the period which may be different because time may
  33.147 +     * have elapsed between the time the comparator was written and the timer
  33.148 +     * being enabled (now).
  33.149 +     */
  33.150 +    oneshot = !timer_is_periodic(h, tn);
  33.151 +    create_periodic_time(h->vcpu, &h->pt[tn],
  33.152 +                         hpet_tick_to_ns(h, diff),
  33.153 +                         oneshot ? 0 : hpet_tick_to_ns(h, h->hpet.period[tn]),
  33.154 +                         irq, NULL, NULL);
  33.155  }
  33.156  
  33.157  static inline uint64_t hpet_fixup_reg(
  33.158 @@ -248,6 +294,13 @@ static int hpet_write(
  33.159      uint64_t old_val, new_val;
  33.160      int tn, i;
  33.161  
  33.162 +    /* Acculumate a bit mask of timers whos state is changed by this write. */
  33.163 +    unsigned long start_timers = 0;
  33.164 +    unsigned long stop_timers  = 0;
  33.165 +#define set_stop_timer(n)    (__set_bit((n), &stop_timers))
  33.166 +#define set_start_timer(n)   (__set_bit((n), &start_timers))
  33.167 +#define set_restart_timer(n) (set_stop_timer(n),set_start_timer(n))
  33.168 +
  33.169      addr &= HPET_MMAP_SIZE-1;
  33.170  
  33.171      if ( hpet_check_access_length(addr, length) != 0 )
  33.172 @@ -256,9 +309,6 @@ static int hpet_write(
  33.173      spin_lock(&h->lock);
  33.174  
  33.175      old_val = hpet_read64(h, addr);
  33.176 -    if ( (addr & ~7) == HPET_COUNTER )
  33.177 -        old_val = hpet_read_maincounter(h);
  33.178 -
  33.179      new_val = val;
  33.180      if ( length != 8 )
  33.181          new_val = hpet_fixup_reg(
  33.182 @@ -275,22 +325,35 @@ static int hpet_write(
  33.183              /* Enable main counter and interrupt generation. */
  33.184              h->mc_offset = h->hpet.mc64 - guest_time_hpet(h->vcpu);
  33.185              for ( i = 0; i < HPET_TIMER_NUM; i++ )
  33.186 -                hpet_set_timer(h, i); 
  33.187 +            {
  33.188 +                h->hpet.comparator64[i] =
  33.189 +                            h->hpet.timers[i].config & HPET_TN_32BIT ?
  33.190 +                                          (uint32_t)h->hpet.timers[i].cmp :
  33.191 +                                                    h->hpet.timers[i].cmp;
  33.192 +                if ( timer_enabled(h, i) )
  33.193 +                    set_start_timer(i);
  33.194 +            }
  33.195          }
  33.196          else if ( (old_val & HPET_CFG_ENABLE) && !(new_val & HPET_CFG_ENABLE) )
  33.197          {
  33.198              /* Halt main counter and disable interrupt generation. */
  33.199              h->hpet.mc64 = h->mc_offset + guest_time_hpet(h->vcpu);
  33.200              for ( i = 0; i < HPET_TIMER_NUM; i++ )
  33.201 -                hpet_stop_timer(h, i);
  33.202 +                if ( timer_enabled(h, i) )
  33.203 +                    set_stop_timer(i);
  33.204          }
  33.205          break;
  33.206  
  33.207      case HPET_COUNTER:
  33.208 +        h->hpet.mc64 = new_val;
  33.209          if ( hpet_enabled(h) )
  33.210 +        {
  33.211              gdprintk(XENLOG_WARNING, 
  33.212                       "HPET: writing main counter but it's not halted!\n");
  33.213 -        h->hpet.mc64 = new_val;
  33.214 +            for ( i = 0; i < HPET_TIMER_NUM; i++ )
  33.215 +                if ( timer_enabled(h, i) )
  33.216 +                    set_restart_timer(i);
  33.217 +        }
  33.218          break;
  33.219  
  33.220      case HPET_T0_CFG:
  33.221 @@ -313,7 +376,28 @@ static int hpet_write(
  33.222              h->hpet.timers[tn].cmp = (uint32_t)h->hpet.timers[tn].cmp;
  33.223              h->hpet.period[tn] = (uint32_t)h->hpet.period[tn];
  33.224          }
  33.225 -
  33.226 +        if ( hpet_enabled(h) )
  33.227 +        {
  33.228 +            if ( new_val & HPET_TN_ENABLE )
  33.229 +            {
  33.230 +                if ( (new_val ^ old_val) & HPET_TN_PERIODIC )
  33.231 +                    /* timer is enabled but switching mode to/from periodic/
  33.232 +                     * one-shot, stop and restart the vpt timer to get it in
  33.233 +                     * the right mode. */
  33.234 +                    set_restart_timer(tn);
  33.235 +                else if ( (new_val & HPET_TN_32BIT) &&
  33.236 +                         !(old_val & HPET_TN_32BIT) )
  33.237 +                    /* switching from 64 bit to 32 bit mode could cause timer
  33.238 +                     * next fire time, or period, to change. */
  33.239 +                    set_restart_timer(tn);
  33.240 +                else if ( !(old_val & HPET_TN_ENABLE) )
  33.241 +                    /* transition from timer disabled to timer enabled. */
  33.242 +                    set_start_timer(tn);
  33.243 +            }
  33.244 +            else if ( old_val & HPET_TN_ENABLE )
  33.245 +                /* transition from timer enabled to timer disabled. */
  33.246 +                set_stop_timer(tn);
  33.247 +        }
  33.248          break;
  33.249  
  33.250      case HPET_T0_CMP:
  33.251 @@ -322,24 +406,32 @@ static int hpet_write(
  33.252          tn = (addr - HPET_T0_CMP) >> 5;
  33.253          if ( timer_is_32bit(h, tn) )
  33.254              new_val = (uint32_t)new_val;
  33.255 -        if ( !timer_is_periodic(h, tn) ||
  33.256 -             (h->hpet.timers[tn].config & HPET_TN_SETVAL) )
  33.257 -            h->hpet.timers[tn].cmp = new_val;
  33.258 -        else
  33.259 +        h->hpet.timers[tn].cmp = new_val;
  33.260 +        if ( h->hpet.timers[tn].config & HPET_TN_SETVAL )
  33.261 +            /*
  33.262 +             * When SETVAL is one, software is able to "directly set a periodic
  33.263 +             * timer's accumulator."  That is, set the comparator without
  33.264 +             * adjusting the period.  Much the same as just setting the
  33.265 +             * comparator on an enabled one-shot timer.
  33.266 +             * 
  33.267 +             * This configuration bit clears when the comparator is written.
  33.268 +             */
  33.269 +            h->hpet.timers[tn].config &= ~HPET_TN_SETVAL;
  33.270 +        else if ( timer_is_periodic(h, tn) )
  33.271          {
  33.272              /*
  33.273               * Clamp period to reasonable min/max values:
  33.274 -             *  - minimum is 900us, same as timers controlled by vpt.c
  33.275 +             *  - minimum is 100us, same as timers controlled by vpt.c
  33.276               *  - maximum is to prevent overflow in time_after() calculations
  33.277               */
  33.278 -            if ( hpet_tick_to_ns(h, new_val) < MICROSECS(900) )
  33.279 -                new_val = (MICROSECS(900) << 10) / h->hpet_to_ns_scale;
  33.280 +            if ( hpet_tick_to_ns(h, new_val) < MICROSECS(100) )
  33.281 +                new_val = (MICROSECS(100) << 10) / h->hpet_to_ns_scale;
  33.282              new_val &= (timer_is_32bit(h, tn) ? ~0u : ~0ull) >> 1;
  33.283              h->hpet.period[tn] = new_val;
  33.284          }
  33.285 -        h->hpet.timers[tn].config &= ~HPET_TN_SETVAL;
  33.286 -        if ( hpet_enabled(h) )
  33.287 -            hpet_set_timer(h, tn);
  33.288 +        h->hpet.comparator64[tn] = new_val;
  33.289 +        if ( hpet_enabled(h) && timer_enabled(h, tn) )
  33.290 +            set_restart_timer(tn);
  33.291          break;
  33.292  
  33.293      case HPET_T0_ROUTE:
  33.294 @@ -354,6 +446,25 @@ static int hpet_write(
  33.295          break;
  33.296      }
  33.297  
  33.298 +    /* stop/start timers whos state was changed by this write. */
  33.299 +    while (stop_timers)
  33.300 +    {
  33.301 +        i = find_first_set_bit(stop_timers);
  33.302 +        __clear_bit(i, &stop_timers);
  33.303 +        hpet_stop_timer(h, i);
  33.304 +    }
  33.305 +
  33.306 +    while (start_timers)
  33.307 +    {
  33.308 +        i = find_first_set_bit(start_timers);
  33.309 +        __clear_bit(i, &start_timers);
  33.310 +        hpet_set_timer(h, i);
  33.311 +    }
  33.312 +
  33.313 +#undef set_stop_timer
  33.314 +#undef set_start_timer
  33.315 +#undef set_restart_timer
  33.316 +
  33.317      spin_unlock(&h->lock);
  33.318  
  33.319   out:
  33.320 @@ -373,86 +484,6 @@ struct hvm_mmio_handler hpet_mmio_handle
  33.321      .write_handler = hpet_write
  33.322  };
  33.323  
  33.324 -static void hpet_route_interrupt(HPETState *h, unsigned int tn)
  33.325 -{
  33.326 -    unsigned int tn_int_route = timer_int_route(h, tn);
  33.327 -    struct domain *d = h->vcpu->domain;
  33.328 -
  33.329 -    ASSERT(spin_is_locked(&h->lock));
  33.330 -
  33.331 -    if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) )
  33.332 -    {
  33.333 -        /* if LegacyReplacementRoute bit is set, HPET specification requires
  33.334 -           timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC,
  33.335 -           timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */
  33.336 -        int isa_irq = (tn == 0) ? 0 : 8;
  33.337 -        hvm_isa_irq_deassert(d, isa_irq);
  33.338 -        hvm_isa_irq_assert(d, isa_irq);
  33.339 -        return;
  33.340 -    }
  33.341 -
  33.342 -    if ( !(timer_int_route_cap(h, tn) & (1U << tn_int_route)) )
  33.343 -    {
  33.344 -        gdprintk(XENLOG_ERR,
  33.345 -                 "HPET: timer%u: invalid interrupt route config\n", tn);
  33.346 -        domain_crash(d);
  33.347 -        return;
  33.348 -    }
  33.349 -
  33.350 -    /* We support only edge-triggered interrupt. */
  33.351 -    spin_lock(&d->arch.hvm_domain.irq_lock);
  33.352 -    vioapic_irq_positive_edge(d, tn_int_route);
  33.353 -    spin_unlock(&d->arch.hvm_domain.irq_lock);
  33.354 -}
  33.355 -
  33.356 -static void hpet_timer_fn(void *opaque)
  33.357 -{
  33.358 -    struct HPET_timer_fn_info *htfi = opaque;
  33.359 -    HPETState *h = htfi->hs;
  33.360 -    unsigned int tn = htfi->tn;
  33.361 -
  33.362 -    spin_lock(&h->lock);
  33.363 -
  33.364 -    if ( !hpet_enabled(h) )
  33.365 -    {
  33.366 -        spin_unlock(&h->lock);
  33.367 -        return;
  33.368 -    }
  33.369 -
  33.370 -    if ( timer_config(h, tn) & HPET_TN_ENABLE )
  33.371 -        hpet_route_interrupt(h, tn);
  33.372 -
  33.373 -    if ( timer_is_periodic(h, tn) && (h->hpet.period[tn] != 0) )
  33.374 -    {
  33.375 -        uint64_t mc = hpet_read_maincounter(h), period = h->hpet.period[tn];
  33.376 -        if ( timer_is_32bit(h, tn) )
  33.377 -        {
  33.378 -            while ( hpet_time_after(mc, h->hpet.timers[tn].cmp) )
  33.379 -                h->hpet.timers[tn].cmp = (uint32_t)(
  33.380 -                    h->hpet.timers[tn].cmp + period);
  33.381 -        }
  33.382 -        else
  33.383 -        {
  33.384 -            while ( hpet_time_after64(mc, h->hpet.timers[tn].cmp) )
  33.385 -                h->hpet.timers[tn].cmp += period;
  33.386 -        }
  33.387 -        set_timer(&h->timers[tn], NOW() + hpet_tick_to_ns(h, period));
  33.388 -    }
  33.389 -
  33.390 -    spin_unlock(&h->lock);
  33.391 -}
  33.392 -
  33.393 -void hpet_migrate_timers(struct vcpu *v)
  33.394 -{
  33.395 -    struct HPETState *h = &v->domain->arch.hvm_domain.pl_time.vhpet;
  33.396 -    int i;
  33.397 -
  33.398 -    if ( v != h->vcpu )
  33.399 -        return;
  33.400 -
  33.401 -    for ( i = 0; i < HPET_TIMER_NUM; i++ )
  33.402 -        migrate_timer(&h->timers[i], v->processor);
  33.403 -}
  33.404  
  33.405  static int hpet_save(struct domain *d, hvm_domain_context_t *h)
  33.406  {
  33.407 @@ -477,18 +508,20 @@ static int hpet_save(struct domain *d, h
  33.408          C(isr);
  33.409          C(mc64);
  33.410          C(timers[0].config);
  33.411 -        C(timers[0].cmp);
  33.412          C(timers[0].fsb);
  33.413          C(timers[1].config);
  33.414 -        C(timers[1].cmp);
  33.415          C(timers[1].fsb);
  33.416          C(timers[2].config);
  33.417 -        C(timers[2].cmp);
  33.418          C(timers[2].fsb);
  33.419          C(period[0]);
  33.420          C(period[1]);
  33.421          C(period[2]);
  33.422  #undef C
  33.423 +        /* save the 64 bit comparator in the 64 bit timer[n].cmp field
  33.424 +         * regardless of whether or not the timer is in 32 bit mode. */
  33.425 +        rec->timers[0].cmp = hp->hpet.comparator64[0];
  33.426 +        rec->timers[1].cmp = hp->hpet.comparator64[1];
  33.427 +        rec->timers[2].cmp = hp->hpet.comparator64[2];
  33.428      }
  33.429  
  33.430      spin_unlock(&hp->lock);
  33.431 @@ -500,6 +533,7 @@ static int hpet_load(struct domain *d, h
  33.432  {
  33.433      HPETState *hp = &d->arch.hvm_domain.pl_time.vhpet;
  33.434      struct hvm_hw_hpet *rec;
  33.435 +    uint64_t cmp;
  33.436      int i;
  33.437  
  33.438      spin_lock(&hp->lock);
  33.439 @@ -515,32 +549,38 @@ static int hpet_load(struct domain *d, h
  33.440      h->cur += HVM_SAVE_LENGTH(HPET);
  33.441  
  33.442  #define C(x) hp->hpet.x = rec->x
  33.443 -        C(capability);
  33.444 -        C(config);
  33.445 -        C(isr);
  33.446 -        C(mc64);
  33.447 -        C(timers[0].config);
  33.448 -        C(timers[0].cmp);
  33.449 -        C(timers[0].fsb);
  33.450 -        C(timers[1].config);
  33.451 -        C(timers[1].cmp);
  33.452 -        C(timers[1].fsb);
  33.453 -        C(timers[2].config);
  33.454 -        C(timers[2].cmp);
  33.455 -        C(timers[2].fsb);
  33.456 -        C(period[0]);
  33.457 -        C(period[1]);
  33.458 -        C(period[2]);
  33.459 +    C(capability);
  33.460 +    C(config);
  33.461 +    C(isr);
  33.462 +    C(mc64);
  33.463 +    /* The following define will generate a compiler error if HPET_TIMER_NUM
  33.464 +     * changes. This indicates an incompatability with previous saved state. */
  33.465 +#define HPET_TIMER_NUM 3
  33.466 +    for ( i = 0; i < HPET_TIMER_NUM; i++ )
  33.467 +    {
  33.468 +        C(timers[i].config);
  33.469 +        C(timers[i].fsb);
  33.470 +        C(period[i]);
  33.471 +        /* restore the hidden 64 bit comparator and truncate the timer's
  33.472 +         * visible comparator field if in 32 bit mode. */
  33.473 +        cmp = rec->timers[i].cmp;
  33.474 +        hp->hpet.comparator64[i] = cmp;
  33.475 +        if ( timer_is_32bit(hp, i) )
  33.476 +            cmp = (uint32_t)cmp;
  33.477 +        hp->hpet.timers[i].cmp = cmp;
  33.478 +    }
  33.479  #undef C
  33.480      
  33.481      /* Recalculate the offset between the main counter and guest time */
  33.482      hp->mc_offset = hp->hpet.mc64 - guest_time_hpet(hp->vcpu);
  33.483 -                
  33.484 -    /* Restart the timers */
  33.485 -    for ( i = 0; i < HPET_TIMER_NUM; i++ )
  33.486 -        if ( hpet_enabled(hp) )
  33.487 -            hpet_set_timer(hp, i);
  33.488  
  33.489 +    /* restart all timers */
  33.490 +
  33.491 +    if ( hpet_enabled(hp) )
  33.492 +        for ( i = 0; i < HPET_TIMER_NUM; i++ )
  33.493 +            if ( timer_enabled(hp, i) )
  33.494 +                hpet_set_timer(hp, i);
  33.495 + 
  33.496      spin_unlock(&hp->lock);
  33.497  
  33.498      return 0;
  33.499 @@ -575,10 +615,7 @@ void hpet_init(struct vcpu *v)
  33.500          h->hpet.timers[i].config = 
  33.501              HPET_TN_INT_ROUTE_CAP | HPET_TN_SIZE_CAP | HPET_TN_PERIODIC_CAP;
  33.502          h->hpet.timers[i].cmp = ~0ULL;
  33.503 -        h->timer_fn_info[i].hs = h;
  33.504 -        h->timer_fn_info[i].tn = i;
  33.505 -        init_timer(&h->timers[i], hpet_timer_fn, &h->timer_fn_info[i],
  33.506 -                   v->processor);
  33.507 +        h->pt[i].source = PTSRC_isa;
  33.508      }
  33.509  }
  33.510  
  33.511 @@ -587,8 +624,14 @@ void hpet_deinit(struct domain *d)
  33.512      int i;
  33.513      HPETState *h = &d->arch.hvm_domain.pl_time.vhpet;
  33.514  
  33.515 -    for ( i = 0; i < HPET_TIMER_NUM; i++ )
  33.516 -        kill_timer(&h->timers[i]);
  33.517 +    spin_lock(&h->lock);
  33.518 +
  33.519 +    if ( hpet_enabled(h) )
  33.520 +        for ( i = 0; i < HPET_TIMER_NUM; i++ )
  33.521 +            if ( timer_enabled(h, i) )
  33.522 +                hpet_stop_timer(h, i);
  33.523 +
  33.524 +    spin_unlock(&h->lock);
  33.525  }
  33.526  
  33.527  void hpet_reset(struct domain *d)
    34.1 --- a/xen/arch/x86/hvm/hvm.c	Tue Nov 04 12:07:22 2008 +0900
    34.2 +++ b/xen/arch/x86/hvm/hvm.c	Tue Nov 04 12:43:19 2008 +0900
    34.3 @@ -163,7 +163,6 @@ u64 hvm_get_guest_tsc(struct vcpu *v)
    34.4  void hvm_migrate_timers(struct vcpu *v)
    34.5  {
    34.6      rtc_migrate_timers(v);
    34.7 -    hpet_migrate_timers(v);
    34.8      pt_migrate(v);
    34.9  }
   34.10  
    35.1 --- a/xen/arch/x86/hvm/i8254.c	Tue Nov 04 12:07:22 2008 +0900
    35.2 +++ b/xen/arch/x86/hvm/i8254.c	Tue Nov 04 12:43:19 2008 +0900
    35.3 @@ -213,13 +213,13 @@ static void pit_load_count(PITState *pit
    35.4      case 2:
    35.5      case 3:
    35.6          /* Periodic timer. */
    35.7 -        create_periodic_time(v, &pit->pt0, period, 0, 0, pit_time_fired, 
    35.8 +        create_periodic_time(v, &pit->pt0, period, period, 0, pit_time_fired, 
    35.9                               &pit->count_load_time[channel]);
   35.10          break;
   35.11      case 1:
   35.12      case 4:
   35.13          /* One-shot timer. */
   35.14 -        create_periodic_time(v, &pit->pt0, period, 0, 1, pit_time_fired,
   35.15 +        create_periodic_time(v, &pit->pt0, period, 0, 0, pit_time_fired,
   35.16                               &pit->count_load_time[channel]);
   35.17          break;
   35.18      default:
    36.1 --- a/xen/arch/x86/hvm/rtc.c	Tue Nov 04 12:07:22 2008 +0900
    36.2 +++ b/xen/arch/x86/hvm/rtc.c	Tue Nov 04 12:43:19 2008 +0900
    36.3 @@ -59,8 +59,8 @@ static void rtc_timer_update(RTCState *s
    36.4  
    36.5          period = 1 << (period_code - 1); /* period in 32 Khz cycles */
    36.6          period = DIV_ROUND((period * 1000000000ULL), 32768); /* period in ns */
    36.7 -        create_periodic_time(v, &s->pt, period, RTC_IRQ,
    36.8 -                             0, rtc_periodic_cb, s);
    36.9 +        create_periodic_time(v, &s->pt, period, period, RTC_IRQ,
   36.10 +                             rtc_periodic_cb, s);
   36.11      }
   36.12      else
   36.13      {
    37.1 --- a/xen/arch/x86/hvm/svm/entry.S	Tue Nov 04 12:07:22 2008 +0900
    37.2 +++ b/xen/arch/x86/hvm/svm/entry.S	Tue Nov 04 12:43:19 2008 +0900
    37.3 @@ -57,6 +57,8 @@
    37.4  #endif
    37.5  
    37.6  ENTRY(svm_asm_do_resume)
    37.7 +        call svm_intr_assist
    37.8 +
    37.9          get_current(bx)
   37.10          CLGI
   37.11  
   37.12 @@ -67,7 +69,6 @@ ENTRY(svm_asm_do_resume)
   37.13          jnz  .Lsvm_process_softirqs
   37.14  
   37.15          call svm_asid_handle_vmrun
   37.16 -        call svm_intr_assist
   37.17  
   37.18          cmpb $0,addr_of(tb_init_done)
   37.19          jnz  .Lsvm_trace
    38.1 --- a/xen/arch/x86/hvm/vlapic.c	Tue Nov 04 12:07:22 2008 +0900
    38.2 +++ b/xen/arch/x86/hvm/vlapic.c	Tue Nov 04 12:43:19 2008 +0900
    38.3 @@ -701,8 +701,9 @@ static int vlapic_write(struct vcpu *v, 
    38.4                              (uint32_t)val * vlapic->hw.timer_divisor;
    38.5  
    38.6          vlapic_set_reg(vlapic, APIC_TMICT, val);
    38.7 -        create_periodic_time(current, &vlapic->pt, period, vlapic->pt.irq,
    38.8 -                             !vlapic_lvtt_period(vlapic), vlapic_pt_cb,
    38.9 +        create_periodic_time(current, &vlapic->pt, period, 
   38.10 +                             vlapic_lvtt_period(vlapic) ? period : 0,
   38.11 +                             vlapic->pt.irq, vlapic_pt_cb,
   38.12                               &vlapic->timer_last_update);
   38.13          vlapic->timer_last_update = vlapic->pt.last_plt_gtime;
   38.14  
   38.15 @@ -861,8 +862,9 @@ static void lapic_rearm(struct vlapic *s
   38.16      period = ((uint64_t)APIC_BUS_CYCLE_NS *
   38.17                (uint32_t)tmict * s->hw.timer_divisor);
   38.18      s->pt.irq = vlapic_get_reg(s, APIC_LVTT) & APIC_VECTOR_MASK;
   38.19 -    create_periodic_time(vlapic_vcpu(s), &s->pt, period, s->pt.irq,
   38.20 -                         !vlapic_lvtt_period(s), vlapic_pt_cb,
   38.21 +    create_periodic_time(vlapic_vcpu(s), &s->pt, period,
   38.22 +                         vlapic_lvtt_period(s) ? period : 0,
   38.23 +                         s->pt.irq, vlapic_pt_cb,
   38.24                           &s->timer_last_update);
   38.25      s->timer_last_update = s->pt.last_plt_gtime;
   38.26  }
    39.1 --- a/xen/arch/x86/hvm/vmx/entry.S	Tue Nov 04 12:07:22 2008 +0900
    39.2 +++ b/xen/arch/x86/hvm/vmx/entry.S	Tue Nov 04 12:43:19 2008 +0900
    39.3 @@ -122,6 +122,8 @@ vmx_asm_vmexit_handler:
    39.4  
    39.5  .globl vmx_asm_do_vmentry
    39.6  vmx_asm_do_vmentry:
    39.7 +        call vmx_intr_assist
    39.8 +
    39.9          get_current(bx)
   39.10          cli
   39.11  
   39.12 @@ -131,8 +133,6 @@ vmx_asm_do_vmentry:
   39.13          cmpl $0,(r(dx),r(ax),1)
   39.14          jnz  .Lvmx_process_softirqs
   39.15  
   39.16 -        call vmx_intr_assist
   39.17 -
   39.18          testb $0xff,VCPU_vmx_emul(r(bx))
   39.19          jnz  .Lvmx_goto_realmode
   39.20  
   39.21 @@ -179,11 +179,13 @@ vmx_asm_do_vmentry:
   39.22  
   39.23  /*.Lvmx_resume:*/
   39.24          VMRESUME
   39.25 +        sti
   39.26          call vm_resume_fail
   39.27          ud2
   39.28  
   39.29  .Lvmx_launch:
   39.30          VMLAUNCH
   39.31 +        sti
   39.32          call vm_launch_fail
   39.33          ud2
   39.34  
    40.1 --- a/xen/arch/x86/hvm/vmx/vmx.c	Tue Nov 04 12:07:22 2008 +0900
    40.2 +++ b/xen/arch/x86/hvm/vmx/vmx.c	Tue Nov 04 12:43:19 2008 +0900
    40.3 @@ -49,6 +49,7 @@
    40.4  #include <asm/hvm/vpt.h>
    40.5  #include <public/hvm/save.h>
    40.6  #include <asm/hvm/trace.h>
    40.7 +#include <asm/xenoprof.h>
    40.8  
    40.9  enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
   40.10  
   40.11 @@ -132,6 +133,7 @@ static void vmx_vcpu_destroy(struct vcpu
   40.12  {
   40.13      vmx_destroy_vmcs(v);
   40.14      vpmu_destroy(v);
   40.15 +    passive_domain_destroy(v);
   40.16  }
   40.17  
   40.18  #ifdef __x86_64__
   40.19 @@ -1666,6 +1668,8 @@ static int vmx_msr_read_intercept(struct
   40.20      default:
   40.21          if ( vpmu_do_rdmsr(regs) )
   40.22              goto done;
   40.23 +        if ( passive_domain_do_rdmsr(regs) )
   40.24 +            goto done;
   40.25          switch ( long_mode_do_msr_read(regs) )
   40.26          {
   40.27              case HNDL_unhandled:
   40.28 @@ -1861,6 +1865,8 @@ static int vmx_msr_write_intercept(struc
   40.29      default:
   40.30          if ( vpmu_do_wrmsr(regs) )
   40.31              return X86EMUL_OKAY;
   40.32 +        if ( passive_domain_do_wrmsr(regs) )
   40.33 +            return X86EMUL_OKAY;
   40.34  
   40.35          if ( wrmsr_viridian_regs(ecx, regs->eax, regs->edx) ) 
   40.36              break;
   40.37 @@ -1964,27 +1970,25 @@ static void ept_handle_violation(unsigne
   40.38  {
   40.39      unsigned long gla_validity = qualification & EPT_GLA_VALIDITY_MASK;
   40.40      struct domain *d = current->domain;
   40.41 -    unsigned long gfn = gpa >> PAGE_SHIFT;
   40.42 +    unsigned long gla, gfn = gpa >> PAGE_SHIFT;
   40.43      mfn_t mfn;
   40.44      p2m_type_t t;
   40.45  
   40.46 -    if ( unlikely(qualification & EPT_GAW_VIOLATION) )
   40.47 +    mfn = gfn_to_mfn(d, gfn, &t);
   40.48 +
   40.49 +    /* There are two legitimate reasons for taking an EPT violation. 
   40.50 +     * One is a guest access to MMIO space. */
   40.51 +    if ( gla_validity == EPT_GLA_VALIDITY_MATCH && p2m_is_mmio(t) )
   40.52      {
   40.53 -        gdprintk(XENLOG_ERR, "EPT violation: guest physical address %"PRIpaddr
   40.54 -                 " exceeded its width limit.\n", gpa);
   40.55 -        goto crash;
   40.56 +        handle_mmio();
   40.57 +        return;
   40.58      }
   40.59  
   40.60 -    if ( unlikely(gla_validity == EPT_GLA_VALIDITY_RSVD) ||
   40.61 -         unlikely(gla_validity == EPT_GLA_VALIDITY_PDPTR_LOAD) )
   40.62 -    {
   40.63 -        gdprintk(XENLOG_ERR, "EPT violation: reserved bit or "
   40.64 -                 "pdptr load violation.\n");
   40.65 -        goto crash;
   40.66 -    }
   40.67 -
   40.68 -    mfn = gfn_to_mfn(d, gfn, &t);
   40.69 -    if ( (t != p2m_ram_ro) && p2m_is_ram(t) && paging_mode_log_dirty(d) )
   40.70 +    /* The other is log-dirty mode, writing to a read-only page */
   40.71 +    if ( paging_mode_log_dirty(d)
   40.72 +         && (gla_validity == EPT_GLA_VALIDITY_MATCH
   40.73 +             || gla_validity == EPT_GLA_VALIDITY_GPT_WALK)
   40.74 +         && p2m_is_ram(t) && (t != p2m_ram_ro) )
   40.75      {
   40.76          paging_mark_dirty(d, mfn_x(mfn));
   40.77          p2m_change_type(d, gfn, p2m_ram_logdirty, p2m_ram_rw);
   40.78 @@ -1992,16 +1996,39 @@ static void ept_handle_violation(unsigne
   40.79          return;
   40.80      }
   40.81  
   40.82 -    /* This can only happen in log-dirty mode, writing back A/D bits. */
   40.83 -    if ( unlikely(gla_validity == EPT_GLA_VALIDITY_GPT_WALK) )
   40.84 -        goto crash;
   40.85 +    /* Everything else is an error. */
   40.86 +    gla = __vmread(GUEST_LINEAR_ADDRESS);
   40.87 +    gdprintk(XENLOG_ERR, "EPT violation %#lx (%c%c%c/%c%c%c), "
   40.88 +             "gpa %#"PRIpaddr", mfn %#lx, type %i.\n", 
   40.89 +             qualification, 
   40.90 +             (qualification & EPT_READ_VIOLATION) ? 'r' : '-',
   40.91 +             (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-',
   40.92 +             (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-',
   40.93 +             (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-',
   40.94 +             (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-',
   40.95 +             (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-',
   40.96 +             gpa, mfn_x(mfn), t);
   40.97  
   40.98 -    ASSERT(gla_validity == EPT_GLA_VALIDITY_MATCH);
   40.99 -    handle_mmio();
  40.100 +    if ( qualification & EPT_GAW_VIOLATION )
  40.101 +        gdprintk(XENLOG_ERR, " --- GPA too wide (max %u bits)\n", 
  40.102 +                 9 * (unsigned) d->arch.hvm_domain.vmx.ept_control.gaw + 21);
  40.103  
  40.104 -    return;
  40.105 +    switch ( gla_validity )
  40.106 +    {
  40.107 +    case EPT_GLA_VALIDITY_PDPTR_LOAD:
  40.108 +        gdprintk(XENLOG_ERR, " --- PDPTR load failed\n"); 
  40.109 +        break;
  40.110 +    case EPT_GLA_VALIDITY_GPT_WALK:
  40.111 +        gdprintk(XENLOG_ERR, " --- guest PT walk to %#lx failed\n", gla);
  40.112 +        break;
  40.113 +    case EPT_GLA_VALIDITY_RSVD:
  40.114 +        gdprintk(XENLOG_ERR, " --- GLA_validity 2 (reserved)\n");
  40.115 +        break;
  40.116 +    case EPT_GLA_VALIDITY_MATCH:
  40.117 +        gdprintk(XENLOG_ERR, " --- guest access to %#lx failed\n", gla);
  40.118 +        break;
  40.119 +    }
  40.120  
  40.121 - crash:
  40.122      domain_crash(d);
  40.123  }
  40.124  
    41.1 --- a/xen/arch/x86/hvm/vmx/vpmu_core2.c	Tue Nov 04 12:07:22 2008 +0900
    41.2 +++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c	Tue Nov 04 12:43:19 2008 +0900
    41.3 @@ -35,6 +35,26 @@
    41.4  #include <asm/hvm/vmx/vpmu.h>
    41.5  #include <asm/hvm/vmx/vpmu_core2.h>
    41.6  
    41.7 +u32 core2_counters_msr[] =   {
    41.8 +    MSR_CORE_PERF_FIXED_CTR0,
    41.9 +    MSR_CORE_PERF_FIXED_CTR1,
   41.10 +    MSR_CORE_PERF_FIXED_CTR2};
   41.11 +
   41.12 +/* Core 2 Non-architectual Performance Control MSRs. */
   41.13 +u32 core2_ctrls_msr[] = {
   41.14 +    MSR_CORE_PERF_FIXED_CTR_CTRL,
   41.15 +    MSR_IA32_PEBS_ENABLE,
   41.16 +    MSR_IA32_DS_AREA};
   41.17 +
   41.18 +struct pmumsr core2_counters = {
   41.19 +    3,
   41.20 +    core2_counters_msr
   41.21 +};
   41.22 +
   41.23 +struct pmumsr core2_ctrls = {
   41.24 +    3,
   41.25 +    core2_ctrls_msr
   41.26 +};
   41.27  static int arch_pmc_cnt;
   41.28  
   41.29  static int core2_get_pmc_count(void)
    42.1 --- a/xen/arch/x86/hvm/vpt.c	Tue Nov 04 12:07:22 2008 +0900
    42.2 +++ b/xen/arch/x86/hvm/vpt.c	Tue Nov 04 12:43:19 2008 +0900
    42.3 @@ -355,8 +355,8 @@ void pt_migrate(struct vcpu *v)
    42.4  }
    42.5  
    42.6  void create_periodic_time(
    42.7 -    struct vcpu *v, struct periodic_time *pt, uint64_t period,
    42.8 -    uint8_t irq, char one_shot, time_cb *cb, void *data)
    42.9 +    struct vcpu *v, struct periodic_time *pt, uint64_t delta,
   42.10 +    uint64_t period, uint8_t irq, time_cb *cb, void *data)
   42.11  {
   42.12      ASSERT(pt->source != 0);
   42.13  
   42.14 @@ -368,13 +368,13 @@ void create_periodic_time(
   42.15      pt->do_not_freeze = 0;
   42.16      pt->irq_issued = 0;
   42.17  
   42.18 -    /* Periodic timer must be at least 0.9ms. */
   42.19 -    if ( (period < 900000) && !one_shot )
   42.20 +    /* Periodic timer must be at least 0.1ms. */
   42.21 +    if ( (period < 100000) && period )
   42.22      {
   42.23          if ( !test_and_set_bool(pt->warned_timeout_too_short) )
   42.24              gdprintk(XENLOG_WARNING, "HVM_PlatformTime: program too "
   42.25                       "small period %"PRIu64"\n", period);
   42.26 -        period = 900000;
   42.27 +        period = 100000;
   42.28      }
   42.29  
   42.30      pt->period = period;
   42.31 @@ -382,15 +382,15 @@ void create_periodic_time(
   42.32      pt->last_plt_gtime = hvm_get_guest_time(pt->vcpu);
   42.33      pt->irq = irq;
   42.34      pt->period_cycles = (u64)period;
   42.35 -    pt->one_shot = one_shot;
   42.36 -    pt->scheduled = NOW() + period;
   42.37 +    pt->one_shot = !period;
   42.38 +    pt->scheduled = NOW() + delta;
   42.39      /*
   42.40       * Offset LAPIC ticks from other timer ticks. Otherwise guests which use
   42.41       * LAPIC ticks for process accounting can see long sequences of process
   42.42       * ticks incorrectly accounted to interrupt processing.
   42.43       */
   42.44 -    if ( pt->source == PTSRC_lapic )
   42.45 -        pt->scheduled += period >> 1;
   42.46 +    if ( !pt->one_shot && (pt->source == PTSRC_lapic) )
   42.47 +        pt->scheduled += delta >> 1;
   42.48      pt->cb = cb;
   42.49      pt->priv = data;
   42.50  
    43.1 --- a/xen/arch/x86/irq.c	Tue Nov 04 12:07:22 2008 +0900
    43.2 +++ b/xen/arch/x86/irq.c	Tue Nov 04 12:43:19 2008 +0900
    43.3 @@ -793,6 +793,10 @@ int map_domain_pirq(
    43.4  
    43.5      ASSERT(spin_is_locked(&d->event_lock));
    43.6  
    43.7 +    /* XXX Until pcidev and msi locking is fixed. */
    43.8 +    if ( type == MAP_PIRQ_TYPE_MSI )
    43.9 +        return -EINVAL;
   43.10 +
   43.11      if ( !IS_PRIV(current->domain) )
   43.12          return -EPERM;
   43.13  
   43.14 @@ -840,7 +844,7 @@ int map_domain_pirq(
   43.15      d->arch.pirq_vector[pirq] = vector;
   43.16      d->arch.vector_pirq[vector] = pirq;
   43.17  
   43.18 -done:
   43.19 + done:
   43.20      spin_unlock_irqrestore(&desc->lock, flags);
   43.21      return ret;
   43.22  }
    44.1 --- a/xen/arch/x86/mm.c	Tue Nov 04 12:07:22 2008 +0900
    44.2 +++ b/xen/arch/x86/mm.c	Tue Nov 04 12:43:19 2008 +0900
    44.3 @@ -566,19 +566,21 @@ static int get_page_from_pagenr(unsigned
    44.4  static int get_page_and_type_from_pagenr(unsigned long page_nr, 
    44.5                                           unsigned long type,
    44.6                                           struct domain *d,
    44.7 +                                         int partial,
    44.8                                           int preemptible)
    44.9  {
   44.10      struct page_info *page = mfn_to_page(page_nr);
   44.11      int rc;
   44.12  
   44.13 -    if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
   44.14 +    if ( likely(partial >= 0) &&
   44.15 +         unlikely(!get_page_from_pagenr(page_nr, d)) )
   44.16          return -EINVAL;
   44.17  
   44.18      rc = (preemptible ?
   44.19            get_page_type_preemptible(page, type) :
   44.20            (get_page_type(page, type) ? 0 : -EINVAL));
   44.21  
   44.22 -    if ( rc )
   44.23 +    if ( unlikely(rc) && partial >= 0 )
   44.24          put_page(page);
   44.25  
   44.26      return rc;
   44.27 @@ -761,7 +763,7 @@ get_page_from_l2e(
   44.28      }
   44.29  
   44.30      rc = get_page_and_type_from_pagenr(
   44.31 -        l2e_get_pfn(l2e), PGT_l1_page_table, d, 0);
   44.32 +        l2e_get_pfn(l2e), PGT_l1_page_table, d, 0, 0);
   44.33      if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
   44.34          rc = 0;
   44.35  
   44.36 @@ -772,7 +774,7 @@ get_page_from_l2e(
   44.37  define_get_linear_pagetable(l3);
   44.38  static int
   44.39  get_page_from_l3e(
   44.40 -    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible)
   44.41 +    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible)
   44.42  {
   44.43      int rc;
   44.44  
   44.45 @@ -786,7 +788,7 @@ get_page_from_l3e(
   44.46      }
   44.47  
   44.48      rc = get_page_and_type_from_pagenr(
   44.49 -        l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible);
   44.50 +        l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
   44.51      if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
   44.52          rc = 0;
   44.53  
   44.54 @@ -797,7 +799,7 @@ get_page_from_l3e(
   44.55  define_get_linear_pagetable(l4);
   44.56  static int
   44.57  get_page_from_l4e(
   44.58 -    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible)
   44.59 +    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible)
   44.60  {
   44.61      int rc;
   44.62  
   44.63 @@ -811,7 +813,7 @@ get_page_from_l4e(
   44.64      }
   44.65  
   44.66      rc = get_page_and_type_from_pagenr(
   44.67 -        l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible);
   44.68 +        l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
   44.69      if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
   44.70          rc = 0;
   44.71  
   44.72 @@ -961,23 +963,32 @@ static int put_page_from_l2e(l2_pgentry_
   44.73      return 1;
   44.74  }
   44.75  
   44.76 +static int __put_page_type(struct page_info *, int preemptible);
   44.77  
   44.78  static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
   44.79 -                             int preemptible)
   44.80 +                             int partial, int preemptible)
   44.81  {
   44.82      if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && 
   44.83           (l3e_get_pfn(l3e) != pfn) )
   44.84 +    {
   44.85 +        if ( unlikely(partial > 0) )
   44.86 +            return __put_page_type(l3e_get_page(l3e), preemptible);
   44.87          return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
   44.88 +    }
   44.89      return 1;
   44.90  }
   44.91  
   44.92  #if CONFIG_PAGING_LEVELS >= 4
   44.93  static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
   44.94 -                             int preemptible)
   44.95 +                             int partial, int preemptible)
   44.96  {
   44.97      if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 
   44.98           (l4e_get_pfn(l4e) != pfn) )
   44.99 +    {
  44.100 +        if ( unlikely(partial > 0) )
  44.101 +            return __put_page_type(l4e_get_page(l4e), preemptible);
  44.102          return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
  44.103 +    }
  44.104      return 1;
  44.105  }
  44.106  #endif
  44.107 @@ -1184,7 +1195,7 @@ static int alloc_l3_table(struct page_in
  44.108      unsigned long  pfn = page_to_mfn(page);
  44.109      l3_pgentry_t  *pl3e;
  44.110      unsigned int   i;
  44.111 -    int            rc = 0;
  44.112 +    int            rc = 0, partial = page->partial_pte;
  44.113  
  44.114  #if CONFIG_PAGING_LEVELS == 3
  44.115      /*
  44.116 @@ -1213,7 +1224,8 @@ static int alloc_l3_table(struct page_in
  44.117      if ( is_pv_32on64_domain(d) )
  44.118          memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
  44.119  
  44.120 -    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ )
  44.121 +    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
  44.122 +          i++, partial = 0 )
  44.123      {
  44.124          if ( is_pv_32bit_domain(d) && (i == 3) )
  44.125          {
  44.126 @@ -1224,16 +1236,17 @@ static int alloc_l3_table(struct page_in
  44.127                  rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
  44.128                                                     PGT_l2_page_table |
  44.129                                                     PGT_pae_xen_l2,
  44.130 -                                                   d, preemptible);
  44.131 +                                                   d, partial, preemptible);
  44.132          }
  44.133          else if ( !is_guest_l3_slot(i) ||
  44.134 -                  (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 )
  44.135 +                  (rc = get_page_from_l3e(pl3e[i], pfn, d,
  44.136 +                                          partial, preemptible)) > 0 )
  44.137              continue;
  44.138  
  44.139          if ( rc == -EAGAIN )
  44.140          {
  44.141              page->nr_validated_ptes = i;
  44.142 -            page->partial_pte = 1;
  44.143 +            page->partial_pte = partial ?: 1;
  44.144          }
  44.145          else if ( rc == -EINTR && i )
  44.146          {
  44.147 @@ -1257,7 +1270,7 @@ static int alloc_l3_table(struct page_in
  44.148              if ( !is_guest_l3_slot(i) )
  44.149                  continue;
  44.150              unadjust_guest_l3e(pl3e[i], d);
  44.151 -            put_page_from_l3e(pl3e[i], pfn, 0);
  44.152 +            put_page_from_l3e(pl3e[i], pfn, 0, 0);
  44.153          }
  44.154      }
  44.155  
  44.156 @@ -1272,18 +1285,20 @@ static int alloc_l4_table(struct page_in
  44.157      unsigned long  pfn = page_to_mfn(page);
  44.158      l4_pgentry_t  *pl4e = page_to_virt(page);
  44.159      unsigned int   i;
  44.160 -    int            rc = 0;
  44.161 -
  44.162 -    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ )
  44.163 +    int            rc = 0, partial = page->partial_pte;
  44.164 +
  44.165 +    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
  44.166 +          i++, partial = 0 )
  44.167      {
  44.168          if ( !is_guest_l4_slot(d, i) ||
  44.169 -             (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 )
  44.170 +             (rc = get_page_from_l4e(pl4e[i], pfn, d,
  44.171 +                                     partial, preemptible)) > 0 )
  44.172              continue;
  44.173  
  44.174          if ( rc == -EAGAIN )
  44.175          {
  44.176              page->nr_validated_ptes = i;
  44.177 -            page->partial_pte = 1;
  44.178 +            page->partial_pte = partial ?: 1;
  44.179          }
  44.180          else if ( rc == -EINTR )
  44.181          {
  44.182 @@ -1299,7 +1314,7 @@ static int alloc_l4_table(struct page_in
  44.183              MEM_LOG("Failure in alloc_l4_table: entry %d", i);
  44.184              while ( i-- > 0 )
  44.185                  if ( is_guest_l4_slot(d, i) )
  44.186 -                    put_page_from_l4e(pl4e[i], pfn, 0);
  44.187 +                    put_page_from_l4e(pl4e[i], pfn, 0, 0);
  44.188          }
  44.189          if ( rc < 0 )
  44.190              return rc;
  44.191 @@ -1377,24 +1392,20 @@ static int free_l3_table(struct page_inf
  44.192      struct domain *d = page_get_owner(page);
  44.193      unsigned long pfn = page_to_mfn(page);
  44.194      l3_pgentry_t *pl3e;
  44.195 -    unsigned int  i = page->nr_validated_ptes - !page->partial_pte;
  44.196 -    int rc = 0;
  44.197 -
  44.198 -#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
  44.199 -    if ( d->arch.relmem == RELMEM_l3 )
  44.200 -        return 0;
  44.201 -#endif
  44.202 +    int rc = 0, partial = page->partial_pte;
  44.203 +    unsigned int  i = page->nr_validated_ptes - !partial;
  44.204  
  44.205      pl3e = map_domain_page(pfn);
  44.206  
  44.207      do {
  44.208          if ( is_guest_l3_slot(i) )
  44.209          {
  44.210 -            rc = put_page_from_l3e(pl3e[i], pfn, preemptible);
  44.211 +            rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
  44.212 +            if ( rc < 0 )
  44.213 +                break;
  44.214 +            partial = 0;
  44.215              if ( rc > 0 )
  44.216                  continue;
  44.217 -            if ( rc )
  44.218 -                break;
  44.219              unadjust_guest_l3e(pl3e[i], d);
  44.220          }
  44.221      } while ( i-- );
  44.222 @@ -1404,7 +1415,7 @@ static int free_l3_table(struct page_inf
  44.223      if ( rc == -EAGAIN )
  44.224      {
  44.225          page->nr_validated_ptes = i;
  44.226 -        page->partial_pte = 1;
  44.227 +        page->partial_pte = partial ?: -1;
  44.228      }
  44.229      else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
  44.230      {
  44.231 @@ -1421,23 +1432,21 @@ static int free_l4_table(struct page_inf
  44.232      struct domain *d = page_get_owner(page);
  44.233      unsigned long pfn = page_to_mfn(page);
  44.234      l4_pgentry_t *pl4e = page_to_virt(page);
  44.235 -    unsigned int  i = page->nr_validated_ptes - !page->partial_pte;
  44.236 -    int rc = 0;
  44.237 -
  44.238 -#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
  44.239 -    if ( d->arch.relmem == RELMEM_l4 )
  44.240 -        return 0;
  44.241 -#endif
  44.242 +    int rc = 0, partial = page->partial_pte;
  44.243 +    unsigned int  i = page->nr_validated_ptes - !partial;
  44.244  
  44.245      do {
  44.246          if ( is_guest_l4_slot(d, i) )
  44.247 -            rc = put_page_from_l4e(pl4e[i], pfn, preemptible);
  44.248 -    } while ( rc >= 0 && i-- );
  44.249 +            rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
  44.250 +        if ( rc < 0 )
  44.251 +            break;
  44.252 +        partial = 0;
  44.253 +    } while ( i-- );
  44.254  
  44.255      if ( rc == -EAGAIN )
  44.256      {
  44.257          page->nr_validated_ptes = i;
  44.258 -        page->partial_pte = 1;
  44.259 +        page->partial_pte = partial ?: -1;
  44.260      }
  44.261      else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
  44.262      {
  44.263 @@ -1713,7 +1722,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
  44.264              return rc ? 0 : -EFAULT;
  44.265          }
  44.266  
  44.267 -        rc = get_page_from_l3e(nl3e, pfn, d, preemptible);
  44.268 +        rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
  44.269          if ( unlikely(rc < 0) )
  44.270              return page_unlock(l3pg), rc;
  44.271          rc = 0;
  44.272 @@ -1742,7 +1751,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
  44.273      }
  44.274  
  44.275      page_unlock(l3pg);
  44.276 -    put_page_from_l3e(ol3e, pfn, 0);
  44.277 +    put_page_from_l3e(ol3e, pfn, 0, 0);
  44.278      return rc;
  44.279  }
  44.280  
  44.281 @@ -1791,7 +1800,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
  44.282              return rc ? 0 : -EFAULT;
  44.283          }
  44.284  
  44.285 -        rc = get_page_from_l4e(nl4e, pfn, d, preemptible);
  44.286 +        rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
  44.287          if ( unlikely(rc < 0) )
  44.288              return page_unlock(l4pg), rc;
  44.289          rc = 0;
  44.290 @@ -1812,7 +1821,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
  44.291      }
  44.292  
  44.293      page_unlock(l4pg);
  44.294 -    put_page_from_l4e(ol4e, pfn, 0);
  44.295 +    put_page_from_l4e(ol4e, pfn, 0, 0);
  44.296      return rc;
  44.297  }
  44.298  
  44.299 @@ -1847,7 +1856,8 @@ int get_page(struct page_info *page, str
  44.300          nx = x + 1;
  44.301          d  = nd;
  44.302          if ( unlikely((x & PGC_count_mask) == 0) ||  /* Not allocated? */
  44.303 -             unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
  44.304 +             /* Keep one spare reference to be acquired by get_page_light(). */
  44.305 +             unlikely(((nx + 1) & PGC_count_mask) <= 1) || /* Overflow? */
  44.306               unlikely(d != _domain) )                /* Wrong owner? */
  44.307          {
  44.308              if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
  44.309 @@ -1869,6 +1879,28 @@ int get_page(struct page_info *page, str
  44.310      return 1;
  44.311  }
  44.312  
  44.313 +/*
  44.314 + * Special version of get_page() to be used exclusively when
  44.315 + * - a page is known to already have a non-zero reference count
  44.316 + * - the page does not need its owner to be checked
  44.317 + * - it will not be called more than once without dropping the thus
  44.318 + *   acquired reference again.
  44.319 + * Due to get_page() reserving one reference, this call cannot fail.
  44.320 + */
  44.321 +static void get_page_light(struct page_info *page)
  44.322 +{
  44.323 +    u32 x, nx, y = page->count_info;
  44.324 +
  44.325 +    do {
  44.326 +        x  = y;
  44.327 +        nx = x + 1;
  44.328 +        BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
  44.329 +        BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
  44.330 +        y = cmpxchg(&page->count_info, x, nx);
  44.331 +    }
  44.332 +    while ( unlikely(y != x) );
  44.333 +}
  44.334 +
  44.335  
  44.336  static int alloc_page_type(struct page_info *page, unsigned long type,
  44.337                             int preemptible)
  44.338 @@ -1909,6 +1941,7 @@ static int alloc_page_type(struct page_i
  44.339      wmb();
  44.340      if ( rc == -EAGAIN )
  44.341      {
  44.342 +        get_page_light(page);
  44.343          page->u.inuse.type_info |= PGT_partial;
  44.344      }
  44.345      else if ( rc == -EINTR )
  44.346 @@ -1973,6 +2006,7 @@ int free_page_type(struct page_info *pag
  44.347          page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
  44.348          page->partial_pte = 0;
  44.349      }
  44.350 +
  44.351      switch ( type & PGT_type_mask )
  44.352      {
  44.353      case PGT_l1_page_table:
  44.354 @@ -1998,6 +2032,15 @@ int free_page_type(struct page_info *pag
  44.355          BUG();
  44.356      }
  44.357  
  44.358 +    return rc;
  44.359 +}
  44.360 +
  44.361 +
  44.362 +static int __put_final_page_type(
  44.363 +    struct page_info *page, unsigned long type, int preemptible)
  44.364 +{
  44.365 +    int rc = free_page_type(page, type, preemptible);
  44.366 +
  44.367      /* No need for atomic update of type_info here: noone else updates it. */
  44.368      if ( rc == 0 )
  44.369      {
  44.370 @@ -2016,8 +2059,8 @@ int free_page_type(struct page_info *pag
  44.371      }
  44.372      else if ( rc == -EINTR )
  44.373      {
  44.374 -        ASSERT(!(page->u.inuse.type_info &
  44.375 -                 (PGT_count_mask|PGT_validated|PGT_partial)));
  44.376 +        ASSERT((page->u.inuse.type_info &
  44.377 +                (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
  44.378          if ( !(shadow_mode_enabled(page_get_owner(page)) &&
  44.379                 (page->count_info & PGC_page_table)) )
  44.380              page->tlbflush_timestamp = tlbflush_current_time();
  44.381 @@ -2028,6 +2071,7 @@ int free_page_type(struct page_info *pag
  44.382      {
  44.383          BUG_ON(rc != -EAGAIN);
  44.384          wmb();
  44.385 +        get_page_light(page);
  44.386          page->u.inuse.type_info |= PGT_partial;
  44.387      }
  44.388  
  44.389 @@ -2039,6 +2083,7 @@ static int __put_page_type(struct page_i
  44.390                             int preemptible)
  44.391  {
  44.392      unsigned long nx, x, y = page->u.inuse.type_info;
  44.393 +    int rc = 0;
  44.394  
  44.395      for ( ; ; )
  44.396      {
  44.397 @@ -2062,7 +2107,10 @@ static int __put_page_type(struct page_i
  44.398                                             x, nx)) != x) )
  44.399                      continue;
  44.400                  /* We cleared the 'valid bit' so we do the clean up. */
  44.401 -                return free_page_type(page, x, preemptible);
  44.402 +                rc = __put_final_page_type(page, x, preemptible);
  44.403 +                if ( x & PGT_partial )
  44.404 +                    put_page(page);
  44.405 +                break;
  44.406              }
  44.407  
  44.408              /*
  44.409 @@ -2084,7 +2132,7 @@ static int __put_page_type(struct page_i
  44.410              return -EINTR;
  44.411      }
  44.412  
  44.413 -    return 0;
  44.414 +    return rc;
  44.415  }
  44.416  
  44.417  
  44.418 @@ -2092,6 +2140,7 @@ static int __get_page_type(struct page_i
  44.419                             int preemptible)
  44.420  {
  44.421      unsigned long nx, x, y = page->u.inuse.type_info;
  44.422 +    int rc = 0;
  44.423  
  44.424      ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
  44.425  
  44.426 @@ -2214,10 +2263,13 @@ static int __get_page_type(struct page_i
  44.427              page->nr_validated_ptes = 0;
  44.428              page->partial_pte = 0;
  44.429          }
  44.430 -        return alloc_page_type(page, type, preemptible);
  44.431 +        rc = alloc_page_type(page, type, preemptible);
  44.432      }
  44.433  
  44.434 -    return 0;
  44.435 +    if ( (x & PGT_partial) && !(nx & PGT_partial) )
  44.436 +        put_page(page);
  44.437 +
  44.438 +    return rc;
  44.439  }
  44.440  
  44.441  void put_page_type(struct page_info *page)
  44.442 @@ -2296,7 +2348,7 @@ int new_guest_cr3(unsigned long mfn)
  44.443  #endif
  44.444      okay = paging_mode_refcounts(d)
  44.445          ? get_page_from_pagenr(mfn, d)
  44.446 -        : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0);
  44.447 +        : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
  44.448      if ( unlikely(!okay) )
  44.449      {
  44.450          MEM_LOG("Error while installing new baseptr %lx", mfn);
  44.451 @@ -2431,6 +2483,29 @@ static inline cpumask_t vcpumask_to_pcpu
  44.452      return pmask;
  44.453  }
  44.454  
  44.455 +#ifdef __i386__
  44.456 +static inline void *fixmap_domain_page(unsigned long mfn)
  44.457 +{
  44.458 +    unsigned int cpu = smp_processor_id();
  44.459 +    void *ptr = (void *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
  44.460 +
  44.461 +    l1e_write(fix_pae_highmem_pl1e - cpu,
  44.462 +              l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
  44.463 +    flush_tlb_one_local(ptr);
  44.464 +    return ptr;
  44.465 +}
  44.466 +static inline void fixunmap_domain_page(const void *ptr)
  44.467 +{
  44.468 +    unsigned int cpu = virt_to_fix((unsigned long)ptr) - FIX_PAE_HIGHMEM_0;
  44.469 +
  44.470 +    l1e_write(fix_pae_highmem_pl1e - cpu, l1e_empty());
  44.471 +    this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
  44.472 +}
  44.473 +#else
  44.474 +#define fixmap_domain_page(mfn) mfn_to_virt(mfn)
  44.475 +#define fixunmap_domain_page(ptr) ((void)(ptr))
  44.476 +#endif
  44.477 +
  44.478  int do_mmuext_op(
  44.479      XEN_GUEST_HANDLE(mmuext_op_t) uops,
  44.480      unsigned int count,
  44.481 @@ -2517,7 +2592,7 @@ int do_mmuext_op(
  44.482              if ( paging_mode_refcounts(FOREIGNDOM) )
  44.483                  break;
  44.484  
  44.485 -            rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1);
  44.486 +            rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1);
  44.487              okay = !rc;
  44.488              if ( unlikely(!okay) )
  44.489              {
  44.490 @@ -2598,7 +2673,7 @@ int do_mmuext_op(
  44.491                      okay = get_page_from_pagenr(mfn, d);
  44.492                  else
  44.493                      okay = !get_page_and_type_from_pagenr(
  44.494 -                        mfn, PGT_root_page_table, d, 0);
  44.495 +                        mfn, PGT_root_page_table, d, 0, 0);
  44.496                  if ( unlikely(!okay) )
  44.497                  {
  44.498                      MEM_LOG("Error while installing new mfn %lx", mfn);
  44.499 @@ -2700,6 +2775,66 @@ int do_mmuext_op(
  44.500              break;
  44.501          }
  44.502  
  44.503 +        case MMUEXT_CLEAR_PAGE:
  44.504 +        {
  44.505 +            unsigned char *ptr;
  44.506 +
  44.507 +            okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
  44.508 +                                                  FOREIGNDOM, 0, 0);
  44.509 +            if ( unlikely(!okay) )
  44.510 +            {
  44.511 +                MEM_LOG("Error while clearing mfn %lx", mfn);
  44.512 +                break;
  44.513 +            }
  44.514 +
  44.515 +            /* A page is dirtied when it's being cleared. */
  44.516 +            paging_mark_dirty(d, mfn);
  44.517 +
  44.518 +            ptr = fixmap_domain_page(mfn);
  44.519 +            clear_page(ptr);
  44.520 +            fixunmap_domain_page(ptr);
  44.521 +
  44.522 +            put_page_and_type(page);
  44.523 +            break;
  44.524 +        }
  44.525 +
  44.526 +        case MMUEXT_COPY_PAGE:
  44.527 +        {
  44.528 +            const unsigned char *src;
  44.529 +            unsigned char *dst;
  44.530 +            unsigned long src_mfn;
  44.531 +
  44.532 +            src_mfn = gmfn_to_mfn(FOREIGNDOM, op.arg2.src_mfn);
  44.533 +            okay = get_page_from_pagenr(src_mfn, FOREIGNDOM);
  44.534 +            if ( unlikely(!okay) )
  44.535 +            {
  44.536 +                MEM_LOG("Error while copying from mfn %lx", src_mfn);
  44.537 +                break;
  44.538 +            }
  44.539 +
  44.540 +            okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
  44.541 +                                                  FOREIGNDOM, 0, 0);
  44.542 +            if ( unlikely(!okay) )
  44.543 +            {
  44.544 +                put_page(mfn_to_page(src_mfn));
  44.545 +                MEM_LOG("Error while copying to mfn %lx", mfn);
  44.546 +                break;
  44.547 +            }
  44.548 +
  44.549 +            /* A page is dirtied when it's being copied to. */
  44.550 +            paging_mark_dirty(d, mfn);
  44.551 +
  44.552 +            src = map_domain_page(src_mfn);
  44.553 +            dst = fixmap_domain_page(mfn);
  44.554 +            copy_page(dst, src);
  44.555 +            fixunmap_domain_page(dst);
  44.556 +            unmap_domain_page(src);
  44.557 +
  44.558 +            put_page_and_type(page);
  44.559 +            put_page(mfn_to_page(src_mfn));
  44.560 +            break;
  44.561 +        }
  44.562 +
  44.563          default:
  44.564              MEM_LOG("Invalid extended pt command 0x%x", op.cmd);
  44.565              rc = -ENOSYS;
    45.1 --- a/xen/arch/x86/mm/hap/p2m-ept.c	Tue Nov 04 12:07:22 2008 +0900
    45.2 +++ b/xen/arch/x86/mm/hap/p2m-ept.c	Tue Nov 04 12:43:19 2008 +0900
    45.3 @@ -157,9 +157,6 @@ ept_set_entry(struct domain *d, unsigned
    45.4      {
    45.5          if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
    45.6          {
    45.7 -            /* Track the highest gfn for which we have ever had a valid mapping */
    45.8 -            if ( gfn > d->arch.p2m->max_mapped_pfn )
    45.9 -                d->arch.p2m->max_mapped_pfn = gfn;
   45.10              ept_entry->emt = epte_get_entry_emt(d, gfn, mfn_x(mfn));
   45.11              ept_entry->sp_avail = walk_level ? 1 : 0;
   45.12  
   45.13 @@ -234,6 +231,11 @@ ept_set_entry(struct domain *d, unsigned
   45.14          unmap_domain_page(split_table);
   45.15      }
   45.16  
   45.17 +    /* Track the highest gfn for which we have ever had a valid mapping */
   45.18 +    if ( mfn_valid(mfn_x(mfn))
   45.19 +         && (gfn + (1UL << order) - 1 > d->arch.p2m->max_mapped_pfn) )
   45.20 +        d->arch.p2m->max_mapped_pfn = gfn + (1UL << order) - 1;
   45.21 +
   45.22      /* Success */
   45.23      rv = 1;
   45.24  
    46.1 --- a/xen/arch/x86/mm/p2m.c	Tue Nov 04 12:07:22 2008 +0900
    46.2 +++ b/xen/arch/x86/mm/p2m.c	Tue Nov 04 12:43:19 2008 +0900
    46.3 @@ -322,7 +322,8 @@ p2m_set_entry(struct domain *d, unsigned
    46.4      }
    46.5  
    46.6      /* Track the highest gfn for which we have ever had a valid mapping */
    46.7 -    if ( mfn_valid(mfn) && (gfn > d->arch.p2m->max_mapped_pfn) )
    46.8 +    if ( mfn_valid(mfn) 
    46.9 +         && (gfn + (1UL << page_order) - 1 > d->arch.p2m->max_mapped_pfn) )
   46.10          d->arch.p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
   46.11  
   46.12      if ( iommu_enabled && (is_hvm_domain(d) || need_iommu(d)) )
   46.13 @@ -956,18 +957,18 @@ guest_physmap_add_entry(struct domain *d
   46.14      /* First, remove m->p mappings for existing p->m mappings */
   46.15      for ( i = 0; i < (1UL << page_order); i++ )
   46.16      {
   46.17 -        omfn = gfn_to_mfn(d, gfn, &ot);
   46.18 +        omfn = gfn_to_mfn(d, gfn + i, &ot);
   46.19          if ( p2m_is_ram(ot) )
   46.20          {
   46.21              ASSERT(mfn_valid(omfn));
   46.22 -            set_gpfn_from_mfn(mfn_x(omfn)+i, INVALID_M2P_ENTRY);
   46.23 +            set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
   46.24          }
   46.25      }
   46.26  
   46.27      /* Then, look for m->p mappings for this range and deal with them */
   46.28      for ( i = 0; i < (1UL << page_order); i++ )
   46.29      {
   46.30 -        ogfn = mfn_to_gfn(d, _mfn(mfn));
   46.31 +        ogfn = mfn_to_gfn(d, _mfn(mfn+i));
   46.32          if (
   46.33  #ifdef __x86_64__
   46.34              (ogfn != 0x5555555555555555L)
   46.35 @@ -975,20 +976,20 @@ guest_physmap_add_entry(struct domain *d
   46.36              (ogfn != 0x55555555L)
   46.37  #endif
   46.38              && (ogfn != INVALID_M2P_ENTRY)
   46.39 -            && (ogfn != gfn) )
   46.40 +            && (ogfn != gfn + i) )
   46.41          {
   46.42              /* This machine frame is already mapped at another physical
   46.43               * address */
   46.44              P2M_DEBUG("aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
   46.45 -                      mfn, ogfn, gfn);
   46.46 +                      mfn + i, ogfn, gfn + i);
   46.47              omfn = gfn_to_mfn(d, ogfn, &ot);
   46.48              if ( p2m_is_ram(ot) )
   46.49              {
   46.50                  ASSERT(mfn_valid(omfn));
   46.51                  P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n",
   46.52                            ogfn , mfn_x(omfn));
   46.53 -                if ( mfn_x(omfn) == mfn )
   46.54 -                    p2m_remove_page(d, ogfn, mfn, 0);
   46.55 +                if ( mfn_x(omfn) == (mfn + i) )
   46.56 +                    p2m_remove_page(d, ogfn, mfn + i, 0);
   46.57              }
   46.58          }
   46.59      }
    47.1 --- a/xen/arch/x86/msi.c	Tue Nov 04 12:07:22 2008 +0900
    47.2 +++ b/xen/arch/x86/msi.c	Tue Nov 04 12:43:19 2008 +0900
    47.3 @@ -33,8 +33,7 @@ DECLARE_BITMAP(msix_fixmap_pages, MAX_MS
    47.4  
    47.5  static int msix_fixmap_alloc(void)
    47.6  {
    47.7 -    int i;
    47.8 -    int rc = -1;
    47.9 +    int i, rc = -1;
   47.10  
   47.11      spin_lock(&msix_fixmap_lock);
   47.12      for ( i = 0; i < MAX_MSIX_PAGES; i++ )
   47.13 @@ -52,12 +51,8 @@ static int msix_fixmap_alloc(void)
   47.14  
   47.15  static void msix_fixmap_free(int idx)
   47.16  {
   47.17 -    if ( idx < FIX_MSIX_IO_RESERV_BASE )
   47.18 -        return;
   47.19 -
   47.20 -    spin_lock(&msix_fixmap_lock);
   47.21 -    clear_bit(idx - FIX_MSIX_IO_RESERV_BASE, &msix_fixmap_pages);
   47.22 -    spin_unlock(&msix_fixmap_lock);
   47.23 +    if ( idx >= FIX_MSIX_IO_RESERV_BASE )
   47.24 +        clear_bit(idx - FIX_MSIX_IO_RESERV_BASE, &msix_fixmap_pages);
   47.25  }
   47.26  
   47.27  /*
   47.28 @@ -78,19 +73,19 @@ static void msi_compose_msg(struct pci_d
   47.29          msg->address_lo =
   47.30              MSI_ADDR_BASE_LO |
   47.31              ((INT_DEST_MODE == 0) ?
   47.32 -                MSI_ADDR_DESTMODE_PHYS:
   47.33 -                MSI_ADDR_DESTMODE_LOGIC) |
   47.34 +             MSI_ADDR_DESTMODE_PHYS:
   47.35 +             MSI_ADDR_DESTMODE_LOGIC) |
   47.36              ((INT_DELIVERY_MODE != dest_LowestPrio) ?
   47.37 -                MSI_ADDR_REDIRECTION_CPU:
   47.38 -                MSI_ADDR_REDIRECTION_LOWPRI) |
   47.39 +             MSI_ADDR_REDIRECTION_CPU:
   47.40 +             MSI_ADDR_REDIRECTION_LOWPRI) |
   47.41              MSI_ADDR_DEST_ID(dest);
   47.42  
   47.43          msg->data =
   47.44              MSI_DATA_TRIGGER_EDGE |
   47.45              MSI_DATA_LEVEL_ASSERT |
   47.46              ((INT_DELIVERY_MODE != dest_LowestPrio) ?
   47.47 -                MSI_DATA_DELIVERY_FIXED:
   47.48 -                MSI_DATA_DELIVERY_LOWPRI) |
   47.49 +             MSI_DATA_DELIVERY_FIXED:
   47.50 +             MSI_DATA_DELIVERY_LOWPRI) |
   47.51              MSI_DATA_VECTOR(vector);
   47.52      }
   47.53  }
   47.54 @@ -128,7 +123,7 @@ static void read_msi_msg(struct msi_desc
   47.55      {
   47.56          void __iomem *base;
   47.57          base = entry->mask_base +
   47.58 -	    entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
   47.59 +            entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
   47.60  
   47.61          msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
   47.62          msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
   47.63 @@ -205,9 +200,9 @@ static void write_msi_msg(struct msi_des
   47.64              entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
   47.65  
   47.66          writel(msg->address_lo,
   47.67 -            base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
   47.68 +               base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
   47.69          writel(msg->address_hi,
   47.70 -            base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
   47.71 +               base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
   47.72          writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
   47.73          break;
   47.74      }
   47.75 @@ -230,7 +225,7 @@ void set_msi_irq_affinity(unsigned int i
   47.76      dest = cpu_mask_to_apicid(mask);
   47.77  
   47.78      if ( !desc )
   47.79 -	return;
   47.80 +        return;
   47.81  
   47.82      ASSERT(spin_is_locked(&irq_desc[irq].lock));
   47.83      spin_lock(&desc->dev->lock);
   47.84 @@ -398,8 +393,8 @@ static void msi_free_vector(int vector)
   47.85          unsigned long start;
   47.86  
   47.87          writel(1, entry->mask_base + entry->msi_attrib.entry_nr
   47.88 -              * PCI_MSIX_ENTRY_SIZE
   47.89 -              + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
   47.90 +               * PCI_MSIX_ENTRY_SIZE
   47.91 +               + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
   47.92  
   47.93          start = (unsigned long)entry->mask_base & ~(PAGE_SIZE - 1);
   47.94          msix_fixmap_free(virt_to_fix(start));
   47.95 @@ -460,20 +455,20 @@ static int msi_capability_init(struct pc
   47.96      entry->vector = vector;
   47.97      if ( is_mask_bit_support(control) )
   47.98          entry->mask_base = (void __iomem *)(long)msi_mask_bits_reg(pos,
   47.99 -                is_64bit_address(control));
  47.100 +                                                                   is_64bit_address(control));
  47.101      entry->dev = dev;
  47.102      if ( entry->msi_attrib.maskbit )
  47.103      {
  47.104          unsigned int maskbits, temp;
  47.105          /* All MSIs are unmasked by default, Mask them all */
  47.106          maskbits = pci_conf_read32(bus, slot, func,
  47.107 -                       msi_mask_bits_reg(pos, is_64bit_address(control)));
  47.108 +                                   msi_mask_bits_reg(pos, is_64bit_address(control)));
  47.109          temp = (1 << multi_msi_capable(control));
  47.110          temp = ((temp - 1) & ~temp);
  47.111          maskbits |= temp;
  47.112          pci_conf_write32(bus, slot, func,
  47.113 -            msi_mask_bits_reg(pos, is_64bit_address(control)),
  47.114 -            maskbits);
  47.115 +                         msi_mask_bits_reg(pos, is_64bit_address(control)),
  47.116 +                         maskbits);
  47.117      }
  47.118      list_add_tail(&entry->list, &dev->msi_list);
  47.119  
  47.120 @@ -575,14 +570,14 @@ static int __pci_enable_msi(struct msi_i
  47.121  
  47.122      pdev = pci_lock_pdev(msi->bus, msi->devfn);
  47.123      if ( !pdev )
  47.124 -	return -ENODEV;
  47.125 +        return -ENODEV;
  47.126  
  47.127      if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSI) )
  47.128      {
  47.129 -	spin_unlock(&pdev->lock);
  47.130 +        spin_unlock(&pdev->lock);
  47.131          dprintk(XENLOG_WARNING, "vector %d has already mapped to MSI on "
  47.132 -            "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
  47.133 -            PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
  47.134 +                "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
  47.135 +                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
  47.136          return 0;
  47.137      }
  47.138  
  47.139 @@ -601,7 +596,7 @@ static void __pci_disable_msi(int vector
  47.140  
  47.141      entry = irq_desc[vector].msi_desc;
  47.142      if ( !entry )
  47.143 -	return;
  47.144 +        return;
  47.145      /*
  47.146       * Lock here is safe.  msi_desc can not be removed without holding
  47.147       * both irq_desc[].lock (which we do) and pdev->lock.
  47.148 @@ -649,20 +644,20 @@ static int __pci_enable_msix(struct msi_
  47.149  
  47.150      pdev = pci_lock_pdev(msi->bus, msi->devfn);
  47.151      if ( !pdev )
  47.152 -	return -ENODEV;
  47.153 +        return -ENODEV;
  47.154  
  47.155      pos = pci_find_cap_offset(msi->bus, slot, func, PCI_CAP_ID_MSIX);
  47.156      control = pci_conf_read16(msi->bus, slot, func, msi_control_reg(pos));
  47.157      nr_entries = multi_msix_capable(control);
  47.158      if (msi->entry_nr > nr_entries)
  47.159      {
  47.160 -	spin_unlock(&pdev->lock);
  47.161 +        spin_unlock(&pdev->lock);
  47.162          return -EINVAL;
  47.163      }
  47.164  
  47.165      if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSIX) )
  47.166      {
  47.167 -	spin_unlock(&pdev->lock);
  47.168 +        spin_unlock(&pdev->lock);
  47.169          dprintk(XENLOG_WARNING, "vector %d has already mapped to MSIX on "
  47.170                  "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
  47.171                  PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
  47.172 @@ -684,7 +679,7 @@ static void __pci_disable_msix(int vecto
  47.173  
  47.174      entry = irq_desc[vector].msi_desc;
  47.175      if ( !entry )
  47.176 -	return;
  47.177 +        return;
  47.178      /*
  47.179       * Lock here is safe.  msi_desc can not be removed without holding
  47.180       * both irq_desc[].lock (which we do) and pdev->lock.
  47.181 @@ -712,7 +707,7 @@ int pci_enable_msi(struct msi_info *msi)
  47.182      ASSERT(spin_is_locked(&irq_desc[msi->vector].lock));
  47.183  
  47.184      return  msi->table_base ? __pci_enable_msix(msi) :
  47.185 -                              __pci_enable_msi(msi);
  47.186 +        __pci_enable_msi(msi);
  47.187  }
  47.188  
  47.189  void pci_disable_msi(int vector)
  47.190 @@ -720,7 +715,7 @@ void pci_disable_msi(int vector)
  47.191      irq_desc_t *desc = &irq_desc[vector];
  47.192      ASSERT(spin_is_locked(&desc->lock));
  47.193      if ( !desc->msi_desc )
  47.194 -	return;
  47.195 +        return;
  47.196  
  47.197      if ( desc->msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
  47.198          __pci_disable_msi(vector);
  47.199 @@ -734,7 +729,7 @@ static void msi_free_vectors(struct pci_
  47.200      irq_desc_t *desc;
  47.201      unsigned long flags;
  47.202  
  47.203 -retry:
  47.204 + retry:
  47.205      list_for_each_entry_safe( entry, tmp, &dev->msi_list, list )
  47.206      {
  47.207          desc = &irq_desc[entry->vector];
  47.208 @@ -742,7 +737,7 @@ retry:
  47.209          local_irq_save(flags);
  47.210          if ( !spin_trylock(&desc->lock) )
  47.211          {
  47.212 -             local_irq_restore(flags);
  47.213 +            local_irq_restore(flags);
  47.214              goto retry;
  47.215          }
  47.216  
    48.1 --- a/xen/arch/x86/oprofile/nmi_int.c	Tue Nov 04 12:07:22 2008 +0900
    48.2 +++ b/xen/arch/x86/oprofile/nmi_int.c	Tue Nov 04 12:43:19 2008 +0900
    48.3 @@ -36,6 +36,55 @@ static unsigned long saved_lvtpc[NR_CPUS
    48.4  static char *cpu_type;
    48.5  
    48.6  extern int is_active(struct domain *d);
    48.7 +extern int is_passive(struct domain *d);
    48.8 +
    48.9 +int passive_domain_do_rdmsr(struct cpu_user_regs *regs)
   48.10 +{
   48.11 +	u64 msr_content;
   48.12 +	int type, index;
   48.13 +	struct vpmu_struct *vpmu = vcpu_vpmu(current);
   48.14 +
   48.15 +	if ( model->is_arch_pmu_msr == NULL )
   48.16 +		return 0;
   48.17 +	if ( !model->is_arch_pmu_msr((u64)regs->ecx, &type, &index) )
   48.18 +		return 0;
   48.19 +	if ( !(vpmu->flags & PASSIVE_DOMAIN_ALLOCATED) )
   48.20 +		if ( ! model->allocated_msr(current) )
   48.21 +			return 0;
   48.22 +
   48.23 +	model->load_msr(current, type, index, &msr_content);
   48.24 +	regs->eax = msr_content & 0xFFFFFFFF;
   48.25 +	regs->edx = msr_content >> 32;
   48.26 +	return 1;
   48.27 +}
   48.28 +
   48.29 +
   48.30 +int passive_domain_do_wrmsr(struct cpu_user_regs *regs)
   48.31 +{
   48.32 +	u64 msr_content;
   48.33 +	int type, index;
   48.34 +	struct vpmu_struct *vpmu = vcpu_vpmu(current);
   48.35 +
   48.36 +	if ( model->is_arch_pmu_msr == NULL )
   48.37 +		return 0;
   48.38 +	if ( !model->is_arch_pmu_msr((u64)regs->ecx, &type, &index) )
   48.39 +		return 0;
   48.40 +
   48.41 +	if ( !(vpmu->flags & PASSIVE_DOMAIN_ALLOCATED) )
   48.42 +		if ( ! model->allocated_msr(current) )
   48.43 +			return 0;
   48.44 +
   48.45 +	msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
   48.46 +	model->save_msr(current, type, index, msr_content);
   48.47 +	return 1;
   48.48 +}
   48.49 +
   48.50 +void passive_domain_destroy(struct vcpu *v)
   48.51 +{
   48.52 +	struct vpmu_struct *vpmu = vcpu_vpmu(v);
   48.53 +	if ( vpmu->flags & PASSIVE_DOMAIN_ALLOCATED )
   48.54 +		model->free_msr(v);
   48.55 +}
   48.56  
   48.57  static int nmi_callback(struct cpu_user_regs *regs, int cpu)
   48.58  {
   48.59 @@ -46,6 +95,8 @@ static int nmi_callback(struct cpu_user_
   48.60  	if ( ovf && is_active(current->domain) && !xen_mode )
   48.61  		send_guest_vcpu_virq(current, VIRQ_XENOPROF);
   48.62  
   48.63 +	if ( ovf == 2 ) 
   48.64 +                test_and_set_bool(current->nmi_pending);
   48.65  	return 1;
   48.66  }
   48.67   
    49.1 --- a/xen/arch/x86/oprofile/op_model_ppro.c	Tue Nov 04 12:07:22 2008 +0900
    49.2 +++ b/xen/arch/x86/oprofile/op_model_ppro.c	Tue Nov 04 12:43:19 2008 +0900
    49.3 @@ -18,6 +18,8 @@
    49.4  #include <xen/sched.h>
    49.5  #include <asm/regs.h>
    49.6  #include <asm/current.h>
    49.7 +#include <asm/hvm/vmx/vpmu.h>
    49.8 +#include <asm/hvm/vmx/vpmu_core2.h>
    49.9   
   49.10  #include "op_x86_model.h"
   49.11  #include "op_counter.h"
   49.12 @@ -39,9 +41,11 @@
   49.13  #define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17))
   49.14  #define CTRL_SET_UM(val, m) (val |= (m << 8))
   49.15  #define CTRL_SET_EVENT(val, e) (val |= e)
   49.16 -
   49.17 +#define IS_ACTIVE(val) (val & (1 << 22) )  
   49.18 +#define IS_ENABLE(val) (val & (1 << 20) )
   49.19  static unsigned long reset_value[NUM_COUNTERS];
   49.20  int ppro_has_global_ctrl = 0;
   49.21 +extern int is_passive(struct domain *d);
   49.22   
   49.23  static void ppro_fill_in_addresses(struct op_msrs * const msrs)
   49.24  {
   49.25 @@ -103,6 +107,7 @@ static int ppro_check_ctrs(unsigned int 
   49.26  	int ovf = 0;
   49.27  	unsigned long eip = regs->eip;
   49.28  	int mode = xenoprofile_get_mode(current, regs);
   49.29 +	struct arch_msr_pair *msrs_content = vcpu_vpmu(current)->context;
   49.30  
   49.31  	for (i = 0 ; i < NUM_COUNTERS; ++i) {
   49.32  		if (!reset_value[i])
   49.33 @@ -111,7 +116,18 @@ static int ppro_check_ctrs(unsigned int 
   49.34  		if (CTR_OVERFLOWED(low)) {
   49.35  			xenoprof_log_event(current, regs, eip, mode, i);
   49.36  			CTR_WRITE(reset_value[i], msrs, i);
   49.37 -			ovf = 1;
   49.38 +			if ( is_passive(current->domain) && (mode != 2) && 
   49.39 +				(vcpu_vpmu(current)->flags & PASSIVE_DOMAIN_ALLOCATED) ) 
   49.40 +			{
   49.41 +				if ( IS_ACTIVE(msrs_content[i].control) )
   49.42 +				{
   49.43 +					msrs_content[i].counter = (low | (u64)high << 32);
   49.44 +					if ( IS_ENABLE(msrs_content[i].control) )
   49.45 +						ovf = 2;
   49.46 +				}
   49.47 +			}
   49.48 +			if ( !ovf )
   49.49 +				ovf = 1;
   49.50  		}
   49.51  	}
   49.52  
   49.53 @@ -159,6 +175,82 @@ static void ppro_stop(struct op_msrs con
   49.54          wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
   49.55  }
   49.56  
   49.57 +static int ppro_is_arch_pmu_msr(u64 msr_index, int *type, int *index)
   49.58 +{
   49.59 +	if ( (msr_index >= MSR_IA32_PERFCTR0) &&
   49.60 +            (msr_index < (MSR_IA32_PERFCTR0 + NUM_COUNTERS)) )
   49.61 +	{
   49.62 +        	*type = MSR_TYPE_ARCH_COUNTER;
   49.63 +		*index = msr_index - MSR_IA32_PERFCTR0;
   49.64 +		return 1;
   49.65 +        }
   49.66 +        if ( (msr_index >= MSR_P6_EVNTSEL0) &&
   49.67 +            (msr_index < (MSR_P6_EVNTSEL0 + NUM_CONTROLS)) )
   49.68 +        {
   49.69 +		*type = MSR_TYPE_ARCH_CTRL;
   49.70 +		*index = msr_index - MSR_P6_EVNTSEL0;
   49.71 +		return 1;
   49.72 +        }
   49.73 +
   49.74 +        return 0;
   49.75 +}
   49.76 +
   49.77 +static int ppro_allocate_msr(struct vcpu *v)
   49.78 +{
   49.79 +	struct vpmu_struct *vpmu = vcpu_vpmu(v);
   49.80 +	struct arch_msr_pair *msr_content;
   49.81 +	
   49.82 +	msr_content = xmalloc_bytes( sizeof(struct arch_msr_pair) * NUM_COUNTERS );
   49.83 +	if ( !msr_content )
   49.84 +		goto out;
   49.85 +	memset(msr_content, 0, sizeof(struct arch_msr_pair) * NUM_COUNTERS);
   49.86 +	vpmu->context = (void *)msr_content;
   49.87 +	vpmu->flags = 0;
   49.88 +	vpmu->flags |= PASSIVE_DOMAIN_ALLOCATED;
   49.89 +	return 1;
   49.90 +out:
   49.91 +        gdprintk(XENLOG_WARNING, "Insufficient memory for oprofile, oprofile is "
   49.92 +                 "unavailable on domain %d vcpu %d.\n",
   49.93 +                 v->vcpu_id, v->domain->domain_id);
   49.94 +        return 0;	
   49.95 +}
   49.96 +
   49.97 +static void ppro_free_msr(struct vcpu *v)
   49.98 +{
   49.99 +	struct vpmu_struct *vpmu = vcpu_vpmu(v);
  49.100 +
  49.101 +	xfree(vpmu->context);
  49.102 +	vpmu->flags &= ~PASSIVE_DOMAIN_ALLOCATED;
  49.103 +}
  49.104 +
  49.105 +static void ppro_load_msr(struct vcpu *v, int type, int index, u64 *msr_content)
  49.106 +{
  49.107 +	struct arch_msr_pair *msrs = vcpu_vpmu(v)->context;
  49.108 +	switch ( type )
  49.109 +	{
  49.110 +	case MSR_TYPE_ARCH_COUNTER:
  49.111 +		*msr_content = msrs[index].counter;
  49.112 +		break;
  49.113 +	case MSR_TYPE_ARCH_CTRL:
  49.114 +		*msr_content = msrs[index].control;
  49.115 +		break;
  49.116 +	}	
  49.117 +}
  49.118 +
  49.119 +static void ppro_save_msr(struct vcpu *v, int type, int index, u64 msr_content)
  49.120 +{
  49.121 +	struct arch_msr_pair *msrs = vcpu_vpmu(v)->context;
  49.122 +	
  49.123 +	switch ( type )
  49.124 +	{
  49.125 +	case MSR_TYPE_ARCH_COUNTER:
  49.126 +		msrs[index].counter = msr_content;
  49.127 +		break;
  49.128 +	case MSR_TYPE_ARCH_CTRL:
  49.129 +		msrs[index].control = msr_content;
  49.130 +		break;
  49.131 +	}	
  49.132 +}
  49.133  
  49.134  struct op_x86_model_spec const op_ppro_spec = {
  49.135  	.num_counters = NUM_COUNTERS,
  49.136 @@ -167,5 +259,10 @@ struct op_x86_model_spec const op_ppro_s
  49.137  	.setup_ctrs = &ppro_setup_ctrs,
  49.138  	.check_ctrs = &ppro_check_ctrs,
  49.139  	.start = &ppro_start,
  49.140 -	.stop = &ppro_stop
  49.141 +	.stop = &ppro_stop,
  49.142 +	.is_arch_pmu_msr = &ppro_is_arch_pmu_msr,
  49.143 +	.allocated_msr = &ppro_allocate_msr,
  49.144 +	.free_msr = &ppro_free_msr,
  49.145 +	.load_msr = &ppro_load_msr,
  49.146 +	.save_msr = &ppro_save_msr
  49.147  };
    50.1 --- a/xen/arch/x86/oprofile/op_x86_model.h	Tue Nov 04 12:07:22 2008 +0900
    50.2 +++ b/xen/arch/x86/oprofile/op_x86_model.h	Tue Nov 04 12:43:19 2008 +0900
    50.3 @@ -41,6 +41,11 @@ struct op_x86_model_spec {
    50.4  			  struct cpu_user_regs * const regs);
    50.5  	void (*start)(struct op_msrs const * const msrs);
    50.6  	void (*stop)(struct op_msrs const * const msrs);
    50.7 +	int (*is_arch_pmu_msr)(u64 msr_index, int *type, int *index);
    50.8 +	int (*allocated_msr)(struct vcpu *v);
    50.9 +	void (*free_msr)(struct vcpu *v);
   50.10 +	void (*load_msr)(struct vcpu * const v, int type, int index, u64 *msr_content);
   50.11 +        void (*save_msr)(struct vcpu * const v, int type, int index, u64 msr_content);
   50.12  };
   50.13  
   50.14  extern struct op_x86_model_spec const op_ppro_spec;
    51.1 --- a/xen/arch/x86/setup.c	Tue Nov 04 12:07:22 2008 +0900
    51.2 +++ b/xen/arch/x86/setup.c	Tue Nov 04 12:43:19 2008 +0900
    51.3 @@ -969,6 +969,7 @@ void __init __start_xen(unsigned long mb
    51.4      serial_init_postirq();
    51.5  
    51.6      BUG_ON(!local_irq_is_enabled());
    51.7 +    spin_debug_enable();
    51.8  
    51.9      for_each_present_cpu ( i )
   51.10      {
    52.1 --- a/xen/arch/x86/smpboot.c	Tue Nov 04 12:07:22 2008 +0900
    52.2 +++ b/xen/arch/x86/smpboot.c	Tue Nov 04 12:43:19 2008 +0900
    52.3 @@ -101,7 +101,7 @@ static cpumask_t smp_commenced_mask;
    52.4  static int __devinitdata tsc_sync_disabled;
    52.5  
    52.6  /* Per CPU bogomips and other parameters */
    52.7 -struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
    52.8 +struct cpuinfo_x86 cpu_data[NR_CPUS];
    52.9  EXPORT_SYMBOL(cpu_data);
   52.10  
   52.11  u32 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
   52.12 @@ -112,7 +112,7 @@ static void map_cpu_to_logical_apicid(vo
   52.13  /* State of each CPU. */
   52.14  DEFINE_PER_CPU(int, cpu_state) = { 0 };
   52.15  
   52.16 -static void *stack_base[NR_CPUS] __cacheline_aligned;
   52.17 +static void *stack_base[NR_CPUS];
   52.18  static DEFINE_SPINLOCK(cpu_add_remove_lock);
   52.19  
   52.20  /*
   52.21 @@ -805,14 +805,6 @@ static inline int alloc_cpu_id(void)
   52.22  	return cpu;
   52.23  }
   52.24  
   52.25 -static struct vcpu *prepare_idle_vcpu(unsigned int cpu)
   52.26 -{
   52.27 -	if (idle_vcpu[cpu])
   52.28 -		return idle_vcpu[cpu];
   52.29 -
   52.30 -	return alloc_idle_vcpu(cpu);
   52.31 -}
   52.32 -
   52.33  static void *prepare_idle_stack(unsigned int cpu)
   52.34  {
   52.35  	if (!stack_base[cpu])
   52.36 @@ -849,7 +841,7 @@ static int __devinit do_boot_cpu(int api
   52.37  
   52.38  	booting_cpu = cpu;
   52.39  
   52.40 -	v = prepare_idle_vcpu(cpu);
   52.41 +	v = alloc_idle_vcpu(cpu);
   52.42  	BUG_ON(v == NULL);
   52.43  
   52.44  	/* start_eip had better be page-aligned! */
    53.1 --- a/xen/arch/x86/time.c	Tue Nov 04 12:07:22 2008 +0900
    53.2 +++ b/xen/arch/x86/time.c	Tue Nov 04 12:43:19 2008 +0900
    53.3 @@ -1063,8 +1063,6 @@ void init_percpu_time(void)
    53.4  /* Late init function (after all CPUs are booted). */
    53.5  int __init init_xen_time(void)
    53.6  {
    53.7 -    local_irq_disable();
    53.8 -
    53.9      /* check if TSC is invariant during deep C state
   53.10         this is a new feature introduced by Nehalem*/
   53.11      if ( cpuid_edx(0x80000007) & (1u<<8) )
   53.12 @@ -1079,8 +1077,6 @@ int __init init_xen_time(void)
   53.13  
   53.14      do_settime(get_cmos_time(), 0, NOW());
   53.15  
   53.16 -    local_irq_enable();
   53.17 -
   53.18      return 0;
   53.19  }
   53.20  
    54.1 --- a/xen/arch/x86/traps.c	Tue Nov 04 12:07:22 2008 +0900
    54.2 +++ b/xen/arch/x86/traps.c	Tue Nov 04 12:43:19 2008 +0900
    54.3 @@ -1030,7 +1030,7 @@ static int handle_gdt_ldt_mapping_fault(
    54.4  #endif
    54.5  
    54.6  static int __spurious_page_fault(
    54.7 -    unsigned long addr, struct cpu_user_regs *regs)
    54.8 +    unsigned long addr, unsigned int error_code)
    54.9  {
   54.10      unsigned long mfn, cr3 = read_cr3();
   54.11  #if CONFIG_PAGING_LEVELS >= 4
   54.12 @@ -1052,17 +1052,17 @@ static int __spurious_page_fault(
   54.13          return 0;
   54.14  
   54.15      /* Reserved bit violations are never spurious faults. */
   54.16 -    if ( regs->error_code & PFEC_reserved_bit )
   54.17 +    if ( error_code & PFEC_reserved_bit )
   54.18          return 0;
   54.19  
   54.20      required_flags  = _PAGE_PRESENT;
   54.21 -    if ( regs->error_code & PFEC_write_access )
   54.22 +    if ( error_code & PFEC_write_access )
   54.23          required_flags |= _PAGE_RW;
   54.24 -    if ( regs->error_code & PFEC_user_mode )
   54.25 +    if ( error_code & PFEC_user_mode )
   54.26          required_flags |= _PAGE_USER;
   54.27  
   54.28      disallowed_flags = 0;
   54.29 -    if ( regs->error_code & PFEC_insn_fetch )
   54.30 +    if ( error_code & PFEC_insn_fetch )
   54.31          disallowed_flags |= _PAGE_NX;
   54.32  
   54.33      mfn = cr3 >> PAGE_SHIFT;
   54.34 @@ -1120,7 +1120,7 @@ static int __spurious_page_fault(
   54.35      dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
   54.36              "at addr %lx, e/c %04x\n",
   54.37              current->domain->domain_id, current->vcpu_id,
   54.38 -            addr, regs->error_code);
   54.39 +            addr, error_code);
   54.40  #if CONFIG_PAGING_LEVELS >= 4
   54.41      dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
   54.42  #endif
   54.43 @@ -1129,14 +1129,11 @@ static int __spurious_page_fault(
   54.44  #endif
   54.45      dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
   54.46      dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
   54.47 -#ifndef NDEBUG
   54.48 -    show_registers(regs);
   54.49 -#endif
   54.50      return 1;
   54.51  }
   54.52  
   54.53  static int spurious_page_fault(
   54.54 -    unsigned long addr, struct cpu_user_regs *regs)
   54.55 +    unsigned long addr, unsigned int error_code)
   54.56  {
   54.57      unsigned long flags;
   54.58      int           is_spurious;
   54.59 @@ -1146,7 +1143,7 @@ static int spurious_page_fault(
   54.60       * page tables from becoming invalid under our feet during the walk.
   54.61       */
   54.62      local_irq_save(flags);
   54.63 -    is_spurious = __spurious_page_fault(addr, regs);
   54.64 +    is_spurious = __spurious_page_fault(addr, error_code);
   54.65      local_irq_restore(flags);
   54.66  
   54.67      return is_spurious;
   54.68 @@ -1208,9 +1205,13 @@ static int fixup_page_fault(unsigned lon
   54.69  asmlinkage void do_page_fault(struct cpu_user_regs *regs)
   54.70  {
   54.71      unsigned long addr, fixup;
   54.72 +    unsigned int error_code;
   54.73  
   54.74      addr = read_cr2();
   54.75  
   54.76 +    /* fixup_page_fault() might change regs->error_code, so cache it here. */
   54.77 +    error_code = regs->error_code;
   54.78 +
   54.79      DEBUGGER_trap_entry(TRAP_page_fault, regs);
   54.80  
   54.81      perfc_incr(page_faults);
   54.82 @@ -1220,7 +1221,7 @@ asmlinkage void do_page_fault(struct cpu
   54.83  
   54.84      if ( unlikely(!guest_mode(regs)) )
   54.85      {
   54.86 -        if ( spurious_page_fault(addr, regs) )
   54.87 +        if ( spurious_page_fault(addr, error_code) )
   54.88              return;
   54.89  
   54.90          if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
   54.91 @@ -1239,11 +1240,11 @@ asmlinkage void do_page_fault(struct cpu
   54.92          panic("FATAL PAGE FAULT\n"
   54.93                "[error_code=%04x]\n"
   54.94                "Faulting linear address: %p\n",
   54.95 -              regs->error_code, _p(addr));
   54.96 +              error_code, _p(addr));
   54.97      }
   54.98  
   54.99      if ( unlikely(current->domain->arch.suppress_spurious_page_faults
  54.100 -                  && spurious_page_fault(addr, regs)) )
  54.101 +                  && spurious_page_fault(addr, error_code)) )
  54.102          return;
  54.103  
  54.104      propagate_page_fault(addr, regs->error_code);
    55.1 --- a/xen/arch/x86/x86_32/domain_page.c	Tue Nov 04 12:07:22 2008 +0900
    55.2 +++ b/xen/arch/x86/x86_32/domain_page.c	Tue Nov 04 12:43:19 2008 +0900
    55.3 @@ -43,7 +43,7 @@ static inline struct vcpu *mapcache_curr
    55.4  void *map_domain_page(unsigned long mfn)
    55.5  {
    55.6      unsigned long va;
    55.7 -    unsigned int idx, i;
    55.8 +    unsigned int idx, i, flags;
    55.9      struct vcpu *v;
   55.10      struct mapcache_domain *dcache;
   55.11      struct mapcache_vcpu *vcache;
   55.12 @@ -69,7 +69,7 @@ void *map_domain_page(unsigned long mfn)
   55.13          goto out;
   55.14      }
   55.15  
   55.16 -    spin_lock(&dcache->lock);
   55.17 +    spin_lock_irqsave(&dcache->lock, flags);
   55.18  
   55.19      /* Has some other CPU caused a wrap? We must flush if so. */
   55.20      if ( unlikely(dcache->epoch != vcache->shadow_epoch) )
   55.21 @@ -105,7 +105,7 @@ void *map_domain_page(unsigned long mfn)
   55.22      set_bit(idx, dcache->inuse);
   55.23      dcache->cursor = idx + 1;
   55.24  
   55.25 -    spin_unlock(&dcache->lock);
   55.26 +    spin_unlock_irqrestore(&dcache->lock, flags);
   55.27  
   55.28      l1e_write(&dcache->l1tab[idx], l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
   55.29  
   55.30 @@ -114,7 +114,7 @@ void *map_domain_page(unsigned long mfn)
   55.31      return (void *)va;
   55.32  }
   55.33  
   55.34 -void unmap_domain_page(void *va)
   55.35 +void unmap_domain_page(const void *va)
   55.36  {
   55.37      unsigned int idx;
   55.38      struct vcpu *v;
   55.39 @@ -241,7 +241,7 @@ void *map_domain_page_global(unsigned lo
   55.40      return (void *)va;
   55.41  }
   55.42  
   55.43 -void unmap_domain_page_global(void *va)
   55.44 +void unmap_domain_page_global(const void *va)
   55.45  {
   55.46      unsigned long __va = (unsigned long)va;
   55.47      l2_pgentry_t *pl2e;
    56.1 --- a/xen/arch/x86/x86_64/compat/mm.c	Tue Nov 04 12:07:22 2008 +0900
    56.2 +++ b/xen/arch/x86/x86_64/compat/mm.c	Tue Nov 04 12:43:19 2008 +0900
    56.3 @@ -231,6 +231,8 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mm
    56.4              case MMUEXT_PIN_L4_TABLE:
    56.5              case MMUEXT_UNPIN_TABLE:
    56.6              case MMUEXT_NEW_BASEPTR:
    56.7 +            case MMUEXT_CLEAR_PAGE:
    56.8 +            case MMUEXT_COPY_PAGE:
    56.9                  arg1 = XLAT_mmuext_op_arg1_mfn;
   56.10                  break;
   56.11              default:
   56.12 @@ -258,6 +260,9 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mm
   56.13              case MMUEXT_INVLPG_MULTI:
   56.14                  arg2 = XLAT_mmuext_op_arg2_vcpumask;
   56.15                  break;
   56.16 +            case MMUEXT_COPY_PAGE:
   56.17 +                arg2 = XLAT_mmuext_op_arg2_src_mfn;
   56.18 +                break;
   56.19              default:
   56.20                  arg2 = -1;
   56.21                  break;
    57.1 --- a/xen/arch/x86/x86_64/cpufreq.c	Tue Nov 04 12:07:22 2008 +0900
    57.2 +++ b/xen/arch/x86/x86_64/cpufreq.c	Tue Nov 04 12:43:19 2008 +0900
    57.3 @@ -56,34 +56,13 @@ compat_set_px_pminfo(uint32_t cpu, struc
    57.4  	return -EFAULT;
    57.5  
    57.6  #define XLAT_processor_performance_HNDL_states(_d_, _s_) do { \
    57.7 -    xen_processor_px_t *xen_states = NULL; \
    57.8 -\
    57.9 -    if ( likely((_s_)->state_count > 0) ) \
   57.10 -    { \
   57.11 -        XEN_GUEST_HANDLE(compat_processor_px_t) states; \
   57.12 -        compat_processor_px_t state; \
   57.13 -        int i; \
   57.14 -\
   57.15 -        xen_states = xlat_malloc_array(xlat_page_current, \
   57.16 -                               xen_processor_px_t, (_s_)->state_count); \
   57.17 -        if ( unlikely(xen_states == NULL) ) \
   57.18 -            return -EFAULT; \
   57.19 -\
   57.20 -        if ( unlikely(!compat_handle_okay((_s_)->states, \
   57.21 -                                (_s_)->state_count)) ) \
   57.22 -            return -EFAULT; \
   57.23 -        guest_from_compat_handle(states, (_s_)->states); \
   57.24 -\
   57.25 -        for ( i = 0; i < _s_->state_count; i++ ) \
   57.26 -        { \
   57.27 -           if ( unlikely(copy_from_guest_offset(&state, states, i, 1)) ) \
   57.28 -               return -EFAULT; \
   57.29 -           XLAT_processor_px(&xen_states[i], &state); \
   57.30 -        } \
   57.31 -    } \
   57.32 -\
   57.33 -    set_xen_guest_handle((_d_)->states, xen_states); \
   57.34 +    XEN_GUEST_HANDLE(compat_processor_px_t) states; \
   57.35 +    if ( unlikely(!compat_handle_okay((_s_)->states, (_s_)->state_count)) ) \
   57.36 +        return -EFAULT; \
   57.37 +    guest_from_compat_handle(states, (_s_)->states); \
   57.38 +    (_d_)->states = guest_handle_cast(states, xen_processor_px_t); \
   57.39  } while (0)
   57.40 +
   57.41      XLAT_processor_performance(xen_perf, perf);
   57.42  #undef XLAT_processor_performance_HNDL_states
   57.43  
    58.1 --- a/xen/common/event_channel.c	Tue Nov 04 12:07:22 2008 +0900
    58.2 +++ b/xen/common/event_channel.c	Tue Nov 04 12:43:19 2008 +0900
    58.3 @@ -386,7 +386,7 @@ static long __evtchn_close(struct domain
    58.4              if ( v->virq_to_evtchn[chn1->u.virq] != port1 )
    58.5                  continue;
    58.6              v->virq_to_evtchn[chn1->u.virq] = 0;
    58.7 -            spin_barrier(&v->virq_lock);
    58.8 +            spin_barrier_irq(&v->virq_lock);
    58.9          }
   58.10          break;
   58.11  
    59.1 --- a/xen/common/kernel.c	Tue Nov 04 12:07:22 2008 +0900
    59.2 +++ b/xen/common/kernel.c	Tue Nov 04 12:43:19 2008 +0900
    59.3 @@ -221,7 +221,8 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDL
    59.4                  fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
    59.5  #ifdef CONFIG_X86
    59.6              if ( !is_hvm_vcpu(current) )
    59.7 -                fi.submap |= 1U << XENFEAT_mmu_pt_update_preserve_ad;
    59.8 +                fi.submap |= (1U << XENFEAT_mmu_pt_update_preserve_ad) |
    59.9 +                             (1U << XENFEAT_highmem_assist);
   59.10  #endif
   59.11              break;
   59.12          default:
    60.1 --- a/xen/common/keyhandler.c	Tue Nov 04 12:07:22 2008 +0900
    60.2 +++ b/xen/common/keyhandler.c	Tue Nov 04 12:43:19 2008 +0900
    60.3 @@ -183,9 +183,9 @@ static void dump_domains(unsigned char k
    60.4      {
    60.5          printk("General information for domain %u:\n", d->domain_id);
    60.6          cpuset_print(tmpstr, sizeof(tmpstr), d->domain_dirty_cpumask);
    60.7 -        printk("    refcnt=%d nr_pages=%d xenheap_pages=%d "
    60.8 +        printk("    refcnt=%d dying=%d nr_pages=%d xenheap_pages=%d "
    60.9                 "dirty_cpus=%s\n",
   60.10 -               atomic_read(&d->refcnt),
   60.11 +               atomic_read(&d->refcnt), d->is_dying,
   60.12                 d->tot_pages, d->xenheap_pages, tmpstr);
   60.13          printk("    handle=%02x%02x%02x%02x-%02x%02x-%02x%02x-"
   60.14                 "%02x%02x-%02x%02x%02x%02x%02x%02x vm_assist=%08lx\n",
    61.1 --- a/xen/common/spinlock.c	Tue Nov 04 12:07:22 2008 +0900
    61.2 +++ b/xen/common/spinlock.c	Tue Nov 04 12:43:19 2008 +0900
    61.3 @@ -1,15 +1,56 @@
    61.4  #include <xen/config.h>
    61.5 +#include <xen/irq.h>
    61.6  #include <xen/smp.h>
    61.7  #include <xen/spinlock.h>
    61.8  
    61.9 +#ifndef NDEBUG
   61.10 +
   61.11 +static atomic_t spin_debug __read_mostly = ATOMIC_INIT(0);
   61.12 +
   61.13 +static void check_lock(struct lock_debug *debug)
   61.14 +{
   61.15 +    int irq_safe = !local_irq_is_enabled();
   61.16 +
   61.17 +    if ( unlikely(atomic_read(&spin_debug) <= 0) )
   61.18 +        return;
   61.19 +
   61.20 +    /* A few places take liberties with this. */
   61.21 +    /* BUG_ON(in_irq() && !irq_safe); */
   61.22 +
   61.23 +    if ( unlikely(debug->irq_safe != irq_safe) )
   61.24 +    {
   61.25 +        int seen = cmpxchg(&debug->irq_safe, -1, irq_safe);
   61.26 +        BUG_ON(seen == !irq_safe);
   61.27 +    }
   61.28 +}
   61.29 +
   61.30 +void spin_debug_enable(void)
   61.31 +{
   61.32 +    atomic_inc(&spin_debug);
   61.33 +}
   61.34 +
   61.35 +void spin_debug_disable(void)
   61.36 +{
   61.37 +    atomic_dec(&spin_debug);
   61.38 +}
   61.39 +
   61.40 +#else /* defined(NDEBUG) */
   61.41 +
   61.42 +#define check_lock(l) ((void)0)
   61.43 +
   61.44 +#endif
   61.45 +
   61.46  void _spin_lock(spinlock_t *lock)
   61.47  {
   61.48 +    check_lock(&lock->debug);
   61.49      _raw_spin_lock(&lock->raw);
   61.50  }
   61.51  
   61.52  void _spin_lock_irq(spinlock_t *lock)
   61.53  {
   61.54 +    ASSERT(local_irq_is_enabled());
   61.55      local_irq_disable();
   61.56 +    check_lock(&lock->debug);
   61.57      _raw_spin_lock(&lock->raw);
   61.58  }
   61.59  
   61.60 @@ -17,6 +58,7 @@ unsigned long _spin_lock_irqsave(spinloc
   61.61  {
   61.62      unsigned long flags;
   61.63      local_irq_save(flags);
   61.64 +    check_lock(&lock->debug);
   61.65      _raw_spin_lock(&lock->raw);
   61.66      return flags;
   61.67  }
   61.68 @@ -40,20 +82,31 @@ void _spin_unlock_irqrestore(spinlock_t 
   61.69  
   61.70  int _spin_is_locked(spinlock_t *lock)
   61.71  {
   61.72 +    check_lock(&lock->debug);
   61.73      return _raw_spin_is_locked(&lock->raw);
   61.74  }
   61.75  
   61.76  int _spin_trylock(spinlock_t *lock)
   61.77  {
   61.78 +    check_lock(&lock->debug);
   61.79      return _raw_spin_trylock(&lock->raw);
   61.80  }
   61.81  
   61.82  void _spin_barrier(spinlock_t *lock)
   61.83  {
   61.84 +    check_lock(&lock->debug);
   61.85      do { mb(); } while ( _raw_spin_is_locked(&lock->raw) );
   61.86      mb();
   61.87  }
   61.88  
   61.89 +void _spin_barrier_irq(spinlock_t *lock)
   61.90 +{
   61.91 +    unsigned long flags;
   61.92 +    local_irq_save(flags);
   61.93 +    _spin_barrier(lock);
   61.94 +    local_irq_restore(flags);
   61.95 +}
   61.96 +
   61.97  void _spin_lock_recursive(spinlock_t *lock)
   61.98  {
   61.99      int cpu = smp_processor_id();
  61.100 @@ -61,6 +114,8 @@ void _spin_lock_recursive(spinlock_t *lo
  61.101      /* Don't allow overflow of recurse_cpu field. */
  61.102      BUILD_BUG_ON(NR_CPUS > 0xfffu);
  61.103  
  61.104 +    check_lock(&lock->debug);
  61.105 +
  61.106      if ( likely(lock->recurse_cpu != cpu) )
  61.107      {
  61.108          spin_lock(lock);
  61.109 @@ -83,12 +138,15 @@ void _spin_unlock_recursive(spinlock_t *
  61.110  
  61.111  void _read_lock(rwlock_t *lock)
  61.112  {
  61.113 +    check_lock(&lock->debug);
  61.114      _raw_read_lock(&lock->raw);
  61.115  }
  61.116  
  61.117  void _read_lock_irq(rwlock_t *lock)
  61.118  {
  61.119 +    ASSERT(local_irq_is_enabled());
  61.120      local_irq_disable();
  61.121 +    check_lock(&lock->debug);
  61.122      _raw_read_lock(&lock->raw);
  61.123  }
  61.124  
  61.125 @@ -96,6 +154,7 @@ unsigned long _read_lock_irqsave(rwlock_
  61.126  {
  61.127      unsigned long flags;
  61.128      local_irq_save(flags);
  61.129 +    check_lock(&lock->debug);
  61.130      _raw_read_lock(&lock->raw);
  61.131      return flags;
  61.132  }
  61.133 @@ -119,12 +178,15 @@ void _read_unlock_irqrestore(rwlock_t *l
  61.134  
  61.135  void _write_lock(rwlock_t *lock)
  61.136  {
  61.137 +    check_lock(&lock->debug);
  61.138      _raw_write_lock(&lock->raw);
  61.139  }
  61.140  
  61.141  void _write_lock_irq(rwlock_t *lock)
  61.142  {
  61.143 +    ASSERT(local_irq_is_enabled());
  61.144      local_irq_disable();
  61.145 +    check_lock(&lock->debug);
  61.146      _raw_write_lock(&lock->raw);
  61.147  }
  61.148  
  61.149 @@ -132,6 +194,7 @@ unsigned long _write_lock_irqsave(rwlock
  61.150  {
  61.151      unsigned long flags;
  61.152      local_irq_save(flags);
  61.153 +    check_lock(&lock->debug);
  61.154      _raw_write_lock(&lock->raw);
  61.155      return flags;
  61.156  }
    62.1 --- a/xen/common/timer.c	Tue Nov 04 12:07:22 2008 +0900
    62.2 +++ b/xen/common/timer.c	Tue Nov 04 12:43:19 2008 +0900
    62.3 @@ -25,10 +25,12 @@
    62.4   * We pull handlers off the timer list this far in future,
    62.5   * rather than reprogramming the time hardware.
    62.6   */
    62.7 -#define TIMER_SLOP (50*1000) /* ns */
    62.8 +static unsigned int timer_slop __read_mostly = 50000; /* 50 us */
    62.9 +integer_param("timer_slop", timer_slop);
   62.10  
   62.11  struct timers {
   62.12      spinlock_t     lock;
   62.13 +    bool_t         overflow;
   62.14      struct timer **heap;
   62.15      struct timer  *list;
   62.16      struct timer  *running;
   62.17 @@ -200,6 +202,7 @@ static int add_entry(struct timers *time
   62.18          return rc;
   62.19  
   62.20      /* Fall back to adding to the slower linked list. */
   62.21 +    timers->overflow = 1;
   62.22      t->status = TIMER_STATUS_in_list;
   62.23      return add_to_list(&timers->list, t);
   62.24  }
   62.25 @@ -258,6 +261,7 @@ void set_timer(struct timer *timer, s_ti
   62.26          __stop_timer(timer);
   62.27  
   62.28      timer->expires = expires;
   62.29 +    timer->expires_end = expires + timer_slop;
   62.30  
   62.31      if ( likely(timer->status != TIMER_STATUS_killed) )
   62.32          __add_timer(timer);
   62.33 @@ -344,19 +348,30 @@ void kill_timer(struct timer *timer)
   62.34  }
   62.35  
   62.36  
   62.37 +static void execute_timer(struct timers *ts, struct timer *t)
   62.38 +{
   62.39 +    void (*fn)(void *) = t->function;
   62.40 +    void *data = t->data;
   62.41 +
   62.42 +    ts->running = t;
   62.43 +    spin_unlock_irq(&ts->lock);
   62.44 +    (*fn)(data);
   62.45 +    spin_lock_irq(&ts->lock);
   62.46 +    ts->running = NULL;
   62.47 +}
   62.48 +
   62.49 +
   62.50  static void timer_softirq_action(void)
   62.51  {
   62.52      struct timer  *t, **heap, *next;
   62.53      struct timers *ts;
   62.54 -    s_time_t       now, deadline;
   62.55 -    void         (*fn)(void *);
   62.56 -    void          *data;
   62.57 +    s_time_t       now;
   62.58  
   62.59      ts = &this_cpu(timers);
   62.60      heap = ts->heap;
   62.61  
   62.62 -    /* If we are using overflow linked list, try to allocate a larger heap. */
   62.63 -    if ( unlikely(ts->list != NULL) )
   62.64 +    /* If we overflowed the heap, try to allocate a larger heap. */
   62.65 +    if ( unlikely(ts->overflow) )
   62.66      {
   62.67          /* old_limit == (2^n)-1; new_limit == (2^(n+4))-1 */
   62.68          int old_limit = GET_HEAP_LIMIT(heap);
   62.69 @@ -377,7 +392,26 @@ static void timer_softirq_action(void)
   62.70  
   62.71      spin_lock_irq(&ts->lock);
   62.72  
   62.73 -    /* Try to move timers from overflow linked list to more efficient heap. */
   62.74 +    now = NOW();
   62.75 +
   62.76 +    /* Execute ready heap timers. */
   62.77 +    while ( (GET_HEAP_SIZE(heap) != 0) &&
   62.78 +            ((t = heap[1])->expires_end < now) )
   62.79 +    {
   62.80 +        remove_from_heap(heap, t);
   62.81 +        t->status = TIMER_STATUS_inactive;
   62.82 +        execute_timer(ts, t);
   62.83 +    }
   62.84 +
   62.85 +    /* Execute ready list timers. */
   62.86 +    while ( ((t = ts->list) != NULL) && (t->expires_end < now) )
   62.87 +    {
   62.88 +        ts->list = t->list_next;
   62.89 +        t->status = TIMER_STATUS_inactive;
   62.90 +        execute_timer(ts, t);
   62.91 +    }
   62.92 +
   62.93 +    /* Try to move timers from linked list to more efficient heap. */
   62.94      next = ts->list;
   62.95      ts->list = NULL;
   62.96      while ( unlikely((t = next) != NULL) )
   62.97 @@ -387,51 +421,44 @@ static void timer_softirq_action(void)
   62.98          add_entry(ts, t);
   62.99      }
  62.100  
  62.101 -    now = NOW();
  62.102 -
  62.103 -    while ( (GET_HEAP_SIZE(heap) != 0) &&
  62.104 -            ((t = heap[1])->expires < (now + TIMER_SLOP)) )
  62.105 +    ts->overflow = (ts->list != NULL);
  62.106 +    if ( unlikely(ts->overflow) )
  62.107      {
  62.108 -        remove_entry(ts, t);
  62.109 +        /* Find earliest deadline at head of list or top of heap. */
  62.110 +        this_cpu(timer_deadline) = ts->list->expires;
  62.111 +        if ( (GET_HEAP_SIZE(heap) != 0) &&
  62.112 +             ((t = heap[1])->expires < this_cpu(timer_deadline)) )
  62.113 +            this_cpu(timer_deadline) = t->expires;
  62.114 +    }
  62.115 +    else
  62.116 +    {
  62.117 +        /*
  62.118 +         * Find the earliest deadline that encompasses largest number of timers
  62.119 +         * on the heap. To do this we take timers from the heap while their
  62.120 +         * valid deadline ranges continue to intersect.
  62.121 +         */
  62.122 +        s_time_t start = 0, end = STIME_MAX;
  62.123 +        struct timer **list_tail = &ts->list;
  62.124  
  62.125 -        ts->running = t;
  62.126 +        while ( (GET_HEAP_SIZE(heap) != 0) &&
  62.127 +                ((t = heap[1])->expires <= end) )
  62.128 +        {
  62.129 +            remove_entry(ts, t);
  62.130  
  62.131 -        fn   = t->function;
  62.132 -        data = t->data;
  62.133 +            t->status = TIMER_STATUS_in_list;
  62.134 +            t->list_next = NULL;
  62.135 +            *list_tail = t;
  62.136 +            list_tail = &t->list_next;
  62.137  
  62.138 -        spin_unlock_irq(&ts->lock);
  62.139 -        (*fn)(data);
  62.140 -        spin_lock_irq(&ts->lock);
  62.141 +            start = t->expires;
  62.142 +            if ( end > t->expires_end )
  62.143 +                end = t->expires_end;
  62.144 +        }
  62.145 +
  62.146 +        this_cpu(timer_deadline) = start;
  62.147      }
  62.148  
  62.149 -    deadline = GET_HEAP_SIZE(heap) ? heap[1]->expires : 0;
  62.150 -
  62.151 -    while ( unlikely((t = ts->list) != NULL) )
  62.152 -    {
  62.153 -        if ( t->expires >= (now + TIMER_SLOP) )
  62.154 -        {
  62.155 -            if ( (deadline == 0) || (deadline > t->expires) )
  62.156 -                deadline = t->expires;
  62.157 -            break;
  62.158 -        }
  62.159 -
  62.160 -        ts->list = t->list_next;
  62.161 -        t->status = TIMER_STATUS_inactive;
  62.162 -
  62.163 -        ts->running = t;
  62.164 -
  62.165 -        fn   = t->function;
  62.166 -        data = t->data;
  62.167 -
  62.168 -        spin_unlock_irq(&ts->lock);
  62.169 -        (*fn)(data);
  62.170 -        spin_lock_irq(&ts->lock);
  62.171 -    }
  62.172 -
  62.173 -    ts->running = NULL;
  62.174 -
  62.175 -    this_cpu(timer_deadline) = deadline;
  62.176 -    if ( !reprogram_timer(deadline) )
  62.177 +    if ( !reprogram_timer(this_cpu(timer_deadline)) )
  62.178          raise_softirq(TIMER_SOFTIRQ);
  62.179  
  62.180      spin_unlock_irq(&ts->lock);
    63.1 --- a/xen/common/xenoprof.c	Tue Nov 04 12:07:22 2008 +0900
    63.2 +++ b/xen/common/xenoprof.c	Tue Nov 04 12:43:19 2008 +0900
    63.3 @@ -85,7 +85,7 @@ int is_active(struct domain *d)
    63.4      return ((x != NULL) && (x->domain_type == XENOPROF_DOMAIN_ACTIVE));
    63.5  }
    63.6  
    63.7 -static int is_passive(struct domain *d)
    63.8 +int is_passive(struct domain *d)
    63.9  {
   63.10      struct xenoprof *x = d->xenoprof;
   63.11      return ((x != NULL) && (x->domain_type == XENOPROF_DOMAIN_PASSIVE));
    64.1 --- a/xen/common/xmalloc.c	Tue Nov 04 12:07:22 2008 +0900
    64.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    64.3 @@ -1,286 +0,0 @@
    64.4 -/******************************************************************************
    64.5 - * Simple allocator for Xen.  If larger than a page, simply use the
    64.6 - * page-order allocator.
    64.7 - *
    64.8 - * Copyright (C) 2005 Rusty Russell IBM Corporation
    64.9 - *
   64.10 - * This program is free software; you can redistribute it and/or modify
   64.11 - * it under the terms of the GNU General Public License as published by
   64.12 - * the Free Software Foundation; either version 2 of the License, or
   64.13 - * (at your option) any later version.
   64.14 - *
   64.15 - * This program is distributed in the hope that it will be useful,
   64.16 - * but WITHOUT ANY WARRANTY; without even the implied warranty of
   64.17 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   64.18 - * GNU General Public License for more details.
   64.19 - *
   64.20 - * You should have received a copy of the GNU General Public License
   64.21 - * along with this program; if not, write to the Free Software
   64.22 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   64.23 - */
   64.24 -
   64.25 -/*
   64.26 - * TODO (Keir, 17/2/05):
   64.27 - *  1. Use space in page_info to avoid xmalloc_hdr in allocated blocks.
   64.28 - *  2. page_info points into free list to make xfree() O(1) complexity.
   64.29 - *  3. Perhaps make this a sub-page buddy allocator? xmalloc() == O(1).
   64.30 - *     (Disadvantage is potentially greater internal fragmentation).
   64.31 - */
   64.32 -
   64.33 -#include <xen/config.h>
   64.34 -#include <xen/mm.h>
   64.35 -#include <xen/spinlock.h>
   64.36 -#include <xen/timer.h>
   64.37 -#include <xen/cache.h>
   64.38 -#include <xen/prefetch.h>
   64.39 -#include <xen/irq.h>
   64.40 -#include <xen/smp.h>
   64.41 -
   64.42 -/*
   64.43 - * XMALLOC_DEBUG:
   64.44 - *  1. Free data blocks are filled with poison bytes.
   64.45 - *  2. In-use data blocks have guard bytes at the start and end.
   64.46 - */
   64.47 -#ifndef NDEBUG
   64.48 -#define XMALLOC_DEBUG 1
   64.49 -#endif
   64.50 -
   64.51 -static LIST_HEAD(freelist);
   64.52 -static DEFINE_SPINLOCK(freelist_lock);
   64.53 -
   64.54 -struct xmalloc_hdr
   64.55 -{
   64.56 -    /* Size is total including this header. */
   64.57 -    size_t size;
   64.58 -    struct list_head freelist;
   64.59 -} __cacheline_aligned;
   64.60 -
   64.61 -static void add_to_freelist(struct xmalloc_hdr *hdr)
   64.62 -{
   64.63 -#if XMALLOC_DEBUG
   64.64 -    memset(hdr + 1, 0xa5, hdr->size - sizeof(*hdr));
   64.65 -#endif
   64.66 -    list_add(&hdr->freelist, &freelist);
   64.67 -}
   64.68 -
   64.69 -static void del_from_freelist(struct xmalloc_hdr *hdr)
   64.70 -{
   64.71 -#if XMALLOC_DEBUG
   64.72 -    size_t i;
   64.73 -    unsigned char *data = (unsigned char *)(hdr + 1);
   64.74 -    for ( i = 0; i < (hdr->size - sizeof(*hdr)); i++ )
   64.75 -        BUG_ON(data[i] != 0xa5);
   64.76 -    BUG_ON((hdr->size <= 0) || (hdr->size >= PAGE_SIZE));
   64.77 -#endif
   64.78 -    list_del(&hdr->freelist);
   64.79 -}
   64.80 -
   64.81 -static void *data_from_header(struct xmalloc_hdr *hdr)
   64.82 -{
   64.83 -#if XMALLOC_DEBUG
   64.84 -    /* Data block contain SMP_CACHE_BYTES of guard canary. */
   64.85 -    unsigned char *data = (unsigned char *)(hdr + 1);
   64.86 -    memset(data, 0x5a, SMP_CACHE_BYTES);
   64.87 -    memset(data + hdr->size - sizeof(*hdr) - SMP_CACHE_BYTES,
   64.88 -           0x5a, SMP_CACHE_BYTES);
   64.89 -    return data + SMP_CACHE_BYTES;
   64.90 -#else
   64.91 -    return hdr + 1;
   64.92 -#endif
   64.93 -}
   64.94 -
   64.95 -static struct xmalloc_hdr *header_from_data(void *p)
   64.96 -{
   64.97 -#if XMALLOC_DEBUG
   64.98 -    unsigned char *data = (unsigned char *)p - SMP_CACHE_BYTES;
   64.99 -    struct xmalloc_hdr *hdr = (struct xmalloc_hdr *)data - 1;
  64.100 -    size_t i;
  64.101 -
  64.102 -    /* Check header guard canary. */
  64.103 -    for ( i = 0; i < SMP_CACHE_BYTES; i++ )
  64.104 -        BUG_ON(data[i] != 0x5a);
  64.105 -
  64.106 -    /* Check footer guard canary. */
  64.107 -    data += hdr->size - sizeof(*hdr) - SMP_CACHE_BYTES;
  64.108 -    for ( i = 0; i < SMP_CACHE_BYTES; i++ )
  64.109 -        BUG_ON(data[i] != 0x5a);
  64.110 -
  64.111 -    return hdr;
  64.112 -#else
  64.113 -    return (struct xmalloc_hdr *)p - 1;
  64.114 -#endif
  64.115 -}
  64.116 -
  64.117 -static void maybe_split(struct xmalloc_hdr *hdr, size_t size, size_t block)
  64.118 -{
  64.119 -    struct xmalloc_hdr *extra;
  64.120 -    size_t leftover = block - size;
  64.121 -
  64.122 -    /* If enough is left to make a block, put it on free list. */
  64.123 -    if ( leftover >= (2 * sizeof(struct xmalloc_hdr)) )
  64.124 -    {
  64.125 -        extra = (struct xmalloc_hdr *)((unsigned long)hdr + size);
  64.126 -        extra->size = leftover;
  64.127 -        add_to_freelist(extra);
  64.128 -    }
  64.129 -    else
  64.130 -    {
  64.131 -        size = block;
  64.132 -    }
  64.133 -
  64.134 -    hdr->size = size;
  64.135 -    /* Debugging aid. */
  64.136 -    hdr->freelist.next = hdr->freelist.prev = NULL;
  64.137 -}
  64.138 -
  64.139 -static void *xmalloc_new_page(size_t size)
  64.140 -{
  64.141 -    struct xmalloc_hdr *hdr;
  64.142 -
  64.143 -    hdr = alloc_xenheap_page();
  64.144 -    if ( hdr == NULL )
  64.145 -        return NULL;
  64.146 -
  64.147 -    spin_lock(&freelist_lock);
  64.148 -    maybe_split(hdr, size, PAGE_SIZE);
  64.149 -    spin_unlock(&freelist_lock);
  64.150 -
  64.151 -    return data_from_header(hdr);
  64.152 -}
  64.153 -
  64.154 -/* Big object?  Just use the page allocator. */
  64.155 -static void *xmalloc_whole_pages(size_t size)
  64.156 -{
  64.157 -    struct xmalloc_hdr *hdr;
  64.158 -    unsigned int pageorder = get_order_from_bytes(size);
  64.159 -
  64.160 -    hdr = alloc_xenheap_pages(pageorder);
  64.161 -    if ( hdr == NULL )
  64.162 -        return NULL;
  64.163 -
  64.164 -    hdr->size = (1 << (pageorder + PAGE_SHIFT));
  64.165 -    /* Debugging aid. */
  64.166 -    hdr->freelist.next = hdr->freelist.prev = NULL;
  64.167 -
  64.168 -    return data_from_header(hdr);
  64.169 -}
  64.170 -
  64.171 -/* Return size, increased to alignment with align. */
  64.172 -static inline size_t align_up(size_t size, size_t align)
  64.173 -{
  64.174 -    return (size + align - 1) & ~(align - 1);
  64.175 -}
  64.176 -
  64.177 -void *_xmalloc(size_t size, size_t align)
  64.178 -{
  64.179 -    struct xmalloc_hdr *i;
  64.180 -
  64.181 -    ASSERT(!in_irq());
  64.182 -
  64.183 -    /* We currently always return cacheline aligned. */
  64.184 -    BUG_ON(align > SMP_CACHE_BYTES);
  64.185 -
  64.186 -#if XMALLOC_DEBUG
  64.187 -    /* Add room for canaries at start and end of data block. */
  64.188 -    size += 2 * SMP_CACHE_BYTES;
  64.189 -#endif
  64.190 -
  64.191 -    /* Add room for header, pad to align next header. */
  64.192 -    size += sizeof(struct xmalloc_hdr);
  64.193 -    size = align_up(size, __alignof__(struct xmalloc_hdr));
  64.194 -
  64.195 -    /* For big allocs, give them whole pages. */
  64.196 -    if ( size >= PAGE_SIZE )
  64.197 -        return xmalloc_whole_pages(size);
  64.198 -
  64.199 -    /* Search free list. */
  64.200 -    spin_lock(&freelist_lock);
  64.201 -    list_for_each_entry( i, &freelist, freelist )
  64.202 -    {
  64.203 -        if ( i->size < size )
  64.204 -            continue;
  64.205 -        del_from_freelist(i);
  64.206 -        maybe_split(i, size, i->size);
  64.207 -        spin_unlock(&freelist_lock);
  64.208 -        return data_from_header(i);
  64.209 -    }
  64.210 -    spin_unlock(&freelist_lock);
  64.211 -
  64.212 -    /* Alloc a new page and return from that. */
  64.213 -    return xmalloc_new_page(size);
  64.214 -}
  64.215 -
  64.216 -void xfree(void *p)
  64.217 -{
  64.218 -    struct xmalloc_hdr *i, *tmp, *hdr;
  64.219 -
  64.220 -    ASSERT(!in_irq());
  64.221 -
  64.222 -    if ( p == NULL )
  64.223 -        return;
  64.224 -
  64.225 -    hdr = header_from_data(p);
  64.226 -
  64.227 -    /* We know hdr will be on same page. */
  64.228 -    BUG_ON(((long)p & PAGE_MASK) != ((long)hdr & PAGE_MASK));
  64.229 -
  64.230 -    /* Not previously freed. */
  64.231 -    BUG_ON(hdr->freelist.next || hdr->freelist.prev);
  64.232 -
  64.233 -    /* Big allocs free directly. */
  64.234 -    if ( hdr->size >= PAGE_SIZE )
  64.235 -    {
  64.236 -        free_xenheap_pages(hdr, get_order_from_bytes(hdr->size));
  64.237 -        return;
  64.238 -    }
  64.239 -
  64.240 -    /* Merge with other free block, or put in list. */
  64.241 -    spin_lock(&freelist_lock);
  64.242 -    list_for_each_entry_safe( i, tmp, &freelist, freelist )
  64.243 -    {
  64.244 -        unsigned long _i   = (unsigned long)i;
  64.245 -        unsigned long _hdr = (unsigned long)hdr;
  64.246 -
  64.247 -        /* Do not merge across page boundaries. */
  64.248 -        if ( ((_i ^ _hdr) & PAGE_MASK) != 0 )
  64.249 -            continue;
  64.250 -
  64.251 -        /* We follow this block?  Swallow it. */
  64.252 -        if ( (_i + i->size) == _hdr )
  64.253 -        {
  64.254 -            del_from_freelist(i);
  64.255 -            i->size += hdr->size;
  64.256 -            hdr = i;
  64.257 -        }
  64.258 -
  64.259 -        /* We precede this block? Swallow it. */
  64.260 -        if ( (_hdr + hdr->size) == _i )
  64.261 -        {
  64.262 -            del_from_freelist(i);
  64.263 -            hdr->size += i->size;
  64.264 -        }
  64.265 -    }
  64.266 -
  64.267 -    /* Did we merge an entire page? */
  64.268 -    if ( hdr->size == PAGE_SIZE )
  64.269 -    {
  64.270 -        BUG_ON((((unsigned long)hdr) & (PAGE_SIZE-1)) != 0);
  64.271 -        free_xenheap_pages(hdr, 0);
  64.272 -    }
  64.273 -    else
  64.274 -    {
  64.275 -        add_to_freelist(hdr);
  64.276 -    }
  64.277 -
  64.278 -    spin_unlock(&freelist_lock);
  64.279 -}
  64.280 -
  64.281 -/*
  64.282 - * Local variables:
  64.283 - * mode: C
  64.284 - * c-set-style: "BSD"
  64.285 - * c-basic-offset: 4
  64.286 - * tab-width: 4
  64.287 - * indent-tabs-mode: nil
  64.288 - * End:
  64.289 - */
    65.1 --- a/xen/drivers/char/serial.c	Tue Nov 04 12:07:22 2008 +0900
    65.2 +++ b/xen/drivers/char/serial.c	Tue Nov 04 12:43:19 2008 +0900
    65.3 @@ -74,7 +74,7 @@ void serial_tx_interrupt(struct serial_p
    65.4      while ( !spin_trylock(&port->tx_lock) )
    65.5      {
    65.6          if ( !port->driver->tx_empty(port) )
    65.7 -            return;
    65.8 +            goto out;
    65.9          cpu_relax();
   65.10      }
   65.11  
   65.12 @@ -89,7 +89,10 @@ void serial_tx_interrupt(struct serial_p
   65.13          }
   65.14      }
   65.15  
   65.16 -    spin_unlock_irqrestore(&port->tx_lock, flags);
   65.17 +    spin_unlock(&port->tx_lock);
   65.18 +
   65.19 + out:
   65.20 +    local_irq_restore(flags);
   65.21  }
   65.22  
   65.23  static void __serial_putc(struct serial_port *port, char c)
    66.1 --- a/xen/drivers/cpufreq/cpufreq.c	Tue Nov 04 12:07:22 2008 +0900
    66.2 +++ b/xen/drivers/cpufreq/cpufreq.c	Tue Nov 04 12:43:19 2008 +0900
    66.3 @@ -31,6 +31,7 @@
    66.4  #include <xen/errno.h>
    66.5  #include <xen/delay.h>
    66.6  #include <xen/cpumask.h>
    66.7 +#include <xen/list.h>
    66.8  #include <xen/sched.h>
    66.9  #include <xen/timer.h>
   66.10  #include <xen/xmalloc.h>
   66.11 @@ -44,8 +45,12 @@
   66.12  #include <acpi/acpi.h>
   66.13  #include <acpi/cpufreq/cpufreq.h>
   66.14  
   66.15 -/* TODO: change to link list later as domain number may be sparse */
   66.16 -static cpumask_t cpufreq_dom_map[NR_CPUS];
   66.17 +struct cpufreq_dom {
   66.18 +    unsigned int	dom;
   66.19 +    cpumask_t		map;
   66.20 +    struct list_head	node;
   66.21 +};
   66.22 +static LIST_HEAD(cpufreq_dom_list_head);
   66.23  
   66.24  int cpufreq_limit_change(unsigned int cpu)
   66.25  {
   66.26 @@ -72,48 +77,80 @@ int cpufreq_add_cpu(unsigned int cpu)
   66.27  {
   66.28      int ret = 0;
   66.29      unsigned int firstcpu;
   66.30 -    unsigned int dom;
   66.31 +    unsigned int dom, domexist = 0;
   66.32      unsigned int j;
   66.33 +    struct list_head *pos;
   66.34 +    struct cpufreq_dom *cpufreq_dom = NULL;
   66.35      struct cpufreq_policy new_policy;
   66.36      struct cpufreq_policy *policy;
   66.37      struct processor_performance *perf = &processor_pminfo[cpu]->perf;
   66.38  
   66.39      /* to protect the case when Px was not controlled by xen */
   66.40 -    if (!processor_pminfo[cpu] || !(perf->init & XEN_PX_INIT))
   66.41 +    if (!processor_pminfo[cpu]      ||
   66.42 +        !(perf->init & XEN_PX_INIT) ||
   66.43 +        !cpu_online(cpu))
   66.44 +        return -EINVAL;
   66.45 +
   66.46 +    if (cpufreq_cpu_policy[cpu])
   66.47          return 0;
   66.48  
   66.49 -    if (!cpu_online(cpu) || cpufreq_cpu_policy[cpu])
   66.50 -        return -EINVAL;
   66.51 -
   66.52      ret = cpufreq_statistic_init(cpu);
   66.53      if (ret)
   66.54          return ret;
   66.55  
   66.56      dom = perf->domain_info.domain;
   66.57 -    if (cpus_weight(cpufreq_dom_map[dom])) {
   66.58 +
   66.59 +    list_for_each(pos, &cpufreq_dom_list_head) {
   66.60 +        cpufreq_dom = list_entry(pos, struct cpufreq_dom, node);
   66.61 +        if (dom == cpufreq_dom->dom) {
   66.62 +            domexist = 1;
   66.63 +            break;
   66.64 +        }
   66.65 +    }
   66.66 +
   66.67 +    if (domexist) {
   66.68          /* share policy with the first cpu since on same boat */
   66.69 -        firstcpu = first_cpu(cpufreq_dom_map[dom]);
   66.70 +        firstcpu = first_cpu(cpufreq_dom->map);
   66.71          policy = cpufreq_cpu_policy[firstcpu];
   66.72  
   66.73          cpufreq_cpu_policy[cpu] = policy;
   66.74 -        cpu_set(cpu, cpufreq_dom_map[dom]);
   66.75 +        cpu_set(cpu, cpufreq_dom->map);
   66.76          cpu_set(cpu, policy->cpus);
   66.77  
   66.78 +        /* domain coordination sanity check */
   66.79 +        if ((perf->domain_info.coord_type !=
   66.80 +             processor_pminfo[firstcpu]->perf.domain_info.coord_type) ||
   66.81 +            (perf->domain_info.num_processors !=
   66.82 +             processor_pminfo[firstcpu]->perf.domain_info.num_processors)) {
   66.83 +            ret = -EINVAL;
   66.84 +            goto err2;
   66.85 +        }
   66.86 +
   66.87          printk(KERN_EMERG"adding CPU %u\n", cpu);
   66.88      } else {
   66.89 +        cpufreq_dom = xmalloc(struct cpufreq_dom);
   66.90 +        if (!cpufreq_dom) {
   66.91 +            cpufreq_statistic_exit(cpu);
   66.92 +            return -ENOMEM;
   66.93 +        }
   66.94 +        memset(cpufreq_dom, 0, sizeof(struct cpufreq_dom));
   66.95 +        cpufreq_dom->dom = dom;
   66.96 +        cpu_set(cpu, cpufreq_dom->map);
   66.97 +        list_add(&cpufreq_dom->node, &cpufreq_dom_list_head);
   66.98 +
   66.99          /* for the first cpu, setup policy and do init work */
  66.100          policy = xmalloc(struct cpufreq_policy);
  66.101          if (!policy) {
  66.102 +            list_del(&cpufreq_dom->node);
  66.103 +            xfree(cpufreq_dom);
  66.104              cpufreq_statistic_exit(cpu);
  66.105              return -ENOMEM;
  66.106          }
  66.107          memset(policy, 0, sizeof(struct cpufreq_policy));
  66.108 -
  66.109 +        policy->cpu = cpu;
  66.110 +        cpu_set(cpu, policy->cpus);
  66.111          cpufreq_cpu_policy[cpu] = policy;
  66.112 -        cpu_set(cpu, cpufreq_dom_map[dom]);
  66.113 -        cpu_set(cpu, policy->cpus);
  66.114  
  66.115 -        policy->cpu = cpu;
  66.116          ret = cpufreq_driver->init(policy);
  66.117          if (ret)
  66.118              goto err1;
  66.119 @@ -124,7 +161,7 @@ int cpufreq_add_cpu(unsigned int cpu)
  66.120       * After get full cpumap of the coordination domain,
  66.121       * we can safely start gov here.
  66.122       */
  66.123 -    if (cpus_weight(cpufreq_dom_map[dom]) ==
  66.124 +    if (cpus_weight(cpufreq_dom->map) ==
  66.125          perf->domain_info.num_processors) {
  66.126          memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
  66.127          policy->governor = NULL;
  66.128 @@ -138,51 +175,68 @@ int cpufreq_add_cpu(unsigned int cpu)
  66.129  err2:
  66.130      cpufreq_driver->exit(policy);
  66.131  err1:
  66.132 -    for_each_cpu_mask(j, cpufreq_dom_map[dom]) {
  66.133 +    for_each_cpu_mask(j, cpufreq_dom->map) {
  66.134          cpufreq_cpu_policy[j] = NULL;
  66.135          cpufreq_statistic_exit(j);
  66.136      }
  66.137  
  66.138 -    cpus_clear(cpufreq_dom_map[dom]);
  66.139 +    list_del(&cpufreq_dom->node);
  66.140 +    xfree(cpufreq_dom);
  66.141      xfree(policy);
  66.142      return ret;
  66.143  }
  66.144  
  66.145  int cpufreq_del_cpu(unsigned int cpu)
  66.146  {
  66.147 -    unsigned int dom;
  66.148 +    unsigned int dom, domexist = 0;
  66.149 +    struct list_head *pos;
  66.150 +    struct cpufreq_dom *cpufreq_dom = NULL;
  66.151      struct cpufreq_policy *policy;
  66.152      struct processor_performance *perf = &processor_pminfo[cpu]->perf;
  66.153  
  66.154      /* to protect the case when Px was not controlled by xen */
  66.155 -    if (!processor_pminfo[cpu] || !(perf->init & XEN_PX_INIT))
  66.156 +    if (!processor_pminfo[cpu]      ||
  66.157 +        !(perf->init & XEN_PX_INIT) ||
  66.158 +        !cpu_online(cpu))
  66.159 +        return -EINVAL;
  66.160 +
  66.161 +    if (!cpufreq_cpu_policy[cpu])
  66.162          return 0;
  66.163  
  66.164 -    if (!cpu_online(cpu) || !cpufreq_cpu_policy[cpu])
  66.165 -        return -EINVAL;
  66.166 -
  66.167      dom = perf->domain_info.domain;
  66.168      policy = cpufreq_cpu_policy[cpu];
  66.169  
  66.170 -    printk(KERN_EMERG"deleting CPU %u\n", cpu);
  66.171 +    list_for_each(pos, &cpufreq_dom_list_head) {
  66.172 +        cpufreq_dom = list_entry(pos, struct cpufreq_dom, node);
  66.173 +        if (dom == cpufreq_dom->dom) {
  66.174 +            domexist = 1;
  66.175 +            break;
  66.176 +        }
  66.177 +    }
  66.178 +
  66.179 +    if (!domexist)
  66.180 +        return -EINVAL;
  66.181  
  66.182      /* for the first cpu of the domain, stop gov */
  66.183 -    if (cpus_weight(cpufreq_dom_map[dom]) ==
  66.184 +    if (cpus_weight(cpufreq_dom->map) ==
  66.185          perf->domain_info.num_processors)
  66.186          __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
  66.187  
  66.188      cpufreq_cpu_policy[cpu] = NULL;
  66.189      cpu_clear(cpu, policy->cpus);
  66.190 -    cpu_clear(cpu, cpufreq_dom_map[dom]);
  66.191 +    cpu_clear(cpu, cpufreq_dom->map);
  66.192      cpufreq_statistic_exit(cpu);
  66.193  
  66.194      /* for the last cpu of the domain, clean room */
  66.195      /* It's safe here to free freq_table, drv_data and policy */
  66.196 -    if (!cpus_weight(cpufreq_dom_map[dom])) {
  66.197 +    if (!cpus_weight(cpufreq_dom->map)) {
  66.198          cpufreq_driver->exit(policy);
  66.199 +        list_del(&cpufreq_dom->node);
  66.200 +        xfree(cpufreq_dom);
  66.201          xfree(policy);
  66.202      }
  66.203  
  66.204 +    printk(KERN_EMERG"deleting CPU %u\n", cpu);
  66.205      return 0;
  66.206  }
  66.207  
  66.208 @@ -258,6 +312,24 @@ int set_px_pminfo(uint32_t acpi_id, stru
  66.209  
  66.210      if ( dom0_px_info->flags & XEN_PX_PCT )
  66.211      {
  66.212 +        /* space_id check */
  66.213 +        if (dom0_px_info->control_register.space_id != 
  66.214 +            dom0_px_info->status_register.space_id)
  66.215 +        {
  66.216 +            ret = -EINVAL;
  66.217 +            goto out;
  66.218 +        }
  66.219 +
  66.220 +#ifdef CONFIG_IA64
  66.221 +        /* for IA64, currently it only supports FFH */
  66.222 +        if (dom0_px_info->control_register.space_id !=
  66.223 +            ACPI_ADR_SPACE_FIXED_HARDWARE)
  66.224 +        {
  66.225 +            ret = -EINVAL;
  66.226 +            goto out;
  66.227 +        }
  66.228 +#endif
  66.229 +
  66.230          memcpy ((void *)&pxpt->control_register,
  66.231                  (void *)&dom0_px_info->control_register,
  66.232                  sizeof(struct xen_pct_register));
  66.233 @@ -267,8 +339,16 @@ int set_px_pminfo(uint32_t acpi_id, stru
  66.234          print_PCT(&pxpt->control_register);
  66.235          print_PCT(&pxpt->status_register);
  66.236      }
  66.237 +
  66.238      if ( dom0_px_info->flags & XEN_PX_PSS ) 
  66.239      {
  66.240 +        /* capability check */
  66.241 +        if (dom0_px_info->state_count <= 1)
  66.242 +        {
  66.243 +            ret = -EINVAL;
  66.244 +            goto out;
  66.245 +        }
  66.246 +
  66.247          if ( !(pxpt->states = xmalloc_array(struct xen_processor_px,
  66.248                          dom0_px_info->state_count)) )
  66.249          {
  66.250 @@ -280,14 +360,28 @@ int set_px_pminfo(uint32_t acpi_id, stru
  66.251          pxpt->state_count = dom0_px_info->state_count;
  66.252          print_PSS(pxpt->states,pxpt->state_count);
  66.253      }
  66.254 +
  66.255      if ( dom0_px_info->flags & XEN_PX_PSD )
  66.256      {
  66.257 +#ifdef CONFIG_X86
  66.258 +        /* for X86, check domain coordination */
  66.259 +        /* for IA64, _PSD is optional for current IA64 cpufreq algorithm */
  66.260 +        if (dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_ALL &&
  66.261 +            dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_ANY &&
  66.262 +            dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_HW)
  66.263 +        {
  66.264 +            ret = -EINVAL;
  66.265 +            goto out;
  66.266 +        }
  66.267 +#endif
  66.268 +
  66.269          pxpt->shared_type = dom0_px_info->shared_type;
  66.270          memcpy ((void *)&pxpt->domain_info,
  66.271                  (void *)&dom0_px_info->domain_info,
  66.272                  sizeof(struct xen_psd_package));
  66.273          print_PSD(&pxpt->domain_info);
  66.274      }
  66.275 +
  66.276      if ( dom0_px_info->flags & XEN_PX_PPC )
  66.277      {
  66.278          pxpt->platform_limit = dom0_px_info->platform_limit;
  66.279 @@ -295,7 +389,6 @@ int set_px_pminfo(uint32_t acpi_id, stru
  66.280  
  66.281          if ( pxpt->init == XEN_PX_INIT )
  66.282          {
  66.283 -
  66.284              ret = cpufreq_limit_change(cpuid); 
  66.285              goto out;
  66.286          }
    67.1 --- a/xen/include/asm-x86/config.h	Tue Nov 04 12:07:22 2008 +0900
    67.2 +++ b/xen/include/asm-x86/config.h	Tue Nov 04 12:43:19 2008 +0900
    67.3 @@ -41,14 +41,6 @@
    67.4  #define CONFIG_HOTPLUG 1
    67.5  #define CONFIG_HOTPLUG_CPU 1
    67.6  
    67.7 -/*
    67.8 - * Avoid deep recursion when tearing down pagetables during domain destruction,
    67.9 - * causing dom0 to become unresponsive and Xen to miss time-critical softirq
   67.10 - * deadlines. This will ultimately be replaced by built-in preemptibility of
   67.11 - * get_page_type().
   67.12 - */
   67.13 -#define DOMAIN_DESTRUCT_AVOID_RECURSION 1
   67.14 -
   67.15  #define HZ 100
   67.16  
   67.17  #define OPT_CONSOLE_STR "vga"
    68.1 --- a/xen/include/asm-x86/event.h	Tue Nov 04 12:07:22 2008 +0900
    68.2 +++ b/xen/include/asm-x86/event.h	Tue Nov 04 12:43:19 2008 +0900
    68.3 @@ -11,36 +11,8 @@
    68.4  
    68.5  #include <xen/shared.h>
    68.6  
    68.7 -static inline void vcpu_kick(struct vcpu *v)
    68.8 -{
    68.9 -    /*
   68.10 -     * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
   68.11 -     * pending flag. These values may fluctuate (after all, we hold no
   68.12 -     * locks) but the key insight is that each change will cause
   68.13 -     * evtchn_upcall_pending to be polled.
   68.14 -     * 
   68.15 -     * NB2. We save the running flag across the unblock to avoid a needless
   68.16 -     * IPI for domains that we IPI'd to unblock.
   68.17 -     */
   68.18 -    int running = v->is_running;
   68.19 -    vcpu_unblock(v);
   68.20 -    if ( running )
   68.21 -        smp_send_event_check_cpu(v->processor);
   68.22 -}
   68.23 -
   68.24 -static inline void vcpu_mark_events_pending(struct vcpu *v)
   68.25 -{
   68.26 -    int already_pending = test_and_set_bit(
   68.27 -        0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
   68.28 -
   68.29 -    if ( already_pending )
   68.30 -        return;
   68.31 -
   68.32 -    if ( is_hvm_vcpu(v) )
   68.33 -        hvm_assert_evtchn_irq(v);
   68.34 -    else
   68.35 -        vcpu_kick(v);
   68.36 -}
   68.37 +void vcpu_kick(struct vcpu *v);
   68.38 +void vcpu_mark_events_pending(struct vcpu *v);
   68.39  
   68.40  int hvm_local_events_need_delivery(struct vcpu *v);
   68.41  static inline int local_events_need_delivery(void)
    69.1 --- a/xen/include/asm-x86/fixmap.h	Tue Nov 04 12:07:22 2008 +0900
    69.2 +++ b/xen/include/asm-x86/fixmap.h	Tue Nov 04 12:43:19 2008 +0900
    69.3 @@ -29,6 +29,7 @@
    69.4   * from the end of virtual memory backwards.
    69.5   */
    69.6  enum fixed_addresses {
    69.7 +    FIX_RESERVED, /* Index 0 is reserved since fix_to_virt(0) > FIXADDR_TOP. */
    69.8  #ifdef __i386__
    69.9      FIX_PAE_HIGHMEM_0,
   69.10      FIX_PAE_HIGHMEM_END = FIX_PAE_HIGHMEM_0 + NR_CPUS-1,
    70.1 --- a/xen/include/asm-x86/hvm/vmx/vpmu.h	Tue Nov 04 12:07:22 2008 +0900
    70.2 +++ b/xen/include/asm-x86/hvm/vmx/vpmu.h	Tue Nov 04 12:43:19 2008 +0900
    70.3 @@ -67,7 +67,7 @@ struct vpmu_struct {
    70.4  #define VPMU_CONTEXT_ALLOCATED              0x1
    70.5  #define VPMU_CONTEXT_LOADED                 0x2
    70.6  #define VPMU_RUNNING                        0x4
    70.7 -
    70.8 +#define PASSIVE_DOMAIN_ALLOCATED	    0x8
    70.9  int vpmu_do_wrmsr(struct cpu_user_regs *regs);
   70.10  int vpmu_do_rdmsr(struct cpu_user_regs *regs);
   70.11  int vpmu_do_interrupt(struct cpu_user_regs *regs);
    71.1 --- a/xen/include/asm-x86/hvm/vmx/vpmu_core2.h	Tue Nov 04 12:07:22 2008 +0900
    71.2 +++ b/xen/include/asm-x86/hvm/vmx/vpmu_core2.h	Tue Nov 04 12:43:19 2008 +0900
    71.3 @@ -23,28 +23,6 @@
    71.4  #ifndef __ASM_X86_HVM_VPMU_CORE_H_
    71.5  #define __ASM_X86_HVM_VPMU_CORE_H_
    71.6  
    71.7 -/* Core 2 Non-architectual Performance Counter MSRs. */
    71.8 -u32 core2_counters_msr[] =   {
    71.9 -    MSR_CORE_PERF_FIXED_CTR0,
   71.10 -    MSR_CORE_PERF_FIXED_CTR1,
   71.11 -    MSR_CORE_PERF_FIXED_CTR2};
   71.12 -
   71.13 -/* Core 2 Non-architectual Performance Control MSRs. */
   71.14 -u32 core2_ctrls_msr[] = {
   71.15 -    MSR_CORE_PERF_FIXED_CTR_CTRL,
   71.16 -    MSR_IA32_PEBS_ENABLE,
   71.17 -    MSR_IA32_DS_AREA};
   71.18 -
   71.19 -struct pmumsr core2_counters = {
   71.20 -    3,
   71.21 -    core2_counters_msr
   71.22 -};
   71.23 -
   71.24 -struct pmumsr core2_ctrls = {
   71.25 -    3,
   71.26 -    core2_ctrls_msr
   71.27 -};
   71.28 -
   71.29  struct arch_msr_pair {
   71.30      u64 counter;
   71.31      u64 control;
    72.1 --- a/xen/include/asm-x86/hvm/vpt.h	Tue Nov 04 12:07:22 2008 +0900
    72.2 +++ b/xen/include/asm-x86/hvm/vpt.h	Tue Nov 04 12:43:19 2008 +0900
    72.3 @@ -32,41 +32,6 @@
    72.4  #include <asm/hvm/irq.h>
    72.5  #include <public/hvm/save.h>
    72.6  
    72.7 -struct HPETState;
    72.8 -struct HPET_timer_fn_info {
    72.9 -    struct HPETState *hs;
   72.10 -    unsigned int tn;
   72.11 -};
   72.12 -
   72.13 -struct hpet_registers {
   72.14 -    /* Memory-mapped, software visible registers */
   72.15 -    uint64_t capability;        /* capabilities */
   72.16 -    uint64_t config;            /* configuration */
   72.17 -    uint64_t isr;               /* interrupt status reg */
   72.18 -    uint64_t mc64;              /* main counter */
   72.19 -    struct {                    /* timers */
   72.20 -        uint64_t config;        /* configuration/cap */
   72.21 -        uint64_t cmp;           /* comparator */
   72.22 -        uint64_t fsb;           /* FSB route, not supported now */
   72.23 -    } timers[HPET_TIMER_NUM];
   72.24 -
   72.25 -    /* Hidden register state */
   72.26 -    uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */
   72.27 -};
   72.28 -
   72.29 -typedef struct HPETState {
   72.30 -    struct hpet_registers hpet;
   72.31 -    struct vcpu *vcpu;
   72.32 -    uint64_t stime_freq;
   72.33 -    uint64_t hpet_to_ns_scale; /* hpet ticks to ns (multiplied by 2^10) */
   72.34 -    uint64_t hpet_to_ns_limit; /* max hpet ticks convertable to ns      */
   72.35 -    uint64_t mc_offset;
   72.36 -    struct timer timers[HPET_TIMER_NUM];
   72.37 -    struct HPET_timer_fn_info timer_fn_info[HPET_TIMER_NUM]; 
   72.38 -    spinlock_t lock;
   72.39 -} HPETState;
   72.40 -
   72.41 -
   72.42  /*
   72.43   * Abstract layer of periodic time, one short time.
   72.44   */
   72.45 @@ -108,6 +73,34 @@ typedef struct PITState {
   72.46      spinlock_t lock;
   72.47  } PITState;
   72.48  
   72.49 +struct hpet_registers {
   72.50 +    /* Memory-mapped, software visible registers */
   72.51 +    uint64_t capability;        /* capabilities */
   72.52 +    uint64_t config;            /* configuration */
   72.53 +    uint64_t isr;               /* interrupt status reg */
   72.54 +    uint64_t mc64;              /* main counter */
   72.55 +    struct {                    /* timers */
   72.56 +        uint64_t config;        /* configuration/cap */
   72.57 +        uint64_t cmp;           /* comparator */
   72.58 +        uint64_t fsb;           /* FSB route, not supported now */
   72.59 +    } timers[HPET_TIMER_NUM];
   72.60 +
   72.61 +    /* Hidden register state */
   72.62 +    uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */
   72.63 +    uint64_t comparator64[HPET_TIMER_NUM]; /* 64 bit running comparator */
   72.64 +};
   72.65 +
   72.66 +typedef struct HPETState {
   72.67 +    struct hpet_registers hpet;
   72.68 +    struct vcpu *vcpu;
   72.69 +    uint64_t stime_freq;
   72.70 +    uint64_t hpet_to_ns_scale; /* hpet ticks to ns (multiplied by 2^10) */
   72.71 +    uint64_t hpet_to_ns_limit; /* max hpet ticks convertable to ns      */
   72.72 +    uint64_t mc_offset;
   72.73 +    struct periodic_time pt[HPET_TIMER_NUM];
   72.74 +    spinlock_t lock;
   72.75 +} HPETState;
   72.76 +
   72.77  typedef struct RTCState {
   72.78      /* Hardware state */
   72.79      struct hvm_hw_rtc hw;
   72.80 @@ -160,13 +153,13 @@ void pt_migrate(struct vcpu *v);
   72.81   * The given periodic timer structure must be initialised with zero bytes,
   72.82   * except for the 'source' field which must be initialised with the
   72.83   * correct PTSRC_ value. The initialised timer structure can then be passed
   72.84 - * to {create,destroy}_periodic_time() and number of times and in any order.
   72.85 + * to {create,destroy}_periodic_time() any number of times and in any order.
   72.86   * Note that, for a given periodic timer, invocations of these functions MUST
   72.87   * be serialised.
   72.88   */
   72.89  void create_periodic_time(
   72.90 -    struct vcpu *v, struct periodic_time *pt, uint64_t period,
   72.91 -    uint8_t irq, char one_shot, time_cb *cb, void *data);
   72.92 +    struct vcpu *v, struct periodic_time *pt, uint64_t delta,
   72.93 +    uint64_t period, uint8_t irq, time_cb *cb, void *data);
   72.94  void destroy_periodic_time(struct periodic_time *pt);
   72.95  
   72.96  int pv_pit_handler(int port, int data, int write);
   72.97 @@ -185,7 +178,6 @@ void pmtimer_init(struct vcpu *v);
   72.98  void pmtimer_deinit(struct domain *d);
   72.99  void pmtimer_reset(struct domain *d);
  72.100  
  72.101 -void hpet_migrate_timers(struct vcpu *v);
  72.102  void hpet_init(struct vcpu *v);
  72.103  void hpet_deinit(struct domain *d);
  72.104  void hpet_reset(struct domain *d);
    73.1 --- a/xen/include/asm-x86/mm.h	Tue Nov 04 12:07:22 2008 +0900
    73.2 +++ b/xen/include/asm-x86/mm.h	Tue Nov 04 12:43:19 2008 +0900
    73.3 @@ -61,12 +61,36 @@ struct page_info
    73.4          /*
    73.5           * When PGT_partial is true then this field is valid and indicates
    73.6           * that PTEs in the range [0, @nr_validated_ptes) have been validated.
    73.7 -         * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been
    73.8 -         * partially validated.
    73.9 +         * An extra page reference must be acquired (or not dropped) whenever
   73.10 +         * PGT_partial gets set, and it must be dropped when the flag gets
   73.11 +         * cleared. This is so that a get() leaving a page in partially
   73.12 +         * validated state (where the caller would drop the reference acquired
   73.13 +         * due to the getting of the type [apparently] failing [-EAGAIN])
   73.14 +         * would not accidentally result in a page left with zero general
   73.15 +         * reference count, but non-zero type reference count (possible when
   73.16 +         * the partial get() is followed immediately by domain destruction).
   73.17 +         * Likewise, the ownership of the single type reference for partially
   73.18 +         * (in-)validated pages is tied to this flag, i.e. the instance
   73.19 +         * setting the flag must not drop that reference, whereas the instance
   73.20 +         * clearing it will have to.
   73.21 +         *
   73.22 +         * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has
   73.23 +         * been partially validated. This implies that the general reference
   73.24 +         * to the page (acquired from get_page_from_lNe()) would be dropped
   73.25 +         * (again due to the apparent failure) and hence must be re-acquired
   73.26 +         * when resuming the validation, but must not be dropped when picking
   73.27 +         * up the page for invalidation.
   73.28 +         *
   73.29 +         * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has
   73.30 +         * been partially invalidated. This is basically the opposite case of
   73.31 +         * above, i.e. the general reference to the page was not dropped in
   73.32 +         * put_page_from_lNe() (due to the apparent failure), and hence it
   73.33 +         * must be dropped when the put operation is resumed (and completes),
   73.34 +         * but it must not be acquired if picking up the page for validation.
   73.35           */
   73.36          struct {
   73.37              u16 nr_validated_ptes;
   73.38 -            bool_t partial_pte;
   73.39 +            s8 partial_pte;
   73.40          };
   73.41  
   73.42          /*
    74.1 --- a/xen/include/asm-x86/page.h	Tue Nov 04 12:07:22 2008 +0900
    74.2 +++ b/xen/include/asm-x86/page.h	Tue Nov 04 12:43:19 2008 +0900
    74.3 @@ -314,6 +314,9 @@ unsigned long clone_idle_pagetable(struc
    74.4  #define __PAGE_HYPERVISOR_NOCACHE \
    74.5      (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED)
    74.6  
    74.7 +#define GRANT_PTE_FLAGS \
    74.8 +    (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX | _PAGE_GNTTAB)
    74.9 +
   74.10  #ifndef __ASSEMBLY__
   74.11  
   74.12  static inline int get_order_from_bytes(paddr_t size)
    75.1 --- a/xen/include/asm-x86/softirq.h	Tue Nov 04 12:07:22 2008 +0900
    75.2 +++ b/xen/include/asm-x86/softirq.h	Tue Nov 04 12:43:19 2008 +0900
    75.3 @@ -3,7 +3,8 @@
    75.4  
    75.5  #define NMI_MCE_SOFTIRQ        (NR_COMMON_SOFTIRQS + 0)
    75.6  #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
    75.7 +#define VCPU_KICK_SOFTIRQ      (NR_COMMON_SOFTIRQS + 2)
    75.8  
    75.9 -#define NR_ARCH_SOFTIRQS       2
   75.10 +#define NR_ARCH_SOFTIRQS       3
   75.11  
   75.12  #endif /* __ASM_SOFTIRQ_H__ */
    76.1 --- a/xen/include/asm-x86/x86_32/page.h	Tue Nov 04 12:07:22 2008 +0900
    76.2 +++ b/xen/include/asm-x86/x86_32/page.h	Tue Nov 04 12:43:19 2008 +0900
    76.3 @@ -105,9 +105,6 @@ extern unsigned int PAGE_HYPERVISOR_NOCA
    76.4  #define get_pte_flags(x) (((int)((x) >> 32) & ~0xFFF) | ((int)(x) & 0xFFF))
    76.5  #define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 32) | ((x) & 0xFFF))
    76.6  
    76.7 -#define GRANT_PTE_FLAGS \
    76.8 -    (_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_GNTTAB)
    76.9 -
   76.10  /*
   76.11   * Disallow unused flag bits plus PAT/PSE, PCD, PWT and GLOBAL.
   76.12   * Permit the NX bit if the hardware supports it.
    77.1 --- a/xen/include/asm-x86/x86_64/page.h	Tue Nov 04 12:07:22 2008 +0900
    77.2 +++ b/xen/include/asm-x86/x86_64/page.h	Tue Nov 04 12:43:19 2008 +0900
    77.3 @@ -119,14 +119,11 @@ typedef l4_pgentry_t root_pgentry_t;
    77.4  #define L3_DISALLOW_MASK (BASE_DISALLOW_MASK)
    77.5  #define L4_DISALLOW_MASK (BASE_DISALLOW_MASK)
    77.6  
    77.7 -#define COMPAT_L3_DISALLOW_MASK 0xFFFFF1FEU
    77.8 +#define COMPAT_L3_DISALLOW_MASK 0xFFFFF198U
    77.9  
   77.10  #define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
   77.11  #define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
   77.12  
   77.13 -#define GRANT_PTE_FLAGS \
   77.14 -    (_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_GNTTAB|_PAGE_USER)
   77.15 -
   77.16  #define USER_MAPPINGS_ARE_GLOBAL
   77.17  #ifdef USER_MAPPINGS_ARE_GLOBAL
   77.18  /*
    78.1 --- a/xen/include/asm-x86/xenoprof.h	Tue Nov 04 12:07:22 2008 +0900
    78.2 +++ b/xen/include/asm-x86/xenoprof.h	Tue Nov 04 12:43:19 2008 +0900
    78.3 @@ -64,6 +64,9 @@ void xenoprof_backtrace(
    78.4                   "xenoprof/x86 with autotranslated mode enabled"    \
    78.5                   "isn't supported yet\n");                          \
    78.6      } while (0)
    78.7 +int passive_domain_do_rdmsr(struct cpu_user_regs *regs);
    78.8 +int passive_domain_do_wrmsr(struct cpu_user_regs *regs);
    78.9 +void passive_domain_destroy(struct vcpu *v);
   78.10  
   78.11  #endif /* __ASM_X86_XENOPROF_H__ */
   78.12  
    79.1 --- a/xen/include/public/features.h	Tue Nov 04 12:07:22 2008 +0900
    79.2 +++ b/xen/include/public/features.h	Tue Nov 04 12:43:19 2008 +0900
    79.3 @@ -59,6 +59,9 @@
    79.4  /* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */
    79.5  #define XENFEAT_mmu_pt_update_preserve_ad  5
    79.6  
    79.7 +/* x86: Does this Xen host support the MMU_{CLEAR,COPY}_PAGE hypercall? */
    79.8 +#define XENFEAT_highmem_assist             6
    79.9 +
   79.10  #define XENFEAT_NR_SUBMAPS 1
   79.11  
   79.12  #endif /* __XEN_PUBLIC_FEATURES_H__ */
    80.1 --- a/xen/include/public/trace.h	Tue Nov 04 12:07:22 2008 +0900
    80.2 +++ b/xen/include/public/trace.h	Tue Nov 04 12:43:19 2008 +0900
    80.3 @@ -142,7 +142,9 @@
    80.4  #define TRC_HVM_INVLPG64        (TRC_HVM_HANDLER + TRC_64_FLAG + 0x14)
    80.5  #define TRC_HVM_MCE             (TRC_HVM_HANDLER + 0x15)
    80.6  #define TRC_HVM_IO_ASSIST       (TRC_HVM_HANDLER + 0x16)
    80.7 +#define TRC_HVM_IO_ASSIST64     (TRC_HVM_HANDLER + TRC_64_FLAG + 0x16)
    80.8  #define TRC_HVM_MMIO_ASSIST     (TRC_HVM_HANDLER + 0x17)
    80.9 +#define TRC_HVM_MMIO_ASSIST64   (TRC_HVM_HANDLER + TRC_64_FLAG + 0x17)
   80.10  #define TRC_HVM_CLTS            (TRC_HVM_HANDLER + 0x18)
   80.11  #define TRC_HVM_LMSW            (TRC_HVM_HANDLER + 0x19)
   80.12  #define TRC_HVM_LMSW64          (TRC_HVM_HANDLER + TRC_64_FLAG + 0x19)
    81.1 --- a/xen/include/public/xen.h	Tue Nov 04 12:07:22 2008 +0900
    81.2 +++ b/xen/include/public/xen.h	Tue Nov 04 12:43:19 2008 +0900
    81.3 @@ -231,6 +231,13 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
    81.4   * cmd: MMUEXT_SET_LDT
    81.5   * linear_addr: Linear address of LDT base (NB. must be page-aligned).
    81.6   * nr_ents: Number of entries in LDT.
    81.7 + *
    81.8 + * cmd: MMUEXT_CLEAR_PAGE
    81.9 + * mfn: Machine frame number to be cleared.
   81.10 + *
   81.11 + * cmd: MMUEXT_COPY_PAGE
   81.12 + * mfn: Machine frame number of the destination page.
   81.13 + * src_mfn: Machine frame number of the source page.
   81.14   */
   81.15  #define MMUEXT_PIN_L1_TABLE      0
   81.16  #define MMUEXT_PIN_L2_TABLE      1
   81.17 @@ -247,12 +254,15 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
   81.18  #define MMUEXT_FLUSH_CACHE      12
   81.19  #define MMUEXT_SET_LDT          13
   81.20  #define MMUEXT_NEW_USER_BASEPTR 15
   81.21 +#define MMUEXT_CLEAR_PAGE       16
   81.22 +#define MMUEXT_COPY_PAGE        17
   81.23  
   81.24  #ifndef __ASSEMBLY__
   81.25  struct mmuext_op {
   81.26      unsigned int cmd;
   81.27      union {
   81.28 -        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
   81.29 +        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR
   81.30 +         * CLEAR_PAGE, COPY_PAGE */
   81.31          xen_pfn_t     mfn;
   81.32          /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
   81.33          unsigned long linear_addr;
   81.34 @@ -266,6 +276,8 @@ struct mmuext_op {
   81.35  #else
   81.36          void *vcpumask;
   81.37  #endif
   81.38 +        /* COPY_PAGE */
   81.39 +        xen_pfn_t src_mfn;
   81.40      } arg2;
   81.41  };
   81.42  typedef struct mmuext_op mmuext_op_t;
    82.1 --- a/xen/include/xen/cpuidle.h	Tue Nov 04 12:07:22 2008 +0900
    82.2 +++ b/xen/include/xen/cpuidle.h	Tue Nov 04 12:43:19 2008 +0900
    82.3 @@ -30,12 +30,18 @@
    82.4  #define ACPI_PROCESSOR_MAX_POWER        8
    82.5  #define CPUIDLE_NAME_LEN                16
    82.6  
    82.7 +#define ACPI_CSTATE_EM_NONE     0
    82.8 +#define ACPI_CSTATE_EM_SYSIO    1
    82.9 +#define ACPI_CSTATE_EM_FFH      2
   82.10 +#define ACPI_CSTATE_EM_HALT     3
   82.11 +
   82.12  struct acpi_processor_cx
   82.13  {
   82.14 +    u8 idx;
   82.15      u8 valid;
   82.16      u8 type;
   82.17      u32 address;
   82.18 -    u8 space_id;
   82.19 +    u8 entry_method; /* ACPI_CSTATE_EM_xxx */
   82.20      u32 latency;
   82.21      u32 latency_ticks;
   82.22      u32 power;
    83.1 --- a/xen/include/xen/domain_page.h	Tue Nov 04 12:07:22 2008 +0900
    83.2 +++ b/xen/include/xen/domain_page.h	Tue Nov 04 12:43:19 2008 +0900
    83.3 @@ -24,7 +24,7 @@ void *map_domain_page(unsigned long mfn)
    83.4   * Pass a VA within a page previously mapped in the context of the
    83.5   * currently-executing VCPU via a call to map_domain_page().
    83.6   */
    83.7 -void unmap_domain_page(void *va);
    83.8 +void unmap_domain_page(const void *va);
    83.9  
   83.10  /*
   83.11   * Similar to the above calls, except the mapping is accessible in all
   83.12 @@ -32,7 +32,7 @@ void unmap_domain_page(void *va);
   83.13   * mappings can also be unmapped from any context.
   83.14   */
   83.15  void *map_domain_page_global(unsigned long mfn);
   83.16 -void unmap_domain_page_global(void *va);
   83.17 +void unmap_domain_page_global(const void *va);
   83.18  
   83.19  #define DMCACHE_ENTRY_VALID 1U
   83.20  #define DMCACHE_ENTRY_HELD  2U
   83.21 @@ -75,7 +75,7 @@ map_domain_page_with_cache(unsigned long
   83.22  }
   83.23  
   83.24  static inline void
   83.25 -unmap_domain_page_with_cache(void *va, struct domain_mmap_cache *cache)
   83.26 +unmap_domain_page_with_cache(const void *va, struct domain_mmap_cache *cache)
   83.27  {
   83.28      ASSERT(cache != NULL);
   83.29      cache->flags &= ~DMCACHE_ENTRY_HELD;
    84.1 --- a/xen/include/xen/spinlock.h	Tue Nov 04 12:07:22 2008 +0900
    84.2 +++ b/xen/include/xen/spinlock.h	Tue Nov 04 12:43:19 2008 +0900
    84.3 @@ -5,21 +5,38 @@
    84.4  #include <asm/system.h>
    84.5  #include <asm/spinlock.h>
    84.6  
    84.7 +#ifndef NDEBUG
    84.8 +struct lock_debug {
    84.9 +    int irq_safe; /* +1: IRQ-safe; 0: not IRQ-safe; -1: don't know yet */
   84.10 +};
   84.11 +#define _LOCK_DEBUG { -1 }
   84.12 +void spin_debug_enable(void);
   84.13 +void spin_debug_disable(void);
   84.14 +#else
   84.15 +struct lock_debug { };
   84.16 +#define _LOCK_DEBUG { }
   84.17 +#define spin_debug_enable() ((void)0)
   84.18 +#define spin_debug_disable() ((void)0)
   84.19 +#endif
   84.20 +
   84.21  typedef struct {
   84.22      raw_spinlock_t raw;
   84.23      u16 recurse_cpu:12;
   84.24      u16 recurse_cnt:4;
   84.25 +    struct lock_debug debug;
   84.26  } spinlock_t;
   84.27  
   84.28 -#define SPIN_LOCK_UNLOCKED { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0 }
   84.29 +
   84.30 +#define SPIN_LOCK_UNLOCKED { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, _LOCK_DEBUG }
   84.31  #define DEFINE_SPINLOCK(l) spinlock_t l = SPIN_LOCK_UNLOCKED
   84.32  #define spin_lock_init(l) (*(l) = (spinlock_t)SPIN_LOCK_UNLOCKED)
   84.33  
   84.34  typedef struct {
   84.35      raw_rwlock_t raw;
   84.36 +    struct lock_debug debug;
   84.37  } rwlock_t;
   84.38  
   84.39 -#define RW_LOCK_UNLOCKED { _RAW_RW_LOCK_UNLOCKED }
   84.40 +#define RW_LOCK_UNLOCKED { _RAW_RW_LOCK_UNLOCKED, _LOCK_DEBUG }
   84.41  #define DEFINE_RWLOCK(l) rwlock_t l = RW_LOCK_UNLOCKED
   84.42  #define rwlock_init(l) (*(l) = (rwlock_t)RW_LOCK_UNLOCKED)
   84.43  
   84.44 @@ -34,6 +51,7 @@ void _spin_unlock_irqrestore(spinlock_t 
   84.45  int _spin_is_locked(spinlock_t *lock);
   84.46  int _spin_trylock(spinlock_t *lock);
   84.47  void _spin_barrier(spinlock_t *lock);
   84.48 +void _spin_barrier_irq(spinlock_t *lock);
   84.49  
   84.50  void _spin_lock_recursive(spinlock_t *lock);
   84.51  void _spin_unlock_recursive(spinlock_t *lock);
   84.52 @@ -67,6 +85,7 @@ void _write_unlock_irqrestore(rwlock_t *
   84.53  
   84.54  /* Ensure a lock is quiescent between two critical operations. */
   84.55  #define spin_barrier(l)               _spin_barrier(l)
   84.56 +#define spin_barrier_irq(l)           _spin_barrier_irq(l)
   84.57  
   84.58  /*
   84.59   * spin_[un]lock_recursive(): Use these forms when the lock can (safely!) be
    85.1 --- a/xen/include/xen/time.h	Tue Nov 04 12:07:22 2008 +0900
    85.2 +++ b/xen/include/xen/time.h	Tue Nov 04 12:43:19 2008 +0900
    85.3 @@ -52,6 +52,7 @@ struct tm gmtime(unsigned long t);
    85.4  #define SECONDS(_s)     ((s_time_t)((_s)  * 1000000000ULL))
    85.5  #define MILLISECS(_ms)  ((s_time_t)((_ms) * 1000000ULL))
    85.6  #define MICROSECS(_us)  ((s_time_t)((_us) * 1000ULL))
    85.7 +#define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1))
    85.8  
    85.9  extern void update_vcpu_system_time(struct vcpu *v);
   85.10  extern void update_domain_wallclock_time(struct domain *d);
    86.1 --- a/xen/include/xen/timer.h	Tue Nov 04 12:07:22 2008 +0900
    86.2 +++ b/xen/include/xen/timer.h	Tue Nov 04 12:43:19 2008 +0900
    86.3 @@ -15,12 +15,13 @@
    86.4  struct timer {
    86.5      /* System time expiry value (nanoseconds since boot). */
    86.6      s_time_t expires;
    86.7 +    s_time_t expires_end;
    86.8  
    86.9      /* Position in active-timer data structure. */
   86.10      union {
   86.11          /* Timer-heap offset. */
   86.12          unsigned int heap_offset;
   86.13 -        /* Overflow linked list. */
   86.14 +        /* Linked list. */
   86.15          struct timer *list_next;
   86.16      };
   86.17  
    87.1 --- a/xen/include/xlat.lst	Tue Nov 04 12:07:22 2008 +0900
    87.2 +++ b/xen/include/xlat.lst	Tue Nov 04 12:43:19 2008 +0900
    87.3 @@ -56,6 +56,6 @@
    87.4  !	processor_flags			platform.h
    87.5  !	processor_power			platform.h
    87.6  !	pct_register			platform.h
    87.7 -!	processor_px			platform.h
    87.8 +?	processor_px			platform.h
    87.9  !	psd_package			platform.h
   87.10  !	processor_performance		platform.h