ia64/xen-unstable

changeset 9076:673f62edbfbe

merge
author awilliam@xenbuild.aw
date Wed Mar 01 12:47:25 2006 -0700 (2006-03-01)
parents 88f97bb8f3ae d8451bb6278c
children c1daa52dd0bf
files patches/linux-2.6.16-rc4/i386-mach-io-check-nmi.patch patches/linux-2.6.16-rc4/net-csum.patch patches/linux-2.6.16-rc4/pmd-shared.patch patches/linux-2.6.16-rc4/smp-alts.patch xen/arch/ia64/vmx/vmx_hypercall.c xen/arch/ia64/xen/process.c xen/include/asm-ia64/config.h xen/include/asm-ia64/linux-xen/asm/README.origin xen/include/asm-ia64/linux-xen/asm/uaccess.h
line diff
     1.1 --- a/buildconfigs/linux-defconfig_xen0_x86_32	Wed Mar 01 10:01:54 2006 -0700
     1.2 +++ b/buildconfigs/linux-defconfig_xen0_x86_32	Wed Mar 01 12:47:25 2006 -0700
     1.3 @@ -1320,6 +1320,7 @@ CONFIG_XEN_BLKDEV_BACKEND=y
     1.4  # CONFIG_XEN_BLKDEV_TAP_BE is not set
     1.5  CONFIG_XEN_NETDEV_BACKEND=y
     1.6  # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
     1.7 +CONFIG_XEN_NETDEV_LOOPBACK=y
     1.8  # CONFIG_XEN_TPMDEV_BACKEND is not set
     1.9  CONFIG_XEN_BLKDEV_FRONTEND=y
    1.10  CONFIG_XEN_NETDEV_FRONTEND=y
     2.1 --- a/buildconfigs/linux-defconfig_xen0_x86_64	Wed Mar 01 10:01:54 2006 -0700
     2.2 +++ b/buildconfigs/linux-defconfig_xen0_x86_64	Wed Mar 01 12:47:25 2006 -0700
     2.3 @@ -1244,6 +1244,7 @@ CONFIG_XEN_BLKDEV_BACKEND=y
     2.4  # CONFIG_XEN_BLKDEV_TAP_BE is not set
     2.5  CONFIG_XEN_NETDEV_BACKEND=y
     2.6  # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
     2.7 +CONFIG_XEN_NETDEV_LOOPBACK=y
     2.8  # CONFIG_XEN_TPMDEV_BACKEND is not set
     2.9  CONFIG_XEN_BLKDEV_FRONTEND=y
    2.10  CONFIG_XEN_NETDEV_FRONTEND=y
     3.1 --- a/buildconfigs/linux-defconfig_xen_x86_32	Wed Mar 01 10:01:54 2006 -0700
     3.2 +++ b/buildconfigs/linux-defconfig_xen_x86_32	Wed Mar 01 12:47:25 2006 -0700
     3.3 @@ -2986,6 +2986,7 @@ CONFIG_XEN_BLKDEV_BACKEND=y
     3.4  # CONFIG_XEN_BLKDEV_TAP_BE is not set
     3.5  CONFIG_XEN_NETDEV_BACKEND=y
     3.6  # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
     3.7 +CONFIG_XEN_NETDEV_LOOPBACK=y
     3.8  # CONFIG_XEN_TPMDEV_BACKEND is not set
     3.9  CONFIG_XEN_BLKDEV_FRONTEND=y
    3.10  CONFIG_XEN_NETDEV_FRONTEND=y
     4.1 --- a/buildconfigs/linux-defconfig_xen_x86_64	Wed Mar 01 10:01:54 2006 -0700
     4.2 +++ b/buildconfigs/linux-defconfig_xen_x86_64	Wed Mar 01 12:47:25 2006 -0700
     4.3 @@ -2656,6 +2656,7 @@ CONFIG_XEN_BLKDEV_BACKEND=y
     4.4  # CONFIG_XEN_BLKDEV_TAP_BE is not set
     4.5  CONFIG_XEN_NETDEV_BACKEND=y
     4.6  # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
     4.7 +CONFIG_XEN_NETDEV_LOOPBACK=y
     4.8  # CONFIG_XEN_TPMDEV_BACKEND is not set
     4.9  CONFIG_XEN_BLKDEV_FRONTEND=y
    4.10  CONFIG_XEN_NETDEV_FRONTEND=y
     5.1 --- a/buildconfigs/mk.linux-2.6-xen	Wed Mar 01 10:01:54 2006 -0700
     5.2 +++ b/buildconfigs/mk.linux-2.6-xen	Wed Mar 01 12:47:25 2006 -0700
     5.3 @@ -2,8 +2,8 @@
     5.4  OS           = linux
     5.5  
     5.6  LINUX_SERIES = 2.6
     5.7 -LINUX_VER    = 2.6.16-rc4
     5.8 -LINUX_SRCS = linux-2.6.15.tar.bz2 patch-2.6.16-rc4.bz2
     5.9 +LINUX_VER    = 2.6.16-rc5
    5.10 +LINUX_SRCS = linux-2.6.15.tar.bz2 patch-2.6.16-rc5.bz2
    5.11  LINUX_PDIR = linux-$(LINUX_VER)
    5.12  
    5.13  EXTRAVERSION ?= xen
    5.14 @@ -34,7 +34,7 @@ pristine-$(LINUX_PDIR)/.valid-srcs: $(LI
    5.15  	touch $(@D)/.hgskip
    5.16  	touch $@
    5.17  
    5.18 -pristine-linux-%.16-rc4/.valid-pristine: pristine-$(LINUX_PDIR)/.valid-srcs
    5.19 +pristine-linux-%.16-rc5/.valid-pristine: pristine-$(LINUX_PDIR)/.valid-srcs
    5.20  	touch $@ # update timestamp to avoid rebuild
    5.21  
    5.22  $(LINUX_DIR)/include/linux/autoconf.h: ref-$(OS)-$(LINUX_VER)/.valid-ref
     6.1 --- a/docs/src/user.tex	Wed Mar 01 10:01:54 2006 -0700
     6.2 +++ b/docs/src/user.tex	Wed Mar 01 12:47:25 2006 -0700
     6.3 @@ -626,7 +626,7 @@ kernel output or logging in to Linux ove
     6.4  allow you to monitor and log the Xen boot process via serial console and
     6.5  can be very useful in debugging.
     6.6  
     6.7 -%% kernel /boot/xen-2.0.gz dom0_mem=131072 com1=115200,8n1
     6.8 +%% kernel /boot/xen-2.0.gz dom0_mem=131072 console=com1,vga com1=115200,8n1
     6.9  %% module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro
    6.10  
    6.11  In order to configure Xen serial console output, it is necessary to
    6.12 @@ -637,8 +637,9 @@ example kernel line with:
    6.13  \end{verbatim}}
    6.14  \end{quote}
    6.15  
    6.16 -This configures Xen to output on COM1 at 115,200 baud, 8 data bits, 1
    6.17 -stop bit and no parity. Modify these parameters for your environment.
    6.18 +This configures Xen to output on COM1 at 115,200 baud, 8 data bits, no
    6.19 +parity and 1 stop bit. Modify these parameters for your environment.
    6.20 +See Section~\ref{s:xboot} for an explanation of all boot parameters.
    6.21  
    6.22  One can also configure XenLinux to share the serial console; to achieve
    6.23  this append ``\path{console=ttyS0}'' to your module line.
     7.1 --- a/linux-2.6-xen-sparse/arch/i386/Kconfig	Wed Mar 01 10:01:54 2006 -0700
     7.2 +++ b/linux-2.6-xen-sparse/arch/i386/Kconfig	Wed Mar 01 12:47:25 2006 -0700
     7.3 @@ -770,7 +770,7 @@ config PHYSICAL_START
     7.4  
     7.5  config HOTPLUG_CPU
     7.6  	bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
     7.7 -	depends on SMP && HOTPLUG && EXPERIMENTAL
     7.8 +	depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER
     7.9  	---help---
    7.10  	  Say Y here to experiment with turning CPUs off and on.  CPUs
    7.11  	  can be controlled through /sys/devices/system/cpu.
    7.12 @@ -1122,6 +1122,7 @@ endif
    7.13  
    7.14  config KPROBES
    7.15  	bool "Kprobes (EXPERIMENTAL)"
    7.16 +	depends on EXPERIMENTAL && MODULES
    7.17  	help
    7.18  	  Kprobes allows you to trap at almost any kernel address and
    7.19  	  execute a callback function.  register_kprobe() establishes
     8.1 --- a/linux-2.6-xen-sparse/arch/i386/kernel/Makefile	Wed Mar 01 10:01:54 2006 -0700
     8.2 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/Makefile	Wed Mar 01 12:47:25 2006 -0700
     8.3 @@ -7,7 +7,7 @@ extra-y := head.o init_task.o vmlinux.ld
     8.4  obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o \
     8.5  		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
     8.6  		pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
     8.7 -		quirks.o i8237.o
     8.8 +		quirks.o i8237.o topology.o
     8.9  
    8.10  obj-y				+= cpu/
    8.11  obj-y				+= timers/
     9.1 --- a/linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c	Wed Mar 01 10:01:54 2006 -0700
     9.2 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c	Wed Mar 01 12:47:25 2006 -0700
     9.3 @@ -44,9 +44,6 @@ extern void __init clustered_apic_check(
     9.4  extern int gsi_irq_sharing(int gsi);
     9.5  #include <asm/proto.h>
     9.6  
     9.7 -static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; }
     9.8 -
     9.9 -
    9.10  #else				/* X86 */
    9.11  
    9.12  #ifdef	CONFIG_X86_LOCAL_APIC
    10.1 --- a/linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c	Wed Mar 01 10:01:54 2006 -0700
    10.2 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c	Wed Mar 01 12:47:25 2006 -0700
    10.3 @@ -4,6 +4,7 @@
    10.4  #include <linux/smp.h>
    10.5  #include <linux/module.h>
    10.6  #include <linux/percpu.h>
    10.7 +#include <linux/bootmem.h>
    10.8  #include <asm/semaphore.h>
    10.9  #include <asm/processor.h>
   10.10  #include <asm/i387.h>
   10.11 @@ -19,6 +20,9 @@
   10.12  
   10.13  #include "cpu.h"
   10.14  
   10.15 +DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
   10.16 +EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
   10.17 +
   10.18  #ifndef CONFIG_XEN
   10.19  DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
   10.20  EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
   10.21 @@ -598,6 +602,8 @@ void __cpuinit cpu_init(void)
   10.22  	struct tss_struct * t = &per_cpu(init_tss, cpu);
   10.23  #endif
   10.24  	struct thread_struct *thread = &current->thread;
   10.25 +	struct desc_struct *gdt;
   10.26 +	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
   10.27  
   10.28  	if (cpu_test_and_set(cpu, cpu_initialized)) {
   10.29  		printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
   10.30 @@ -614,7 +620,54 @@ void __cpuinit cpu_init(void)
   10.31  		set_in_cr4(X86_CR4_TSD);
   10.32  	}
   10.33  
   10.34 -	cpu_gdt_init(&cpu_gdt_descr[cpu]);
   10.35 +#ifndef CONFIG_XEN
   10.36 +	/*
   10.37 +	 * This is a horrible hack to allocate the GDT.  The problem
   10.38 +	 * is that cpu_init() is called really early for the boot CPU
   10.39 +	 * (and hence needs bootmem) but much later for the secondary
   10.40 +	 * CPUs, when bootmem will have gone away
   10.41 +	 */
   10.42 +	if (NODE_DATA(0)->bdata->node_bootmem_map) {
   10.43 +		gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
   10.44 +		/* alloc_bootmem_pages panics on failure, so no check */
   10.45 +		memset(gdt, 0, PAGE_SIZE);
   10.46 +	} else {
   10.47 +		gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
   10.48 +		if (unlikely(!gdt)) {
   10.49 +			printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
   10.50 +			for (;;)
   10.51 +				local_irq_enable();
   10.52 +		}
   10.53 +	}
   10.54 +
   10.55 +	/*
   10.56 +	 * Initialize the per-CPU GDT with the boot GDT,
   10.57 +	 * and set up the GDT descriptor:
   10.58 +	 */
   10.59 + 	memcpy(gdt, cpu_gdt_table, GDT_SIZE);
   10.60 +
   10.61 +	/* Set up GDT entry for 16bit stack */
   10.62 + 	*(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
   10.63 +		((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
   10.64 +		((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
   10.65 +		(CPU_16BIT_STACK_SIZE - 1);
   10.66 +
   10.67 +	cpu_gdt_descr->size = GDT_SIZE - 1;
   10.68 + 	cpu_gdt_descr->address = (unsigned long)gdt;
   10.69 +#else
   10.70 +	if (cpu == 0 && cpu_gdt_descr->address == 0) {
   10.71 +		gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
   10.72 +		/* alloc_bootmem_pages panics on failure, so no check */
   10.73 +		memset(gdt, 0, PAGE_SIZE);
   10.74 +
   10.75 +		memcpy(gdt, cpu_gdt_table, GDT_SIZE);
   10.76 +		
   10.77 +		cpu_gdt_descr->size = GDT_SIZE;
   10.78 +		cpu_gdt_descr->address = (unsigned long)gdt;
   10.79 +	}
   10.80 +#endif
   10.81 +
   10.82 +	cpu_gdt_init(cpu_gdt_descr);
   10.83  
   10.84  	/*
   10.85  	 * Set up and load the per-CPU TSS and LDT
    11.1 --- a/linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S	Wed Mar 01 10:01:54 2006 -0700
    11.2 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S	Wed Mar 01 12:47:25 2006 -0700
    11.3 @@ -87,19 +87,9 @@ ENTRY(empty_zero_page)
    11.4   */
    11.5  .data
    11.6  
    11.7 -	ALIGN
    11.8 -	.word 0				# 32 bit align gdt_desc.address
    11.9 -	.globl cpu_gdt_descr
   11.10 -cpu_gdt_descr:
   11.11 -	.word GDT_SIZE
   11.12 -	.long cpu_gdt_table
   11.13 -
   11.14 -	.fill NR_CPUS-1,8,0		# space for the other GDT descriptors
   11.15 -
   11.16  /*
   11.17   * The Global Descriptor Table contains 28 quadwords, per-CPU.
   11.18   */
   11.19 -	.align PAGE_SIZE_asm
   11.20  ENTRY(cpu_gdt_table)
   11.21  	.quad 0x0000000000000000	/* NULL descriptor */
   11.22  	.quad 0x0000000000000000	/* 0x0b reserved */
   11.23 @@ -148,10 +138,6 @@ ENTRY(cpu_gdt_table)
   11.24  	.quad 0x0000000000000000	/* 0xf0 - unused */
   11.25  	.quad 0x0000000000000000	/* 0xf8 - GDT entry 31: double-fault TSS */
   11.26  
   11.27 -	/* Be sure this is zeroed to avoid false validations in Xen */
   11.28 -	.fill PAGE_SIZE_asm / 8 - GDT_ENTRIES,8,0
   11.29 -
   11.30 -
   11.31  /*
   11.32   * __xen_guest information
   11.33   */
   11.34 @@ -176,6 +162,7 @@ ENTRY(cpu_gdt_table)
   11.35  	.ascii  ",FEATURES=writable_page_tables"
   11.36  	.ascii	         "|writable_descriptor_tables"
   11.37  	.ascii	         "|auto_translated_physmap"
   11.38 +	.ascii	         "|pae_pgdir_above_4gb"
   11.39  	.ascii	         "|supervisor_mode_kernel"
   11.40  #ifdef CONFIG_X86_PAE
   11.41  	.ascii	",PAE=yes"
    12.1 --- a/linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c	Wed Mar 01 10:01:54 2006 -0700
    12.2 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c	Wed Mar 01 12:47:25 2006 -0700
    12.3 @@ -2634,8 +2634,10 @@ int __init io_apic_get_unique_id (int io
    12.4  		spin_unlock_irqrestore(&ioapic_lock, flags);
    12.5  
    12.6  		/* Sanity check */
    12.7 -		if (reg_00.bits.ID != apic_id)
    12.8 -			panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic);
    12.9 +		if (reg_00.bits.ID != apic_id) {
   12.10 +			printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
   12.11 +			return -1;
   12.12 +		}
   12.13  	}
   12.14  
   12.15  	apic_printk(APIC_VERBOSE, KERN_INFO
    13.1 --- a/linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c	Wed Mar 01 10:01:54 2006 -0700
    13.2 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c	Wed Mar 01 12:47:25 2006 -0700
    13.3 @@ -935,6 +935,7 @@ void __init mp_register_ioapic (
    13.4  	u32			gsi_base)
    13.5  {
    13.6  	int			idx = 0;
    13.7 +	int			tmpid;
    13.8  
    13.9  	if (nr_ioapics >= MAX_IO_APICS) {
   13.10  		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
   13.11 @@ -957,9 +958,14 @@ void __init mp_register_ioapic (
   13.12  	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
   13.13  #endif
   13.14  	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 < 15))
   13.15 -		mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id);
   13.16 +		tmpid = io_apic_get_unique_id(idx, id);
   13.17  	else
   13.18 -		mp_ioapics[idx].mpc_apicid = id;
   13.19 +		tmpid = id;
   13.20 +	if (tmpid == -1) {
   13.21 +		nr_ioapics--;
   13.22 +		return;
   13.23 +	}
   13.24 +	mp_ioapics[idx].mpc_apicid = tmpid;
   13.25  	mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
   13.26  	
   13.27  	/* 
    14.1 --- a/linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c	Wed Mar 01 10:01:54 2006 -0700
    14.2 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c	Wed Mar 01 12:47:25 2006 -0700
    14.3 @@ -898,12 +898,6 @@ static int __devinit do_boot_cpu(int api
    14.4  	unsigned long start_eip;
    14.5  	unsigned short nmi_high = 0, nmi_low = 0;
    14.6  
    14.7 -	if (!cpu_gdt_descr[cpu].address &&
    14.8 -	    !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
    14.9 -		printk("Failed to allocate GDT for CPU %d\n", cpu);
   14.10 -		return 1;
   14.11 -	}
   14.12 -
   14.13  	++cpucount;
   14.14  
   14.15  	/*
    15.1 --- a/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c	Wed Mar 01 10:01:54 2006 -0700
    15.2 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c	Wed Mar 01 12:47:25 2006 -0700
    15.3 @@ -48,6 +48,8 @@
    15.4  #include <linux/mca.h>
    15.5  #include <linux/sysctl.h>
    15.6  #include <linux/percpu.h>
    15.7 +#include <linux/kernel_stat.h>
    15.8 +#include <linux/posix-timers.h>
    15.9  
   15.10  #include <asm/io.h>
   15.11  #include <asm/smp.h>
   15.12 @@ -70,6 +72,7 @@
   15.13  #include <asm/arch_hooks.h>
   15.14  
   15.15  #include <xen/evtchn.h>
   15.16 +#include <xen/interface/vcpu.h>
   15.17  
   15.18  #if defined (__i386__)
   15.19  #include <asm/i8259.h>
   15.20 @@ -123,6 +126,13 @@ static u32 shadow_tv_version;
   15.21  static u64 processed_system_time;   /* System time (ns) at last processing. */
   15.22  static DEFINE_PER_CPU(u64, processed_system_time);
   15.23  
   15.24 +/* How much CPU time was spent blocked and how much was 'stolen'? */
   15.25 +static DEFINE_PER_CPU(u64, processed_stolen_time);
   15.26 +static DEFINE_PER_CPU(u64, processed_blocked_time);
   15.27 +
   15.28 +/* Current runstate of each CPU (updated automatically by the hypervisor). */
   15.29 +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
   15.30 +
   15.31  /* Must be signed, as it's compared with s64 quantities which can be -ve. */
   15.32  #define NS_PER_TICK (1000000000LL/HZ)
   15.33  
   15.34 @@ -477,14 +487,45 @@ int do_settimeofday(struct timespec *tv)
   15.35  
   15.36  EXPORT_SYMBOL(do_settimeofday);
   15.37  
   15.38 -#ifdef CONFIG_XEN_PRIVILEGED_GUEST
   15.39 +static void sync_xen_wallclock(unsigned long dummy);
   15.40 +static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
   15.41 +static void sync_xen_wallclock(unsigned long dummy)
   15.42 +{
   15.43 +	time_t sec;
   15.44 +	s64 nsec;
   15.45 +	dom0_op_t op;
   15.46 +
   15.47 +	if (!ntp_synced() || independent_wallclock ||
   15.48 +	    !(xen_start_info->flags & SIF_INITDOMAIN))
   15.49 +		return;
   15.50 +
   15.51 +	write_seqlock_irq(&xtime_lock);
   15.52 +
   15.53 +	sec  = xtime.tv_sec;
   15.54 +	nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
   15.55 +	__normalize_time(&sec, &nsec);
   15.56 +
   15.57 +	op.cmd = DOM0_SETTIME;
   15.58 +	op.u.settime.secs        = sec;
   15.59 +	op.u.settime.nsecs       = nsec;
   15.60 +	op.u.settime.system_time = processed_system_time;
   15.61 +	HYPERVISOR_dom0_op(&op);
   15.62 +
   15.63 +	update_wallclock();
   15.64 +
   15.65 +	write_sequnlock_irq(&xtime_lock);
   15.66 +
   15.67 +	/* Once per minute. */
   15.68 +	mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
   15.69 +}
   15.70 +
   15.71  static int set_rtc_mmss(unsigned long nowtime)
   15.72  {
   15.73  	int retval;
   15.74  
   15.75  	WARN_ON(irqs_disabled());
   15.76  
   15.77 -	if (!(xen_start_info->flags & SIF_INITDOMAIN))
   15.78 +	if (independent_wallclock || !(xen_start_info->flags & SIF_INITDOMAIN))
   15.79  		return 0;
   15.80  
   15.81  	/* gets recalled with irq locally disabled */
   15.82 @@ -497,12 +538,6 @@ static int set_rtc_mmss(unsigned long no
   15.83  
   15.84  	return retval;
   15.85  }
   15.86 -#else
   15.87 -static int set_rtc_mmss(unsigned long nowtime)
   15.88 -{
   15.89 -	return 0;
   15.90 -}
   15.91 -#endif
   15.92  
   15.93  /* monotonic_clock(): returns # of nanoseconds passed since time_init()
   15.94   *		Note: This function is required to return accurate
   15.95 @@ -567,19 +602,37 @@ EXPORT_SYMBOL(profile_pc);
   15.96  
   15.97  irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
   15.98  {
   15.99 -	s64 delta, delta_cpu;
  15.100 +	s64 delta, delta_cpu, stolen, blocked;
  15.101 +	u64 sched_time;
  15.102  	int i, cpu = smp_processor_id();
  15.103  	struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
  15.104 +	struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
  15.105  
  15.106  	write_seqlock(&xtime_lock);
  15.107  
  15.108  	do {
  15.109  		get_time_values_from_xen();
  15.110  
  15.111 +		/* Obtain a consistent snapshot of elapsed wallclock cycles. */
  15.112  		delta = delta_cpu = 
  15.113  			shadow->system_timestamp + get_nsec_offset(shadow);
  15.114  		delta     -= processed_system_time;
  15.115  		delta_cpu -= per_cpu(processed_system_time, cpu);
  15.116 +
  15.117 +		/*
  15.118 +		 * Obtain a consistent snapshot of stolen/blocked cycles. We
  15.119 +		 * can use state_entry_time to detect if we get preempted here.
  15.120 +		 */
  15.121 +		do {
  15.122 +			sched_time = runstate->state_entry_time;
  15.123 +			barrier();
  15.124 +			stolen = runstate->time[RUNSTATE_runnable] +
  15.125 +				runstate->time[RUNSTATE_offline] -
  15.126 +				per_cpu(processed_stolen_time, cpu);
  15.127 +			blocked = runstate->time[RUNSTATE_blocked] -
  15.128 +				per_cpu(processed_blocked_time, cpu);
  15.129 +			barrier();
  15.130 +		} while (sched_time != runstate->state_entry_time);
  15.131  	}
  15.132  	while (!time_values_up_to_date(cpu));
  15.133  
  15.134 @@ -612,20 +665,69 @@ irqreturn_t timer_interrupt(int irq, voi
  15.135  	write_sequnlock(&xtime_lock);
  15.136  
  15.137  	/*
  15.138 -         * Local CPU jiffy work. No need to hold xtime_lock, and I'm not sure
  15.139 -         * if there is risk of deadlock if we do (since update_process_times
  15.140 -         * may do scheduler rebalancing work and thus acquire runqueue locks).
  15.141 -         */
  15.142 -	while (delta_cpu >= NS_PER_TICK) {
  15.143 -		delta_cpu -= NS_PER_TICK;
  15.144 -		per_cpu(processed_system_time, cpu) += NS_PER_TICK;
  15.145 -		update_process_times(user_mode(regs));
  15.146 -		profile_tick(CPU_PROFILING, regs);
  15.147 +	 * Account stolen ticks.
  15.148 +	 * HACK: Passing NULL to account_steal_time()
  15.149 +	 * ensures that the ticks are accounted as stolen.
  15.150 +	 */
  15.151 +	if (stolen > 0) {
  15.152 +		delta_cpu -= stolen;
  15.153 +		do_div(stolen, NS_PER_TICK);
  15.154 +		per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
  15.155 +		per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
  15.156 +		account_steal_time(NULL, (cputime_t)stolen);
  15.157  	}
  15.158  
  15.159 +	/*
  15.160 +	 * Account blocked ticks.
  15.161 +	 * HACK: Passing idle_task to account_steal_time()
  15.162 +	 * ensures that the ticks are accounted as idle/wait.
  15.163 +	 */
  15.164 +	if (blocked > 0) {
  15.165 +		delta_cpu -= blocked;
  15.166 +		do_div(blocked, NS_PER_TICK);
  15.167 +		per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
  15.168 +		per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
  15.169 +		account_steal_time(idle_task(cpu), (cputime_t)blocked);
  15.170 +	}
  15.171 +
  15.172 +	/* Account user/system ticks. */
  15.173 +	if (delta_cpu > 0) {
  15.174 +		do_div(delta_cpu, NS_PER_TICK);
  15.175 +		per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
  15.176 +		if (user_mode(regs))
  15.177 +			account_user_time(current, (cputime_t)delta_cpu);
  15.178 +		else
  15.179 +			account_system_time(current, HARDIRQ_OFFSET,
  15.180 +					    (cputime_t)delta_cpu);
  15.181 +	}
  15.182 +
  15.183 +	/* Local timer processing (see update_process_times()). */
  15.184 +	run_local_timers();
  15.185 +	if (rcu_pending(cpu))
  15.186 +		rcu_check_callbacks(cpu, user_mode(regs));
  15.187 +	scheduler_tick();
  15.188 +	run_posix_cpu_timers(current);
  15.189 +
  15.190  	return IRQ_HANDLED;
  15.191  }
  15.192  
  15.193 +static void init_missing_ticks_accounting(int cpu)
  15.194 +{
  15.195 +	struct vcpu_register_runstate_memory_area area;
  15.196 +	struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
  15.197 +
  15.198 +	memset(runstate, 0, sizeof(*runstate));
  15.199 +
  15.200 +	area.addr.v = runstate;
  15.201 +	HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
  15.202 +
  15.203 +	per_cpu(processed_blocked_time, cpu) =
  15.204 +		runstate->time[RUNSTATE_blocked];
  15.205 +	per_cpu(processed_stolen_time, cpu) =
  15.206 +		runstate->time[RUNSTATE_runnable] +
  15.207 +		runstate->time[RUNSTATE_offline];
  15.208 +}
  15.209 +
  15.210  /* not static: needed by APM */
  15.211  unsigned long get_cmos_time(void)
  15.212  {
  15.213 @@ -691,6 +793,7 @@ static void sync_cmos_clock(unsigned lon
  15.214  void notify_arch_cmos_timer(void)
  15.215  {
  15.216  	mod_timer(&sync_cmos_timer, jiffies + 1);
  15.217 +	mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
  15.218  }
  15.219  
  15.220  static long clock_cmos_diff, sleep_start;
  15.221 @@ -814,6 +917,7 @@ void __init time_init(void)
  15.222  
  15.223  	processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
  15.224  	per_cpu(processed_system_time, 0) = processed_system_time;
  15.225 +	init_missing_ticks_accounting(0);
  15.226  
  15.227  	update_wallclock();
  15.228  
  15.229 @@ -891,6 +995,7 @@ void time_resume(void)
  15.230  
  15.231  	processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
  15.232  	per_cpu(processed_system_time, 0) = processed_system_time;
  15.233 +	init_missing_ticks_accounting(0);
  15.234  
  15.235  	update_wallclock();
  15.236  }
  15.237 @@ -909,6 +1014,7 @@ void local_setup_timer(unsigned int cpu)
  15.238  		/* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
  15.239  		per_cpu(processed_system_time, cpu) = 
  15.240  			per_cpu(shadow_time, 0).system_timestamp;
  15.241 +		init_missing_ticks_accounting(cpu);
  15.242  	} while (read_seqretry(&xtime_lock, seq));
  15.243  
  15.244  	sprintf(timer_name[cpu], "timer%d", cpu);
    16.1 --- a/linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile	Wed Mar 01 10:01:54 2006 -0700
    16.2 +++ b/linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile	Wed Mar 01 12:47:25 2006 -0700
    16.3 @@ -2,6 +2,4 @@
    16.4  # Makefile for the linux kernel.
    16.5  #
    16.6  
    16.7 -obj-y				:= setup.o topology.o
    16.8 -  
    16.9 -topology-y			:= ../mach-default/topology.o
   16.10 +obj-y				:= setup.o
    17.1 --- a/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c	Wed Mar 01 10:01:54 2006 -0700
    17.2 +++ b/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c	Wed Mar 01 12:47:25 2006 -0700
    17.3 @@ -454,6 +454,7 @@ void zap_low_mappings (void)
    17.4  
    17.5  static int disable_nx __initdata = 0;
    17.6  u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
    17.7 +EXPORT_SYMBOL(__supported_pte_mask);
    17.8  
    17.9  /*
   17.10   * noexec = on|off
    18.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    18.2 +++ b/linux-2.6-xen-sparse/arch/i386/mm/pgtable.c	Wed Mar 01 12:47:25 2006 -0700
    18.3 @@ -0,0 +1,283 @@
    18.4 +/*
    18.5 + *  linux/arch/i386/mm/pgtable.c
    18.6 + */
    18.7 +
    18.8 +#include <linux/config.h>
    18.9 +#include <linux/sched.h>
   18.10 +#include <linux/kernel.h>
   18.11 +#include <linux/errno.h>
   18.12 +#include <linux/mm.h>
   18.13 +#include <linux/swap.h>
   18.14 +#include <linux/smp.h>
   18.15 +#include <linux/highmem.h>
   18.16 +#include <linux/slab.h>
   18.17 +#include <linux/pagemap.h>
   18.18 +#include <linux/spinlock.h>
   18.19 +#include <linux/module.h>
   18.20 +
   18.21 +#include <asm/system.h>
   18.22 +#include <asm/pgtable.h>
   18.23 +#include <asm/pgalloc.h>
   18.24 +#include <asm/fixmap.h>
   18.25 +#include <asm/e820.h>
   18.26 +#include <asm/tlb.h>
   18.27 +#include <asm/tlbflush.h>
   18.28 +
   18.29 +void show_mem(void)
   18.30 +{
   18.31 +	int total = 0, reserved = 0;
   18.32 +	int shared = 0, cached = 0;
   18.33 +	int highmem = 0;
   18.34 +	struct page *page;
   18.35 +	pg_data_t *pgdat;
   18.36 +	unsigned long i;
   18.37 +	struct page_state ps;
   18.38 +	unsigned long flags;
   18.39 +
   18.40 +	printk(KERN_INFO "Mem-info:\n");
   18.41 +	show_free_areas();
   18.42 +	printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
   18.43 +	for_each_pgdat(pgdat) {
   18.44 +		pgdat_resize_lock(pgdat, &flags);
   18.45 +		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
   18.46 +			page = pgdat_page_nr(pgdat, i);
   18.47 +			total++;
   18.48 +			if (PageHighMem(page))
   18.49 +				highmem++;
   18.50 +			if (PageReserved(page))
   18.51 +				reserved++;
   18.52 +			else if (PageSwapCache(page))
   18.53 +				cached++;
   18.54 +			else if (page_count(page))
   18.55 +				shared += page_count(page) - 1;
   18.56 +		}
   18.57 +		pgdat_resize_unlock(pgdat, &flags);
   18.58 +	}
   18.59 +	printk(KERN_INFO "%d pages of RAM\n", total);
   18.60 +	printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
   18.61 +	printk(KERN_INFO "%d reserved pages\n", reserved);
   18.62 +	printk(KERN_INFO "%d pages shared\n", shared);
   18.63 +	printk(KERN_INFO "%d pages swap cached\n", cached);
   18.64 +
   18.65 +	get_page_state(&ps);
   18.66 +	printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty);
   18.67 +	printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
   18.68 +	printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped);
   18.69 +	printk(KERN_INFO "%lu pages slab\n", ps.nr_slab);
   18.70 +	printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages);
   18.71 +}
   18.72 +
   18.73 +/*
   18.74 + * Associate a virtual page frame with a given physical page frame 
   18.75 + * and protection flags for that frame.
   18.76 + */ 
   18.77 +static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
   18.78 +{
   18.79 +	pgd_t *pgd;
   18.80 +	pud_t *pud;
   18.81 +	pmd_t *pmd;
   18.82 +	pte_t *pte;
   18.83 +
   18.84 +	pgd = swapper_pg_dir + pgd_index(vaddr);
   18.85 +	if (pgd_none(*pgd)) {
   18.86 +		BUG();
   18.87 +		return;
   18.88 +	}
   18.89 +	pud = pud_offset(pgd, vaddr);
   18.90 +	if (pud_none(*pud)) {
   18.91 +		BUG();
   18.92 +		return;
   18.93 +	}
   18.94 +	pmd = pmd_offset(pud, vaddr);
   18.95 +	if (pmd_none(*pmd)) {
   18.96 +		BUG();
   18.97 +		return;
   18.98 +	}
   18.99 +	pte = pte_offset_kernel(pmd, vaddr);
  18.100 +	/* <pfn,flags> stored as-is, to permit clearing entries */
  18.101 +	set_pte(pte, pfn_pte(pfn, flags));
  18.102 +
  18.103 +	/*
  18.104 +	 * It's enough to flush this one mapping.
  18.105 +	 * (PGE mappings get flushed as well)
  18.106 +	 */
  18.107 +	__flush_tlb_one(vaddr);
  18.108 +}
  18.109 +
  18.110 +/*
  18.111 + * Associate a large virtual page frame with a given physical page frame 
  18.112 + * and protection flags for that frame. pfn is for the base of the page,
  18.113 + * vaddr is what the page gets mapped to - both must be properly aligned. 
  18.114 + * The pmd must already be instantiated. Assumes PAE mode.
  18.115 + */ 
  18.116 +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
  18.117 +{
  18.118 +	pgd_t *pgd;
  18.119 +	pud_t *pud;
  18.120 +	pmd_t *pmd;
  18.121 +
  18.122 +	if (vaddr & (PMD_SIZE-1)) {		/* vaddr is misaligned */
  18.123 +		printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
  18.124 +		return; /* BUG(); */
  18.125 +	}
  18.126 +	if (pfn & (PTRS_PER_PTE-1)) {		/* pfn is misaligned */
  18.127 +		printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
  18.128 +		return; /* BUG(); */
  18.129 +	}
  18.130 +	pgd = swapper_pg_dir + pgd_index(vaddr);
  18.131 +	if (pgd_none(*pgd)) {
  18.132 +		printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
  18.133 +		return; /* BUG(); */
  18.134 +	}
  18.135 +	pud = pud_offset(pgd, vaddr);
  18.136 +	pmd = pmd_offset(pud, vaddr);
  18.137 +	set_pmd(pmd, pfn_pmd(pfn, flags));
  18.138 +	/*
  18.139 +	 * It's enough to flush this one mapping.
  18.140 +	 * (PGE mappings get flushed as well)
  18.141 +	 */
  18.142 +	__flush_tlb_one(vaddr);
  18.143 +}
  18.144 +
  18.145 +static int nr_fixmaps = 0;
  18.146 +unsigned long __FIXADDR_TOP = 0xfffff000;
  18.147 +EXPORT_SYMBOL(__FIXADDR_TOP);
  18.148 +
  18.149 +void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
  18.150 +{
  18.151 +	unsigned long address = __fix_to_virt(idx);
  18.152 +
  18.153 +	if (idx >= __end_of_fixed_addresses) {
  18.154 +		BUG();
  18.155 +		return;
  18.156 +	}
  18.157 +	set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
  18.158 +	nr_fixmaps++;
  18.159 +}
  18.160 +
  18.161 +void set_fixaddr_top(unsigned long top)
  18.162 +{
  18.163 +	BUG_ON(nr_fixmaps > 0);
  18.164 +	__FIXADDR_TOP = top - PAGE_SIZE;
  18.165 +}
  18.166 +
  18.167 +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
  18.168 +{
  18.169 +	return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
  18.170 +}
  18.171 +
  18.172 +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
  18.173 +{
  18.174 +	struct page *pte;
  18.175 +
  18.176 +#ifdef CONFIG_HIGHPTE
  18.177 +	pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
  18.178 +#else
  18.179 +	pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
  18.180 +#endif
  18.181 +	return pte;
  18.182 +}
  18.183 +
  18.184 +void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
  18.185 +{
  18.186 +	memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
  18.187 +}
  18.188 +
  18.189 +/*
  18.190 + * List of all pgd's needed for non-PAE so it can invalidate entries
  18.191 + * in both cached and uncached pgd's; not needed for PAE since the
  18.192 + * kernel pmd is shared. If PAE were not to share the pmd a similar
  18.193 + * tactic would be needed. This is essentially codepath-based locking
  18.194 + * against pageattr.c; it is the unique case in which a valid change
  18.195 + * of kernel pagetables can't be lazily synchronized by vmalloc faults.
  18.196 + * vmalloc faults work because attached pagetables are never freed.
  18.197 + * The locking scheme was chosen on the basis of manfred's
  18.198 + * recommendations and having no core impact whatsoever.
  18.199 + * -- wli
  18.200 + */
  18.201 +DEFINE_SPINLOCK(pgd_lock);
  18.202 +struct page *pgd_list;
  18.203 +
  18.204 +static inline void pgd_list_add(pgd_t *pgd)
  18.205 +{
  18.206 +	struct page *page = virt_to_page(pgd);
  18.207 +	page->index = (unsigned long)pgd_list;
  18.208 +	if (pgd_list)
  18.209 +		set_page_private(pgd_list, (unsigned long)&page->index);
  18.210 +	pgd_list = page;
  18.211 +	set_page_private(page, (unsigned long)&pgd_list);
  18.212 +}
  18.213 +
  18.214 +static inline void pgd_list_del(pgd_t *pgd)
  18.215 +{
  18.216 +	struct page *next, **pprev, *page = virt_to_page(pgd);
  18.217 +	next = (struct page *)page->index;
  18.218 +	pprev = (struct page **)page_private(page);
  18.219 +	*pprev = next;
  18.220 +	if (next)
  18.221 +		set_page_private(next, (unsigned long)pprev);
  18.222 +}
  18.223 +
  18.224 +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
  18.225 +{
  18.226 +	unsigned long flags;
  18.227 +
  18.228 +	if (PTRS_PER_PMD == 1) {
  18.229 +		memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
  18.230 +		spin_lock_irqsave(&pgd_lock, flags);
  18.231 +	}
  18.232 +
  18.233 +	clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
  18.234 +			swapper_pg_dir + USER_PTRS_PER_PGD,
  18.235 +			KERNEL_PGD_PTRS);
  18.236 +	if (PTRS_PER_PMD > 1)
  18.237 +		return;
  18.238 +
  18.239 +	pgd_list_add(pgd);
  18.240 +	spin_unlock_irqrestore(&pgd_lock, flags);
  18.241 +}
  18.242 +
  18.243 +/* never called when PTRS_PER_PMD > 1 */
  18.244 +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
  18.245 +{
  18.246 +	unsigned long flags; /* can be called from interrupt context */
  18.247 +
  18.248 +	spin_lock_irqsave(&pgd_lock, flags);
  18.249 +	pgd_list_del(pgd);
  18.250 +	spin_unlock_irqrestore(&pgd_lock, flags);
  18.251 +}
  18.252 +
  18.253 +pgd_t *pgd_alloc(struct mm_struct *mm)
  18.254 +{
  18.255 +	int i;
  18.256 +	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
  18.257 +
  18.258 +	if (PTRS_PER_PMD == 1 || !pgd)
  18.259 +		return pgd;
  18.260 +
  18.261 +	for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
  18.262 +		pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
  18.263 +		if (!pmd)
  18.264 +			goto out_oom;
  18.265 +		set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
  18.266 +	}
  18.267 +	return pgd;
  18.268 +
  18.269 +out_oom:
  18.270 +	for (i--; i >= 0; i--)
  18.271 +		kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
  18.272 +	kmem_cache_free(pgd_cache, pgd);
  18.273 +	return NULL;
  18.274 +}
  18.275 +
  18.276 +void pgd_free(pgd_t *pgd)
  18.277 +{
  18.278 +	int i;
  18.279 +
  18.280 +	/* in the PAE case user pgd entries are overwritten before usage */
  18.281 +	if (PTRS_PER_PMD > 1)
  18.282 +		for (i = 0; i < USER_PTRS_PER_PGD; ++i)
  18.283 +			kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
  18.284 +	/* in the non-PAE case, free_pgtables() clears user pgd entries */
  18.285 +	kmem_cache_free(pgd_cache, pgd);
  18.286 +}
    19.1 --- a/linux-2.6-xen-sparse/arch/x86_64/Kconfig	Wed Mar 01 10:01:54 2006 -0700
    19.2 +++ b/linux-2.6-xen-sparse/arch/x86_64/Kconfig	Wed Mar 01 12:47:25 2006 -0700
    19.3 @@ -381,21 +381,6 @@ config HPET_TIMER
    19.4  	  as it is off-chip.  You can find the HPET spec at
    19.5  	  <http://www.intel.com/hardwaredesign/hpetspec.htm>.
    19.6  
    19.7 -config X86_PM_TIMER
    19.8 -	bool "PM timer" if EMBEDDED
    19.9 -	depends on ACPI && !X86_64_XEN
   19.10 -	default y
   19.11 -	help
   19.12 -	  Support the ACPI PM timer for time keeping. This is slow,
   19.13 -	  but is useful on some chipsets without HPET on systems with more
   19.14 -	  than one CPU. On a single processor or single socket multi core
   19.15 -	  system it is normally not required.
   19.16 -	  When the PM timer is active 64bit vsyscalls are disabled
   19.17 -	  and should not be enabled (/proc/sys/kernel/vsyscall64 should
   19.18 -	  not be changed).
   19.19 -	  The kernel selects the PM timer only as a last resort, so it is
   19.20 -	  useful to enable just in case.
   19.21 -
   19.22  config HPET_EMULATE_RTC
   19.23  	bool "Provide RTC interrupt"
   19.24  	depends on HPET_TIMER && RTC=y
   19.25 @@ -640,6 +625,7 @@ source "arch/x86_64/oprofile/Kconfig"
   19.26  
   19.27  config KPROBES
   19.28  	bool "Kprobes (EXPERIMENTAL)"
   19.29 +	depends on EXPERIMENTAL && MODULES
   19.30  	help
   19.31  	  Kprobes allows you to trap at almost any kernel address and
   19.32  	  execute a callback function.  register_kprobe() establishes
    20.1 --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile	Wed Mar 01 10:01:54 2006 -0700
    20.2 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile	Wed Mar 01 12:47:25 2006 -0700
    20.3 @@ -45,7 +45,7 @@ CFLAGS_vsyscall.o		:= $(PROFILING) -g0
    20.4  
    20.5  bootflag-y			+= ../../i386/kernel/bootflag.o
    20.6  cpuid-$(subst m,y,$(CONFIG_X86_CPUID))  += ../../i386/kernel/cpuid.o
    20.7 -topology-y                     += ../../i386/mach-default/topology.o
    20.8 +topology-y                     += ../../i386/kernel/topology.o
    20.9  microcode-$(subst m,y,$(CONFIG_MICROCODE))  += ../../i386/kernel/microcode.o
   20.10  intel_cacheinfo-y		+= ../../i386/kernel/cpu/intel_cacheinfo.o
   20.11  quirks-y			+= ../../i386/kernel/quirks.o
    21.1 --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c	Wed Mar 01 10:01:54 2006 -0700
    21.2 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c	Wed Mar 01 12:47:25 2006 -0700
    21.3 @@ -114,6 +114,8 @@ void smp_apic_timer_interrupt(struct pt_
    21.4  	irq_exit();
    21.5  }
    21.6  
    21.7 +int __initdata unsync_tsc_on_multicluster;
    21.8 +
    21.9  /*
   21.10   * This interrupt should _never_ happen with our APIC/SMP architecture
   21.11   */
    22.1 --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S	Wed Mar 01 10:01:54 2006 -0700
    22.2 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S	Wed Mar 01 12:47:25 2006 -0700
    22.3 @@ -51,6 +51,7 @@
    22.4  #include <asm/page.h>
    22.5  #include <asm/errno.h>
    22.6  #include <xen/interface/arch-x86_64.h>
    22.7 +#include <xen/interface/features.h>
    22.8  
    22.9  #include "irq_vectors.h"
   22.10  
   22.11 @@ -146,16 +147,19 @@ NMI_MASK = 0x80000000
   22.12           */
   22.13  	.macro HYPERVISOR_IRET flag
   22.14  	testb $3,1*8(%rsp)
   22.15 -	jnz   1f
   22.16 +	jnz   2f
   22.17  	testl $NMI_MASK,2*8(%rsp)
   22.18 +	jnz   2f
   22.19 +
   22.20 +	testb $1,(xen_features+XENFEAT_supervisor_mode_kernel)
   22.21  	jnz   1f
   22.22  
   22.23  	/* Direct iret to kernel space. Correct CS and SS. */
   22.24  	orb   $3,1*8(%rsp)
   22.25  	orb   $3,4*8(%rsp)
   22.26 -	iretq
   22.27 +1:	iretq
   22.28  
   22.29 -1:	/* Slow iret via hypervisor. */
   22.30 +2:	/* Slow iret via hypervisor. */
   22.31  	andl  $~NMI_MASK, 16(%rsp)
   22.32  	pushq $\flag
   22.33  	jmp  hypercall_page + (__HYPERVISOR_iret * 32)
    23.1 --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c	Wed Mar 01 10:01:54 2006 -0700
    23.2 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c	Wed Mar 01 12:47:25 2006 -0700
    23.3 @@ -51,6 +51,8 @@ static int no_timer_check;
    23.4  int disable_timer_pin_1 __initdata;
    23.5  
    23.6  #ifndef CONFIG_XEN
    23.7 +int timer_over_8254 __initdata = 1;
    23.8 +
    23.9  /* Where if anywhere is the i8259 connect in external int mode */
   23.10  static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
   23.11  #endif
   23.12 @@ -301,6 +303,22 @@ static int __init enable_ioapic_setup(ch
   23.13  __setup("noapic", disable_ioapic_setup);
   23.14  __setup("apic", enable_ioapic_setup);
   23.15  
   23.16 +#ifndef CONFIG_XEN
   23.17 +static int __init setup_disable_8254_timer(char *s)
   23.18 +{
   23.19 +	timer_over_8254 = -1;
   23.20 +	return 1;
   23.21 +}
   23.22 +static int __init setup_enable_8254_timer(char *s)
   23.23 +{
   23.24 +	timer_over_8254 = 2;
   23.25 +	return 1;
   23.26 +}
   23.27 +
   23.28 +__setup("disable_8254_timer", setup_disable_8254_timer);
   23.29 +__setup("enable_8254_timer", setup_enable_8254_timer);
   23.30 +#endif /* !CONFIG_XEN */
   23.31 +
   23.32  #include <asm/pci-direct.h>
   23.33  #include <linux/pci_ids.h>
   23.34  #include <linux/pci.h>
   23.35 @@ -360,28 +378,21 @@ void __init check_ioapic(void)
   23.36  					/* RED-PEN skip them on mptables too? */
   23.37  					return;
   23.38  				case PCI_VENDOR_ID_ATI:
   23.39 +
   23.40 +				/* This should be actually default, but
   23.41 +				   for 2.6.16 let's do it for ATI only where
   23.42 +				   it's really needed. */
   23.43  #ifndef CONFIG_XEN
   23.44 -					if (apic_runs_main_timer != 0)
   23.45 -						break;
   23.46 -#ifdef CONFIG_ACPI
   23.47 -					/* Don't do this for laptops right
   23.48 -					   right now because their timer
   23.49 -					   doesn't necessarily tick in C2/3 */
   23.50 -					if (acpi_fadt.revision >= 3 &&
   23.51 -			(acpi_fadt.plvl2_lat + acpi_fadt.plvl3_lat) < 1100) {
   23.52 -						printk(KERN_INFO
   23.53 -"ATI board detected, but seems to be a laptop. Timer might be shakey, sorry\n");
   23.54 -						break;
   23.55 -					}
   23.56 -#endif					
   23.57 +					if (timer_over_8254 == 1) {	
   23.58 +						timer_over_8254 = 0;	
   23.59  					printk(KERN_INFO
   23.60 -	     "ATI board detected. Using APIC/PM timer.\n");
   23.61 -					apic_runs_main_timer = 1;
   23.62 -					nohpet = 1;
   23.63 +		"ATI board detected. Disabling timer routing over 8254.\n");
   23.64 +					}	
   23.65  #endif
   23.66  					return;
   23.67  				} 
   23.68  
   23.69 +
   23.70  				/* No multi-function device? */
   23.71  				type = read_pci_config_byte(num,slot,func,
   23.72  							    PCI_HEADER_TYPE);
   23.73 @@ -1848,6 +1859,8 @@ static inline void unlock_ExtINT_logic(v
   23.74   * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
   23.75   * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
   23.76   * fanatically on his truly buggy board.
   23.77 + *
   23.78 + * FIXME: really need to revamp this for modern platforms only.
   23.79   */
   23.80  static inline void check_timer(void)
   23.81  {
   23.82 @@ -1870,7 +1883,8 @@ static inline void check_timer(void)
   23.83  	 */
   23.84  	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
   23.85  	init_8259A(1);
   23.86 -	enable_8259A_irq(0);
   23.87 +	if (timer_over_8254 > 0)
   23.88 +		enable_8259A_irq(0);
   23.89  
   23.90  	pin1  = find_isa_irq_pin(0, mp_INT);
   23.91  	apic1 = find_isa_irq_apic(0, mp_INT);
   23.92 @@ -1925,7 +1939,7 @@ static inline void check_timer(void)
   23.93  	}
   23.94  	printk(" failed.\n");
   23.95  
   23.96 -	if (nmi_watchdog) {
   23.97 +	if (nmi_watchdog == NMI_IO_APIC) {
   23.98  		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
   23.99  		nmi_watchdog = 0;
  23.100  	}
    24.1 --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c	Wed Mar 01 10:01:54 2006 -0700
    24.2 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c	Wed Mar 01 12:47:25 2006 -0700
    24.3 @@ -462,6 +462,12 @@ static __init void parse_cmdline_early (
    24.4  		else if(!memcmp(from, "elfcorehdr=", 11))
    24.5  			elfcorehdr_addr = memparse(from+11, &from);
    24.6  #endif
    24.7 +
    24.8 +#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
    24.9 +		else if (!memcmp(from, "additional_cpus=", 16))
   24.10 +			setup_additional_cpus(from+16);
   24.11 +#endif
   24.12 +
   24.13  	next_char:
   24.14  		c = *(from++);
   24.15  		if (!c)
    25.1 --- a/linux-2.6-xen-sparse/drivers/acpi/Kconfig	Wed Mar 01 10:01:54 2006 -0700
    25.2 +++ b/linux-2.6-xen-sparse/drivers/acpi/Kconfig	Wed Mar 01 12:47:25 2006 -0700
    25.3 @@ -247,7 +247,7 @@ config ACPI_CUSTOM_DSDT_FILE
    25.4  	  Enter the full path name to the file wich includes the AmlCode declaration.
    25.5  
    25.6  config ACPI_BLACKLIST_YEAR
    25.7 -	int "Disable ACPI for systems before Jan 1st this year" if X86
    25.8 +	int "Disable ACPI for systems before Jan 1st this year" if X86_32
    25.9  	default 0
   25.10  	help
   25.11  	  enter a 4-digit year, eg. 2001 to disable ACPI by default
   25.12 @@ -285,9 +285,9 @@ config ACPI_SYSTEM
   25.13  	  dump your ACPI DSDT table using /proc/acpi/dsdt.
   25.14  
   25.15  config X86_PM_TIMER
   25.16 -	bool "Power Management Timer Support"
   25.17 +	bool "Power Management Timer Support" if EMBEDDED
   25.18  	depends on X86
   25.19 -	depends on !X86_64
   25.20 +	depends on !XEN
   25.21  	default y
   25.22  	help
   25.23  	  The Power Management Timer is available on all ACPI-capable,
   25.24 @@ -298,9 +298,8 @@ config X86_PM_TIMER
   25.25  	  voltage scaling, unlike the commonly used Time Stamp Counter
   25.26  	  (TSC) timing source.
   25.27  
   25.28 -	  So, if you see messages like 'Losing too many ticks!' in the
   25.29 -	  kernel logs, and/or you are using this on a notebook which
   25.30 -	  does not yet have an HPET, you should say "Y" here.
   25.31 +	  You should nearly always say Y here because many modern
   25.32 +	  systems require this timer. 
   25.33  
   25.34  config ACPI_CONTAINER
   25.35  	tristate "ACPI0004,PNP0A05 and PNP0A06 Container Driver (EXPERIMENTAL)"
    26.1 --- a/linux-2.6-xen-sparse/drivers/video/Kconfig	Wed Mar 01 10:01:54 2006 -0700
    26.2 +++ b/linux-2.6-xen-sparse/drivers/video/Kconfig	Wed Mar 01 12:47:25 2006 -0700
    26.3 @@ -520,7 +520,7 @@ config FB_GBE
    26.4  config FB_GBE_MEM
    26.5  	int "Video memory size in MB"
    26.6  	depends on FB_GBE
    26.7 -	default 8
    26.8 +	default 4
    26.9  	help
   26.10  	  This is the amount of memory reserved for the framebuffer,
   26.11  	  which can be any value between 1MB and 8MB.
    27.1 --- a/linux-2.6-xen-sparse/drivers/xen/Kconfig	Wed Mar 01 10:01:54 2006 -0700
    27.2 +++ b/linux-2.6-xen-sparse/drivers/xen/Kconfig	Wed Mar 01 12:47:25 2006 -0700
    27.3 @@ -68,7 +68,7 @@ config XEN_PCIDEV_BE_DEBUG
    27.4  	default n
    27.5  
    27.6  config XEN_BLKDEV_BACKEND
    27.7 -	bool "Block-device backend driver"
    27.8 +	tristate "Block-device backend driver"
    27.9  	default y
   27.10  	help
   27.11  	  The block-device backend driver allows the kernel to export its
   27.12 @@ -76,7 +76,7 @@ config XEN_BLKDEV_BACKEND
   27.13  	  interface.
   27.14  
   27.15  config XEN_BLKDEV_TAP_BE
   27.16 -        bool "Block Tap support for backend driver (DANGEROUS)"
   27.17 +        tristate "Block Tap support for backend driver (DANGEROUS)"
   27.18          depends on XEN_BLKDEV_BACKEND
   27.19          default n
   27.20          help
   27.21 @@ -89,7 +89,7 @@ config XEN_BLKDEV_TAP_BE
   27.22            modified to use grant tables.
   27.23  
   27.24  config XEN_NETDEV_BACKEND
   27.25 -	bool "Network-device backend driver"
   27.26 +	tristate "Network-device backend driver"
   27.27  	default y
   27.28  	help
   27.29  	  The network-device backend driver allows the kernel to export its
   27.30 @@ -109,8 +109,16 @@ config XEN_NETDEV_PIPELINED_TRANSMITTER
   27.31  	  are unsure; or if you experience network hangs when this option is
   27.32  	  enabled; then you must say N here.
   27.33  
   27.34 +config XEN_NETDEV_LOOPBACK
   27.35 +	tristate "Network-device loopback driver"
   27.36 +	depends on XEN_NETDEV_BACKEND
   27.37 +	default y
   27.38 +	help
   27.39 +	  A two-interface loopback device to emulate a local netfront-netback
   27.40 +	  connection.
   27.41 +
   27.42  config XEN_TPMDEV_BACKEND
   27.43 -	bool "TPM-device backend driver"
   27.44 +	tristate "TPM-device backend driver"
   27.45  	default n
   27.46  	help
   27.47  	  The TPM-device backend driver
   27.48 @@ -145,7 +153,7 @@ config XEN_NETDEV_FRONTEND
   27.49  	  (domain 0), then you almost certainly want to say Y here.
   27.50  
   27.51  config XEN_BLKDEV_TAP
   27.52 -	bool "Block device tap driver"
   27.53 +	tristate "Block device tap driver"
   27.54  	default n
   27.55  	help
   27.56  	  This driver allows a VM to interact on block device channels
   27.57 @@ -154,7 +162,7 @@ config XEN_BLKDEV_TAP
   27.58  	  space.  Odds are that you want to say N here.
   27.59  
   27.60  config XEN_TPMDEV_FRONTEND
   27.61 -	bool "TPM-device frontend driver"
   27.62 +	tristate "TPM-device frontend driver"
   27.63  	default n
   27.64  	select TCG_TPM
   27.65  	select TCG_XEN
    28.1 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/Makefile	Wed Mar 01 10:01:54 2006 -0700
    28.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/Makefile	Wed Mar 01 12:47:25 2006 -0700
    28.3 @@ -1,2 +1,3 @@
    28.4 +obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
    28.5  
    28.6 -obj-y	:= blkback.o xenbus.o interface.o vbd.o
    28.7 +blkbk-y	:= blkback.o xenbus.o interface.o vbd.o
    29.1 --- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c	Wed Mar 01 10:01:54 2006 -0700
    29.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c	Wed Mar 01 12:47:25 2006 -0700
    29.3 @@ -29,14 +29,10 @@
    29.4   * 64 should be enough to keep us competitive with Linux.
    29.5   */
    29.6  static int blkif_reqs = 64;
    29.7 -static int mmap_pages;
    29.8 +module_param_named(reqs, blkif_reqs, int, 0);
    29.9 +MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
   29.10  
   29.11 -static int __init set_blkif_reqs(char *str)
   29.12 -{
   29.13 -	get_option(&str, &blkif_reqs);
   29.14 -	return 1;
   29.15 -}
   29.16 -__setup("blkif_reqs=", set_blkif_reqs);
   29.17 +static int mmap_pages;
   29.18  
   29.19  /* Run-time switchable: /sys/module/blkback/parameters/ */
   29.20  static unsigned int log_stats = 0;
   29.21 @@ -574,10 +570,20 @@ static int __init blkif_init(void)
   29.22  		list_add_tail(&pending_reqs[i].free_list, &pending_free);
   29.23      
   29.24  	blkif_xenbus_init();
   29.25 +	__unsafe(THIS_MODULE);
   29.26  	return 0;
   29.27  }
   29.28  
   29.29 -__initcall(blkif_init);
   29.30 +module_init(blkif_init);
   29.31 +
   29.32 +static void blkif_exit(void)
   29.33 +{
   29.34 +	BUG();
   29.35 +}
   29.36 +
   29.37 +module_exit(blkif_exit);
   29.38 +
   29.39 +MODULE_LICENSE("Dual BSD/GPL");
   29.40  
   29.41  /*
   29.42   * Local variables:
    30.1 --- a/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c	Wed Mar 01 10:01:54 2006 -0700
    30.2 +++ b/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c	Wed Mar 01 12:47:25 2006 -0700
    30.3 @@ -16,6 +16,7 @@
    30.4  
    30.5  /* Referenced in netback.c. */
    30.6  /*static*/ kmem_cache_t *skbuff_cachep;
    30.7 +EXPORT_SYMBOL(skbuff_cachep);
    30.8  
    30.9  #define MAX_SKBUFF_ORDER 4
   30.10  static kmem_cache_t *skbuff_order_cachep[MAX_SKBUFF_ORDER + 1];
    31.1 --- a/linux-2.6-xen-sparse/drivers/xen/core/smpboot.c	Wed Mar 01 10:01:54 2006 -0700
    31.2 +++ b/linux-2.6-xen-sparse/drivers/xen/core/smpboot.c	Wed Mar 01 12:47:25 2006 -0700
    31.3 @@ -150,6 +150,11 @@ void vcpu_prepare(int vcpu)
    31.4  {
    31.5  	vcpu_guest_context_t ctxt;
    31.6  	struct task_struct *idle = idle_task(vcpu);
    31.7 +#ifdef __x86_64__
    31.8 +	struct desc_ptr *gdt_descr = &cpu_gdt_descr[vcpu];
    31.9 +#else
   31.10 +	struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, vcpu);
   31.11 +#endif
   31.12  
   31.13  	if (vcpu == 0)
   31.14  		return;
   31.15 @@ -171,8 +176,8 @@ void vcpu_prepare(int vcpu)
   31.16  
   31.17  	ctxt.ldt_ents = 0;
   31.18  
   31.19 -	ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[vcpu].address);
   31.20 -	ctxt.gdt_ents      = cpu_gdt_descr[vcpu].size / 8;
   31.21 +	ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
   31.22 +	ctxt.gdt_ents      = gdt_descr->size / 8;
   31.23  
   31.24  #ifdef __i386__
   31.25  	ctxt.user_regs.cs = __KERNEL_CS;
   31.26 @@ -210,6 +215,11 @@ void __init smp_prepare_cpus(unsigned in
   31.27  {
   31.28  	int cpu;
   31.29  	struct task_struct *idle;
   31.30 +#ifdef __x86_64__
   31.31 +	struct desc_ptr *gdt_descr;
   31.32 +#else
   31.33 +	struct Xgt_desc_struct *gdt_descr;
   31.34 +#endif
   31.35  
   31.36  	cpu_data[0] = boot_cpu_data;
   31.37  
   31.38 @@ -226,6 +236,22 @@ void __init smp_prepare_cpus(unsigned in
   31.39  		if (cpu == 0)
   31.40  			continue;
   31.41  
   31.42 +#ifdef __x86_64__
   31.43 +		gdt_descr = &cpu_gdt_descr[cpu];
   31.44 +#else
   31.45 +		gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
   31.46 +#endif
   31.47 +		gdt_descr->address = get_zeroed_page(GFP_KERNEL);
   31.48 +		if (unlikely(!gdt_descr->address)) {
   31.49 +			printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
   31.50 +			continue;
   31.51 +		}
   31.52 +		gdt_descr->size = GDT_SIZE;
   31.53 +		memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
   31.54 +		make_page_readonly(
   31.55 +			(void *)gdt_descr->address,
   31.56 +			XENFEAT_writable_descriptor_tables);
   31.57 +
   31.58  		cpu_data[cpu] = boot_cpu_data;
   31.59  		cpu_2_logical_apicid[cpu] = cpu;
   31.60  		x86_cpu_to_apicid[cpu] = cpu;
   31.61 @@ -242,17 +268,6 @@ void __init smp_prepare_cpus(unsigned in
   31.62  
   31.63  		irq_ctx_init(cpu);
   31.64  
   31.65 -		cpu_gdt_descr[cpu].address =
   31.66 -			__get_free_page(GFP_KERNEL|__GFP_ZERO);
   31.67 -		BUG_ON(cpu_gdt_descr[0].size > PAGE_SIZE);
   31.68 -		cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size;
   31.69 -		memcpy((void *)cpu_gdt_descr[cpu].address,
   31.70 -		       (void *)cpu_gdt_descr[0].address,
   31.71 -		       cpu_gdt_descr[0].size);
   31.72 -		make_page_readonly(
   31.73 -			(void *)cpu_gdt_descr[cpu].address,
   31.74 -			XENFEAT_writable_descriptor_tables);
   31.75 -
   31.76  #ifdef CONFIG_HOTPLUG_CPU
   31.77  		if (xen_start_info->flags & SIF_INITDOMAIN)
   31.78  			cpu_set(cpu, cpu_present_map);
    32.1 --- a/linux-2.6-xen-sparse/drivers/xen/net_driver_util.c	Wed Mar 01 10:01:54 2006 -0700
    32.2 +++ b/linux-2.6-xen-sparse/drivers/xen/net_driver_util.c	Wed Mar 01 12:47:25 2006 -0700
    32.3 @@ -30,6 +30,7 @@
    32.4  
    32.5  #include <linux/if_ether.h>
    32.6  #include <linux/err.h>
    32.7 +#include <linux/module.h>
    32.8  #include <xen/net_driver_util.h>
    32.9  
   32.10  
   32.11 @@ -54,7 +55,7 @@ int xen_net_read_mac(struct xenbus_devic
   32.12  	kfree(macstr);
   32.13  	return 0;
   32.14  }
   32.15 -
   32.16 +EXPORT_SYMBOL(xen_net_read_mac);
   32.17  
   32.18  /*
   32.19   * Local variables:
    33.1 --- a/linux-2.6-xen-sparse/drivers/xen/netback/Makefile	Wed Mar 01 10:01:54 2006 -0700
    33.2 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/Makefile	Wed Mar 01 12:47:25 2006 -0700
    33.3 @@ -1,2 +1,5 @@
    33.4 +obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o
    33.5 +obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o
    33.6  
    33.7 -obj-y	:= netback.o xenbus.o interface.o loopback.o
    33.8 +netbk-y   := netback.o xenbus.o interface.o
    33.9 +netloop-y := loopback.o
    34.1 --- a/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c	Wed Mar 01 10:01:54 2006 -0700
    34.2 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c	Wed Mar 01 12:47:25 2006 -0700
    34.3 @@ -178,6 +178,23 @@ static int __init make_loopback(int i)
    34.4  	return err;
    34.5  }
    34.6  
    34.7 +static void __init clean_loopback(int i)
    34.8 +{
    34.9 +	struct net_device *dev1, *dev2;
   34.10 +	char dev_name[IFNAMSIZ];
   34.11 +
   34.12 +	sprintf(dev_name, "vif0.%d", i);
   34.13 +	dev1 = dev_get_by_name(dev_name);
   34.14 +	sprintf(dev_name, "veth%d", i);
   34.15 +	dev2 = dev_get_by_name(dev_name);
   34.16 +	if (dev1 && dev2) {
   34.17 +		unregister_netdev(dev2);
   34.18 +		unregister_netdev(dev1);
   34.19 +		free_netdev(dev2);
   34.20 +		free_netdev(dev1);
   34.21 +	}
   34.22 +}
   34.23 +
   34.24  static int __init loopback_init(void)
   34.25  {
   34.26  	int i, err = 0;
   34.27 @@ -191,6 +208,18 @@ static int __init loopback_init(void)
   34.28  
   34.29  module_init(loopback_init);
   34.30  
   34.31 +static void __exit loopback_exit(void)
   34.32 +{
   34.33 +	int i;
   34.34 +
   34.35 +	for (i = nloopbacks; i-- > 0; )
   34.36 +		clean_loopback(i);
   34.37 +}
   34.38 +
   34.39 +module_exit(loopback_exit);
   34.40 +
   34.41 +MODULE_LICENSE("Dual BSD/GPL");
   34.42 +
   34.43  /*
   34.44   * Local variables:
   34.45   *  c-file-style: "linux"
    35.1 --- a/linux-2.6-xen-sparse/drivers/xen/netback/netback.c	Wed Mar 01 10:01:54 2006 -0700
    35.2 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/netback.c	Wed Mar 01 12:47:25 2006 -0700
    35.3 @@ -505,14 +505,12 @@ static void net_tx_action(unsigned long 
    35.4  			/* Still too big to send right now? Set a callback. */
    35.5  			if (txreq.size > netif->remaining_credit) {
    35.6  				netif->remaining_credit = 0;
    35.7 -				netif->credit_timeout.expires  = 
    35.8 -					next_credit;
    35.9  				netif->credit_timeout.data     =
   35.10  					(unsigned long)netif;
   35.11  				netif->credit_timeout.function =
   35.12  					tx_credit_callback;
   35.13 -				add_timer_on(&netif->credit_timeout,
   35.14 -					     smp_processor_id());
   35.15 +				__mod_timer(&netif->credit_timeout,
   35.16 +					    next_credit);
   35.17  				break;
   35.18  			}
   35.19  		}
   35.20 @@ -811,6 +809,8 @@ static int __init netback_init(void)
   35.21  		&netif_be_dbg);
   35.22  #endif
   35.23  
   35.24 +	__unsafe(THIS_MODULE);
   35.25 +
   35.26  	return 0;
   35.27  }
   35.28  
   35.29 @@ -822,6 +822,8 @@ static void netback_cleanup(void)
   35.30  module_init(netback_init);
   35.31  module_exit(netback_cleanup);
   35.32  
   35.33 +MODULE_LICENSE("Dual BSD/GPL");
   35.34 +
   35.35  /*
   35.36   * Local variables:
   35.37   *  c-file-style: "linux"
    36.1 --- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c	Wed Mar 01 10:01:54 2006 -0700
    36.2 +++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c	Wed Mar 01 12:47:25 2006 -0700
    36.3 @@ -114,6 +114,7 @@ struct netfront_info
    36.4  
    36.5  	/* Receive-ring batched refills. */
    36.6  #define RX_MIN_TARGET 8
    36.7 +#define RX_DFL_MIN_TARGET 64
    36.8  #define RX_MAX_TARGET NET_RX_RING_SIZE
    36.9  	int rx_min_target, rx_max_target, rx_target;
   36.10  	struct sk_buff_head rx_batch;
   36.11 @@ -1102,8 +1103,8 @@ static int create_netdev(int handle, str
   36.12  	spin_lock_init(&np->rx_lock);
   36.13  
   36.14  	skb_queue_head_init(&np->rx_batch);
   36.15 -	np->rx_target     = RX_MIN_TARGET;
   36.16 -	np->rx_min_target = RX_MIN_TARGET;
   36.17 +	np->rx_target     = RX_DFL_MIN_TARGET;
   36.18 +	np->rx_min_target = RX_DFL_MIN_TARGET;
   36.19  	np->rx_max_target = RX_MAX_TARGET;
   36.20  
   36.21  	init_timer(&np->rx_refill_timer);
    37.1 --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h	Wed Mar 01 10:01:54 2006 -0700
    37.2 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h	Wed Mar 01 12:47:25 2006 -0700
    37.3 @@ -54,9 +54,11 @@ typedef struct tpmif_st {
    37.4  void tpmif_disconnect_complete(tpmif_t * tpmif);
    37.5  tpmif_t *tpmif_find(domid_t domid, long int instance);
    37.6  void tpmif_interface_init(void);
    37.7 +void tpmif_interface_exit(void);
    37.8  void tpmif_schedule_work(tpmif_t * tpmif);
    37.9  void tpmif_deschedule_work(tpmif_t * tpmif);
   37.10  void tpmif_xenbus_init(void);
   37.11 +void tpmif_xenbus_exit(void);
   37.12  int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn);
   37.13  irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs);
   37.14  int tpmif_vtpm_open(tpmif_t *tpmif, domid_t domain, u32 instance);
    38.1 --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c	Wed Mar 01 10:01:54 2006 -0700
    38.2 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c	Wed Mar 01 12:47:25 2006 -0700
    38.3 @@ -186,6 +186,12 @@ tpmif_interface_init(void)
    38.4  					 0, 0, NULL, NULL);
    38.5  }
    38.6  
    38.7 +void __init
    38.8 +tpmif_interface_exit(void)
    38.9 +{
   38.10 +	kmem_cache_destroy(tpmif_cachep);
   38.11 +}
   38.12 +
   38.13  /*
   38.14   * Local variables:
   38.15   *  c-file-style: "linux"
    39.1 --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c	Wed Mar 01 10:01:54 2006 -0700
    39.2 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c	Wed Mar 01 12:47:25 2006 -0700
    39.3 @@ -1092,7 +1092,20 @@ tpmback_init(void)
    39.4  	return 0;
    39.5  }
    39.6  
    39.7 -__initcall(tpmback_init);
    39.8 +module_init(tpmback_init);
    39.9 +
   39.10 +static void __exit
   39.11 +tpmback_exit(void)
   39.12 +{
   39.13 +
   39.14 +	tpmif_xenbus_exit();
   39.15 +	tpmif_interface_exit();
   39.16 +	misc_deregister(&ibmvtpms_miscdevice);
   39.17 +}
   39.18 +
   39.19 +module_exit(tpmback_exit);
   39.20 +
   39.21 +MODULE_LICENSE("Dual BSD/GPL");
   39.22  
   39.23  /*
   39.24   * Local variables:
    40.1 --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c	Wed Mar 01 10:01:54 2006 -0700
    40.2 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c	Wed Mar 01 12:47:25 2006 -0700
    40.3 @@ -317,6 +317,11 @@ void tpmif_xenbus_init(void)
    40.4  	xenbus_register_backend(&tpmback);
    40.5  }
    40.6  
    40.7 +void tpmif_xenbus_exit(void)
    40.8 +{
    40.9 +	xenbus_unregister_driver(&tpmback);
   40.10 +}
   40.11 +
   40.12  /*
   40.13   * Local variables:
   40.14   *  c-file-style: "linux"
    41.1 --- a/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c	Wed Mar 01 10:01:54 2006 -0700
    41.2 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c	Wed Mar 01 12:47:25 2006 -0700
    41.3 @@ -480,6 +480,11 @@ static void __init init_tpm_xenbus(void)
    41.4  	xenbus_register_frontend(&tpmfront);
    41.5  }
    41.6  
    41.7 +static void __exit exit_tpm_xenbus(void)
    41.8 +{
    41.9 +	xenbus_unregister_driver(&tpmfront);
   41.10 +}
   41.11 +
   41.12  
   41.13  static int
   41.14  tpm_allocate_buffers(struct tpm_private *tp)
   41.15 @@ -700,7 +705,18 @@ tpmif_init(void)
   41.16  	return 0;
   41.17  }
   41.18  
   41.19 -__initcall(tpmif_init);
   41.20 +module_init(tpmif_init);
   41.21 +
   41.22 +static void __exit
   41.23 +tpmif_exit(void)
   41.24 +{
   41.25 +	exit_tpm_xenbus();
   41.26 +	gnttab_free_grant_references(gref_head);
   41.27 +}
   41.28 +
   41.29 +module_exit(tpmif_exit);
   41.30 +
   41.31 +MODULE_LICENSE("Dual BSD/GPL");
   41.32  
   41.33  /*
   41.34   * Local variables:
    42.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    42.2 +++ b/linux-2.6-xen-sparse/include/asm-i386/fixmap.h	Wed Mar 01 12:47:25 2006 -0700
    42.3 @@ -0,0 +1,151 @@
    42.4 +/*
    42.5 + * fixmap.h: compile-time virtual memory allocation
    42.6 + *
    42.7 + * This file is subject to the terms and conditions of the GNU General Public
    42.8 + * License.  See the file "COPYING" in the main directory of this archive
    42.9 + * for more details.
   42.10 + *
   42.11 + * Copyright (C) 1998 Ingo Molnar
   42.12 + *
   42.13 + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   42.14 + */
   42.15 +
   42.16 +#ifndef _ASM_FIXMAP_H
   42.17 +#define _ASM_FIXMAP_H
   42.18 +
   42.19 +#include <linux/config.h>
   42.20 +
   42.21 +/* used by vmalloc.c, vsyscall.lds.S.
   42.22 + *
   42.23 + * Leave one empty page between vmalloc'ed areas and
   42.24 + * the start of the fixmap.
   42.25 + */
   42.26 +extern unsigned long __FIXADDR_TOP;
   42.27 +
   42.28 +#ifndef __ASSEMBLY__
   42.29 +#include <linux/kernel.h>
   42.30 +#include <asm/acpi.h>
   42.31 +#include <asm/apicdef.h>
   42.32 +#include <asm/page.h>
   42.33 +#ifdef CONFIG_HIGHMEM
   42.34 +#include <linux/threads.h>
   42.35 +#include <asm/kmap_types.h>
   42.36 +#endif
   42.37 +
   42.38 +/*
   42.39 + * Here we define all the compile-time 'special' virtual
   42.40 + * addresses. The point is to have a constant address at
   42.41 + * compile time, but to set the physical address only
   42.42 + * in the boot process. We allocate these special addresses
   42.43 + * from the end of virtual memory (0xfffff000) backwards.
   42.44 + * Also this lets us do fail-safe vmalloc(), we
   42.45 + * can guarantee that these special addresses and
   42.46 + * vmalloc()-ed addresses never overlap.
   42.47 + *
   42.48 + * these 'compile-time allocated' memory buffers are
   42.49 + * fixed-size 4k pages. (or larger if used with an increment
   42.50 + * highger than 1) use fixmap_set(idx,phys) to associate
   42.51 + * physical memory with fixmap indices.
   42.52 + *
   42.53 + * TLB entries of such buffers will not be flushed across
   42.54 + * task switches.
   42.55 + */
   42.56 +enum fixed_addresses {
   42.57 +	FIX_HOLE,
   42.58 +#ifdef CONFIG_X86_LOCAL_APIC
   42.59 +	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
   42.60 +#endif
   42.61 +#ifdef CONFIG_X86_IO_APIC
   42.62 +	FIX_IO_APIC_BASE_0,
   42.63 +	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
   42.64 +#endif
   42.65 +#ifdef CONFIG_X86_VISWS_APIC
   42.66 +	FIX_CO_CPU,	/* Cobalt timer */
   42.67 +	FIX_CO_APIC,	/* Cobalt APIC Redirection Table */ 
   42.68 +	FIX_LI_PCIA,	/* Lithium PCI Bridge A */
   42.69 +	FIX_LI_PCIB,	/* Lithium PCI Bridge B */
   42.70 +#endif
   42.71 +#ifdef CONFIG_X86_F00F_BUG
   42.72 +	FIX_F00F_IDT,	/* Virtual mapping for IDT */
   42.73 +#endif
   42.74 +#ifdef CONFIG_X86_CYCLONE_TIMER
   42.75 +	FIX_CYCLONE_TIMER, /*cyclone timer register*/
   42.76 +#endif 
   42.77 +#ifdef CONFIG_HIGHMEM
   42.78 +	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
   42.79 +	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
   42.80 +#endif
   42.81 +#ifdef CONFIG_ACPI
   42.82 +	FIX_ACPI_BEGIN,
   42.83 +	FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
   42.84 +#endif
   42.85 +#ifdef CONFIG_PCI_MMCONFIG
   42.86 +	FIX_PCIE_MCFG,
   42.87 +#endif
   42.88 +	__end_of_permanent_fixed_addresses,
   42.89 +	/* temporary boot-time mappings, used before ioremap() is functional */
   42.90 +#define NR_FIX_BTMAPS	16
   42.91 +	FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
   42.92 +	FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
   42.93 +	FIX_WP_TEST,
   42.94 +	__end_of_fixed_addresses
   42.95 +};
   42.96 +
   42.97 +extern void __set_fixmap (enum fixed_addresses idx,
   42.98 +					unsigned long phys, pgprot_t flags);
   42.99 +
  42.100 +extern void set_fixaddr_top(unsigned long top);
  42.101 +
  42.102 +#define set_fixmap(idx, phys) \
  42.103 +		__set_fixmap(idx, phys, PAGE_KERNEL)
  42.104 +/*
  42.105 + * Some hardware wants to get fixmapped without caching.
  42.106 + */
  42.107 +#define set_fixmap_nocache(idx, phys) \
  42.108 +		__set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
  42.109 +
  42.110 +#define clear_fixmap(idx) \
  42.111 +		__set_fixmap(idx, 0, __pgprot(0))
  42.112 +
  42.113 +#define FIXADDR_TOP	((unsigned long)__FIXADDR_TOP)
  42.114 +
  42.115 +#define __FIXADDR_SIZE	(__end_of_permanent_fixed_addresses << PAGE_SHIFT)
  42.116 +#define __FIXADDR_BOOT_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
  42.117 +#define FIXADDR_START		(FIXADDR_TOP - __FIXADDR_SIZE)
  42.118 +#define FIXADDR_BOOT_START	(FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
  42.119 +
  42.120 +#define __fix_to_virt(x)	(FIXADDR_TOP - ((x) << PAGE_SHIFT))
  42.121 +#define __virt_to_fix(x)	((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
  42.122 +
  42.123 +extern void __this_fixmap_does_not_exist(void);
  42.124 +
  42.125 +/*
  42.126 + * 'index to address' translation. If anyone tries to use the idx
  42.127 + * directly without tranlation, we catch the bug with a NULL-deference
  42.128 + * kernel oops. Illegal ranges of incoming indices are caught too.
  42.129 + */
  42.130 +static __always_inline unsigned long fix_to_virt(const unsigned int idx)
  42.131 +{
  42.132 +	/*
  42.133 +	 * this branch gets completely eliminated after inlining,
  42.134 +	 * except when someone tries to use fixaddr indices in an
  42.135 +	 * illegal way. (such as mixing up address types or using
  42.136 +	 * out-of-range indices).
  42.137 +	 *
  42.138 +	 * If it doesn't get removed, the linker will complain
  42.139 +	 * loudly with a reasonably clear error message..
  42.140 +	 */
  42.141 +	if (idx >= __end_of_fixed_addresses)
  42.142 +		__this_fixmap_does_not_exist();
  42.143 +
  42.144 +        return __fix_to_virt(idx);
  42.145 +}
  42.146 +
  42.147 +static inline unsigned long virt_to_fix(const unsigned long vaddr)
  42.148 +{
  42.149 +	BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
  42.150 +	return __virt_to_fix(vaddr);
  42.151 +}
  42.152 +
  42.153 +#endif /* !__ASSEMBLY__ */
  42.154 +#endif
    43.1 --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h	Wed Mar 01 10:01:54 2006 -0700
    43.2 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h	Wed Mar 01 12:47:25 2006 -0700
    43.3 @@ -23,11 +23,13 @@ struct Xgt_desc_struct {
    43.4  	unsigned short pad;
    43.5  } __attribute__ ((packed));
    43.6  
    43.7 -extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS];
    43.8 +extern struct Xgt_desc_struct idt_descr;
    43.9 +DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
   43.10 +
   43.11  
   43.12  static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
   43.13  {
   43.14 -	return ((struct desc_struct *)cpu_gdt_descr[cpu].address);
   43.15 +	return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
   43.16  }
   43.17  
   43.18  #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
    44.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    44.2 +++ b/linux-2.6-xen-sparse/include/asm-i386/page.h	Wed Mar 01 12:47:25 2006 -0700
    44.3 @@ -0,0 +1,148 @@
    44.4 +#ifndef _I386_PAGE_H
    44.5 +#define _I386_PAGE_H
    44.6 +
    44.7 +/* PAGE_SHIFT determines the page size */
    44.8 +#define PAGE_SHIFT	12
    44.9 +#define PAGE_SIZE	(1UL << PAGE_SHIFT)
   44.10 +#define PAGE_MASK	(~(PAGE_SIZE-1))
   44.11 +
   44.12 +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
   44.13 +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
   44.14 +
   44.15 +#ifdef __KERNEL__
   44.16 +#ifndef __ASSEMBLY__
   44.17 +
   44.18 +#include <linux/config.h>
   44.19 +
   44.20 +#ifdef CONFIG_X86_USE_3DNOW
   44.21 +
   44.22 +#include <asm/mmx.h>
   44.23 +
   44.24 +#define clear_page(page)	mmx_clear_page((void *)(page))
   44.25 +#define copy_page(to,from)	mmx_copy_page(to,from)
   44.26 +
   44.27 +#else
   44.28 +
   44.29 +/*
   44.30 + *	On older X86 processors it's not a win to use MMX here it seems.
   44.31 + *	Maybe the K6-III ?
   44.32 + */
   44.33 + 
   44.34 +#define clear_page(page)	memset((void *)(page), 0, PAGE_SIZE)
   44.35 +#define copy_page(to,from)	memcpy((void *)(to), (void *)(from), PAGE_SIZE)
   44.36 +
   44.37 +#endif
   44.38 +
   44.39 +#define clear_user_page(page, vaddr, pg)	clear_page(page)
   44.40 +#define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
   44.41 +
   44.42 +#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr)
   44.43 +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
   44.44 +
   44.45 +/*
   44.46 + * These are used to make use of C type-checking..
   44.47 + */
   44.48 +extern int nx_enabled;
   44.49 +#ifdef CONFIG_X86_PAE
   44.50 +extern unsigned long long __supported_pte_mask;
   44.51 +typedef struct { unsigned long pte_low, pte_high; } pte_t;
   44.52 +typedef struct { unsigned long long pmd; } pmd_t;
   44.53 +typedef struct { unsigned long long pgd; } pgd_t;
   44.54 +typedef struct { unsigned long long pgprot; } pgprot_t;
   44.55 +#define pmd_val(x)	((x).pmd)
   44.56 +#define pte_val(x)	((x).pte_low | ((unsigned long long)(x).pte_high << 32))
   44.57 +#define __pmd(x) ((pmd_t) { (x) } )
   44.58 +#define HPAGE_SHIFT	21
   44.59 +#else
   44.60 +typedef struct { unsigned long pte_low; } pte_t;
   44.61 +typedef struct { unsigned long pgd; } pgd_t;
   44.62 +typedef struct { unsigned long pgprot; } pgprot_t;
   44.63 +#define boot_pte_t pte_t /* or would you rather have a typedef */
   44.64 +#define pte_val(x)	((x).pte_low)
   44.65 +#define HPAGE_SHIFT	22
   44.66 +#endif
   44.67 +#define PTE_MASK	PAGE_MASK
   44.68 +
   44.69 +#ifdef CONFIG_HUGETLB_PAGE
   44.70 +#define HPAGE_SIZE	((1UL) << HPAGE_SHIFT)
   44.71 +#define HPAGE_MASK	(~(HPAGE_SIZE - 1))
   44.72 +#define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
   44.73 +#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
   44.74 +#endif
   44.75 +
   44.76 +#define pgd_val(x)	((x).pgd)
   44.77 +#define pgprot_val(x)	((x).pgprot)
   44.78 +
   44.79 +#define __pte(x) ((pte_t) { (x) } )
   44.80 +#define __pgd(x) ((pgd_t) { (x) } )
   44.81 +#define __pgprot(x)	((pgprot_t) { (x) } )
   44.82 +
   44.83 +#endif /* !__ASSEMBLY__ */
   44.84 +
   44.85 +/* to align the pointer to the (next) page boundary */
   44.86 +#define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
   44.87 +
   44.88 +/*
   44.89 + * This handles the memory map.. We could make this a config
   44.90 + * option, but too many people screw it up, and too few need
   44.91 + * it.
   44.92 + *
   44.93 + * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
   44.94 + * a virtual address space of one gigabyte, which limits the
   44.95 + * amount of physical memory you can use to about 950MB. 
   44.96 + *
   44.97 + * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
   44.98 + * and CONFIG_HIGHMEM64G options in the kernel configuration.
   44.99 + */
  44.100 +
  44.101 +#ifndef __ASSEMBLY__
  44.102 +
  44.103 +/*
  44.104 + * This much address space is reserved for vmalloc() and iomap()
  44.105 + * as well as fixmap mappings.
  44.106 + */
  44.107 +extern unsigned int __VMALLOC_RESERVE;
  44.108 +
  44.109 +extern int sysctl_legacy_va_layout;
  44.110 +
  44.111 +extern int page_is_ram(unsigned long pagenr);
  44.112 +
  44.113 +#endif /* __ASSEMBLY__ */
  44.114 +
  44.115 +#ifdef __ASSEMBLY__
  44.116 +#define __PAGE_OFFSET		CONFIG_PAGE_OFFSET
  44.117 +#define __PHYSICAL_START	CONFIG_PHYSICAL_START
  44.118 +#else
  44.119 +#define __PAGE_OFFSET		((unsigned long)CONFIG_PAGE_OFFSET)
  44.120 +#define __PHYSICAL_START	((unsigned long)CONFIG_PHYSICAL_START)
  44.121 +#endif
  44.122 +#define __KERNEL_START		(__PAGE_OFFSET + __PHYSICAL_START)
  44.123 +
  44.124 +
  44.125 +#define PAGE_OFFSET		((unsigned long)__PAGE_OFFSET)
  44.126 +#define VMALLOC_RESERVE		((unsigned long)__VMALLOC_RESERVE)
  44.127 +#define MAXMEM			(__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
  44.128 +#define __pa(x)			((unsigned long)(x)-PAGE_OFFSET)
  44.129 +#define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
  44.130 +#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
  44.131 +#ifdef CONFIG_FLATMEM
  44.132 +#define pfn_to_page(pfn)	(mem_map + (pfn))
  44.133 +#define page_to_pfn(page)	((unsigned long)((page) - mem_map))
  44.134 +#define pfn_valid(pfn)		((pfn) < max_mapnr)
  44.135 +#endif /* CONFIG_FLATMEM */
  44.136 +#define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
  44.137 +
  44.138 +#define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
  44.139 +
  44.140 +#define VM_DATA_DEFAULT_FLAGS \
  44.141 +	(VM_READ | VM_WRITE | \
  44.142 +	((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
  44.143 +		 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
  44.144 +
  44.145 +#define __HAVE_ARCH_GATE_AREA 1
  44.146 +
  44.147 +#endif /* __KERNEL__ */
  44.148 +
  44.149 +#include <asm-generic/page.h>
  44.150 +
  44.151 +#endif /* _I386_PAGE_H */
    45.1 --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pci.h	Wed Mar 01 10:01:54 2006 -0700
    45.2 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pci.h	Wed Mar 01 12:47:25 2006 -0700
    45.3 @@ -19,8 +19,6 @@ extern unsigned int pcibios_assign_all_b
    45.4  #endif
    45.5  #define pcibios_scan_all_fns(a, b)	0
    45.6  
    45.7 -extern int no_iommu, force_iommu;
    45.8 -
    45.9  extern unsigned long pci_mem_start;
   45.10  #define PCIBIOS_MIN_IO		0x1000
   45.11  #define PCIBIOS_MIN_MEM		(pci_mem_start)
    46.1 --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h	Wed Mar 01 10:01:54 2006 -0700
    46.2 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h	Wed Mar 01 12:47:25 2006 -0700
    46.3 @@ -169,7 +169,7 @@ static inline pte_t ptep_get_and_clear_f
    46.4  #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
    46.5  #define PGDIR_MASK	(~(PGDIR_SIZE-1))
    46.6  
    46.7 -#define USER_PTRS_PER_PGD	(TASK_SIZE/PGDIR_SIZE)
    46.8 +#define USER_PTRS_PER_PGD	((TASK_SIZE-1)/PGDIR_SIZE+1)
    46.9  #define FIRST_USER_ADDRESS	0
   46.10  
   46.11  #ifndef __ASSEMBLY__
    47.1 --- a/linux-2.6-xen-sparse/include/linux/mm.h	Wed Mar 01 10:01:54 2006 -0700
    47.2 +++ b/linux-2.6-xen-sparse/include/linux/mm.h	Wed Mar 01 12:47:25 2006 -0700
    47.3 @@ -1064,7 +1064,11 @@ int shrink_slab(unsigned long scanned, g
    47.4  void drop_pagecache(void);
    47.5  void drop_slab(void);
    47.6  
    47.7 +#ifndef CONFIG_MMU
    47.8 +#define randomize_va_space 0
    47.9 +#else
   47.10  extern int randomize_va_space;
   47.11 +#endif
   47.12  
   47.13  #endif /* __KERNEL__ */
   47.14  #endif /* _LINUX_MM_H */
    48.1 --- a/linux-2.6-xen-sparse/mm/page_alloc.c	Wed Mar 01 10:01:54 2006 -0700
    48.2 +++ b/linux-2.6-xen-sparse/mm/page_alloc.c	Wed Mar 01 12:47:25 2006 -0700
    48.3 @@ -1017,7 +1017,7 @@ rebalance:
    48.4  		if (page)
    48.5  			goto got_pg;
    48.6  
    48.7 -		out_of_memory(gfp_mask, order);
    48.8 +		out_of_memory(zonelist, gfp_mask, order);
    48.9  		goto restart;
   48.10  	}
   48.11  
    49.1 --- a/linux-2.6-xen-sparse/net/core/skbuff.c	Wed Mar 01 10:01:54 2006 -0700
    49.2 +++ b/linux-2.6-xen-sparse/net/core/skbuff.c	Wed Mar 01 12:47:25 2006 -0700
    49.3 @@ -434,6 +434,9 @@ struct sk_buff *skb_clone(struct sk_buff
    49.4  	C(pkt_type);
    49.5  	C(ip_summed);
    49.6  	C(priority);
    49.7 +#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
    49.8 +	C(ipvs_property);
    49.9 +#endif
   49.10  	C(protocol);
   49.11  	n->destructor = NULL;
   49.12  #ifdef CONFIG_NETFILTER
   49.13 @@ -445,13 +448,6 @@ struct sk_buff *skb_clone(struct sk_buff
   49.14  	C(nfct_reasm);
   49.15  	nf_conntrack_get_reasm(skb->nfct_reasm);
   49.16  #endif
   49.17 -#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
   49.18 -	C(ipvs_property);
   49.19 -#endif
   49.20 -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
   49.21 -	C(nfct_reasm);
   49.22 -	nf_conntrack_get_reasm(skb->nfct_reasm);
   49.23 -#endif
   49.24  #ifdef CONFIG_BRIDGE_NETFILTER
   49.25  	C(nf_bridge);
   49.26  	nf_bridge_get(skb->nf_bridge);
    50.1 --- a/patches/linux-2.6.16-rc4/i386-mach-io-check-nmi.patch	Wed Mar 01 10:01:54 2006 -0700
    50.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    50.3 @@ -1,45 +0,0 @@
    50.4 -diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/traps.c ./arch/i386/kernel/traps.c
    50.5 ---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/traps.c	2006-02-15 20:38:51.000000000 +0000
    50.6 -+++ ./arch/i386/kernel/traps.c	2006-02-15 20:40:43.000000000 +0000
    50.7 -@@ -567,18 +567,11 @@ static void mem_parity_error(unsigned ch
    50.8 - 
    50.9 - static void io_check_error(unsigned char reason, struct pt_regs * regs)
   50.10 - {
   50.11 --	unsigned long i;
   50.12 --
   50.13 - 	printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
   50.14 - 	show_registers(regs);
   50.15 - 
   50.16 - 	/* Re-enable the IOCK line, wait for a few seconds */
   50.17 --	reason = (reason & 0xf) | 8;
   50.18 --	outb(reason, 0x61);
   50.19 --	i = 2000;
   50.20 --	while (--i) udelay(1000);
   50.21 --	reason &= ~8;
   50.22 --	outb(reason, 0x61);
   50.23 -+	clear_io_check_error(reason);
   50.24 - }
   50.25 - 
   50.26 - static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
   50.27 -diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/mach-default/mach_traps.h ./include/asm-i386/mach-default/mach_traps.h
   50.28 ---- ../pristine-linux-2.6.16-rc3/include/asm-i386/mach-default/mach_traps.h	2006-01-03 03:21:10.000000000 +0000
   50.29 -+++ ./include/asm-i386/mach-default/mach_traps.h	2006-02-15 20:40:43.000000000 +0000
   50.30 -@@ -15,6 +15,18 @@ static inline void clear_mem_error(unsig
   50.31 - 	outb(reason, 0x61);
   50.32 - }
   50.33 - 
   50.34 -+static inline void clear_io_check_error(unsigned char reason)
   50.35 -+{
   50.36 -+	unsigned long i;
   50.37 -+
   50.38 -+	reason = (reason & 0xf) | 8;
   50.39 -+	outb(reason, 0x61);
   50.40 -+	i = 2000;
   50.41 -+	while (--i) udelay(1000);
   50.42 -+	reason &= ~8;
   50.43 -+	outb(reason, 0x61);
   50.44 -+}
   50.45 -+
   50.46 - static inline unsigned char get_nmi_reason(void)
   50.47 - {
   50.48 - 	return inb(0x61);
    51.1 --- a/patches/linux-2.6.16-rc4/net-csum.patch	Wed Mar 01 10:01:54 2006 -0700
    51.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    51.3 @@ -1,41 +0,0 @@
    51.4 -diff -pruN ../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_tcp.c ./net/ipv4/netfilter/ip_nat_proto_tcp.c
    51.5 ---- ../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_tcp.c	2006-02-02 17:39:51.000000000 +0000
    51.6 -+++ ./net/ipv4/netfilter/ip_nat_proto_tcp.c	2006-02-02 17:44:18.000000000 +0000
    51.7 -@@ -129,10 +129,14 @@ tcp_manip_pkt(struct sk_buff **pskb,
    51.8 - 	if (hdrsize < sizeof(*hdr))
    51.9 - 		return 1;
   51.10 - 
   51.11 --	hdr->check = ip_nat_cheat_check(~oldip, newip,
   51.12 -+	if ((*pskb)->proto_csum_blank) {
   51.13 -+		hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check);
   51.14 -+	} else {
   51.15 -+		hdr->check = ip_nat_cheat_check(~oldip, newip,
   51.16 - 					ip_nat_cheat_check(oldport ^ 0xFFFF,
   51.17 - 							   newport,
   51.18 - 							   hdr->check));
   51.19 -+	}
   51.20 - 	return 1;
   51.21 - }
   51.22 -
   51.23 -diff -pruN ../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_udp.c ./net/ipv4/netfilter/ip_nat_proto_udp.c
   51.24 ---- ../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_udp.c	2006-02-02 17:39:51.000000000 +0000
   51.25 -+++ ./net/ipv4/netfilter/ip_nat_proto_udp.c	2006-02-02 17:44:18.000000000 +0000
   51.26 -@@ -113,11 +113,16 @@ udp_manip_pkt(struct sk_buff **pskb,
   51.27 - 		newport = tuple->dst.u.udp.port;
   51.28 - 		portptr = &hdr->dest;
   51.29 - 	}
   51.30 --	if (hdr->check) /* 0 is a special case meaning no checksum */
   51.31 --		hdr->check = ip_nat_cheat_check(~oldip, newip,
   51.32 -+	if (hdr->check) { /* 0 is a special case meaning no checksum */
   51.33 -+		if ((*pskb)->proto_csum_blank) {
   51.34 -+			hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check);
   51.35 -+		} else {
   51.36 -+			hdr->check = ip_nat_cheat_check(~oldip, newip,
   51.37 - 					ip_nat_cheat_check(*portptr ^ 0xFFFF,
   51.38 - 							   newport,
   51.39 - 							   hdr->check));
   51.40 -+		}
   51.41 -+	}
   51.42 - 	*portptr = newport;
   51.43 - 	return 1;
   51.44 - }
    52.1 --- a/patches/linux-2.6.16-rc4/pmd-shared.patch	Wed Mar 01 10:01:54 2006 -0700
    52.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    52.3 @@ -1,111 +0,0 @@
    52.4 -diff -pruN ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pageattr.c ./arch/i386/mm/pageattr.c
    52.5 ---- ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pageattr.c	2006-02-02 17:39:29.000000000 +0000
    52.6 -+++ ./arch/i386/mm/pageattr.c	2006-02-02 17:45:14.000000000 +0000
    52.7 -@@ -78,7 +78,7 @@ static void set_pmd_pte(pte_t *kpte, uns
    52.8 - 	unsigned long flags;
    52.9 - 
   52.10 - 	set_pte_atomic(kpte, pte); 	/* change init_mm */
   52.11 --	if (PTRS_PER_PMD > 1)
   52.12 -+	if (HAVE_SHARED_KERNEL_PMD)
   52.13 - 		return;
   52.14 - 
   52.15 - 	spin_lock_irqsave(&pgd_lock, flags);
   52.16 -diff -pruN ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pgtable.c ./arch/i386/mm/pgtable.c
   52.17 ---- ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pgtable.c	2006-01-03 03:21:10.000000000 +0000
   52.18 -+++ ./arch/i386/mm/pgtable.c	2006-02-02 17:45:14.000000000 +0000
   52.19 -@@ -215,9 +215,10 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
   52.20 - 		spin_lock_irqsave(&pgd_lock, flags);
   52.21 - 	}
   52.22 - 
   52.23 --	clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
   52.24 --			swapper_pg_dir + USER_PTRS_PER_PGD,
   52.25 --			KERNEL_PGD_PTRS);
   52.26 -+	if (PTRS_PER_PMD == 1 || HAVE_SHARED_KERNEL_PMD)
   52.27 -+		clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
   52.28 -+				swapper_pg_dir + USER_PTRS_PER_PGD,
   52.29 -+				KERNEL_PGD_PTRS);
   52.30 - 	if (PTRS_PER_PMD > 1)
   52.31 - 		return;
   52.32 - 
   52.33 -@@ -249,6 +250,30 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
   52.34 - 			goto out_oom;
   52.35 - 		set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
   52.36 - 	}
   52.37 -+
   52.38 -+	if (!HAVE_SHARED_KERNEL_PMD) {
   52.39 -+		unsigned long flags;
   52.40 -+
   52.41 -+		for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
   52.42 -+			pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
   52.43 -+			if (!pmd)
   52.44 -+				goto out_oom;
   52.45 -+			set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd)));
   52.46 -+		}
   52.47 -+
   52.48 -+		spin_lock_irqsave(&pgd_lock, flags);
   52.49 -+		for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
   52.50 -+			unsigned long v = (unsigned long)i << PGDIR_SHIFT;
   52.51 -+			pgd_t *kpgd = pgd_offset_k(v);
   52.52 -+			pud_t *kpud = pud_offset(kpgd, v);
   52.53 -+			pmd_t *kpmd = pmd_offset(kpud, v);
   52.54 -+			pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
   52.55 -+			memcpy(pmd, kpmd, PAGE_SIZE);
   52.56 -+		}
   52.57 -+		pgd_list_add(pgd);
   52.58 -+		spin_unlock_irqrestore(&pgd_lock, flags);
   52.59 -+	}
   52.60 -+
   52.61 - 	return pgd;
   52.62 - 
   52.63 - out_oom:
   52.64 -@@ -263,9 +288,23 @@ void pgd_free(pgd_t *pgd)
   52.65 - 	int i;
   52.66 - 
   52.67 - 	/* in the PAE case user pgd entries are overwritten before usage */
   52.68 --	if (PTRS_PER_PMD > 1)
   52.69 --		for (i = 0; i < USER_PTRS_PER_PGD; ++i)
   52.70 --			kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
   52.71 -+	if (PTRS_PER_PMD > 1) {
   52.72 -+		for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
   52.73 -+			pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
   52.74 -+			kmem_cache_free(pmd_cache, pmd);
   52.75 -+		}
   52.76 -+		if (!HAVE_SHARED_KERNEL_PMD) {
   52.77 -+			unsigned long flags;
   52.78 -+			spin_lock_irqsave(&pgd_lock, flags);
   52.79 -+			pgd_list_del(pgd);
   52.80 -+			spin_unlock_irqrestore(&pgd_lock, flags);
   52.81 -+			for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
   52.82 -+				pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
   52.83 -+				memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
   52.84 -+				kmem_cache_free(pmd_cache, pmd);
   52.85 -+			}
   52.86 -+		}
   52.87 -+	}
   52.88 - 	/* in the non-PAE case, free_pgtables() clears user pgd entries */
   52.89 - 	kmem_cache_free(pgd_cache, pgd);
   52.90 - }
   52.91 -diff -pruN ../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-2level-defs.h ./include/asm-i386/pgtable-2level-defs.h
   52.92 ---- ../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-2level-defs.h	2006-01-03 03:21:10.000000000 +0000
   52.93 -+++ ./include/asm-i386/pgtable-2level-defs.h	2006-02-02 17:45:14.000000000 +0000
   52.94 -@@ -1,6 +1,8 @@
   52.95 - #ifndef _I386_PGTABLE_2LEVEL_DEFS_H
   52.96 - #define _I386_PGTABLE_2LEVEL_DEFS_H
   52.97 - 
   52.98 -+#define HAVE_SHARED_KERNEL_PMD 0
   52.99 -+
  52.100 - /*
  52.101 -  * traditional i386 two-level paging structure:
  52.102 -  */
  52.103 -diff -pruN ../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-3level-defs.h ./include/asm-i386/pgtable-3level-defs.h
  52.104 ---- ../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-3level-defs.h	2006-01-03 03:21:10.000000000 +0000
  52.105 -+++ ./include/asm-i386/pgtable-3level-defs.h	2006-02-02 17:45:14.000000000 +0000
  52.106 -@@ -1,6 +1,8 @@
  52.107 - #ifndef _I386_PGTABLE_3LEVEL_DEFS_H
  52.108 - #define _I386_PGTABLE_3LEVEL_DEFS_H
  52.109 - 
  52.110 -+#define HAVE_SHARED_KERNEL_PMD 1
  52.111 -+
  52.112 - /*
  52.113 -  * PGDIR_SHIFT determines what a top-level page table entry can map
  52.114 -  */
    53.1 --- a/patches/linux-2.6.16-rc4/smp-alts.patch	Wed Mar 01 10:01:54 2006 -0700
    53.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    53.3 @@ -1,591 +0,0 @@
    53.4 -diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/Kconfig ./arch/i386/Kconfig
    53.5 ---- ../pristine-linux-2.6.16-rc3/arch/i386/Kconfig	2006-02-15 20:38:51.000000000 +0000
    53.6 -+++ ./arch/i386/Kconfig	2006-02-15 20:45:57.000000000 +0000
    53.7 -@@ -202,6 +202,19 @@ config SMP
    53.8 - 
    53.9 - 	  If you don't know what to do here, say N.
   53.10 - 
   53.11 -+config SMP_ALTERNATIVES
   53.12 -+	bool "SMP alternatives support (EXPERIMENTAL)"
   53.13 -+	depends on SMP && EXPERIMENTAL
   53.14 -+	help
   53.15 -+	  Try to reduce the overhead of running an SMP kernel on a uniprocessor
   53.16 -+	  host slightly by replacing certain key instruction sequences
   53.17 -+	  according to whether we currently have more than one CPU available.
   53.18 -+	  This should provide a noticeable boost to performance when
   53.19 -+	  running SMP kernels on UP machines, and have negligible impact
   53.20 -+	  when running on an true SMP host.
   53.21 -+
   53.22 -+          If unsure, say N.
   53.23 -+	  
   53.24 - config NR_CPUS
   53.25 - 	int "Maximum number of CPUs (2-255)"
   53.26 - 	range 2 255
   53.27 -diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/Makefile ./arch/i386/kernel/Makefile
   53.28 ---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/Makefile	2006-02-15 20:38:51.000000000 +0000
   53.29 -+++ ./arch/i386/kernel/Makefile	2006-02-15 20:45:57.000000000 +0000
   53.30 -@@ -37,6 +37,7 @@ obj-$(CONFIG_EFI) 		+= efi.o efi_stub.o
   53.31 - obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault.o
   53.32 - obj-$(CONFIG_VM86)		+= vm86.o
   53.33 - obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
   53.34 -+obj-$(CONFIG_SMP_ALTERNATIVES)  += smpalts.o
   53.35 - 
   53.36 - EXTRA_AFLAGS   := -traditional
   53.37 - 
   53.38 -diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpalts.c ./arch/i386/kernel/smpalts.c
   53.39 ---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpalts.c	1970-01-01 01:00:00.000000000 +0100
   53.40 -+++ ./arch/i386/kernel/smpalts.c	2006-02-15 20:45:57.000000000 +0000
   53.41 -@@ -0,0 +1,85 @@
   53.42 -+#include <linux/kernel.h>
   53.43 -+#include <asm/system.h>
   53.44 -+#include <asm/smp_alt.h>
   53.45 -+#include <asm/processor.h>
   53.46 -+#include <asm/string.h>
   53.47 -+
   53.48 -+struct smp_replacement_record {
   53.49 -+	unsigned char targ_size;
   53.50 -+	unsigned char smp1_size;
   53.51 -+	unsigned char smp2_size;
   53.52 -+	unsigned char up_size;
   53.53 -+	unsigned char feature;
   53.54 -+	unsigned char data[0];
   53.55 -+};
   53.56 -+
   53.57 -+struct smp_alternative_record {
   53.58 -+	void *targ_start;
   53.59 -+	struct smp_replacement_record *repl;
   53.60 -+};
   53.61 -+
   53.62 -+extern struct smp_alternative_record __start_smp_alternatives_table,
   53.63 -+  __stop_smp_alternatives_table;
   53.64 -+extern unsigned long __init_begin, __init_end;
   53.65 -+
   53.66 -+void prepare_for_smp(void)
   53.67 -+{
   53.68 -+	struct smp_alternative_record *r;
   53.69 -+	printk(KERN_INFO "Enabling SMP...\n");
   53.70 -+	for (r = &__start_smp_alternatives_table;
   53.71 -+	     r != &__stop_smp_alternatives_table;
   53.72 -+	     r++) {
   53.73 -+		BUG_ON(r->repl->targ_size < r->repl->smp1_size);
   53.74 -+		BUG_ON(r->repl->targ_size < r->repl->smp2_size);
   53.75 -+		BUG_ON(r->repl->targ_size < r->repl->up_size);
   53.76 -+               if (system_state == SYSTEM_RUNNING &&
   53.77 -+                   r->targ_start >= (void *)&__init_begin &&
   53.78 -+                   r->targ_start < (void *)&__init_end)
   53.79 -+                       continue;
   53.80 -+		if (r->repl->feature != (unsigned char)-1 &&
   53.81 -+		    boot_cpu_has(r->repl->feature)) {
   53.82 -+			memcpy(r->targ_start,
   53.83 -+			       r->repl->data + r->repl->smp1_size,
   53.84 -+			       r->repl->smp2_size);
   53.85 -+			memset(r->targ_start + r->repl->smp2_size,
   53.86 -+			       0x90,
   53.87 -+			       r->repl->targ_size - r->repl->smp2_size);
   53.88 -+		} else {
   53.89 -+			memcpy(r->targ_start,
   53.90 -+			       r->repl->data,
   53.91 -+			       r->repl->smp1_size);
   53.92 -+			memset(r->targ_start + r->repl->smp1_size,
   53.93 -+			       0x90,
   53.94 -+			       r->repl->targ_size - r->repl->smp1_size);
   53.95 -+		}
   53.96 -+	}
   53.97 -+	/* Paranoia */
   53.98 -+	asm volatile ("jmp 1f\n1:");
   53.99 -+	mb();
  53.100 -+}
  53.101 -+
  53.102 -+void unprepare_for_smp(void)
  53.103 -+{
  53.104 -+	struct smp_alternative_record *r;
  53.105 -+	printk(KERN_INFO "Disabling SMP...\n");
  53.106 -+	for (r = &__start_smp_alternatives_table;
  53.107 -+	     r != &__stop_smp_alternatives_table;
  53.108 -+	     r++) {
  53.109 -+		BUG_ON(r->repl->targ_size < r->repl->smp1_size);
  53.110 -+		BUG_ON(r->repl->targ_size < r->repl->smp2_size);
  53.111 -+		BUG_ON(r->repl->targ_size < r->repl->up_size);
  53.112 -+               if (system_state == SYSTEM_RUNNING &&
  53.113 -+                   r->targ_start >= (void *)&__init_begin &&
  53.114 -+                   r->targ_start < (void *)&__init_end)
  53.115 -+                       continue;
  53.116 -+		memcpy(r->targ_start,
  53.117 -+		       r->repl->data + r->repl->smp1_size + r->repl->smp2_size,
  53.118 -+		       r->repl->up_size);
  53.119 -+		memset(r->targ_start + r->repl->up_size,
  53.120 -+		       0x90,
  53.121 -+		       r->repl->targ_size - r->repl->up_size);
  53.122 -+	}
  53.123 -+	/* Paranoia */
  53.124 -+	asm volatile ("jmp 1f\n1:");
  53.125 -+	mb();
  53.126 -+}
  53.127 -diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpboot.c ./arch/i386/kernel/smpboot.c
  53.128 ---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpboot.c	2006-02-15 20:38:51.000000000 +0000
  53.129 -+++ ./arch/i386/kernel/smpboot.c	2006-02-15 20:45:57.000000000 +0000
  53.130 -@@ -1214,6 +1214,11 @@ static void __init smp_boot_cpus(unsigne
  53.131 - 		if (max_cpus <= cpucount+1)
  53.132 - 			continue;
  53.133 - 
  53.134 -+#ifdef CONFIG_SMP_ALTERNATIVES
  53.135 -+		if (kicked == 1)
  53.136 -+			prepare_for_smp();
  53.137 -+#endif
  53.138 -+
  53.139 - 		if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
  53.140 - 			printk("CPU #%d not responding - cannot use it.\n",
  53.141 - 								apicid);
  53.142 -@@ -1392,6 +1397,11 @@ int __devinit __cpu_up(unsigned int cpu)
  53.143 - 		return -EIO;
  53.144 - 	}
  53.145 - 
  53.146 -+#ifdef CONFIG_SMP_ALTERNATIVES
  53.147 -+	if (num_online_cpus() == 1)
  53.148 -+		prepare_for_smp();
  53.149 -+#endif
  53.150 -+
  53.151 - 	local_irq_enable();
  53.152 - 	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
  53.153 - 	/* Unleash the CPU! */
  53.154 -diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/vmlinux.lds.S ./arch/i386/kernel/vmlinux.lds.S
  53.155 ---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/vmlinux.lds.S	2006-01-03 03:21:10.000000000 +0000
  53.156 -+++ ./arch/i386/kernel/vmlinux.lds.S	2006-02-15 20:45:57.000000000 +0000
  53.157 -@@ -34,6 +34,13 @@ SECTIONS
  53.158 -   __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
  53.159 -   __stop___ex_table = .;
  53.160 - 
  53.161 -+  . = ALIGN(16);
  53.162 -+  __start_smp_alternatives_table = .;
  53.163 -+  __smp_alternatives : { *(__smp_alternatives) }
  53.164 -+  __stop_smp_alternatives_table = .;
  53.165 -+
  53.166 -+  __smp_replacements : { *(__smp_replacements) }
  53.167 -+
  53.168 -   RODATA
  53.169 - 
  53.170 -   /* writeable */
  53.171 -diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/atomic.h ./include/asm-i386/atomic.h
  53.172 ---- ../pristine-linux-2.6.16-rc3/include/asm-i386/atomic.h	2006-02-15 20:38:57.000000000 +0000
  53.173 -+++ ./include/asm-i386/atomic.h	2006-02-15 20:45:57.000000000 +0000
  53.174 -@@ -4,18 +4,13 @@
  53.175 - #include <linux/config.h>
  53.176 - #include <linux/compiler.h>
  53.177 - #include <asm/processor.h>
  53.178 -+#include <asm/smp_alt.h>
  53.179 - 
  53.180 - /*
  53.181 -  * Atomic operations that C can't guarantee us.  Useful for
  53.182 -  * resource counting etc..
  53.183 -  */
  53.184 - 
  53.185 --#ifdef CONFIG_SMP
  53.186 --#define LOCK "lock ; "
  53.187 --#else
  53.188 --#define LOCK ""
  53.189 --#endif
  53.190 --
  53.191 - /*
  53.192 -  * Make sure gcc doesn't try to be clever and move things around
  53.193 -  * on us. We need to use _exactly_ the address the user gave us,
  53.194 -diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/bitops.h ./include/asm-i386/bitops.h
  53.195 ---- ../pristine-linux-2.6.16-rc3/include/asm-i386/bitops.h	2006-02-15 20:38:57.000000000 +0000
  53.196 -+++ ./include/asm-i386/bitops.h	2006-02-15 20:45:57.000000000 +0000
  53.197 -@@ -7,6 +7,7 @@
  53.198 - 
  53.199 - #include <linux/config.h>
  53.200 - #include <linux/compiler.h>
  53.201 -+#include <asm/smp_alt.h>
  53.202 - 
  53.203 - /*
  53.204 -  * These have to be done with inline assembly: that way the bit-setting
  53.205 -@@ -16,12 +17,6 @@
  53.206 -  * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
  53.207 -  */
  53.208 - 
  53.209 --#ifdef CONFIG_SMP
  53.210 --#define LOCK_PREFIX "lock ; "
  53.211 --#else
  53.212 --#define LOCK_PREFIX ""
  53.213 --#endif
  53.214 --
  53.215 - #define ADDR (*(volatile long *) addr)
  53.216 - 
  53.217 - /**
  53.218 -@@ -41,7 +36,7 @@
  53.219 -  */
  53.220 - static inline void set_bit(int nr, volatile unsigned long * addr)
  53.221 - {
  53.222 --	__asm__ __volatile__( LOCK_PREFIX
  53.223 -+	__asm__ __volatile__( LOCK
  53.224 - 		"btsl %1,%0"
  53.225 - 		:"+m" (ADDR)
  53.226 - 		:"Ir" (nr));
  53.227 -@@ -76,7 +71,7 @@ static inline void __set_bit(int nr, vol
  53.228 -  */
  53.229 - static inline void clear_bit(int nr, volatile unsigned long * addr)
  53.230 - {
  53.231 --	__asm__ __volatile__( LOCK_PREFIX
  53.232 -+	__asm__ __volatile__( LOCK
  53.233 - 		"btrl %1,%0"
  53.234 - 		:"+m" (ADDR)
  53.235 - 		:"Ir" (nr));
  53.236 -@@ -121,7 +116,7 @@ static inline void __change_bit(int nr, 
  53.237 -  */
  53.238 - static inline void change_bit(int nr, volatile unsigned long * addr)
  53.239 - {
  53.240 --	__asm__ __volatile__( LOCK_PREFIX
  53.241 -+	__asm__ __volatile__( LOCK
  53.242 - 		"btcl %1,%0"
  53.243 - 		:"+m" (ADDR)
  53.244 - 		:"Ir" (nr));
  53.245 -@@ -140,7 +135,7 @@ static inline int test_and_set_bit(int n
  53.246 - {
  53.247 - 	int oldbit;
  53.248 - 
  53.249 --	__asm__ __volatile__( LOCK_PREFIX
  53.250 -+	__asm__ __volatile__( LOCK
  53.251 - 		"btsl %2,%1\n\tsbbl %0,%0"
  53.252 - 		:"=r" (oldbit),"+m" (ADDR)
  53.253 - 		:"Ir" (nr) : "memory");
  53.254 -@@ -180,7 +175,7 @@ static inline int test_and_clear_bit(int
  53.255 - {
  53.256 - 	int oldbit;
  53.257 - 
  53.258 --	__asm__ __volatile__( LOCK_PREFIX
  53.259 -+	__asm__ __volatile__( LOCK
  53.260 - 		"btrl %2,%1\n\tsbbl %0,%0"
  53.261 - 		:"=r" (oldbit),"+m" (ADDR)
  53.262 - 		:"Ir" (nr) : "memory");
  53.263 -@@ -231,7 +226,7 @@ static inline int test_and_change_bit(in
  53.264 - {
  53.265 - 	int oldbit;
  53.266 - 
  53.267 --	__asm__ __volatile__( LOCK_PREFIX
  53.268 -+	__asm__ __volatile__( LOCK
  53.269 - 		"btcl %2,%1\n\tsbbl %0,%0"
  53.270 - 		:"=r" (oldbit),"+m" (ADDR)
  53.271 - 		:"Ir" (nr) : "memory");
  53.272 -diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/futex.h ./include/asm-i386/futex.h
  53.273 ---- ../pristine-linux-2.6.16-rc3/include/asm-i386/futex.h	2006-02-15 20:38:57.000000000 +0000
  53.274 -+++ ./include/asm-i386/futex.h	2006-02-15 20:45:57.000000000 +0000
  53.275 -@@ -28,7 +28,7 @@
  53.276 - "1:	movl	%2, %0\n\
  53.277 - 	movl	%0, %3\n"					\
  53.278 - 	insn "\n"						\
  53.279 --"2:	" LOCK_PREFIX "cmpxchgl %3, %2\n\
  53.280 -+"2:	" LOCK "cmpxchgl %3, %2\n\
  53.281 - 	jnz	1b\n\
  53.282 - 3:	.section .fixup,\"ax\"\n\
  53.283 - 4:	mov	%5, %1\n\
  53.284 -@@ -68,7 +68,7 @@ futex_atomic_op_inuser (int encoded_op, 
  53.285 - #endif
  53.286 - 		switch (op) {
  53.287 - 		case FUTEX_OP_ADD:
  53.288 --			__futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
  53.289 -+			__futex_atomic_op1(LOCK "xaddl %0, %2", ret,
  53.290 - 					   oldval, uaddr, oparg);
  53.291 - 			break;
  53.292 - 		case FUTEX_OP_OR:
  53.293 -diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/rwsem.h ./include/asm-i386/rwsem.h
  53.294 ---- ../pristine-linux-2.6.16-rc3/include/asm-i386/rwsem.h	2006-01-03 03:21:10.000000000 +0000
  53.295 -+++ ./include/asm-i386/rwsem.h	2006-02-15 20:45:57.000000000 +0000
  53.296 -@@ -40,6 +40,7 @@
  53.297 - 
  53.298 - #include <linux/list.h>
  53.299 - #include <linux/spinlock.h>
  53.300 -+#include <asm/smp_alt.h>
  53.301 - 
  53.302 - struct rwsem_waiter;
  53.303 - 
  53.304 -@@ -99,7 +100,7 @@ static inline void __down_read(struct rw
  53.305 - {
  53.306 - 	__asm__ __volatile__(
  53.307 - 		"# beginning down_read\n\t"
  53.308 --LOCK_PREFIX	"  incl      (%%eax)\n\t" /* adds 0x00000001, returns the old value */
  53.309 -+LOCK	        "  incl      (%%eax)\n\t" /* adds 0x00000001, returns the old value */
  53.310 - 		"  js        2f\n\t" /* jump if we weren't granted the lock */
  53.311 - 		"1:\n\t"
  53.312 - 		LOCK_SECTION_START("")
  53.313 -@@ -130,7 +131,7 @@ static inline int __down_read_trylock(st
  53.314 - 		"  movl	     %1,%2\n\t"
  53.315 - 		"  addl      %3,%2\n\t"
  53.316 - 		"  jle	     2f\n\t"
  53.317 --LOCK_PREFIX	"  cmpxchgl  %2,%0\n\t"
  53.318 -+LOCK	        "  cmpxchgl  %2,%0\n\t"
  53.319 - 		"  jnz	     1b\n\t"
  53.320 - 		"2:\n\t"
  53.321 - 		"# ending __down_read_trylock\n\t"
  53.322 -@@ -150,7 +151,7 @@ static inline void __down_write(struct r
  53.323 - 	tmp = RWSEM_ACTIVE_WRITE_BIAS;
  53.324 - 	__asm__ __volatile__(
  53.325 - 		"# beginning down_write\n\t"
  53.326 --LOCK_PREFIX	"  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */
  53.327 -+LOCK	        "  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */
  53.328 - 		"  testl     %%edx,%%edx\n\t" /* was the count 0 before? */
  53.329 - 		"  jnz       2f\n\t" /* jump if we weren't granted the lock */
  53.330 - 		"1:\n\t"
  53.331 -@@ -188,7 +189,7 @@ static inline void __up_read(struct rw_s
  53.332 - 	__s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
  53.333 - 	__asm__ __volatile__(
  53.334 - 		"# beginning __up_read\n\t"
  53.335 --LOCK_PREFIX	"  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */
  53.336 -+LOCK	        "  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */
  53.337 - 		"  js        2f\n\t" /* jump if the lock is being waited upon */
  53.338 - 		"1:\n\t"
  53.339 - 		LOCK_SECTION_START("")
  53.340 -@@ -214,7 +215,7 @@ static inline void __up_write(struct rw_
  53.341 - 	__asm__ __volatile__(
  53.342 - 		"# beginning __up_write\n\t"
  53.343 - 		"  movl      %2,%%edx\n\t"
  53.344 --LOCK_PREFIX	"  xaddl     %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */
  53.345 -+LOCK	        "  xaddl     %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */
  53.346 - 		"  jnz       2f\n\t" /* jump if the lock is being waited upon */
  53.347 - 		"1:\n\t"
  53.348 - 		LOCK_SECTION_START("")
  53.349 -@@ -239,7 +240,7 @@ static inline void __downgrade_write(str
  53.350 - {
  53.351 - 	__asm__ __volatile__(
  53.352 - 		"# beginning __downgrade_write\n\t"
  53.353 --LOCK_PREFIX	"  addl      %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
  53.354 -+LOCK	        "  addl      %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
  53.355 - 		"  js        2f\n\t" /* jump if the lock is being waited upon */
  53.356 - 		"1:\n\t"
  53.357 - 		LOCK_SECTION_START("")
  53.358 -@@ -263,7 +264,7 @@ LOCK_PREFIX	"  addl      %2,(%%eax)\n\t"
  53.359 - static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
  53.360 - {
  53.361 - 	__asm__ __volatile__(
  53.362 --LOCK_PREFIX	"addl %1,%0"
  53.363 -+LOCK	          "addl %1,%0"
  53.364 - 		: "=m"(sem->count)
  53.365 - 		: "ir"(delta), "m"(sem->count));
  53.366 - }
  53.367 -@@ -276,7 +277,7 @@ static inline int rwsem_atomic_update(in
  53.368 - 	int tmp = delta;
  53.369 - 
  53.370 - 	__asm__ __volatile__(
  53.371 --LOCK_PREFIX	"xadd %0,(%2)"
  53.372 -+LOCK  	          "xadd %0,(%2)"
  53.373 - 		: "+r"(tmp), "=m"(sem->count)
  53.374 - 		: "r"(sem), "m"(sem->count)
  53.375 - 		: "memory");
  53.376 -diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/smp_alt.h ./include/asm-i386/smp_alt.h
  53.377 ---- ../pristine-linux-2.6.16-rc3/include/asm-i386/smp_alt.h	1970-01-01 01:00:00.000000000 +0100
  53.378 -+++ ./include/asm-i386/smp_alt.h	2006-02-15 20:45:57.000000000 +0000
  53.379 -@@ -0,0 +1,32 @@
  53.380 -+#ifndef __ASM_SMP_ALT_H__
  53.381 -+#define __ASM_SMP_ALT_H__
  53.382 -+
  53.383 -+#include <linux/config.h>
  53.384 -+
  53.385 -+#ifdef CONFIG_SMP
  53.386 -+#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
  53.387 -+#define LOCK \
  53.388 -+        "6677: nop\n" \
  53.389 -+	".section __smp_alternatives,\"a\"\n" \
  53.390 -+	".long 6677b\n" \
  53.391 -+	".long 6678f\n" \
  53.392 -+	".previous\n" \
  53.393 -+	".section __smp_replacements,\"a\"\n" \
  53.394 -+	"6678: .byte 1\n" \
  53.395 -+	".byte 1\n" \
  53.396 -+	".byte 0\n" \
  53.397 -+        ".byte 1\n" \
  53.398 -+	".byte -1\n" \
  53.399 -+	"lock\n" \
  53.400 -+	"nop\n" \
  53.401 -+	".previous\n"
  53.402 -+void prepare_for_smp(void);
  53.403 -+void unprepare_for_smp(void);
  53.404 -+#else
  53.405 -+#define LOCK "lock ; "
  53.406 -+#endif
  53.407 -+#else
  53.408 -+#define LOCK ""
  53.409 -+#endif
  53.410 -+
  53.411 -+#endif /* __ASM_SMP_ALT_H__ */
  53.412 -diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/spinlock.h ./include/asm-i386/spinlock.h
  53.413 ---- ../pristine-linux-2.6.16-rc3/include/asm-i386/spinlock.h	2006-01-03 03:21:10.000000000 +0000
  53.414 -+++ ./include/asm-i386/spinlock.h	2006-02-15 20:45:57.000000000 +0000
  53.415 -@@ -6,6 +6,7 @@
  53.416 - #include <asm/page.h>
  53.417 - #include <linux/config.h>
  53.418 - #include <linux/compiler.h>
  53.419 -+#include <asm/smp_alt.h>
  53.420 - 
  53.421 - /*
  53.422 -  * Your basic SMP spinlocks, allowing only a single CPU anywhere
  53.423 -@@ -23,7 +24,8 @@
  53.424 - 
  53.425 - #define __raw_spin_lock_string \
  53.426 - 	"\n1:\t" \
  53.427 --	"lock ; decb %0\n\t" \
  53.428 -+	LOCK \
  53.429 -+	"decb %0\n\t" \
  53.430 - 	"jns 3f\n" \
  53.431 - 	"2:\t" \
  53.432 - 	"rep;nop\n\t" \
  53.433 -@@ -34,7 +36,8 @@
  53.434 - 
  53.435 - #define __raw_spin_lock_string_flags \
  53.436 - 	"\n1:\t" \
  53.437 --	"lock ; decb %0\n\t" \
  53.438 -+	LOCK \
  53.439 -+	"decb %0\n\t" \
  53.440 - 	"jns 4f\n\t" \
  53.441 - 	"2:\t" \
  53.442 - 	"testl $0x200, %1\n\t" \
  53.443 -@@ -65,10 +68,34 @@ static inline void __raw_spin_lock_flags
  53.444 - static inline int __raw_spin_trylock(raw_spinlock_t *lock)
  53.445 - {
  53.446 - 	char oldval;
  53.447 -+#ifdef CONFIG_SMP_ALTERNATIVES
  53.448 - 	__asm__ __volatile__(
  53.449 --		"xchgb %b0,%1"
  53.450 -+		"1:movb %1,%b0\n"
  53.451 -+		"movb $0,%1\n"
  53.452 -+		"2:"
  53.453 -+		".section __smp_alternatives,\"a\"\n"
  53.454 -+		".long 1b\n"
  53.455 -+		".long 3f\n"
  53.456 -+		".previous\n"
  53.457 -+		".section __smp_replacements,\"a\"\n"
  53.458 -+		"3: .byte 2b - 1b\n"
  53.459 -+		".byte 5f-4f\n"
  53.460 -+		".byte 0\n"
  53.461 -+		".byte 6f-5f\n"
  53.462 -+		".byte -1\n"
  53.463 -+		"4: xchgb %b0,%1\n"
  53.464 -+		"5: movb %1,%b0\n"
  53.465 -+		"movb $0,%1\n"
  53.466 -+		"6:\n"
  53.467 -+		".previous\n"
  53.468 - 		:"=q" (oldval), "=m" (lock->slock)
  53.469 - 		:"0" (0) : "memory");
  53.470 -+#else
  53.471 -+	__asm__ __volatile__(
  53.472 -+		"xchgb %b0,%1\n"
  53.473 -+		:"=q" (oldval), "=m" (lock->slock)
  53.474 -+		:"0" (0) : "memory");
  53.475 -+#endif
  53.476 - 	return oldval > 0;
  53.477 - }
  53.478 - 
  53.479 -@@ -178,12 +205,12 @@ static inline int __raw_write_trylock(ra
  53.480 - 
  53.481 - static inline void __raw_read_unlock(raw_rwlock_t *rw)
  53.482 - {
  53.483 --	asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
  53.484 -+	asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory");
  53.485 - }
  53.486 - 
  53.487 - static inline void __raw_write_unlock(raw_rwlock_t *rw)
  53.488 - {
  53.489 --	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0"
  53.490 -+	asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0"
  53.491 - 				 : "=m" (rw->lock) : : "memory");
  53.492 - }
  53.493 - 
  53.494 -diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/system.h ./include/asm-i386/system.h
  53.495 ---- ../pristine-linux-2.6.16-rc3/include/asm-i386/system.h	2006-02-15 20:38:57.000000000 +0000
  53.496 -+++ ./include/asm-i386/system.h	2006-02-15 20:45:57.000000000 +0000
  53.497 -@@ -5,7 +5,7 @@
  53.498 - #include <linux/kernel.h>
  53.499 - #include <asm/segment.h>
  53.500 - #include <asm/cpufeature.h>
  53.501 --#include <linux/bitops.h> /* for LOCK_PREFIX */
  53.502 -+#include <asm/smp_alt.h>
  53.503 - 
  53.504 - #ifdef __KERNEL__
  53.505 - 
  53.506 -@@ -271,19 +271,19 @@ static inline unsigned long __cmpxchg(vo
  53.507 - 	unsigned long prev;
  53.508 - 	switch (size) {
  53.509 - 	case 1:
  53.510 --		__asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
  53.511 -+		__asm__ __volatile__(LOCK "cmpxchgb %b1,%2"
  53.512 - 				     : "=a"(prev)
  53.513 - 				     : "q"(new), "m"(*__xg(ptr)), "0"(old)
  53.514 - 				     : "memory");
  53.515 - 		return prev;
  53.516 - 	case 2:
  53.517 --		__asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
  53.518 -+		__asm__ __volatile__(LOCK "cmpxchgw %w1,%2"
  53.519 - 				     : "=a"(prev)
  53.520 - 				     : "r"(new), "m"(*__xg(ptr)), "0"(old)
  53.521 - 				     : "memory");
  53.522 - 		return prev;
  53.523 - 	case 4:
  53.524 --		__asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
  53.525 -+		__asm__ __volatile__(LOCK "cmpxchgl %1,%2"
  53.526 - 				     : "=a"(prev)
  53.527 - 				     : "r"(new), "m"(*__xg(ptr)), "0"(old)
  53.528 - 				     : "memory");
  53.529 -@@ -336,7 +336,7 @@ static inline unsigned long long __cmpxc
  53.530 - 				      unsigned long long new)
  53.531 - {
  53.532 - 	unsigned long long prev;
  53.533 --	__asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
  53.534 -+	__asm__ __volatile__(LOCK "cmpxchg8b %3"
  53.535 - 			     : "=A"(prev)
  53.536 - 			     : "b"((unsigned long)new),
  53.537 - 			       "c"((unsigned long)(new >> 32)),
  53.538 -@@ -503,11 +503,55 @@ struct alt_instr { 
  53.539 - #endif
  53.540 - 
  53.541 - #ifdef CONFIG_SMP
  53.542 -+#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
  53.543 -+#define smp_alt_mb(instr)                                           \
  53.544 -+__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \
  53.545 -+		     ".section __smp_alternatives,\"a\"\n"          \
  53.546 -+		     ".long 6667b\n"                                \
  53.547 -+                     ".long 6673f\n"                                \
  53.548 -+		     ".previous\n"                                  \
  53.549 -+		     ".section __smp_replacements,\"a\"\n"          \
  53.550 -+		     "6673:.byte 6668b-6667b\n"                     \
  53.551 -+		     ".byte 6670f-6669f\n"                          \
  53.552 -+		     ".byte 6671f-6670f\n"                          \
  53.553 -+                     ".byte 0\n"                                    \
  53.554 -+		     ".byte %c0\n"                                  \
  53.555 -+		     "6669:lock;addl $0,0(%%esp)\n"                 \
  53.556 -+		     "6670:" instr "\n"                             \
  53.557 -+		     "6671:\n"                                      \
  53.558 -+		     ".previous\n"                                  \
  53.559 -+		     :                                              \
  53.560 -+		     : "i" (X86_FEATURE_XMM2)                       \
  53.561 -+		     : "memory")
  53.562 -+#define smp_rmb() smp_alt_mb("lfence")
  53.563 -+#define smp_mb()  smp_alt_mb("mfence")
  53.564 -+#define set_mb(var, value) do {                                     \
  53.565 -+unsigned long __set_mb_temp;                                        \
  53.566 -+__asm__ __volatile__("6667:movl %1, %0\n6668:\n"                    \
  53.567 -+		     ".section __smp_alternatives,\"a\"\n"          \
  53.568 -+		     ".long 6667b\n"                                \
  53.569 -+		     ".long 6673f\n"                                \
  53.570 -+		     ".previous\n"                                  \
  53.571 -+		     ".section __smp_replacements,\"a\"\n"          \
  53.572 -+		     "6673: .byte 6668b-6667b\n"                    \
  53.573 -+		     ".byte 6670f-6669f\n"                          \
  53.574 -+		     ".byte 0\n"                                    \
  53.575 -+		     ".byte 6671f-6670f\n"                          \
  53.576 -+		     ".byte -1\n"                                   \
  53.577 -+		     "6669: xchg %1, %0\n"                          \
  53.578 -+		     "6670:movl %1, %0\n"                           \
  53.579 -+		     "6671:\n"                                      \
  53.580 -+		     ".previous\n"                                  \
  53.581 -+		     : "=m" (var), "=r" (__set_mb_temp)             \
  53.582 -+		     : "1" (value)                                  \
  53.583 -+		     : "memory"); } while (0)
  53.584 -+#else
  53.585 - #define smp_mb()	mb()
  53.586 - #define smp_rmb()	rmb()
  53.587 -+#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
  53.588 -+#endif
  53.589 - #define smp_wmb()	wmb()
  53.590 - #define smp_read_barrier_depends()	read_barrier_depends()
  53.591 --#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
  53.592 - #else
  53.593 - #define smp_mb()	barrier()
  53.594 - #define smp_rmb()	barrier()
    54.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    54.2 +++ b/patches/linux-2.6.16-rc5/i386-mach-io-check-nmi.patch	Wed Mar 01 12:47:25 2006 -0700
    54.3 @@ -0,0 +1,45 @@
    54.4 +diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/traps.c ./arch/i386/kernel/traps.c
    54.5 +--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/traps.c	2006-02-27 15:46:58.000000000 +0000
    54.6 ++++ ./arch/i386/kernel/traps.c	2006-02-27 15:55:23.000000000 +0000
    54.7 +@@ -567,18 +567,11 @@ static void mem_parity_error(unsigned ch
    54.8 + 
    54.9 + static void io_check_error(unsigned char reason, struct pt_regs * regs)
   54.10 + {
   54.11 +-	unsigned long i;
   54.12 +-
   54.13 + 	printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
   54.14 + 	show_registers(regs);
   54.15 + 
   54.16 + 	/* Re-enable the IOCK line, wait for a few seconds */
   54.17 +-	reason = (reason & 0xf) | 8;
   54.18 +-	outb(reason, 0x61);
   54.19 +-	i = 2000;
   54.20 +-	while (--i) udelay(1000);
   54.21 +-	reason &= ~8;
   54.22 +-	outb(reason, 0x61);
   54.23 ++	clear_io_check_error(reason);
   54.24 + }
   54.25 + 
   54.26 + static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
   54.27 +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/mach-default/mach_traps.h ./include/asm-i386/mach-default/mach_traps.h
   54.28 +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/mach-default/mach_traps.h	2006-01-03 03:21:10.000000000 +0000
   54.29 ++++ ./include/asm-i386/mach-default/mach_traps.h	2006-02-27 15:55:23.000000000 +0000
   54.30 +@@ -15,6 +15,18 @@ static inline void clear_mem_error(unsig
   54.31 + 	outb(reason, 0x61);
   54.32 + }
   54.33 + 
   54.34 ++static inline void clear_io_check_error(unsigned char reason)
   54.35 ++{
   54.36 ++	unsigned long i;
   54.37 ++
   54.38 ++	reason = (reason & 0xf) | 8;
   54.39 ++	outb(reason, 0x61);
   54.40 ++	i = 2000;
   54.41 ++	while (--i) udelay(1000);
   54.42 ++	reason &= ~8;
   54.43 ++	outb(reason, 0x61);
   54.44 ++}
   54.45 ++
   54.46 + static inline unsigned char get_nmi_reason(void)
   54.47 + {
   54.48 + 	return inb(0x61);
    55.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    55.2 +++ b/patches/linux-2.6.16-rc5/net-csum.patch	Wed Mar 01 12:47:25 2006 -0700
    55.3 @@ -0,0 +1,41 @@
    55.4 +diff -pruN ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_tcp.c ./net/ipv4/netfilter/ip_nat_proto_tcp.c
    55.5 +--- ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_tcp.c	2006-02-27 15:47:38.000000000 +0000
    55.6 ++++ ./net/ipv4/netfilter/ip_nat_proto_tcp.c	2006-02-27 15:55:25.000000000 +0000
    55.7 +@@ -129,10 +129,14 @@ tcp_manip_pkt(struct sk_buff **pskb,
    55.8 + 	if (hdrsize < sizeof(*hdr))
    55.9 + 		return 1;
   55.10 + 
   55.11 +-	hdr->check = ip_nat_cheat_check(~oldip, newip,
   55.12 ++	if ((*pskb)->proto_csum_blank) {
   55.13 ++		hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check);
   55.14 ++	} else {
   55.15 ++		hdr->check = ip_nat_cheat_check(~oldip, newip,
   55.16 + 					ip_nat_cheat_check(oldport ^ 0xFFFF,
   55.17 + 							   newport,
   55.18 + 							   hdr->check));
   55.19 ++	}
   55.20 + 	return 1;
   55.21 + }
   55.22 + 
   55.23 +diff -pruN ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_udp.c ./net/ipv4/netfilter/ip_nat_proto_udp.c
   55.24 +--- ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_udp.c	2006-02-27 15:47:38.000000000 +0000
   55.25 ++++ ./net/ipv4/netfilter/ip_nat_proto_udp.c	2006-02-27 15:55:25.000000000 +0000
   55.26 +@@ -113,11 +113,16 @@ udp_manip_pkt(struct sk_buff **pskb,
   55.27 + 		newport = tuple->dst.u.udp.port;
   55.28 + 		portptr = &hdr->dest;
   55.29 + 	}
   55.30 +-	if (hdr->check) /* 0 is a special case meaning no checksum */
   55.31 +-		hdr->check = ip_nat_cheat_check(~oldip, newip,
   55.32 ++	if (hdr->check) { /* 0 is a special case meaning no checksum */
   55.33 ++		if ((*pskb)->proto_csum_blank) {
   55.34 ++			hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check);
   55.35 ++		} else {
   55.36 ++			hdr->check = ip_nat_cheat_check(~oldip, newip,
   55.37 + 					ip_nat_cheat_check(*portptr ^ 0xFFFF,
   55.38 + 							   newport,
   55.39 + 							   hdr->check));
   55.40 ++		}
   55.41 ++	}
   55.42 + 	*portptr = newport;
   55.43 + 	return 1;
   55.44 + }
    56.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    56.2 +++ b/patches/linux-2.6.16-rc5/pmd-shared.patch	Wed Mar 01 12:47:25 2006 -0700
    56.3 @@ -0,0 +1,111 @@
    56.4 +diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/mm/pageattr.c ./arch/i386/mm/pageattr.c
    56.5 +--- ../pristine-linux-2.6.16-rc5/arch/i386/mm/pageattr.c	2006-02-27 15:46:58.000000000 +0000
    56.6 ++++ ./arch/i386/mm/pageattr.c	2006-02-27 15:55:31.000000000 +0000
    56.7 +@@ -78,7 +78,7 @@ static void set_pmd_pte(pte_t *kpte, uns
    56.8 + 	unsigned long flags;
    56.9 + 
   56.10 + 	set_pte_atomic(kpte, pte); 	/* change init_mm */
   56.11 +-	if (PTRS_PER_PMD > 1)
   56.12 ++	if (HAVE_SHARED_KERNEL_PMD)
   56.13 + 		return;
   56.14 + 
   56.15 + 	spin_lock_irqsave(&pgd_lock, flags);
   56.16 +diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/mm/pgtable.c ./arch/i386/mm/pgtable.c
   56.17 +--- ../pristine-linux-2.6.16-rc5/arch/i386/mm/pgtable.c	2006-01-03 03:21:10.000000000 +0000
   56.18 ++++ ./arch/i386/mm/pgtable.c	2006-02-27 15:55:31.000000000 +0000
   56.19 +@@ -215,9 +215,10 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
   56.20 + 		spin_lock_irqsave(&pgd_lock, flags);
   56.21 + 	}
   56.22 + 
   56.23 +-	clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
   56.24 +-			swapper_pg_dir + USER_PTRS_PER_PGD,
   56.25 +-			KERNEL_PGD_PTRS);
   56.26 ++	if (PTRS_PER_PMD == 1 || HAVE_SHARED_KERNEL_PMD)
   56.27 ++		clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
   56.28 ++				swapper_pg_dir + USER_PTRS_PER_PGD,
   56.29 ++				KERNEL_PGD_PTRS);
   56.30 + 	if (PTRS_PER_PMD > 1)
   56.31 + 		return;
   56.32 + 
   56.33 +@@ -249,6 +250,30 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
   56.34 + 			goto out_oom;
   56.35 + 		set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
   56.36 + 	}
   56.37 ++
   56.38 ++	if (!HAVE_SHARED_KERNEL_PMD) {
   56.39 ++		unsigned long flags;
   56.40 ++
   56.41 ++		for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
   56.42 ++			pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
   56.43 ++			if (!pmd)
   56.44 ++				goto out_oom;
   56.45 ++			set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd)));
   56.46 ++		}
   56.47 ++
   56.48 ++		spin_lock_irqsave(&pgd_lock, flags);
   56.49 ++		for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
   56.50 ++			unsigned long v = (unsigned long)i << PGDIR_SHIFT;
   56.51 ++			pgd_t *kpgd = pgd_offset_k(v);
   56.52 ++			pud_t *kpud = pud_offset(kpgd, v);
   56.53 ++			pmd_t *kpmd = pmd_offset(kpud, v);
   56.54 ++			pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
   56.55 ++			memcpy(pmd, kpmd, PAGE_SIZE);
   56.56 ++		}
   56.57 ++		pgd_list_add(pgd);
   56.58 ++		spin_unlock_irqrestore(&pgd_lock, flags);
   56.59 ++	}
   56.60 ++
   56.61 + 	return pgd;
   56.62 + 
   56.63 + out_oom:
   56.64 +@@ -263,9 +288,23 @@ void pgd_free(pgd_t *pgd)
   56.65 + 	int i;
   56.66 + 
   56.67 + 	/* in the PAE case user pgd entries are overwritten before usage */
   56.68 +-	if (PTRS_PER_PMD > 1)
   56.69 +-		for (i = 0; i < USER_PTRS_PER_PGD; ++i)
   56.70 +-			kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
   56.71 ++	if (PTRS_PER_PMD > 1) {
   56.72 ++		for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
   56.73 ++			pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
   56.74 ++			kmem_cache_free(pmd_cache, pmd);
   56.75 ++		}
   56.76 ++		if (!HAVE_SHARED_KERNEL_PMD) {
   56.77 ++			unsigned long flags;
   56.78 ++			spin_lock_irqsave(&pgd_lock, flags);
   56.79 ++			pgd_list_del(pgd);
   56.80 ++			spin_unlock_irqrestore(&pgd_lock, flags);
   56.81 ++			for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
   56.82 ++				pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
   56.83 ++				memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
   56.84 ++				kmem_cache_free(pmd_cache, pmd);
   56.85 ++			}
   56.86 ++		}
   56.87 ++	}
   56.88 + 	/* in the non-PAE case, free_pgtables() clears user pgd entries */
   56.89 + 	kmem_cache_free(pgd_cache, pgd);
   56.90 + }
   56.91 +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-2level-defs.h ./include/asm-i386/pgtable-2level-defs.h
   56.92 +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-2level-defs.h	2006-01-03 03:21:10.000000000 +0000
   56.93 ++++ ./include/asm-i386/pgtable-2level-defs.h	2006-02-27 15:55:31.000000000 +0000
   56.94 +@@ -1,6 +1,8 @@
   56.95 + #ifndef _I386_PGTABLE_2LEVEL_DEFS_H
   56.96 + #define _I386_PGTABLE_2LEVEL_DEFS_H
   56.97 + 
   56.98 ++#define HAVE_SHARED_KERNEL_PMD 0
   56.99 ++
  56.100 + /*
  56.101 +  * traditional i386 two-level paging structure:
  56.102 +  */
  56.103 +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-3level-defs.h ./include/asm-i386/pgtable-3level-defs.h
  56.104 +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-3level-defs.h	2006-01-03 03:21:10.000000000 +0000
  56.105 ++++ ./include/asm-i386/pgtable-3level-defs.h	2006-02-27 15:55:31.000000000 +0000
  56.106 +@@ -1,6 +1,8 @@
  56.107 + #ifndef _I386_PGTABLE_3LEVEL_DEFS_H
  56.108 + #define _I386_PGTABLE_3LEVEL_DEFS_H
  56.109 + 
  56.110 ++#define HAVE_SHARED_KERNEL_PMD 1
  56.111 ++
  56.112 + /*
  56.113 +  * PGDIR_SHIFT determines what a top-level page table entry can map
  56.114 +  */
    57.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    57.2 +++ b/patches/linux-2.6.16-rc5/smp-alts.patch	Wed Mar 01 12:47:25 2006 -0700
    57.3 @@ -0,0 +1,591 @@
    57.4 +diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/Kconfig ./arch/i386/Kconfig
    57.5 +--- ../pristine-linux-2.6.16-rc5/arch/i386/Kconfig	2006-02-27 15:46:58.000000000 +0000
    57.6 ++++ ./arch/i386/Kconfig	2006-02-27 15:55:34.000000000 +0000
    57.7 +@@ -202,6 +202,19 @@ config SMP
    57.8 + 
    57.9 + 	  If you don't know what to do here, say N.
   57.10 + 
   57.11 ++config SMP_ALTERNATIVES
   57.12 ++	bool "SMP alternatives support (EXPERIMENTAL)"
   57.13 ++	depends on SMP && EXPERIMENTAL
   57.14 ++	help
   57.15 ++	  Try to reduce the overhead of running an SMP kernel on a uniprocessor
   57.16 ++	  host slightly by replacing certain key instruction sequences
   57.17 ++	  according to whether we currently have more than one CPU available.
   57.18 ++	  This should provide a noticeable boost to performance when
   57.19 ++	  running SMP kernels on UP machines, and have negligible impact
   57.20 ++	  when running on an true SMP host.
   57.21 ++
   57.22 ++          If unsure, say N.
   57.23 ++	  
   57.24 + config NR_CPUS
   57.25 + 	int "Maximum number of CPUs (2-255)"
   57.26 + 	range 2 255
   57.27 +diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/Makefile ./arch/i386/kernel/Makefile
   57.28 +--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/Makefile	2006-02-27 15:46:58.000000000 +0000
   57.29 ++++ ./arch/i386/kernel/Makefile	2006-02-27 15:55:34.000000000 +0000
   57.30 +@@ -37,6 +37,7 @@ obj-$(CONFIG_EFI) 		+= efi.o efi_stub.o
   57.31 + obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault.o
   57.32 + obj-$(CONFIG_VM86)		+= vm86.o
   57.33 + obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
   57.34 ++obj-$(CONFIG_SMP_ALTERNATIVES)  += smpalts.o
   57.35 + 
   57.36 + EXTRA_AFLAGS   := -traditional
   57.37 + 
   57.38 +diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpalts.c ./arch/i386/kernel/smpalts.c
   57.39 +--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpalts.c	1970-01-01 01:00:00.000000000 +0100
   57.40 ++++ ./arch/i386/kernel/smpalts.c	2006-02-27 15:55:34.000000000 +0000
   57.41 +@@ -0,0 +1,85 @@
   57.42 ++#include <linux/kernel.h>
   57.43 ++#include <asm/system.h>
   57.44 ++#include <asm/smp_alt.h>
   57.45 ++#include <asm/processor.h>
   57.46 ++#include <asm/string.h>
   57.47 ++
   57.48 ++struct smp_replacement_record {
   57.49 ++	unsigned char targ_size;
   57.50 ++	unsigned char smp1_size;
   57.51 ++	unsigned char smp2_size;
   57.52 ++	unsigned char up_size;
   57.53 ++	unsigned char feature;
   57.54 ++	unsigned char data[0];
   57.55 ++};
   57.56 ++
   57.57 ++struct smp_alternative_record {
   57.58 ++	void *targ_start;
   57.59 ++	struct smp_replacement_record *repl;
   57.60 ++};
   57.61 ++
   57.62 ++extern struct smp_alternative_record __start_smp_alternatives_table,
   57.63 ++  __stop_smp_alternatives_table;
   57.64 ++extern unsigned long __init_begin, __init_end;
   57.65 ++
   57.66 ++void prepare_for_smp(void)
   57.67 ++{
   57.68 ++	struct smp_alternative_record *r;
   57.69 ++	printk(KERN_INFO "Enabling SMP...\n");
   57.70 ++	for (r = &__start_smp_alternatives_table;
   57.71 ++	     r != &__stop_smp_alternatives_table;
   57.72 ++	     r++) {
   57.73 ++		BUG_ON(r->repl->targ_size < r->repl->smp1_size);
   57.74 ++		BUG_ON(r->repl->targ_size < r->repl->smp2_size);
   57.75 ++		BUG_ON(r->repl->targ_size < r->repl->up_size);
   57.76 ++               if (system_state == SYSTEM_RUNNING &&
   57.77 ++                   r->targ_start >= (void *)&__init_begin &&
   57.78 ++                   r->targ_start < (void *)&__init_end)
   57.79 ++                       continue;
   57.80 ++		if (r->repl->feature != (unsigned char)-1 &&
   57.81 ++		    boot_cpu_has(r->repl->feature)) {
   57.82 ++			memcpy(r->targ_start,
   57.83 ++			       r->repl->data + r->repl->smp1_size,
   57.84 ++			       r->repl->smp2_size);
   57.85 ++			memset(r->targ_start + r->repl->smp2_size,
   57.86 ++			       0x90,
   57.87 ++			       r->repl->targ_size - r->repl->smp2_size);
   57.88 ++		} else {
   57.89 ++			memcpy(r->targ_start,
   57.90 ++			       r->repl->data,
   57.91 ++			       r->repl->smp1_size);
   57.92 ++			memset(r->targ_start + r->repl->smp1_size,
   57.93 ++			       0x90,
   57.94 ++			       r->repl->targ_size - r->repl->smp1_size);
   57.95 ++		}
   57.96 ++	}
   57.97 ++	/* Paranoia */
   57.98 ++	asm volatile ("jmp 1f\n1:");
   57.99 ++	mb();
  57.100 ++}
  57.101 ++
  57.102 ++void unprepare_for_smp(void)
  57.103 ++{
  57.104 ++	struct smp_alternative_record *r;
  57.105 ++	printk(KERN_INFO "Disabling SMP...\n");
  57.106 ++	for (r = &__start_smp_alternatives_table;
  57.107 ++	     r != &__stop_smp_alternatives_table;
  57.108 ++	     r++) {
  57.109 ++		BUG_ON(r->repl->targ_size < r->repl->smp1_size);
  57.110 ++		BUG_ON(r->repl->targ_size < r->repl->smp2_size);
  57.111 ++		BUG_ON(r->repl->targ_size < r->repl->up_size);
  57.112 ++               if (system_state == SYSTEM_RUNNING &&
  57.113 ++                   r->targ_start >= (void *)&__init_begin &&
  57.114 ++                   r->targ_start < (void *)&__init_end)
  57.115 ++                       continue;
  57.116 ++		memcpy(r->targ_start,
  57.117 ++		       r->repl->data + r->repl->smp1_size + r->repl->smp2_size,
  57.118 ++		       r->repl->up_size);
  57.119 ++		memset(r->targ_start + r->repl->up_size,
  57.120 ++		       0x90,
  57.121 ++		       r->repl->targ_size - r->repl->up_size);
  57.122 ++	}
  57.123 ++	/* Paranoia */
  57.124 ++	asm volatile ("jmp 1f\n1:");
  57.125 ++	mb();
  57.126 ++}
  57.127 +diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpboot.c ./arch/i386/kernel/smpboot.c
  57.128 +--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpboot.c	2006-02-27 15:46:58.000000000 +0000
  57.129 ++++ ./arch/i386/kernel/smpboot.c	2006-02-27 15:55:34.000000000 +0000
  57.130 +@@ -1208,6 +1208,11 @@ static void __init smp_boot_cpus(unsigne
  57.131 + 		if (max_cpus <= cpucount+1)
  57.132 + 			continue;
  57.133 + 
  57.134 ++#ifdef CONFIG_SMP_ALTERNATIVES
  57.135 ++		if (kicked == 1)
  57.136 ++			prepare_for_smp();
  57.137 ++#endif
  57.138 ++
  57.139 + 		if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
  57.140 + 			printk("CPU #%d not responding - cannot use it.\n",
  57.141 + 								apicid);
  57.142 +@@ -1386,6 +1391,11 @@ int __devinit __cpu_up(unsigned int cpu)
  57.143 + 		return -EIO;
  57.144 + 	}
  57.145 + 
  57.146 ++#ifdef CONFIG_SMP_ALTERNATIVES
  57.147 ++	if (num_online_cpus() == 1)
  57.148 ++		prepare_for_smp();
  57.149 ++#endif
  57.150 ++
  57.151 + 	local_irq_enable();
  57.152 + 	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
  57.153 + 	/* Unleash the CPU! */
  57.154 +diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/vmlinux.lds.S ./arch/i386/kernel/vmlinux.lds.S
  57.155 +--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/vmlinux.lds.S	2006-01-03 03:21:10.000000000 +0000
  57.156 ++++ ./arch/i386/kernel/vmlinux.lds.S	2006-02-27 15:55:34.000000000 +0000
  57.157 +@@ -34,6 +34,13 @@ SECTIONS
  57.158 +   __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
  57.159 +   __stop___ex_table = .;
  57.160 + 
  57.161 ++  . = ALIGN(16);
  57.162 ++  __start_smp_alternatives_table = .;
  57.163 ++  __smp_alternatives : { *(__smp_alternatives) }
  57.164 ++  __stop_smp_alternatives_table = .;
  57.165 ++
  57.166 ++  __smp_replacements : { *(__smp_replacements) }
  57.167 ++
  57.168 +   RODATA
  57.169 + 
  57.170 +   /* writeable */
  57.171 +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/atomic.h ./include/asm-i386/atomic.h
  57.172 +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/atomic.h	2006-02-27 15:47:25.000000000 +0000
  57.173 ++++ ./include/asm-i386/atomic.h	2006-02-27 15:55:34.000000000 +0000
  57.174 +@@ -4,18 +4,13 @@
  57.175 + #include <linux/config.h>
  57.176 + #include <linux/compiler.h>
  57.177 + #include <asm/processor.h>
  57.178 ++#include <asm/smp_alt.h>
  57.179 + 
  57.180 + /*
  57.181 +  * Atomic operations that C can't guarantee us.  Useful for
  57.182 +  * resource counting etc..
  57.183 +  */
  57.184 + 
  57.185 +-#ifdef CONFIG_SMP
  57.186 +-#define LOCK "lock ; "
  57.187 +-#else
  57.188 +-#define LOCK ""
  57.189 +-#endif
  57.190 +-
  57.191 + /*
  57.192 +  * Make sure gcc doesn't try to be clever and move things around
  57.193 +  * on us. We need to use _exactly_ the address the user gave us,
  57.194 +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/bitops.h ./include/asm-i386/bitops.h
  57.195 +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/bitops.h	2006-02-27 15:47:25.000000000 +0000
  57.196 ++++ ./include/asm-i386/bitops.h	2006-02-27 15:55:34.000000000 +0000
  57.197 +@@ -7,6 +7,7 @@
  57.198 + 
  57.199 + #include <linux/config.h>
  57.200 + #include <linux/compiler.h>
  57.201 ++#include <asm/smp_alt.h>
  57.202 + 
  57.203 + /*
  57.204 +  * These have to be done with inline assembly: that way the bit-setting
  57.205 +@@ -16,12 +17,6 @@
  57.206 +  * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
  57.207 +  */
  57.208 + 
  57.209 +-#ifdef CONFIG_SMP
  57.210 +-#define LOCK_PREFIX "lock ; "
  57.211 +-#else
  57.212 +-#define LOCK_PREFIX ""
  57.213 +-#endif
  57.214 +-
  57.215 + #define ADDR (*(volatile long *) addr)
  57.216 + 
  57.217 + /**
  57.218 +@@ -41,7 +36,7 @@
  57.219 +  */
  57.220 + static inline void set_bit(int nr, volatile unsigned long * addr)
  57.221 + {
  57.222 +-	__asm__ __volatile__( LOCK_PREFIX
  57.223 ++	__asm__ __volatile__( LOCK
  57.224 + 		"btsl %1,%0"
  57.225 + 		:"+m" (ADDR)
  57.226 + 		:"Ir" (nr));
  57.227 +@@ -76,7 +71,7 @@ static inline void __set_bit(int nr, vol
  57.228 +  */
  57.229 + static inline void clear_bit(int nr, volatile unsigned long * addr)
  57.230 + {
  57.231 +-	__asm__ __volatile__( LOCK_PREFIX
  57.232 ++	__asm__ __volatile__( LOCK
  57.233 + 		"btrl %1,%0"
  57.234 + 		:"+m" (ADDR)
  57.235 + 		:"Ir" (nr));
  57.236 +@@ -121,7 +116,7 @@ static inline void __change_bit(int nr, 
  57.237 +  */
  57.238 + static inline void change_bit(int nr, volatile unsigned long * addr)
  57.239 + {
  57.240 +-	__asm__ __volatile__( LOCK_PREFIX
  57.241 ++	__asm__ __volatile__( LOCK
  57.242 + 		"btcl %1,%0"
  57.243 + 		:"+m" (ADDR)
  57.244 + 		:"Ir" (nr));
  57.245 +@@ -140,7 +135,7 @@ static inline int test_and_set_bit(int n
  57.246 + {
  57.247 + 	int oldbit;
  57.248 + 
  57.249 +-	__asm__ __volatile__( LOCK_PREFIX
  57.250 ++	__asm__ __volatile__( LOCK
  57.251 + 		"btsl %2,%1\n\tsbbl %0,%0"
  57.252 + 		:"=r" (oldbit),"+m" (ADDR)
  57.253 + 		:"Ir" (nr) : "memory");
  57.254 +@@ -180,7 +175,7 @@ static inline int test_and_clear_bit(int
  57.255 + {
  57.256 + 	int oldbit;
  57.257 + 
  57.258 +-	__asm__ __volatile__( LOCK_PREFIX
  57.259 ++	__asm__ __volatile__( LOCK
  57.260 + 		"btrl %2,%1\n\tsbbl %0,%0"
  57.261 + 		:"=r" (oldbit),"+m" (ADDR)
  57.262 + 		:"Ir" (nr) : "memory");
  57.263 +@@ -231,7 +226,7 @@ static inline int test_and_change_bit(in
  57.264 + {
  57.265 + 	int oldbit;
  57.266 + 
  57.267 +-	__asm__ __volatile__( LOCK_PREFIX
  57.268 ++	__asm__ __volatile__( LOCK
  57.269 + 		"btcl %2,%1\n\tsbbl %0,%0"
  57.270 + 		:"=r" (oldbit),"+m" (ADDR)
  57.271 + 		:"Ir" (nr) : "memory");
  57.272 +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/futex.h ./include/asm-i386/futex.h
  57.273 +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/futex.h	2006-02-27 15:47:25.000000000 +0000
  57.274 ++++ ./include/asm-i386/futex.h	2006-02-27 15:55:34.000000000 +0000
  57.275 +@@ -28,7 +28,7 @@
  57.276 + "1:	movl	%2, %0\n\
  57.277 + 	movl	%0, %3\n"					\
  57.278 + 	insn "\n"						\
  57.279 +-"2:	" LOCK_PREFIX "cmpxchgl %3, %2\n\
  57.280 ++"2:	" LOCK "cmpxchgl %3, %2\n\
  57.281 + 	jnz	1b\n\
  57.282 + 3:	.section .fixup,\"ax\"\n\
  57.283 + 4:	mov	%5, %1\n\
  57.284 +@@ -68,7 +68,7 @@ futex_atomic_op_inuser (int encoded_op, 
  57.285 + #endif
  57.286 + 		switch (op) {
  57.287 + 		case FUTEX_OP_ADD:
  57.288 +-			__futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
  57.289 ++			__futex_atomic_op1(LOCK "xaddl %0, %2", ret,
  57.290 + 					   oldval, uaddr, oparg);
  57.291 + 			break;
  57.292 + 		case FUTEX_OP_OR:
  57.293 +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/rwsem.h ./include/asm-i386/rwsem.h
  57.294 +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/rwsem.h	2006-01-03 03:21:10.000000000 +0000
  57.295 ++++ ./include/asm-i386/rwsem.h	2006-02-27 15:55:34.000000000 +0000
  57.296 +@@ -40,6 +40,7 @@
  57.297 + 
  57.298 + #include <linux/list.h>
  57.299 + #include <linux/spinlock.h>
  57.300 ++#include <asm/smp_alt.h>
  57.301 + 
  57.302 + struct rwsem_waiter;
  57.303 + 
  57.304 +@@ -99,7 +100,7 @@ static inline void __down_read(struct rw
  57.305 + {
  57.306 + 	__asm__ __volatile__(
  57.307 + 		"# beginning down_read\n\t"
  57.308 +-LOCK_PREFIX	"  incl      (%%eax)\n\t" /* adds 0x00000001, returns the old value */
  57.309 ++LOCK	        "  incl      (%%eax)\n\t" /* adds 0x00000001, returns the old value */
  57.310 + 		"  js        2f\n\t" /* jump if we weren't granted the lock */
  57.311 + 		"1:\n\t"
  57.312 + 		LOCK_SECTION_START("")
  57.313 +@@ -130,7 +131,7 @@ static inline int __down_read_trylock(st
  57.314 + 		"  movl	     %1,%2\n\t"
  57.315 + 		"  addl      %3,%2\n\t"
  57.316 + 		"  jle	     2f\n\t"
  57.317 +-LOCK_PREFIX	"  cmpxchgl  %2,%0\n\t"
  57.318 ++LOCK	        "  cmpxchgl  %2,%0\n\t"
  57.319 + 		"  jnz	     1b\n\t"
  57.320 + 		"2:\n\t"
  57.321 + 		"# ending __down_read_trylock\n\t"
  57.322 +@@ -150,7 +151,7 @@ static inline void __down_write(struct r
  57.323 + 	tmp = RWSEM_ACTIVE_WRITE_BIAS;
  57.324 + 	__asm__ __volatile__(
  57.325 + 		"# beginning down_write\n\t"
  57.326 +-LOCK_PREFIX	"  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */
  57.327 ++LOCK	        "  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */
  57.328 + 		"  testl     %%edx,%%edx\n\t" /* was the count 0 before? */
  57.329 + 		"  jnz       2f\n\t" /* jump if we weren't granted the lock */
  57.330 + 		"1:\n\t"
  57.331 +@@ -188,7 +189,7 @@ static inline void __up_read(struct rw_s
  57.332 + 	__s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
  57.333 + 	__asm__ __volatile__(
  57.334 + 		"# beginning __up_read\n\t"
  57.335 +-LOCK_PREFIX	"  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */
  57.336 ++LOCK	        "  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */
  57.337 + 		"  js        2f\n\t" /* jump if the lock is being waited upon */
  57.338 + 		"1:\n\t"
  57.339 + 		LOCK_SECTION_START("")
  57.340 +@@ -214,7 +215,7 @@ static inline void __up_write(struct rw_
  57.341 + 	__asm__ __volatile__(
  57.342 + 		"# beginning __up_write\n\t"
  57.343 + 		"  movl      %2,%%edx\n\t"
  57.344 +-LOCK_PREFIX	"  xaddl     %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */
  57.345 ++LOCK	        "  xaddl     %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */
  57.346 + 		"  jnz       2f\n\t" /* jump if the lock is being waited upon */
  57.347 + 		"1:\n\t"
  57.348 + 		LOCK_SECTION_START("")
  57.349 +@@ -239,7 +240,7 @@ static inline void __downgrade_write(str
  57.350 + {
  57.351 + 	__asm__ __volatile__(
  57.352 + 		"# beginning __downgrade_write\n\t"
  57.353 +-LOCK_PREFIX	"  addl      %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
  57.354 ++LOCK	        "  addl      %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
  57.355 + 		"  js        2f\n\t" /* jump if the lock is being waited upon */
  57.356 + 		"1:\n\t"
  57.357 + 		LOCK_SECTION_START("")
  57.358 +@@ -263,7 +264,7 @@ LOCK_PREFIX	"  addl      %2,(%%eax)\n\t"
  57.359 + static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
  57.360 + {
  57.361 + 	__asm__ __volatile__(
  57.362 +-LOCK_PREFIX	"addl %1,%0"
  57.363 ++LOCK	          "addl %1,%0"
  57.364 + 		: "=m"(sem->count)
  57.365 + 		: "ir"(delta), "m"(sem->count));
  57.366 + }
  57.367 +@@ -276,7 +277,7 @@ static inline int rwsem_atomic_update(in
  57.368 + 	int tmp = delta;
  57.369 + 
  57.370 + 	__asm__ __volatile__(
  57.371 +-LOCK_PREFIX	"xadd %0,(%2)"
  57.372 ++LOCK  	          "xadd %0,(%2)"
  57.373 + 		: "+r"(tmp), "=m"(sem->count)
  57.374 + 		: "r"(sem), "m"(sem->count)
  57.375 + 		: "memory");
  57.376 +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/smp_alt.h ./include/asm-i386/smp_alt.h
  57.377 +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/smp_alt.h	1970-01-01 01:00:00.000000000 +0100
  57.378 ++++ ./include/asm-i386/smp_alt.h	2006-02-27 15:55:34.000000000 +0000
  57.379 +@@ -0,0 +1,32 @@
  57.380 ++#ifndef __ASM_SMP_ALT_H__
  57.381 ++#define __ASM_SMP_ALT_H__
  57.382 ++
  57.383 ++#include <linux/config.h>
  57.384 ++
  57.385 ++#ifdef CONFIG_SMP
  57.386 ++#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
  57.387 ++#define LOCK \
  57.388 ++        "6677: nop\n" \
  57.389 ++	".section __smp_alternatives,\"a\"\n" \
  57.390 ++	".long 6677b\n" \
  57.391 ++	".long 6678f\n" \
  57.392 ++	".previous\n" \
  57.393 ++	".section __smp_replacements,\"a\"\n" \
  57.394 ++	"6678: .byte 1\n" \
  57.395 ++	".byte 1\n" \
  57.396 ++	".byte 0\n" \
  57.397 ++        ".byte 1\n" \
  57.398 ++	".byte -1\n" \
  57.399 ++	"lock\n" \
  57.400 ++	"nop\n" \
  57.401 ++	".previous\n"
  57.402 ++void prepare_for_smp(void);
  57.403 ++void unprepare_for_smp(void);
  57.404 ++#else
  57.405 ++#define LOCK "lock ; "
  57.406 ++#endif
  57.407 ++#else
  57.408 ++#define LOCK ""
  57.409 ++#endif
  57.410 ++
  57.411 ++#endif /* __ASM_SMP_ALT_H__ */
  57.412 +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/spinlock.h ./include/asm-i386/spinlock.h
  57.413 +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/spinlock.h	2006-01-03 03:21:10.000000000 +0000
  57.414 ++++ ./include/asm-i386/spinlock.h	2006-02-27 15:55:34.000000000 +0000
  57.415 +@@ -6,6 +6,7 @@
  57.416 + #include <asm/page.h>
  57.417 + #include <linux/config.h>
  57.418 + #include <linux/compiler.h>
  57.419 ++#include <asm/smp_alt.h>
  57.420 + 
  57.421 + /*
  57.422 +  * Your basic SMP spinlocks, allowing only a single CPU anywhere
  57.423 +@@ -23,7 +24,8 @@
  57.424 + 
  57.425 + #define __raw_spin_lock_string \
  57.426 + 	"\n1:\t" \
  57.427 +-	"lock ; decb %0\n\t" \
  57.428 ++	LOCK \
  57.429 ++	"decb %0\n\t" \
  57.430 + 	"jns 3f\n" \
  57.431 + 	"2:\t" \
  57.432 + 	"rep;nop\n\t" \
  57.433 +@@ -34,7 +36,8 @@
  57.434 + 
  57.435 + #define __raw_spin_lock_string_flags \
  57.436 + 	"\n1:\t" \
  57.437 +-	"lock ; decb %0\n\t" \
  57.438 ++	LOCK \
  57.439 ++	"decb %0\n\t" \
  57.440 + 	"jns 4f\n\t" \
  57.441 + 	"2:\t" \
  57.442 + 	"testl $0x200, %1\n\t" \
  57.443 +@@ -65,10 +68,34 @@ static inline void __raw_spin_lock_flags
  57.444 + static inline int __raw_spin_trylock(raw_spinlock_t *lock)
  57.445 + {
  57.446 + 	char oldval;
  57.447 ++#ifdef CONFIG_SMP_ALTERNATIVES
  57.448 + 	__asm__ __volatile__(
  57.449 +-		"xchgb %b0,%1"
  57.450 ++		"1:movb %1,%b0\n"
  57.451 ++		"movb $0,%1\n"
  57.452 ++		"2:"
  57.453 ++		".section __smp_alternatives,\"a\"\n"
  57.454 ++		".long 1b\n"
  57.455 ++		".long 3f\n"
  57.456 ++		".previous\n"
  57.457 ++		".section __smp_replacements,\"a\"\n"
  57.458 ++		"3: .byte 2b - 1b\n"
  57.459 ++		".byte 5f-4f\n"
  57.460 ++		".byte 0\n"
  57.461 ++		".byte 6f-5f\n"
  57.462 ++		".byte -1\n"
  57.463 ++		"4: xchgb %b0,%1\n"
  57.464 ++		"5: movb %1,%b0\n"
  57.465 ++		"movb $0,%1\n"
  57.466 ++		"6:\n"
  57.467 ++		".previous\n"
  57.468 + 		:"=q" (oldval), "=m" (lock->slock)
  57.469 + 		:"0" (0) : "memory");
  57.470 ++#else
  57.471 ++	__asm__ __volatile__(
  57.472 ++		"xchgb %b0,%1\n"
  57.473 ++		:"=q" (oldval), "=m" (lock->slock)
  57.474 ++		:"0" (0) : "memory");
  57.475 ++#endif
  57.476 + 	return oldval > 0;
  57.477 + }
  57.478 + 
  57.479 +@@ -178,12 +205,12 @@ static inline int __raw_write_trylock(ra
  57.480 + 
  57.481 + static inline void __raw_read_unlock(raw_rwlock_t *rw)
  57.482 + {
  57.483 +-	asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
  57.484 ++	asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory");
  57.485 + }
  57.486 + 
  57.487 + static inline void __raw_write_unlock(raw_rwlock_t *rw)
  57.488 + {
  57.489 +-	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0"
  57.490 ++	asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0"
  57.491 + 				 : "=m" (rw->lock) : : "memory");
  57.492 + }
  57.493 + 
  57.494 +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/system.h ./include/asm-i386/system.h
  57.495 +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/system.h	2006-02-27 15:47:25.000000000 +0000
  57.496 ++++ ./include/asm-i386/system.h	2006-02-27 15:55:34.000000000 +0000
  57.497 +@@ -5,7 +5,7 @@
  57.498 + #include <linux/kernel.h>
  57.499 + #include <asm/segment.h>
  57.500 + #include <asm/cpufeature.h>
  57.501 +-#include <linux/bitops.h> /* for LOCK_PREFIX */
  57.502 ++#include <asm/smp_alt.h>
  57.503 + 
  57.504 + #ifdef __KERNEL__
  57.505 + 
  57.506 +@@ -271,19 +271,19 @@ static inline unsigned long __cmpxchg(vo
  57.507 + 	unsigned long prev;
  57.508 + 	switch (size) {
  57.509 + 	case 1:
  57.510 +-		__asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
  57.511 ++		__asm__ __volatile__(LOCK "cmpxchgb %b1,%2"
  57.512 + 				     : "=a"(prev)
  57.513 + 				     : "q"(new), "m"(*__xg(ptr)), "0"(old)
  57.514 + 				     : "memory");
  57.515 + 		return prev;
  57.516 + 	case 2:
  57.517 +-		__asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
  57.518 ++		__asm__ __volatile__(LOCK "cmpxchgw %w1,%2"
  57.519 + 				     : "=a"(prev)
  57.520 + 				     : "r"(new), "m"(*__xg(ptr)), "0"(old)
  57.521 + 				     : "memory");
  57.522 + 		return prev;
  57.523 + 	case 4:
  57.524 +-		__asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
  57.525 ++		__asm__ __volatile__(LOCK "cmpxchgl %1,%2"
  57.526 + 				     : "=a"(prev)
  57.527 + 				     : "r"(new), "m"(*__xg(ptr)), "0"(old)
  57.528 + 				     : "memory");
  57.529 +@@ -336,7 +336,7 @@ static inline unsigned long long __cmpxc
  57.530 + 				      unsigned long long new)
  57.531 + {
  57.532 + 	unsigned long long prev;
  57.533 +-	__asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
  57.534 ++	__asm__ __volatile__(LOCK "cmpxchg8b %3"
  57.535 + 			     : "=A"(prev)
  57.536 + 			     : "b"((unsigned long)new),
  57.537 + 			       "c"((unsigned long)(new >> 32)),
  57.538 +@@ -503,11 +503,55 @@ struct alt_instr { 
  57.539 + #endif
  57.540 + 
  57.541 + #ifdef CONFIG_SMP
  57.542 ++#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
  57.543 ++#define smp_alt_mb(instr)                                           \
  57.544 ++__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \
  57.545 ++		     ".section __smp_alternatives,\"a\"\n"          \
  57.546 ++		     ".long 6667b\n"                                \
  57.547 ++                     ".long 6673f\n"                                \
  57.548 ++		     ".previous\n"                                  \
  57.549 ++		     ".section __smp_replacements,\"a\"\n"          \
  57.550 ++		     "6673:.byte 6668b-6667b\n"                     \
  57.551 ++		     ".byte 6670f-6669f\n"                          \
  57.552 ++		     ".byte 6671f-6670f\n"                          \
  57.553 ++                     ".byte 0\n"                                    \
  57.554 ++		     ".byte %c0\n"                                  \
  57.555 ++		     "6669:lock;addl $0,0(%%esp)\n"                 \
  57.556 ++		     "6670:" instr "\n"                             \
  57.557 ++		     "6671:\n"                                      \
  57.558 ++		     ".previous\n"                                  \
  57.559 ++		     :                                              \
  57.560 ++		     : "i" (X86_FEATURE_XMM2)                       \
  57.561 ++		     : "memory")
  57.562 ++#define smp_rmb() smp_alt_mb("lfence")
  57.563 ++#define smp_mb()  smp_alt_mb("mfence")
  57.564 ++#define set_mb(var, value) do {                                     \
  57.565 ++unsigned long __set_mb_temp;                                        \
  57.566 ++__asm__ __volatile__("6667:movl %1, %0\n6668:\n"                    \
  57.567 ++		     ".section __smp_alternatives,\"a\"\n"          \
  57.568 ++		     ".long 6667b\n"                                \
  57.569 ++		     ".long 6673f\n"                                \
  57.570 ++		     ".previous\n"                                  \
  57.571 ++		     ".section __smp_replacements,\"a\"\n"          \
  57.572 ++		     "6673: .byte 6668b-6667b\n"                    \
  57.573 ++		     ".byte 6670f-6669f\n"                          \
  57.574 ++		     ".byte 0\n"                                    \
  57.575 ++		     ".byte 6671f-6670f\n"                          \
  57.576 ++		     ".byte -1\n"                                   \
  57.577 ++		     "6669: xchg %1, %0\n"                          \
  57.578 ++		     "6670:movl %1, %0\n"                           \
  57.579 ++		     "6671:\n"                                      \
  57.580 ++		     ".previous\n"                                  \
  57.581 ++		     : "=m" (var), "=r" (__set_mb_temp)             \
  57.582 ++		     : "1" (value)                                  \
  57.583 ++		     : "memory"); } while (0)
  57.584 ++#else
  57.585 + #define smp_mb()	mb()
  57.586 + #define smp_rmb()	rmb()
  57.587 ++#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
  57.588 ++#endif
  57.589 + #define smp_wmb()	wmb()
  57.590 + #define smp_read_barrier_depends()	read_barrier_depends()
  57.591 +-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
  57.592 + #else
  57.593 + #define smp_mb()	barrier()
  57.594 + #define smp_rmb()	barrier()
    58.1 --- a/tools/examples/Makefile	Wed Mar 01 10:01:54 2006 -0700
    58.2 +++ b/tools/examples/Makefile	Wed Mar 01 12:47:25 2006 -0700
    58.3 @@ -26,10 +26,11 @@ XEN_SCRIPTS += network-route vif-route
    58.4  XEN_SCRIPTS += network-nat vif-nat
    58.5  XEN_SCRIPTS += block
    58.6  XEN_SCRIPTS += block-enbd block-nbd
    58.7 -XEN_SCRIPTS += vtpm
    58.8 -XEN_SCRIPT_DATA = xen-script-common.sh
    58.9 +XEN_SCRIPTS += vtpm vtpm-delete
   58.10 +XEN_SCRIPTS += xen-hotplug-cleanup
   58.11 +XEN_SCRIPT_DATA = xen-script-common.sh locking.sh logging.sh
   58.12  XEN_SCRIPT_DATA += xen-hotplug-common.sh xen-network-common.sh vif-common.sh
   58.13 -XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh
   58.14 +XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh vtpm-hotplug-common.sh
   58.15  
   58.16  XEN_HOTPLUG_DIR = /etc/hotplug
   58.17  XEN_HOTPLUG_SCRIPTS = xen-backend.agent
    59.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    59.2 +++ b/tools/examples/locking.sh	Wed Mar 01 12:47:25 2006 -0700
    59.3 @@ -0,0 +1,98 @@
    59.4 +#
    59.5 +# Copyright (c) 2005 XenSource Ltd.
    59.6 +#
    59.7 +# This library is free software; you can redistribute it and/or
    59.8 +# modify it under the terms of version 2.1 of the GNU Lesser General Public
    59.9 +# License as published by the Free Software Foundation.
   59.10 +#
   59.11 +# This library is distributed in the hope that it will be useful,
   59.12 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
   59.13 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   59.14 +# Lesser General Public License for more details.
   59.15 +#
   59.16 +# You should have received a copy of the GNU Lesser General Public
   59.17 +# License along with this library; if not, write to the Free Software
   59.18 +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   59.19 +#
   59.20 +
   59.21 +#
   59.22 +# Serialisation
   59.23 +#
   59.24 +
   59.25 +LOCK_SLEEPTIME=1
   59.26 +LOCK_SPINNING_RETRIES=5
   59.27 +LOCK_RETRIES=10
   59.28 +LOCK_BASEDIR=/var/run/xen-hotplug
   59.29 +
   59.30 +
   59.31 +claim_lock()
   59.32 +{
   59.33 +  local lockdir="$LOCK_BASEDIR/$1"
   59.34 +  mkdir -p "$LOCK_BASEDIR"
   59.35 +  _claim_lock "$lockdir"
   59.36 +}
   59.37 +
   59.38 +
   59.39 +release_lock()
   59.40 +{
   59.41 +  _release_lock "$LOCK_BASEDIR/$1"
   59.42 +}
   59.43 +
   59.44 +
   59.45 +_claim_lock()
   59.46 +{
   59.47 +  local lockdir="$1"
   59.48 +  local owner=$(_lock_owner "$lockdir")
   59.49 +  local retries=0
   59.50 +
   59.51 +  while [ $retries -lt $LOCK_RETRIES ]
   59.52 +  do
   59.53 +    mkdir "$lockdir" 2>/dev/null && trap "release_lock $1; sigerr" ERR &&
   59.54 +      _update_lock_info "$lockdir" && return
   59.55 +
   59.56 +    local new_owner=$(_lock_owner "$lockdir")
   59.57 +    if [ "$new_owner" != "$owner" ]
   59.58 +    then
   59.59 +      owner="$new_owner"
   59.60 +      retries=0
   59.61 +    fi
   59.62 +
   59.63 +    if [ $retries -gt $LOCK_SPINNING_RETRIES ]
   59.64 +    then
   59.65 +      sleep $LOCK_SLEEPTIME
   59.66 +    else
   59.67 +      sleep 0
   59.68 +    fi
   59.69 +    retries=$(($retries + 1))
   59.70 +  done
   59.71 +  _steal_lock "$lockdir"
   59.72 +}
   59.73 +
   59.74 +
   59.75 +_release_lock()
   59.76 +{
   59.77 +  trap sigerr ERR
   59.78 +  rm -rf "$1" 2>/dev/null || true
   59.79 +}
   59.80 +
   59.81 +
   59.82 +_steal_lock()
   59.83 +{
   59.84 +  local lockdir="$1"
   59.85 +  local owner=$(cat "$lockdir/owner" 2>/dev/null || echo "unknown")
   59.86 +  log err "Forced to steal lock on $lockdir from $owner!"
   59.87 +  _release_lock "$lockdir"
   59.88 +  _claim_lock "$lockdir"
   59.89 +}
   59.90 +
   59.91 +
   59.92 +_lock_owner()
   59.93 +{
   59.94 +  cat "$1/owner" 2>/dev/null || echo "unknown"
   59.95 +}
   59.96 +
   59.97 +
   59.98 +_update_lock_info()
   59.99 +{
  59.100 +  echo "$$: $0" >"$1/owner"
  59.101 +}
    60.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    60.2 +++ b/tools/examples/logging.sh	Wed Mar 01 12:47:25 2006 -0700
    60.3 @@ -0,0 +1,22 @@
    60.4 +#
    60.5 +# Copyright (c) 2005 XenSource Ltd.
    60.6 +#
    60.7 +# This library is free software; you can redistribute it and/or
    60.8 +# modify it under the terms of version 2.1 of the GNU Lesser General Public
    60.9 +# License as published by the Free Software Foundation.
   60.10 +#
   60.11 +# This library is distributed in the hope that it will be useful,
   60.12 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
   60.13 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   60.14 +# Lesser General Public License for more details.
   60.15 +#
   60.16 +# You should have received a copy of the GNU Lesser General Public
   60.17 +# License along with this library; if not, write to the Free Software
   60.18 +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   60.19 +#
   60.20 +
   60.21 +log() {
   60.22 +  local level="$1"
   60.23 +  shift
   60.24 +  logger -p "daemon.$level" -- "$0:" "$@" || echo "$0 $@" >&2
   60.25 +}
    61.1 --- a/tools/examples/vif-common.sh	Wed Mar 01 10:01:54 2006 -0700
    61.2 +++ b/tools/examples/vif-common.sh	Wed Mar 01 12:47:25 2006 -0700
    61.3 @@ -125,7 +125,7 @@ function handle_iptable()
    61.4  #
    61.5  function ip_of()
    61.6  {
    61.7 -  ip addr show "$1" | awk "/^.*inet.*$1\$/{print \$2}" | sed 's,/.*,,' | head -1
    61.8 +  ip addr show "$1" | awk "/^.*inet.*$1\$/{print \$2}" | sed -n '1 s,/.*,,p'
    61.9  }
   61.10  
   61.11  
    62.1 --- a/tools/examples/vtpm	Wed Mar 01 10:01:54 2006 -0700
    62.2 +++ b/tools/examples/vtpm	Wed Mar 01 12:47:25 2006 -0700
    62.3 @@ -1,7 +1,7 @@
    62.4  #!/bin/sh
    62.5  
    62.6  dir=$(dirname "$0")
    62.7 -. "$dir/vtpm-common.sh"
    62.8 +. "$dir/vtpm-hotplug-common.sh"
    62.9  
   62.10  vtpm_fatal_error=0
   62.11  
    63.1 --- a/tools/examples/vtpm-common.sh	Wed Mar 01 10:01:54 2006 -0700
    63.2 +++ b/tools/examples/vtpm-common.sh	Wed Mar 01 12:47:25 2006 -0700
    63.3 @@ -17,21 +17,8 @@
    63.4  #
    63.5  
    63.6  dir=$(dirname "$0")
    63.7 -. "$dir/xen-hotplug-common.sh"
    63.8 -
    63.9 -findCommand "$@"
   63.10 -if [ "$command" != "online" ]  &&
   63.11 -   [ "$command" != "offline" ] &&
   63.12 -   [ "$command" != "add" ]     &&
   63.13 -   [ "$command" != "remove" ]
   63.14 -then
   63.15 -	log err "Invalid command: $command"
   63.16 -	exit 1
   63.17 -fi
   63.18 -
   63.19 -
   63.20 -XENBUS_PATH="${XENBUS_PATH:?}"
   63.21 -
   63.22 +. "$dir/logging.sh"
   63.23 +. "$dir/locking.sh"
   63.24  
   63.25  VTPMDB="/etc/xen/vtpm.db"
   63.26  
   63.27 @@ -58,15 +45,19 @@ if [ -z "$VTPM_IMPL_DEFINED" ]; then
   63.28  	function vtpm_resume() {
   63.29  		true
   63.30  	}
   63.31 +	function vtpm_delete() {
   63.32 +		true
   63.33 +	}
   63.34  fi
   63.35  
   63.36 +
   63.37  #Find the instance number for the vtpm given the name of the domain
   63.38  # Parameters
   63.39  # - vmname : the name of the vm
   63.40  # Return value
   63.41  #  Returns '0' if instance number could not be found, otherwise
   63.42  #  it returns the instance number in the variable 'instance'
   63.43 -function find_instance () {
   63.44 +function vtpmdb_find_instance () {
   63.45  	local vmname=$1
   63.46  	local ret=0
   63.47  	instance=`cat $VTPMDB |                    \
   63.48 @@ -80,18 +71,17 @@ function find_instance () {
   63.49  	             }                             \
   63.50  	           }'`
   63.51  	if [ "$instance" != "" ]; then
   63.52 -		ret=1
   63.53 +		ret=$instance
   63.54  	fi
   63.55 -	return $ret
   63.56 +	echo "$ret"
   63.57  }
   63.58  
   63.59  
   63.60  # Check whether a particular instance number is still available
   63.61 -# returns '1' if it is available
   63.62 -function is_free_instancenum () {
   63.63 +# returns "0" if it is not available, "1" otherwise.
   63.64 +function vtpmdb_is_free_instancenum () {
   63.65  	local instance=$1
   63.66  	local avail=1
   63.67 -
   63.68  	#Allowed instance number range: 1-255
   63.69  	if [ $instance -eq 0 -o $instance -gt 255 ]; then
   63.70  		avail=0
   63.71 @@ -110,13 +100,13 @@ function is_free_instancenum () {
   63.72  			fi
   63.73  		done
   63.74  	fi
   63.75 -	return $avail
   63.76 +	echo "$avail"
   63.77  }
   63.78  
   63.79  
   63.80  # Get an available instance number given the database
   63.81  # Returns an unused instance number
   63.82 -function get_free_instancenum () {
   63.83 +function vtpmdb_get_free_instancenum () {
   63.84  	local ctr
   63.85  	local instances
   63.86  	local don
   63.87 @@ -145,12 +135,12 @@ function get_free_instancenum () {
   63.88  		fi
   63.89  		let ctr=ctr+1
   63.90  	done
   63.91 -	let instance=$ctr
   63.92 +	echo "$ctr"
   63.93  }
   63.94  
   63.95  
   63.96  # Add a domain name and instance number to the DB file
   63.97 -function add_instance () {
   63.98 +function vtpmdb_add_instance () {
   63.99  	local vmname=$1
  63.100  	local inst=$2
  63.101  
  63.102 @@ -159,8 +149,8 @@ function add_instance () {
  63.103  		echo "#1st column: domain name" >> $VTPMDB
  63.104  		echo "#2nd column: TPM instance number" >> $VTPMDB
  63.105  	fi
  63.106 -	validate_entry $vmname $inst
  63.107 -	if [ $? -eq 0 ]; then
  63.108 +	res=$(vtpmdb_validate_entry $vmname $inst)
  63.109 +	if [ $res -eq 0 ]; then
  63.110  		echo "$vmname $inst" >> $VTPMDB
  63.111  	fi
  63.112  }
  63.113 @@ -168,11 +158,10 @@ function add_instance () {
  63.114  
  63.115  #Validate whether an entry is the same as passed to this
  63.116  #function
  63.117 -function validate_entry () {
  63.118 +function vtpmdb_validate_entry () {
  63.119  	local rc=0
  63.120  	local vmname=$1
  63.121  	local inst=$2
  63.122 -	local res
  63.123  
  63.124  	res=`cat $VTPMDB |             \
  63.125  	     gawk -vvmname=$vmname     \
  63.126 @@ -197,13 +186,15 @@ function validate_entry () {
  63.127  	elif [ "$res" == "2" ]; then
  63.128  		let rc=2
  63.129  	fi
  63.130 -	return $rc
  63.131 +	echo "$rc"
  63.132  }
  63.133  
  63.134  
  63.135  #Remove an entry from the vTPM database given its domain name
  63.136 -function remove_entry () {
  63.137 +#and instance number
  63.138 +function vtpmdb_remove_entry () {
  63.139  	local vmname=$1
  63.140 +	local instance=$2
  63.141  	local VTPMDB_TMP="$VTPMDB".tmp
  63.142  	`cat $VTPMDB |             \
  63.143  	 gawk -vvmname=$vmname     \
  63.144 @@ -214,6 +205,7 @@ function remove_entry () {
  63.145  	 '} > $VTPMDB_TMP`
  63.146  	if [ -e $VTPMDB_TMP ]; then
  63.147  		mv -f $VTPMDB_TMP $VTPMDB
  63.148 +		vtpm_delete $instance
  63.149  	else
  63.150  		log err "Error creating temporary file '$VTPMDB_TMP'."
  63.151  	fi
  63.152 @@ -222,7 +214,7 @@ function remove_entry () {
  63.153  
  63.154  # Find the reason for the creation of this device:
  63.155  # Set global REASON variable to 'resume' or 'create'
  63.156 -function get_create_reason () {
  63.157 +function vtpm_get_create_reason () {
  63.158  	local resume=$(xenstore-read $XENBUS_PATH/resume)
  63.159  	if [ "$resume" == "True" ]; then
  63.160  		REASON="resume"
  63.161 @@ -231,32 +223,30 @@ function get_create_reason () {
  63.162  	fi
  63.163  }
  63.164  
  63.165 +
  63.166  #Create a vTPM instance
  63.167  # If no entry in the TPM database is found, the instance is
  63.168  # created and an entry added to the database.
  63.169  function vtpm_create_instance () {
  63.170  	local domname=$(xenstore_read "$XENBUS_PATH"/domain)
  63.171  	local res
  63.172 -	set +e
  63.173 -	get_create_reason
  63.174 +	local instance
  63.175 +	vtpm_get_create_reason
  63.176  
  63.177  	claim_lock vtpmdb
  63.178 -
  63.179 -	find_instance $domname
  63.180 -	res=$?
  63.181 -	if [ $res -eq 0 ]; then
  63.182 +	instance=$(vtpmdb_find_instance $domname)
  63.183 +	if [ "$instance" == "0" ]; then
  63.184  		#Try to give the preferred instance to the domain
  63.185  		instance=$(xenstore_read "$XENBUS_PATH"/pref_instance)
  63.186  		if [ "$instance" != "" ]; then
  63.187 -			is_free_instancenum $instance
  63.188 -			res=$?
  63.189 +			res=$(vtpmdb_is_free_instancenum $instance)
  63.190  			if [ $res -eq 0 ]; then
  63.191 -				get_free_instancenum
  63.192 +				instance=$(vtpmdb_get_free_instancenum)
  63.193  			fi
  63.194  		else
  63.195 -			get_free_instancenum
  63.196 +			instance=$(vtpmdb_get_free_instancenum)
  63.197  		fi
  63.198 -		add_instance $domname $instance
  63.199 +		vtpmdb_add_instance $domname $instance
  63.200  		if [ "$REASON" == "create" ]; then
  63.201  			vtpm_create $instance
  63.202  		elif [ "$REASON" == "resume" ]; then
  63.203 @@ -279,25 +269,40 @@ function vtpm_create_instance () {
  63.204  		true
  63.205  	fi
  63.206  	xenstore_write $XENBUS_PATH/instance $instance
  63.207 -	set -e
  63.208  }
  63.209  
  63.210  
  63.211 -#Remove an instance
  63.212 +#Remove an instance when a VM is terminating or suspending.
  63.213 +#Since it is assumed that the VM will appear again, the
  63.214 +#entry is kept in the VTPMDB file.
  63.215  function vtpm_remove_instance () {
  63.216  	local domname=$(xenstore_read "$XENBUS_PATH"/domain)
  63.217 -	set +e
  63.218 -	find_instance $domname
  63.219 -	res=$?
  63.220 -	if [ $res -eq 0 ]; then
  63.221 -		#Something is really wrong with the DB
  63.222 -		log err "vTPM DB file $VTPMDB has no entry for '$domname'"
  63.223 -	else
  63.224 +
  63.225 +	claim_lock vtpmdb
  63.226 +
  63.227 +	instance=$(vtpmdb_find_instance $domname)
  63.228 +
  63.229 +	if [ "$instance" != "0" ]; then
  63.230  		if [ "$REASON" == "suspend" ]; then
  63.231  			vtpm_suspend $instance
  63.232  		fi
  63.233  	fi
  63.234 -	set -e
  63.235 +
  63.236 +	release_lock vtpmdb
  63.237  }
  63.238  
  63.239  
  63.240 +#Remove an entry in the VTPMDB file given the domain's name
  63.241 +#1st parameter: The name of the domain
  63.242 +function vtpm_delete_instance () {
  63.243 +	local rc
  63.244 +
  63.245 +	claim_lock vtpmdb
  63.246 +
  63.247 +	instance=$(vtpmdb_find_instance $1)
  63.248 +	if [ "$instance" != "0" ]; then
  63.249 +		vtpmdb_remove_entry $1 $instance
  63.250 +	fi
  63.251 +
  63.252 +	release_lock vtpmdb
  63.253 +}
    64.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    64.2 +++ b/tools/examples/vtpm-delete	Wed Mar 01 12:47:25 2006 -0700
    64.3 @@ -0,0 +1,9 @@
    64.4 +#!/bin/sh
    64.5 +
    64.6 +# This scripts must be called the following way:
    64.7 +# vtpm-delete <domain name>
    64.8 +
    64.9 +dir=$(dirname "$0")
   64.10 +. "$dir/vtpm-common.sh"
   64.11 +
   64.12 +vtpm_delete_instance $1
    65.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    65.2 +++ b/tools/examples/vtpm-hotplug-common.sh	Wed Mar 01 12:47:25 2006 -0700
    65.3 @@ -0,0 +1,35 @@
    65.4 +#
    65.5 +# Copyright (c) 2005 IBM Corporation
    65.6 +# Copyright (c) 2005 XenSource Ltd.
    65.7 +#
    65.8 +# This library is free software; you can redistribute it and/or
    65.9 +# modify it under the terms of version 2.1 of the GNU Lesser General Public
   65.10 +# License as published by the Free Software Foundation.
   65.11 +#
   65.12 +# This library is distributed in the hope that it will be useful,
   65.13 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
   65.14 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   65.15 +# Lesser General Public License for more details.
   65.16 +#
   65.17 +# You should have received a copy of the GNU Lesser General Public
   65.18 +# License along with this library; if not, write to the Free Software
   65.19 +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   65.20 +#
   65.21 +
   65.22 +dir=$(dirname "$0")
   65.23 +. "$dir/xen-hotplug-common.sh"
   65.24 +
   65.25 +findCommand "$@"
   65.26 +if [ "$command" != "online" ]  &&
   65.27 +   [ "$command" != "offline" ] &&
   65.28 +   [ "$command" != "add" ]     &&
   65.29 +   [ "$command" != "remove" ]
   65.30 +then
   65.31 +	log err "Invalid command: $command"
   65.32 +	exit 1
   65.33 +fi
   65.34 +
   65.35 +
   65.36 +XENBUS_PATH="${XENBUS_PATH:?}"
   65.37 +
   65.38 +. "$dir/vtpm-common.sh"
    66.1 --- a/tools/examples/xen-backend.agent	Wed Mar 01 10:01:54 2006 -0700
    66.2 +++ b/tools/examples/xen-backend.agent	Wed Mar 01 12:47:25 2006 -0700
    66.3 @@ -18,12 +18,7 @@ case "$ACTION" in
    66.4    add)
    66.5      ;;
    66.6    remove)
    66.7 -    # remove device frontend store entries
    66.8 -    xenstore-rm -t $(xenstore-read "$XENBUS_PATH/frontend") || true
    66.9 -
   66.10 -    # remove device backend store entries
   66.11 -    xenstore-rm -t "$XENBUS_PATH"       || true
   66.12 -    xenstore-rm -t "error/$XENBUS_PATH" || true
   66.13 +    /etc/xen/scripts/xen-hotplug-cleanup
   66.14      ;;
   66.15    online)
   66.16      ;;
    67.1 --- a/tools/examples/xen-backend.rules	Wed Mar 01 10:01:54 2006 -0700
    67.2 +++ b/tools/examples/xen-backend.rules	Wed Mar 01 12:47:25 2006 -0700
    67.3 @@ -2,6 +2,4 @@ SUBSYSTEM=="xen-backend", KERNEL=="vbd*"
    67.4  SUBSYSTEM=="xen-backend", KERNEL=="vtpm*", RUN+="/etc/xen/scripts/vtpm $env{ACTION}"
    67.5  SUBSYSTEM=="xen-backend", KERNEL=="vif*", ACTION=="online", RUN+="$env{script} online"
    67.6  SUBSYSTEM=="xen-backend", KERNEL=="vif*", ACTION=="offline", RUN+="$env{script} offline"
    67.7 -SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/bin/bash -c '/usr/bin/xenstore-rm -t $$(/usr/bin/xenstore-read $env{XENBUS_PATH}/frontend)'"
    67.8 -SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/usr/bin/xenstore-rm -t $env{XENBUS_PATH}"
    67.9 -SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/usr/bin/xenstore-rm -t error/$env{XENBUS_PATH}"
   67.10 +SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/etc/xen/scripts/xen-hotplug-cleanup"
    68.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    68.2 +++ b/tools/examples/xen-hotplug-cleanup	Wed Mar 01 12:47:25 2006 -0700
    68.3 @@ -0,0 +1,21 @@
    68.4 +#! /bin/sh
    68.5 +
    68.6 +dir=$(dirname "$0")
    68.7 +. "$dir/xen-hotplug-common.sh"
    68.8 +
    68.9 +# Claim the lock protecting /etc/xen/scripts/block.  This stops a race whereby
   68.10 +# paths in the store would disappear underneath that script as it attempted to
   68.11 +# read from the store checking for device sharing.
   68.12 +# Any other scripts that do similar things will have to have their lock
   68.13 +# claimed too.
   68.14 +# This is pretty horrible, but there's not really a nicer way of solving this.
   68.15 +claim_lock "block"
   68.16 +
   68.17 +# remove device frontend store entries
   68.18 +xenstore-rm -t $(xenstore-read "$XENBUS_PATH/frontend") || true
   68.19 +
   68.20 +# remove device backend store entries
   68.21 +xenstore-rm -t "$XENBUS_PATH"       || true
   68.22 +xenstore-rm -t "error/$XENBUS_PATH" || true
   68.23 +
   68.24 +release_lock "block"
    69.1 --- a/tools/examples/xen-hotplug-common.sh	Wed Mar 01 10:01:54 2006 -0700
    69.2 +++ b/tools/examples/xen-hotplug-common.sh	Wed Mar 01 12:47:25 2006 -0700
    69.3 @@ -17,7 +17,9 @@
    69.4  
    69.5  
    69.6  dir=$(dirname "$0")
    69.7 +. "$dir/logging.sh"
    69.8  . "$dir/xen-script-common.sh"
    69.9 +. "$dir/locking.sh"
   69.10  
   69.11  exec 2>>/var/log/xen-hotplug.log
   69.12  
   69.13 @@ -25,12 +27,6 @@ export PATH="/sbin:/bin:/usr/bin:/usr/sb
   69.14  export LANG="POSIX"
   69.15  unset $(set | grep ^LC_ | cut -d= -f1)
   69.16  
   69.17 -log() {
   69.18 -  local level="$1"
   69.19 -  shift
   69.20 -  logger -p "daemon.$level" -- "$0:" "$@" || echo "$0 $@" >&2
   69.21 -}
   69.22 -
   69.23  fatal() {
   69.24    xenstore_write "$XENBUS_PATH"/hotplug-status error
   69.25    log err "$@"
   69.26 @@ -93,87 +89,4 @@ xenstore_write() {
   69.27  }
   69.28  
   69.29  
   69.30 -#
   69.31 -# Serialisation
   69.32 -#
   69.33 -
   69.34 -LOCK_SLEEPTIME=1
   69.35 -LOCK_SPINNING_RETRIES=5
   69.36 -LOCK_RETRIES=10
   69.37 -LOCK_BASEDIR=/var/run/xen-hotplug
   69.38 -
   69.39 -
   69.40 -claim_lock()
   69.41 -{
   69.42 -  local lockdir="$LOCK_BASEDIR/$1"
   69.43 -  mkdir -p "$LOCK_BASEDIR"
   69.44 -  _claim_lock "$lockdir"
   69.45 -}
   69.46 -
   69.47 -
   69.48 -release_lock()
   69.49 -{
   69.50 -  _release_lock "$LOCK_BASEDIR/$1"
   69.51 -}
   69.52 -
   69.53 -
   69.54 -_claim_lock()
   69.55 -{
   69.56 -  local lockdir="$1"
   69.57 -  local owner=$(_lock_owner "$lockdir")
   69.58 -  local retries=0
   69.59 -
   69.60 -  while [ $retries -lt $LOCK_RETRIES ]
   69.61 -  do
   69.62 -    mkdir "$lockdir" 2>/dev/null && trap "release_lock $1; sigerr" ERR &&
   69.63 -      _update_lock_info "$lockdir" && return
   69.64 -
   69.65 -    local new_owner=$(_lock_owner "$lockdir")
   69.66 -    if [ "$new_owner" != "$owner" ]
   69.67 -    then
   69.68 -      owner="$new_owner"
   69.69 -      retries=0
   69.70 -    fi
   69.71 -
   69.72 -    if [ $retries -gt $LOCK_SPINNING_RETRIES ]
   69.73 -    then
   69.74 -      sleep $LOCK_SLEEPTIME
   69.75 -    else
   69.76 -      sleep 0
   69.77 -    fi
   69.78 -    retries=$(($retries + 1))
   69.79 -  done
   69.80 -  _steal_lock "$lockdir"
   69.81 -}
   69.82 -
   69.83 -
   69.84 -_release_lock()
   69.85 -{
   69.86 -  trap sigerr ERR
   69.87 -  rm -rf "$1" 2>/dev/null || true
   69.88 -}
   69.89 -
   69.90 -
   69.91 -_steal_lock()
   69.92 -{
   69.93 -  local lockdir="$1"
   69.94 -  local owner=$(cat "$lockdir/owner" 2>/dev/null || echo "unknown")
   69.95 -  log err "Forced to steal lock on $lockdir from $owner!"
   69.96 -  _release_lock "$lockdir"
   69.97 -  _claim_lock "$lockdir"
   69.98 -}
   69.99 -
  69.100 -
  69.101 -_lock_owner()
  69.102 -{
  69.103 -  cat "$1/owner" 2>/dev/null || echo "unknown"
  69.104 -}
  69.105 -
  69.106 -
  69.107 -_update_lock_info()
  69.108 -{
  69.109 -  echo "$$: $0" >"$1/owner"
  69.110 -}
  69.111 -
  69.112 -
  69.113  log debug "$@" "XENBUS_PATH=$XENBUS_PATH"
    70.1 --- a/tools/firmware/hvmloader/Makefile	Wed Mar 01 10:01:54 2006 -0700
    70.2 +++ b/tools/firmware/hvmloader/Makefile	Wed Mar 01 12:47:25 2006 -0700
    70.3 @@ -19,7 +19,7 @@
    70.4  #
    70.5  
    70.6  XEN_ROOT = ../../..
    70.7 -include $(XEN_ROOT)/tools/Rules.mk
    70.8 +include $(XEN_ROOT)/Config.mk
    70.9  
   70.10  # The HVM loader is started in 32-bit mode at the address below:
   70.11  LOADADDR = 0x100000
   70.12 @@ -29,9 +29,13 @@ XENINC   =-I$(XEN_ROOT)/tools/libxc
   70.13  
   70.14  OBJECTS	 = hvmloader.o acpi_madt.o 
   70.15  
   70.16 -CC       = gcc
   70.17 +# Disable PIE/SSP if GCC supports them. They can break us.
   70.18 +CFLAGS  += $(call test-gcc-flag,$(CC),-nopie)
   70.19 +CFLAGS  += $(call test-gcc-flag,$(CC),-fno-stack-protector)
   70.20 +CFLAGS  += $(call test-gcc-flag,$(CC),-fno-stack-protector-all)
   70.21 +
   70.22  OBJCOPY  = objcopy
   70.23 -CFLAGS   = $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float
   70.24 +CFLAGS  += $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float
   70.25  CFLAGS  += -m32 -march=i686
   70.26  LDFLAGS  = -m32 -nostdlib -Wl,-N -Wl,-Ttext -Wl,$(LOADADDR)
   70.27  
    71.1 --- a/tools/firmware/vgabios/Makefile	Wed Mar 01 10:01:54 2006 -0700
    71.2 +++ b/tools/firmware/vgabios/Makefile	Wed Mar 01 12:47:25 2006 -0700
    71.3 @@ -1,6 +1,4 @@
    71.4  CC      = gcc
    71.5 -CFLAGS  = -g -O2 -Wall -Wstrict-prototypes
    71.6 -LDFLAGS = 
    71.7  
    71.8  GCC = gcc
    71.9  BCC = bcc
    72.1 --- a/tools/firmware/vmxassist/Makefile	Wed Mar 01 10:01:54 2006 -0700
    72.2 +++ b/tools/firmware/vmxassist/Makefile	Wed Mar 01 12:47:25 2006 -0700
    72.3 @@ -19,7 +19,7 @@
    72.4  #
    72.5  
    72.6  XEN_ROOT = ../../..
    72.7 -include $(XEN_ROOT)/tools/Rules.mk
    72.8 +include $(XEN_ROOT)/Config.mk
    72.9  
   72.10  # The emulator code lives in ROM space
   72.11  TEXTADDR=0x000D0000
   72.12 @@ -27,11 +27,14 @@ TEXTADDR=0x000D0000
   72.13  DEFINES=-DDEBUG -DTEXTADDR=$(TEXTADDR)
   72.14  XENINC=-I$(XEN_ROOT)/tools/libxc
   72.15  
   72.16 -LD       = ld
   72.17 -CC       = gcc
   72.18 +# Disable PIE/SSP if GCC supports them. They can break us.
   72.19 +CFLAGS  += $(call test-gcc-flag,$(CC),-nopie)
   72.20 +CFLAGS  += $(call test-gcc-flag,$(CC),-fno-stack-protector)
   72.21 +CFLAGS  += $(call test-gcc-flag,$(CC),-fno-stack-protector-all)
   72.22 +
   72.23  CPP      = cpp -P
   72.24  OBJCOPY  = objcopy -p -O binary -R .note -R .comment -R .bss -S --gap-fill=0
   72.25 -CFLAGS   = $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float
   72.26 +CFLAGS  += $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float
   72.27  CFLAGS  += -m32 -march=i686
   72.28  LDFLAGS  = -m elf_i386
   72.29  
    73.1 --- a/tools/ioemu/Makefile	Wed Mar 01 10:01:54 2006 -0700
    73.2 +++ b/tools/ioemu/Makefile	Wed Mar 01 12:47:25 2006 -0700
    73.3 @@ -1,6 +1,9 @@
    73.4 +XEN_ROOT=../..
    73.5 +include $(XEN_ROOT)/tools/Rules.mk
    73.6 +
    73.7  -include config-host.mak
    73.8  
    73.9 -CFLAGS=-Wall -O2 -g -fno-strict-aliasing 
   73.10 +CFLAGS+=-Wall -O2 -g -fno-strict-aliasing 
   73.11  ifdef CONFIG_DARWIN
   73.12  CFLAGS+= -mdynamic-no-pic
   73.13  endif
    74.1 --- a/tools/ioemu/hw/ide.c	Wed Mar 01 10:01:54 2006 -0700
    74.2 +++ b/tools/ioemu/hw/ide.c	Wed Mar 01 12:47:25 2006 -0700
    74.3 @@ -669,9 +669,6 @@ static int ide_read_dma_cb(IDEState *s,
    74.4      }
    74.5      if (s->io_buffer_index >= s->io_buffer_size && s->nsector == 0) {
    74.6          s->status = READY_STAT | SEEK_STAT;
    74.7 -        s->bmdma->status &= ~BM_STATUS_DMAING;
    74.8 -        s->bmdma->status |= BM_STATUS_INT;
    74.9 -        ide_set_irq(s);
   74.10  #ifdef DEBUG_IDE_ATAPI
   74.11          printf("dma status=0x%x\n", s->status);
   74.12  #endif
   74.13 @@ -738,9 +735,6 @@ static int ide_write_dma_cb(IDEState *s,
   74.14              if (n == 0) {
   74.15                  /* end of transfer */
   74.16                  s->status = READY_STAT | SEEK_STAT;
   74.17 -                s->bmdma->status &= ~BM_STATUS_DMAING;
   74.18 -                s->bmdma->status |= BM_STATUS_INT;
   74.19 -                ide_set_irq(s);
   74.20                  return 0;
   74.21              }
   74.22              if (n > MAX_MULT_SECTORS)
   74.23 @@ -987,9 +981,6 @@ static int ide_atapi_cmd_read_dma_cb(IDE
   74.24      if (s->packet_transfer_size <= 0) {
   74.25          s->status = READY_STAT;
   74.26          s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD;
   74.27 -        s->bmdma->status &= ~BM_STATUS_DMAING;
   74.28 -        s->bmdma->status |= BM_STATUS_INT;
   74.29 -        ide_set_irq(s);
   74.30  #ifdef DEBUG_IDE_ATAPI
   74.31          printf("dma status=0x%x\n", s->status);
   74.32  #endif
   74.33 @@ -2025,6 +2016,17 @@ static void ide_map(PCIDevice *pci_dev, 
   74.34      }
   74.35  }
   74.36  
   74.37 +static void ide_dma_finish(BMDMAState *bm)
   74.38 +{
   74.39 +    IDEState *s = bm->ide_if;
   74.40 +
   74.41 +    bm->status &= ~BM_STATUS_DMAING;
   74.42 +    bm->status |= BM_STATUS_INT;
   74.43 +    bm->dma_cb = NULL;
   74.44 +    bm->ide_if = NULL;
   74.45 +    ide_set_irq(s);
   74.46 +}
   74.47 +
   74.48  /* XXX: full callback usage to prepare non blocking I/Os support -
   74.49     error handling */
   74.50  #ifdef DMA_MULTI_THREAD
   74.51 @@ -2070,9 +2072,8 @@ static void ide_dma_loop(BMDMAState *bm)
   74.52          cur_addr += 8;
   74.53      }
   74.54      /* end of transfer */
   74.55 - the_end:
   74.56 -    bm->dma_cb = NULL;
   74.57 -    bm->ide_if = NULL;
   74.58 +the_end:
   74.59 +    ide_dma_finish(bm);
   74.60  }
   74.61  
   74.62  static void ide_dma_start(IDEState *s, IDEDMAFunc *dma_cb)
    75.1 --- a/tools/ioemu/hw/pcnet.c	Wed Mar 01 10:01:54 2006 -0700
    75.2 +++ b/tools/ioemu/hw/pcnet.c	Wed Mar 01 12:47:25 2006 -0700
    75.3 @@ -376,6 +376,10 @@ static int pcnet_can_receive(void *opaqu
    75.4      if (s->recv_pos > 0)
    75.5          return 0;
    75.6  
    75.7 +    pcnet_rdte_poll(s);
    75.8 +    if (!(CSR_CRST(s) & 0x8000)) {
    75.9 +        return 0;
   75.10 +    }
   75.11      return sizeof(s->buffer)-16;
   75.12  }
   75.13  
    76.1 --- a/tools/ioemu/target-i386-dm/Makefile	Wed Mar 01 10:01:54 2006 -0700
    76.2 +++ b/tools/ioemu/target-i386-dm/Makefile	Wed Mar 01 12:47:25 2006 -0700
    76.3 @@ -1,7 +1,8 @@
    76.4 +include config.mak
    76.5 +override TARGET_ARCH=i386
    76.6 +
    76.7  XEN_ROOT=../../..
    76.8  include $(XEN_ROOT)/tools/Rules.mk
    76.9 -include config.mak
   76.10 -override TARGET_ARCH=i386
   76.11  
   76.12  INSTALL_DIR := $(DESTDIR)/usr/$(LIBDIR)/xen/bin
   76.13  TARGET_PATH=$(SRC_PATH)/target-$(TARGET_ARCH)
   76.14 @@ -12,7 +13,7 @@ ifdef CONFIG_USER_ONLY
   76.15  VPATH+=:$(SRC_PATH)/linux-user
   76.16  DEFINES+=-I$(SRC_PATH)/linux-user -I$(SRC_PATH)/linux-user/$(TARGET_ARCH)
   76.17  endif
   76.18 -CFLAGS=-Wall -O2 -g -fno-strict-aliasing
   76.19 +CFLAGS+=-Wall -O2 -g -fno-strict-aliasing
   76.20  LDFLAGS=-g
   76.21  LIBS=
   76.22  HELPER_CFLAGS=$(CFLAGS)
    77.1 --- a/tools/libxc/xc_linux_build.c	Wed Mar 01 10:01:54 2006 -0700
    77.2 +++ b/tools/libxc/xc_linux_build.c	Wed Mar 01 12:47:25 2006 -0700
    77.3 @@ -46,6 +46,77 @@
    77.4  #define probe_aout9(image,image_size,load_funcs) 1
    77.5  #endif
    77.6  
    77.7 +static const char *feature_names[XENFEAT_NR_SUBMAPS*32] = {
    77.8 +    [XENFEAT_writable_page_tables]       = "writable_page_tables",
    77.9 +    [XENFEAT_writable_descriptor_tables] = "writable_descriptor_tables",
   77.10 +    [XENFEAT_auto_translated_physmap]    = "auto_translated_physmap",
   77.11 +    [XENFEAT_supervisor_mode_kernel]     = "supervisor_mode_kernel",
   77.12 +    [XENFEAT_pae_pgdir_above_4gb]        = "pae_pgdir_above_4gb"
   77.13 +};
   77.14 +
   77.15 +static inline void set_feature_bit (int nr, uint32_t *addr)
   77.16 +{
   77.17 +    addr[nr>>5] |= (1<<(nr&31));
   77.18 +}
   77.19 +
   77.20 +static inline int test_feature_bit(int nr, uint32_t *addr)
   77.21 +{
   77.22 +    return !!(addr[nr>>5] & (1<<(nr&31)));
   77.23 +}
   77.24 +
   77.25 +static int parse_features(
   77.26 +    const char *feats,
   77.27 +    uint32_t supported[XENFEAT_NR_SUBMAPS],
   77.28 +    uint32_t required[XENFEAT_NR_SUBMAPS])
   77.29 +{
   77.30 +    const char *end, *p;
   77.31 +    int i, req;
   77.32 +
   77.33 +    if ( (end = strchr(feats, ',')) == NULL )
   77.34 +        end = feats + strlen(feats);
   77.35 +
   77.36 +    while ( feats < end )
   77.37 +    {
   77.38 +        p = strchr(feats, '|');
   77.39 +        if ( (p == NULL) || (p > end) )
   77.40 +            p = end;
   77.41 +
   77.42 +        req = (*feats == '!');
   77.43 +        if ( req )
   77.44 +            feats++;
   77.45 +
   77.46 +        for ( i = 0; i < XENFEAT_NR_SUBMAPS*32; i++ )
   77.47 +        {
   77.48 +            if ( feature_names[i] == NULL )
   77.49 +                continue;
   77.50 +
   77.51 +            if ( strncmp(feature_names[i], feats, p-feats) == 0 )
   77.52 +            {
   77.53 +                set_feature_bit(i, supported);
   77.54 +                if ( required && req )
   77.55 +                    set_feature_bit(i, required);
   77.56 +                break;
   77.57 +            }
   77.58 +        }
   77.59 +
   77.60 +        if ( i == XENFEAT_NR_SUBMAPS*32 )
   77.61 +        {
   77.62 +            ERROR("Unknown feature \"%.*s\".\n", (int)(p-feats), feats);
   77.63 +            if ( req )
   77.64 +            {
   77.65 +                ERROR("Kernel requires an unknown hypervisor feature.\n");
   77.66 +                return -EINVAL;
   77.67 +            }
   77.68 +        }
   77.69 +
   77.70 +        feats = p;
   77.71 +        if ( *feats == '|' )
   77.72 +            feats++;
   77.73 +    }
   77.74 +
   77.75 +    return -EINVAL;
   77.76 +}
   77.77 +
   77.78  static int probeimageformat(char *image,
   77.79                              unsigned long image_size,
   77.80                              struct load_funcs *load_funcs)
   77.81 @@ -344,7 +415,8 @@ static int setup_guest(int xc_handle,
   77.82                         unsigned long shared_info_frame,
   77.83                         unsigned long flags,
   77.84                         unsigned int store_evtchn, unsigned long *store_mfn,
   77.85 -                       unsigned int console_evtchn, unsigned long *console_mfn)
   77.86 +                       unsigned int console_evtchn, unsigned long *console_mfn,
   77.87 +                       uint32_t required_features[XENFEAT_NR_SUBMAPS])
   77.88  {
   77.89      unsigned long *page_array = NULL;
   77.90      struct load_funcs load_funcs;
   77.91 @@ -483,7 +555,8 @@ static int setup_guest(int xc_handle,
   77.92                         unsigned long shared_info_frame,
   77.93                         unsigned long flags,
   77.94                         unsigned int store_evtchn, unsigned long *store_mfn,
   77.95 -                       unsigned int console_evtchn, unsigned long *console_mfn)
   77.96 +                       unsigned int console_evtchn, unsigned long *console_mfn,
   77.97 +                       uint32_t required_features[XENFEAT_NR_SUBMAPS])
   77.98  {
   77.99      unsigned long *page_array = NULL;
  77.100      unsigned long count, i, hypercall_pfn;
  77.101 @@ -515,8 +588,9 @@ static int setup_guest(int xc_handle,
  77.102      unsigned long vpt_start;
  77.103      unsigned long vpt_end;
  77.104      unsigned long v_end;
  77.105 -    unsigned shadow_mode_enabled;
  77.106      unsigned long guest_store_mfn, guest_console_mfn, guest_shared_info_mfn;
  77.107 +    unsigned long shadow_mode_enabled;
  77.108 +    uint32_t supported_features[XENFEAT_NR_SUBMAPS] = { 0, };
  77.109  
  77.110      rc = probeimageformat(image, image_size, &load_funcs);
  77.111      if ( rc != 0 )
  77.112 @@ -534,8 +608,6 @@ static int setup_guest(int xc_handle,
  77.113          goto error_out;
  77.114      }
  77.115  
  77.116 -    shadow_mode_enabled = !!strstr(dsi.xen_guest_string,
  77.117 -                                   "SHADOW=translate");
  77.118      /*
  77.119       * Why do we need this? The number of page-table frames depends on the 
  77.120       * size of the bootstrap address space. But the size of the address space 
  77.121 @@ -637,6 +709,35 @@ static int setup_guest(int xc_handle,
  77.122      (load_funcs.loadimage)(image, image_size, xc_handle, dom, page_array,
  77.123                             &dsi);
  77.124  
  77.125 +    /* Parse and validate kernel features. */
  77.126 +    p = strstr(dsi.xen_guest_string, "FEATURES=");
  77.127 +    if ( p != NULL )
  77.128 +    {
  77.129 +        if ( !parse_features(p + strlen("FEATURES="),
  77.130 +                             supported_features,
  77.131 +                             required_features) )
  77.132 +        {
  77.133 +            ERROR("Failed to parse guest kernel features.\n");
  77.134 +            goto error_out;
  77.135 +        }
  77.136 +
  77.137 +        fprintf(stderr, "Supported features  = { %08x }.\n",
  77.138 +                supported_features[0]);
  77.139 +        fprintf(stderr, "Required features   = { %08x }.\n",
  77.140 +                required_features[0]);
  77.141 +    }
  77.142 +
  77.143 +    for ( i = 0; i < XENFEAT_NR_SUBMAPS; i++ )
  77.144 +    {
  77.145 +        if ( (supported_features[i]&required_features[i]) != required_features[i] )
  77.146 +        {
  77.147 +            ERROR("Guest kernel does not support a required feature.\n");
  77.148 +            goto error_out;
  77.149 +        }
  77.150 +    }
  77.151 +
  77.152 +    shadow_mode_enabled = test_feature_bit(XENFEAT_auto_translated_physmap, required_features);
  77.153 +
  77.154      /* Load the initial ramdisk image. */
  77.155      if ( initrd_len != 0 )
  77.156      {
  77.157 @@ -870,6 +971,7 @@ int xc_linux_build(int xc_handle,
  77.158                     const char *image_name,
  77.159                     const char *ramdisk_name,
  77.160                     const char *cmdline,
  77.161 +                   const char *features,
  77.162                     unsigned long flags,
  77.163                     unsigned int store_evtchn,
  77.164                     unsigned long *store_mfn,
  77.165 @@ -886,6 +988,16 @@ int xc_linux_build(int xc_handle,
  77.166      char         *image = NULL;
  77.167      unsigned long image_size, initrd_size=0;
  77.168      unsigned long vstartinfo_start, vkern_entry, vstack_start;
  77.169 +    uint32_t      features_bitmap[XENFEAT_NR_SUBMAPS] = { 0, };
  77.170 +
  77.171 +    if ( features != NULL )
  77.172 +    {
  77.173 +        if ( !parse_features(features, features_bitmap, NULL) )
  77.174 +        {
  77.175 +            PERROR("Failed to parse configured features\n");
  77.176 +            goto error_out;
  77.177 +        }
  77.178 +    }
  77.179  
  77.180      if ( (nr_pages = get_tot_pages(xc_handle, domid)) < 0 )
  77.181      {
  77.182 @@ -940,7 +1052,8 @@ int xc_linux_build(int xc_handle,
  77.183                       &vstack_start, ctxt, cmdline,
  77.184                       op.u.getdomaininfo.shared_info_frame,
  77.185                       flags, store_evtchn, store_mfn,
  77.186 -                     console_evtchn, console_mfn) < 0 )
  77.187 +                     console_evtchn, console_mfn,
  77.188 +                     features_bitmap) < 0 )
  77.189      {
  77.190          ERROR("Error constructing guest OS");
  77.191          goto error_out;
    78.1 --- a/tools/libxc/xenguest.h	Wed Mar 01 10:01:54 2006 -0700
    78.2 +++ b/tools/libxc/xenguest.h	Wed Mar 01 12:47:25 2006 -0700
    78.3 @@ -47,6 +47,7 @@ int xc_linux_build(int xc_handle,
    78.4                     const char *image_name,
    78.5                     const char *ramdisk_name,
    78.6                     const char *cmdline,
    78.7 +                   const char *features,
    78.8                     unsigned long flags,
    78.9                     unsigned int store_evtchn,
   78.10                     unsigned long *store_mfn,
    79.1 --- a/tools/pygrub/src/pygrub	Wed Mar 01 10:01:54 2006 -0700
    79.2 +++ b/tools/pygrub/src/pygrub	Wed Mar 01 12:47:25 2006 -0700
    79.3 @@ -94,12 +94,18 @@ def get_active_offset(file):
    79.4              return struct.unpack("<L", buf[poff+8:poff+12])[0] * SECTOR_SIZE
    79.5      return -1
    79.6  
    79.7 -def get_config(fn):
    79.8 +def get_config(fn, isconfig = False):
    79.9      if not os.access(fn, os.R_OK):
   79.10          raise RuntimeError, "Unable to access %s" %(fn,)
   79.11  
   79.12      cf = grub.GrubConf.GrubConfigFile()
   79.13  
   79.14 +    if isconfig:
   79.15 +        # set the config file and parse it
   79.16 +        cf.filename = fn
   79.17 +        cf.parse()
   79.18 +        return cf
   79.19 +
   79.20      offset = 0
   79.21      if is_disk_image(fn):
   79.22          offset = get_active_offset(fn)
   79.23 @@ -130,9 +136,7 @@ def get_config(fn):
   79.24          # then parse the grub config
   79.25          cf.parse(buf)
   79.26      else:
   79.27 -        # set the config file and parse it
   79.28 -        cf.filename = fn
   79.29 -        cf.parse()
   79.30 +        raise RuntimeError, "Unable to read filesystem" 
   79.31      
   79.32      return cf
   79.33  
   79.34 @@ -214,7 +218,8 @@ if __name__ == "__main__":
   79.35  
   79.36      try:
   79.37          opts, args = getopt.gnu_getopt(sys.argv[1:], 'qh::',
   79.38 -                                   ["quiet", "help", "output=", "entry="])
   79.39 +                                   ["quiet", "help", "output=", "entry=",
   79.40 +                                    "isconfig"])
   79.41      except getopt.GetoptError:
   79.42          usage()
   79.43          sys.exit(1)
   79.44 @@ -227,6 +232,7 @@ if __name__ == "__main__":
   79.45      output = None
   79.46      entry = None
   79.47      interactive = True
   79.48 +    isconfig = False
   79.49      for o, a in opts:
   79.50          if o in ("-q", "--quiet"):
   79.51              interactive = False
   79.52 @@ -239,13 +245,15 @@ if __name__ == "__main__":
   79.53              entry = a
   79.54              # specifying the entry to boot implies non-interactive
   79.55              interactive = False
   79.56 +        elif o in ("--isconfig",):
   79.57 +            isconfig = True
   79.58  
   79.59      if output is None or output == "-":
   79.60          fd = sys.stdout.fileno()
   79.61      else:
   79.62          fd = os.open(output, os.O_WRONLY)
   79.63  
   79.64 -    cf = get_config(file)
   79.65 +    cf = get_config(file, isconfig)
   79.66      if interactive:
   79.67          curses.wrapper(run_main)
   79.68      else:
    80.1 --- a/tools/python/xen/lowlevel/xc/xc.c	Wed Mar 01 10:01:54 2006 -0700
    80.2 +++ b/tools/python/xen/lowlevel/xc/xc.c	Wed Mar 01 12:47:25 2006 -0700
    80.3 @@ -326,27 +326,29 @@ static PyObject *pyxc_linux_build(XcObje
    80.4                                    PyObject *kwds)
    80.5  {
    80.6      uint32_t dom;
    80.7 -    char *image, *ramdisk = NULL, *cmdline = "";
    80.8 +    char *image, *ramdisk = NULL, *cmdline = "", *features = NULL;
    80.9      int flags = 0;
   80.10      int store_evtchn, console_evtchn;
   80.11      unsigned long store_mfn = 0;
   80.12      unsigned long console_mfn = 0;
   80.13  
   80.14 -    static char *kwd_list[] = { "dom", "store_evtchn", 
   80.15 -                                "console_evtchn", "image", 
   80.16 +    static char *kwd_list[] = { "dom", "store_evtchn",
   80.17 +                                "console_evtchn", "image",
   80.18  				/* optional */
   80.19 -				"ramdisk", "cmdline", "flags", NULL };
   80.20 +				"ramdisk", "cmdline", "flags",
   80.21 +				"features", NULL };
   80.22  
   80.23 -    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiis|ssi", kwd_list,
   80.24 +    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiis|ssis", kwd_list,
   80.25                                        &dom, &store_evtchn,
   80.26 -				      &console_evtchn, &image, 
   80.27 +				      &console_evtchn, &image,
   80.28  				      /* optional */
   80.29 -				      &ramdisk, &cmdline, &flags) )
   80.30 +				      &ramdisk, &cmdline, &flags,
   80.31 +				      &features) )
   80.32          return NULL;
   80.33  
   80.34      if ( xc_linux_build(self->xc_handle, dom, image,
   80.35 -                        ramdisk, cmdline, flags,
   80.36 -                        store_evtchn, &store_mfn, 
   80.37 +                        ramdisk, cmdline, features, flags,
   80.38 +                        store_evtchn, &store_mfn,
   80.39  			console_evtchn, &console_mfn) != 0 ) {
   80.40          if (!errno)
   80.41               errno = EINVAL;
    81.1 --- a/tools/python/xen/xend/XendBootloader.py	Wed Mar 01 10:01:54 2006 -0700
    81.2 +++ b/tools/python/xen/xend/XendBootloader.py	Wed Mar 01 12:47:25 2006 -0700
    81.3 @@ -1,7 +1,7 @@
    81.4  #
    81.5  # XendBootloader.py - Framework to run a boot loader for picking the kernel
    81.6  #
    81.7 -# Copyright 2005 Red Hat, Inc.
    81.8 +# Copyright 2005-2006 Red Hat, Inc.
    81.9  # Jeremy Katz <katzj@redhat.com>
   81.10  #
   81.11  # This software may be freely redistributed under the terms of the GNU
   81.12 @@ -13,13 +13,12 @@
   81.13  #
   81.14  
   81.15  import os, select, errno
   81.16 +import random
   81.17  import sxp
   81.18  
   81.19  from XendLogging import log
   81.20  from XendError import VmError
   81.21  
   81.22 -BL_FIFO = "/var/lib/xen/xenbl"
   81.23 -
   81.24  def bootloader(blexec, disk, quiet = 0, vcpus = None, entry = None):
   81.25      """Run the boot loader executable on the given disk and return a
   81.26      config image.
   81.27 @@ -38,14 +37,18 @@ def bootloader(blexec, disk, quiet = 0, 
   81.28          log.error(msg)
   81.29          raise VmError(msg)
   81.30  
   81.31 -    os.mkfifo(BL_FIFO, 0600)
   81.32 +    while True:
   81.33 +        fifo = "/var/lib/xen/xenbl.%s" %(random.randint(0, 32000),)
   81.34 +        if not os.path.exists(fifo):
   81.35 +            break
   81.36 +    os.mkfifo(fifo, 0600)
   81.37  
   81.38      child = os.fork()
   81.39      if (not child):
   81.40          args = [ blexec ]
   81.41          if quiet:
   81.42              args.append("-q")
   81.43 -        args.append("--output=%s" %(BL_FIFO,))
   81.44 +        args.append("--output=%s" %(fifo,))
   81.45          if entry is not None:
   81.46              args.append("--entry=%s" %(entry,))
   81.47          args.append(disk)
   81.48 @@ -59,7 +62,7 @@ def bootloader(blexec, disk, quiet = 0, 
   81.49  
   81.50      while 1:
   81.51          try:
   81.52 -            r = os.open(BL_FIFO, os.O_RDONLY)
   81.53 +            r = os.open(fifo, os.O_RDONLY)
   81.54          except OSError, e:
   81.55              if e.errno == errno.EINTR:
   81.56                  continue
   81.57 @@ -74,7 +77,7 @@ def bootloader(blexec, disk, quiet = 0, 
   81.58          
   81.59      os.waitpid(child, 0)
   81.60      os.close(r)
   81.61 -    os.unlink(BL_FIFO)
   81.62 +    os.unlink(fifo)
   81.63  
   81.64      if len(ret) == 0:
   81.65          msg = "Boot loader didn't return any data!"
    82.1 --- a/tools/python/xen/xend/XendDomainInfo.py	Wed Mar 01 10:01:54 2006 -0700
    82.2 +++ b/tools/python/xen/xend/XendDomainInfo.py	Wed Mar 01 12:47:25 2006 -0700
    82.3 @@ -1502,15 +1502,14 @@ class XendDomainInfo:
    82.4          if not self.info['bootloader']:
    82.5              return
    82.6          # if we're restarting with a bootloader, we need to run it
    82.7 -        # FIXME: this assumes the disk is the first device and
    82.8 -        # that we're booting from the first disk
    82.9          blcfg = None
   82.10          config = self.sxpr()
   82.11          # FIXME: this assumes that we want to use the first disk
   82.12 -        dev = sxp.child_value(config, "device")
   82.13 -        if dev:
   82.14 -            disk = sxp.child_value(dev, "uname")
   82.15 -            fn = blkdev_uname_to_file(disk)
   82.16 +        for dev in sxp.children(config, "device"):
   82.17 +            disk = sxp.child(dev, "vbd")
   82.18 +            if disk is None:
   82.19 +                continue
   82.20 +            fn = blkdev_uname_to_file(sxp.child_value(disk, "uname"))
   82.21              blcfg = bootloader(self.info['bootloader'], fn, 1,
   82.22                                 self.info['vcpus'])
   82.23          if blcfg is None:
    83.1 --- a/tools/python/xen/xend/image.py	Wed Mar 01 10:01:54 2006 -0700
    83.2 +++ b/tools/python/xen/xend/image.py	Wed Mar 01 12:47:25 2006 -0700
    83.3 @@ -68,6 +68,7 @@ class ImageHandler:
    83.4          self.kernel = None
    83.5          self.ramdisk = None
    83.6          self.cmdline = None
    83.7 +        self.features = None
    83.8  
    83.9          self.configure(imageConfig, deviceConfig)
   83.10  
   83.11 @@ -89,6 +90,7 @@ class ImageHandler:
   83.12          if args:
   83.13              self.cmdline += " " + args
   83.14          self.ramdisk = get_cfg("ramdisk", '')
   83.15 +        self.features = get_cfg("features", '')
   83.16          
   83.17          self.vm.storeVm(("image/ostype", self.ostype),
   83.18                          ("image/kernel", self.kernel),
   83.19 @@ -175,13 +177,15 @@ class LinuxImageHandler(ImageHandler):
   83.20          log.debug("cmdline        = %s", self.cmdline)
   83.21          log.debug("ramdisk        = %s", self.ramdisk)
   83.22          log.debug("vcpus          = %d", self.vm.getVCpuCount())
   83.23 +        log.debug("features       = %s", self.features)
   83.24  
   83.25          return xc.linux_build(dom            = self.vm.getDomid(),
   83.26                                image          = self.kernel,
   83.27                                store_evtchn   = store_evtchn,
   83.28                                console_evtchn = console_evtchn,
   83.29                                cmdline        = self.cmdline,
   83.30 -                              ramdisk        = self.ramdisk)
   83.31 +                              ramdisk        = self.ramdisk,
   83.32 +                              features       = self.features)
   83.33  
   83.34  class HVMImageHandler(ImageHandler):
   83.35  
    84.1 --- a/tools/python/xen/xend/server/netif.py	Wed Mar 01 10:01:54 2006 -0700
    84.2 +++ b/tools/python/xen/xend/server/netif.py	Wed Mar 01 12:47:25 2006 -0700
    84.3 @@ -113,7 +113,8 @@ class NetifController(DevController):
    84.4                             script.replace(xroot.network_script_dir + os.sep,
    84.5                                            "")])
    84.6          if ip:
    84.7 -            result.append(['ip', ip.split(" ")])
    84.8 +            for i in ip.split(" "):
    84.9 +                result.append(['ip', i])
   84.10          if bridge:
   84.11              result.append(['bridge', bridge])
   84.12          if mac:
    85.1 --- a/tools/python/xen/xm/create.py	Wed Mar 01 10:01:54 2006 -0700
    85.2 +++ b/tools/python/xen/xm/create.py	Wed Mar 01 12:47:25 2006 -0700
    85.3 @@ -137,6 +137,10 @@ gopts.var('ramdisk', val='FILE',
    85.4            fn=set_value, default='',
    85.5            use="Path to ramdisk.")
    85.6  
    85.7 +gopts.var('features', val='FEATURES',
    85.8 +          fn=set_value, default='',
    85.9 +          use="Features to enable in guest kernel")
   85.10 +
   85.11  gopts.var('builder', val='FUNCTION',
   85.12            fn=set_value, default='linux',
   85.13            use="Function to use to build the domain.")
   85.14 @@ -445,6 +449,8 @@ def configure_image(vals):
   85.15          config_image.append(['root', cmdline_root])
   85.16      if vals.extra:
   85.17          config_image.append(['args', vals.extra])
   85.18 +    if vals.features:
   85.19 +        config_image.append(['features', vals.features])
   85.20  
   85.21      if vals.builder == 'hvm':
   85.22          configure_hvm(config_image, vals)
    86.1 --- a/tools/tests/Makefile	Wed Mar 01 10:01:54 2006 -0700
    86.2 +++ b/tools/tests/Makefile	Wed Mar 01 12:47:25 2006 -0700
    86.3 @@ -4,13 +4,12 @@ include $(XEN_ROOT)/tools/Rules.mk
    86.4  
    86.5  TARGET := test_x86_emulator
    86.6  
    86.7 -CC     := gcc
    86.8 -CFLAGS := -O2 -Wall -Werror -D__TEST_HARNESS__
    86.9 +HOSTCFLAGS += -D__TEST_HARNESS__
   86.10  
   86.11  all: $(TARGET)
   86.12  
   86.13  $(TARGET): x86_emulate.o test_x86_emulator.o
   86.14 -	$(CC) -o $@ $^
   86.15 +	$(HOSTCC) -o $@ $^
   86.16  
   86.17  clean:
   86.18  	rm -rf $(TARGET) *.o *~ core
   86.19 @@ -18,7 +17,7 @@ clean:
   86.20  install:
   86.21  
   86.22  x86_emulate.o: $(XEN_ROOT)/xen/arch/x86/x86_emulate.c
   86.23 -	$(CC) $(CFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $<
   86.24 +	$(HOSTCC) $(HOSTCFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $<
   86.25  
   86.26  %.o: %.c
   86.27 -	$(CC) $(CFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $<
   86.28 +	$(HOSTCC) $(HOSTCFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $<
    87.1 --- a/tools/xenstore/xs.c	Wed Mar 01 10:01:54 2006 -0700
    87.2 +++ b/tools/xenstore/xs.c	Wed Mar 01 12:47:25 2006 -0700
    87.3 @@ -31,7 +31,6 @@
    87.4  #include <signal.h>
    87.5  #include <stdint.h>
    87.6  #include <errno.h>
    87.7 -#include <sys/ioctl.h>
    87.8  #include <pthread.h>
    87.9  #include "xs.h"
   87.10  #include "list.h"
   87.11 @@ -343,7 +342,6 @@ static void *xs_talkv(struct xs_handle *
   87.12  		free(ret);
   87.13  		saved_errno = EBADF;
   87.14  		goto close_fd;
   87.15 -		
   87.16  	}
   87.17  	return ret;
   87.18  
    88.1 --- a/tools/xm-test/configure.ac	Wed Mar 01 10:01:54 2006 -0700
    88.2 +++ b/tools/xm-test/configure.ac	Wed Mar 01 12:47:25 2006 -0700
    88.3 @@ -93,6 +93,7 @@ AC_CONFIG_FILES([
    88.4      tests/unpause/Makefile
    88.5      tests/vcpu-pin/Makefile
    88.6      tests/vcpu-disable/Makefile
    88.7 +    tests/vtpm/Makefile
    88.8      tests/enforce_dom0_cpus/Makefile
    88.9      lib/XmTestReport/xmtest.py
   88.10      lib/XmTestLib/config.py
    89.1 --- a/tools/xm-test/lib/XmTestLib/Network.py	Wed Mar 01 10:01:54 2006 -0700
    89.2 +++ b/tools/xm-test/lib/XmTestLib/Network.py	Wed Mar 01 12:47:25 2006 -0700
    89.3 @@ -22,6 +22,7 @@
    89.4  import sys;
    89.5  import os;
    89.6  import atexit;
    89.7 +import random;
    89.8  
    89.9  from Test import *
   89.10  from Xm import *
   89.11 @@ -53,12 +54,22 @@ class XmNetwork:
   89.12          if rc == 0:
   89.13              SKIP("Zeroconf address found: " + out)
   89.14  
   89.15 +        # Randomize one octet of the IP addresses we choose, so that
   89.16 +        # multiple machines running network tests don't interfere 
   89.17 +        # with each other. 
   89.18 +        self.subnet = random.randint(1,254)
   89.19 +
   89.20      def calc_ip_address(self, dom, interface):
   89.21          # Generate an IP address from the dom# and eth#:
   89.22 -        #      169.254.(eth#+153).(dom#+10)
   89.23 +        #      169.254.(self.subnet).(eth#)*16 + (dom# + 1)
   89.24          ethnum = int(interface[len("eth"):])
   89.25 +        if (ethnum > 15):
   89.26 +            raise NetworkError("ethnum > 15 : " + interface)
   89.27          domnum = int(dom[len("dom"):])
   89.28 -        return "169.254."+ str(ethnum+153) + "." + str(domnum+10)
   89.29 +        if (domnum > 14):
   89.30 +            raise NetworkError("domnum > 14 : " + dom)
   89.31 +
   89.32 +        return "169.254."+ str(self.subnet) + "." + str(ethnum*16+domnum+1)
   89.33  
   89.34      def ip(self, dom, interface, todomname=None, toeth=None, bridge=None):
   89.35          newip = self.calc_ip_address(dom, interface)
   89.36 @@ -96,4 +107,4 @@ class XmNetwork:
   89.37          return newip
   89.38  
   89.39      def mask(self, dom, interface):
   89.40 -        return "255.255.255.0"
   89.41 +        return "255.255.255.240"
    90.1 --- a/tools/xm-test/lib/XmTestLib/XenDomain.py	Wed Mar 01 10:01:54 2006 -0700
    90.2 +++ b/tools/xm-test/lib/XmTestLib/XenDomain.py	Wed Mar 01 12:47:25 2006 -0700
    90.3 @@ -99,6 +99,7 @@ class XenConfig:
    90.4          # These options need to be lists
    90.5          self.defaultOpts["disk"] = []
    90.6          self.defaultOpts["vif"]  = []
    90.7 +        self.defaultOpts["vtpm"] = []
    90.8  
    90.9          self.opts = self.defaultOpts
   90.10  
    91.1 --- a/tools/xm-test/tests/Makefile.am	Wed Mar 01 10:01:54 2006 -0700
    91.2 +++ b/tools/xm-test/tests/Makefile.am	Wed Mar 01 12:47:25 2006 -0700
    91.3 @@ -23,6 +23,7 @@ SUBDIRS = 	               	\
    91.4  		unpause         \
    91.5  		vcpu-disable    \
    91.6  		vcpu-pin	\
    91.7 +		vtpm            \
    91.8  		enforce_dom0_cpus	\
    91.9  		save restore migrate
   91.10  
    92.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    92.2 +++ b/tools/xm-test/tests/vtpm/01_vtpm-list_pos.py	Wed Mar 01 12:47:25 2006 -0700
    92.3 @@ -0,0 +1,45 @@
    92.4 +#!/usr/bin/python
    92.5 +
    92.6 +# Copyright (C) International Business Machines Corp., 2006
    92.7 +# Author: Stefan Berger <stefanb@us.ibm.com)
    92.8 +
    92.9 +# Positive Test: create domain with virtual TPM attached at build time,
   92.10 +#                verify list
   92.11 +
   92.12 +
   92.13 +from XmTestLib import *
   92.14 +
   92.15 +def vtpm_cleanup(domName):
   92.16 +	# Since this is only a temporary domain I clean up the domain from the
   92.17 +	# virtual TPM directory
   92.18 +	traceCommand("/etc/xen/scripts/vtpm-delete %s" % domName)
   92.19 +
   92.20 +if ENABLE_HVM_SUPPORT:
   92.21 +    SKIP("vtpm-list not supported for HVM domains")
   92.22 +
   92.23 +config = {"vtpm":"instance=1,backend=0"}
   92.24 +domain = XmTestDomain(extraConfig=config)
   92.25 +
   92.26 +try:
   92.27 +    domain.start()
   92.28 +except DomainError, e:
   92.29 +    if verbose:
   92.30 +        print e.extra
   92.31 +    vtpm_cleanup(domain.getName())
   92.32 +    FAIL("Unable to create domain")
   92.33 +
   92.34 +domName = domain.getName()
   92.35 +
   92.36 +status, output = traceCommand("xm vtpm-list %s" % domain.getId())
   92.37 +eyecatcher = "/local/domain/0/backend/vtpm"
   92.38 +where = output.find(eyecatcher)
   92.39 +if status != 0:
   92.40 +    vtpm_cleanup(domName)
   92.41 +    FAIL("xm vtpm-list returned bad status, expected 0, status is %i" % status)
   92.42 +elif where < 0:
   92.43 +    vtpm_cleanup(domName)
   92.44 +    FAIL("Fail to list virtual TPM device")
   92.45 +
   92.46 +domain.stop()
   92.47 +
   92.48 +vtpm_cleanup(domName)
    93.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    93.2 +++ b/tools/xm-test/tests/vtpm/02_vtpm-cat_pcrs.py	Wed Mar 01 12:47:25 2006 -0700
    93.3 @@ -0,0 +1,81 @@
    93.4 +#!/usr/bin/python
    93.5 +
    93.6 +# Copyright (C) International Business Machines Corp., 2006
    93.7 +# Author: Stefan Berger <stefanb@us.ibm.com)
    93.8 +
    93.9 +# Positive Test: create domain with virtual TPM attached at build time,
   93.10 +#                check list of pcrs
   93.11 +
   93.12 +from XmTestLib import *
   93.13 +
   93.14 +def vtpm_cleanup(domName):
   93.15 +	# Since this is only a temporary domain I clean up the domain from the
   93.16 +	# virtual TPM directory
   93.17 +	traceCommand("/etc/xen/scripts/vtpm-delete %s" % domName)
   93.18 +
   93.19 +if ENABLE_HVM_SUPPORT:
   93.20 +    SKIP("vtpm-list not supported for HVM domains")
   93.21 +
   93.22 +status, output = traceCommand("ls /dev/tpm0")
   93.23 +if re.search("No such file or directory",output):
   93.24 +    SKIP("This machine has no hardware TPM; cannot run this test")
   93.25 +
   93.26 +status, output = traceCommand("ps aux | grep vtpm_manager | grep -v grep")
   93.27 +if output == "":
   93.28 +    FAIL("virtual TPM manager must be started to run this test")
   93.29 +
   93.30 +# vtpm manager has been detected
   93.31 +config = {"vtpm":"instance=1,backend=0"}
   93.32 +domain = XmTestDomain(extraConfig=config)
   93.33 +
   93.34 +try:
   93.35 +    domain.start()
   93.36 +except DomainError, e:
   93.37 +    if verbose:
   93.38 +        print e.extra
   93.39 +    vtpm_cleanup(domain.getName())
   93.40 +    FAIL("Unable to create domain")
   93.41 +
   93.42 +domName = domain.getName()
   93.43 +
   93.44 +try:
   93.45 +    console = XmConsole(domain.getName())
   93.46 +except ConsoleError, e:
   93.47 +    vtpm_cleanup(domName)
   93.48 +    FAIL(str(e))
   93.49 +
   93.50 +try:
   93.51 +    console.sendInput("input")
   93.52 +    run = console.runCmd("ls /sys")
   93.53 +except ConsoleError, e:
   93.54 +    saveLog(console.getHistory())
   93.55 +    vtpm_cleanup(domName)
   93.56 +    FAIL(str(e))
   93.57 +
   93.58 +if re.search("No such file",run["output"]):
   93.59 +    try:
   93.60 +        run = console.runCmd("mkdir /sys")
   93.61 +        run = console.runCmd("mount -t sysfs /sys /sys")
   93.62 +    except ConsoleError, e:
   93.63 +        saveLog(console.getHistory())
   93.64 +        vtpm_cleanup(domName)
   93.65 +        FAIL(str(e))
   93.66 +
   93.67 +try:
   93.68 +    run = console.runCmd("cat /sys/devices/platform/tpm_vtpm/pcrs")
   93.69 +except ConsoleError, e:
   93.70 +    saveLog(console.getHistory())
   93.71 +    vtpm_cleanup(domName)
   93.72 +    FAIL(str(e))
   93.73 +
   93.74 +if re.search("No such file",run["output"]):
   93.75 +    FAIL("TPM frontend support not compiled into (domU?) kernel")
   93.76 +
   93.77 +console.closeConsole()
   93.78 +
   93.79 +domain.stop()
   93.80 +
   93.81 +vtpm_cleanup(domName)
   93.82 +
   93.83 +if not re.search("PCR-00:",run["output"]):
   93.84 +	FAIL("Virtual TPM is not working correctly on /dev/vtpm on backend side")
    94.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    94.2 +++ b/tools/xm-test/tests/vtpm/Makefile.am	Wed Mar 01 12:47:25 2006 -0700
    94.3 @@ -0,0 +1,22 @@
    94.4 +
    94.5 +SUBDIRS =
    94.6 +
    94.7 +TESTS = 01_vtpm-list_pos.test \
    94.8 +        02_vtpm-cat_pcrs.test
    94.9 +
   94.10 +XFAIL_TESTS =
   94.11 +
   94.12 +EXTRA_DIST = $(TESTS) $(XFAIL_TESTS)
   94.13 +
   94.14 +TESTS_ENVIRONMENT=@TENV@
   94.15 +
   94.16 +%.test: %.py
   94.17 +	cp $< $@
   94.18 +	chmod +x $@
   94.19 +
   94.20 +clean-local: am_config_clean-local
   94.21 +
   94.22 +am_config_clean-local:
   94.23 +	rm -f *test
   94.24 +	rm -f *log
   94.25 +	rm -f *~
    95.1 --- a/xen/Rules.mk	Wed Mar 01 10:01:54 2006 -0700
    95.2 +++ b/xen/Rules.mk	Wed Mar 01 12:47:25 2006 -0700
    95.3 @@ -45,7 +45,7 @@ ALL_OBJS += $(BASEDIR)/arch/$(TARGET_ARC
    95.4  
    95.5  include $(BASEDIR)/arch/$(TARGET_ARCH)/Rules.mk
    95.6  
    95.7 -CFLAGS += -g
    95.8 +CFLAGS += -g -D__XEN__
    95.9  
   95.10  ifneq ($(debug),y)
   95.11  CFLAGS += -DNDEBUG
    96.1 --- a/xen/arch/ia64/vmx/vmx_hypercall.c	Wed Mar 01 10:01:54 2006 -0700
    96.2 +++ b/xen/arch/ia64/vmx/vmx_hypercall.c	Wed Mar 01 12:47:25 2006 -0700
    96.3 @@ -57,45 +57,7 @@ void hyper_mmu_update(void)
    96.4      vcpu_set_gr(vcpu, 8, ret, 0);
    96.5      vmx_vcpu_increment_iip(vcpu);
    96.6  }
    96.7 -/* turn off temporarily, we will merge hypercall parameter convention with xeno, when
    96.8 -    VTI domain need to call hypercall */
    96.9 -#if 0
   96.10 -unsigned long __hypercall_create_continuation(
   96.11 -    unsigned int op, unsigned int nr_args, ...)
   96.12 -{
   96.13 -    struct mc_state *mcs = &mc_state[smp_processor_id()];
   96.14 -    VCPU *vcpu = current;
   96.15 -    struct cpu_user_regs *regs = vcpu_regs(vcpu);
   96.16 -    unsigned int i;
   96.17 -    va_list args;
   96.18  
   96.19 -    va_start(args, nr_args);
   96.20 -    if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) {
   96.21 -	panic("PREEMPT happen in multicall\n");	// Not support yet
   96.22 -    } else {
   96.23 -	vcpu_set_gr(vcpu, 15, op, 0);
   96.24 -	for ( i = 0; i < nr_args; i++) {
   96.25 -	    switch (i) {
   96.26 -	    case 0: vcpu_set_gr(vcpu, 16, va_arg(args, unsigned long), 0);
   96.27 -		    break;
   96.28 -	    case 1: vcpu_set_gr(vcpu, 17, va_arg(args, unsigned long), 0);
   96.29 -		    break;
   96.30 -	    case 2: vcpu_set_gr(vcpu, 18, va_arg(args, unsigned long), 0);
   96.31 -		    break;
   96.32 -	    case 3: vcpu_set_gr(vcpu, 19, va_arg(args, unsigned long), 0);
   96.33 -		    break;
   96.34 -	    case 4: vcpu_set_gr(vcpu, 20, va_arg(args, unsigned long), 0);
   96.35 -		    break;
   96.36 -	    default: panic("Too many args for hypercall continuation\n");
   96.37 -		    break;
   96.38 -	    }
   96.39 -	}
   96.40 -    }
   96.41 -    vcpu->arch.hypercall_continuation = 1;
   96.42 -    va_end(args);
   96.43 -    return op;
   96.44 -}
   96.45 -#endif
   96.46  void hyper_dom_mem_op(void)
   96.47  {
   96.48      VCPU *vcpu=current;
    97.1 --- a/xen/arch/ia64/xen/process.c	Wed Mar 01 10:01:54 2006 -0700
    97.2 +++ b/xen/arch/ia64/xen/process.c	Wed Mar 01 12:47:25 2006 -0700
    97.3 @@ -801,30 +801,48 @@ printf("*** Handled privop masquerading 
    97.4  	reflect_interruption(isr,regs,vector);
    97.5  }
    97.6  
    97.7 -unsigned long __hypercall_create_continuation(
    97.8 -	unsigned int op, unsigned int nr_args, ...)
    97.9 +unsigned long hypercall_create_continuation(
   97.10 +	unsigned int op, const char *format, ...)
   97.11  {
   97.12      struct mc_state *mcs = &mc_state[smp_processor_id()];
   97.13      struct vcpu *v = current;
   97.14 +    const char *p = format;
   97.15 +    unsigned long arg;
   97.16      unsigned int i;
   97.17      va_list args;
   97.18  
   97.19 -    va_start(args, nr_args);
   97.20 +    va_start(args, format);
   97.21      if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) {
   97.22  	panic("PREEMPT happen in multicall\n");	// Not support yet
   97.23      } else {
   97.24  	vcpu_set_gr(v, 2, op, 0);
   97.25 -	for ( i = 0; i < nr_args; i++) {
   97.26 +	for ( i = 0; *p != '\0'; i++) {
   97.27 +            switch ( *p++ )
   97.28 +            {
   97.29 +            case 'i':
   97.30 +                arg = (unsigned long)va_arg(args, unsigned int);
   97.31 +                break;
   97.32 +            case 'l':
   97.33 +                arg = (unsigned long)va_arg(args, unsigned long);
   97.34 +                break;
   97.35 +            case 'p':
   97.36 +            case 'h':
   97.37 +                arg = (unsigned long)va_arg(args, void *);
   97.38 +                break;
   97.39 +            default:
   97.40 +                arg = 0;
   97.41 +                BUG();
   97.42 +            }
   97.43  	    switch (i) {
   97.44 -	    case 0: vcpu_set_gr(v, 14, va_arg(args, unsigned long), 0);
   97.45 +	    case 0: vcpu_set_gr(v, 14, arg, 0);
   97.46  		    break;
   97.47 -	    case 1: vcpu_set_gr(v, 15, va_arg(args, unsigned long), 0);
   97.48 +	    case 1: vcpu_set_gr(v, 15, arg, 0);
   97.49  		    break;
   97.50 -	    case 2: vcpu_set_gr(v, 16, va_arg(args, unsigned long), 0);
   97.51 +	    case 2: vcpu_set_gr(v, 16, arg, 0);
   97.52  		    break;
   97.53 -	    case 3: vcpu_set_gr(v, 17, va_arg(args, unsigned long), 0);
   97.54 +	    case 3: vcpu_set_gr(v, 17, arg, 0);
   97.55  		    break;
   97.56 -	    case 4: vcpu_set_gr(v, 18, va_arg(args, unsigned long), 0);
   97.57 +	    case 4: vcpu_set_gr(v, 18, arg, 0);
   97.58  		    break;
   97.59  	    default: panic("Too many args for hypercall continuation\n");
   97.60  		    break;
    98.1 --- a/xen/arch/x86/Makefile	Wed Mar 01 10:01:54 2006 -0700
    98.2 +++ b/xen/arch/x86/Makefile	Wed Mar 01 12:47:25 2006 -0700
    98.3 @@ -33,6 +33,10 @@ ifeq ($(TARGET_SUBARCH),x86_32)
    98.4   endif
    98.5  endif
    98.6  
    98.7 +ifneq ($(supervisor_mode_kernel),y)
    98.8 +OBJS := $(subst x86_32/supervisor_mode_kernel.o,,$(OBJS))
    98.9 +endif
   98.10 +
   98.11  OBJS := $(subst $(TARGET_SUBARCH)/asm-offsets.o,,$(OBJS))
   98.12  OBJS := $(subst $(TARGET_SUBARCH)/xen.lds.o,,$(OBJS))
   98.13  
   98.14 @@ -44,7 +48,7 @@ default: $(TARGET)
   98.15  
   98.16  $(TARGET): $(TARGET)-syms boot/mkelf32
   98.17  	./boot/mkelf32 $(TARGET)-syms $(TARGET) 0x100000 \
   98.18 -	`nm $(TARGET)-syms | sort | tail -n 1 | sed -e 's/^\([^ ]*\).*/0x\1/'`
   98.19 +	`$(NM) $(TARGET)-syms | sort | tail -n 1 | sed -e 's/^\([^ ]*\).*/0x\1/'`
   98.20  
   98.21  $(CURDIR)/arch.o: $(OBJS)
   98.22  	$(LD) $(LDFLAGS) -r -o $@ $(OBJS)
    99.1 --- a/xen/arch/x86/Rules.mk	Wed Mar 01 10:01:54 2006 -0700
    99.2 +++ b/xen/arch/x86/Rules.mk	Wed Mar 01 12:47:25 2006 -0700
    99.3 @@ -6,6 +6,7 @@
    99.4  # 'make clean' before rebuilding.
    99.5  #
    99.6  pae ?= n
    99.7 +supervisor_mode_kernel ?= n
    99.8  
    99.9  CFLAGS  += -nostdinc -fno-builtin -fno-common -fno-strict-aliasing
   99.10  CFLAGS  += -iwithprefix include -Wall -Werror -Wno-pointer-arith -pipe
   99.11 @@ -32,6 +33,9 @@ ifeq ($(pae),y)
   99.12  CFLAGS  += -DCONFIG_X86_PAE=1
   99.13  endif
   99.14  endif
   99.15 +ifeq ($(supervisor_mode_kernel),y)
   99.16 +CFLAGS  += -DCONFIG_X86_SUPERVISOR_MODE_KERNEL=1
   99.17 +endif
   99.18  
   99.19  ifeq ($(TARGET_SUBARCH),x86_64)
   99.20  CFLAGS  += -m64 -mno-red-zone -fpic -fno-reorder-blocks
   100.1 --- a/xen/arch/x86/boot/mkelf32.c	Wed Mar 01 10:01:54 2006 -0700
   100.2 +++ b/xen/arch/x86/boot/mkelf32.c	Wed Mar 01 12:47:25 2006 -0700
   100.3 @@ -244,7 +244,7 @@ int main(int argc, char **argv)
   100.4  
   100.5      inimage  = argv[1];
   100.6      outimage = argv[2];
   100.7 -    loadbase = strtoul(argv[3], NULL, 16);
   100.8 +    loadbase = strtoull(argv[3], NULL, 16);
   100.9      final_exec_addr = strtoul(argv[4], NULL, 16);
  100.10  
  100.11      infd = open(inimage, O_RDONLY);
   101.1 --- a/xen/arch/x86/dom0_ops.c	Wed Mar 01 10:01:54 2006 -0700
   101.2 +++ b/xen/arch/x86/dom0_ops.c	Wed Mar 01 12:47:25 2006 -0700
   101.3 @@ -181,10 +181,13 @@ long arch_do_dom0_op(struct dom0_op *op,
   101.4      {
   101.5          dom0_physinfo_t *pi = &op->u.physinfo;
   101.6  
   101.7 -        pi->threads_per_core = smp_num_siblings;
   101.8 -        pi->cores_per_socket = boot_cpu_data.x86_max_cores;
   101.9 +        pi->threads_per_core =
  101.10 +            cpus_weight(cpu_sibling_map[0]);
  101.11 +        pi->cores_per_socket =
  101.12 +            cpus_weight(cpu_core_map[0]) / pi->threads_per_core;
  101.13          pi->sockets_per_node = 
  101.14 -            num_online_cpus() / (pi->threads_per_core * pi->cores_per_socket);
  101.15 +            num_online_cpus() / cpus_weight(cpu_core_map[0]);
  101.16 +
  101.17          pi->nr_nodes         = 1;
  101.18          pi->total_pages      = total_pages;
  101.19          pi->free_pages       = avail_domheap_pages();
   102.1 --- a/xen/arch/x86/domain.c	Wed Mar 01 10:01:54 2006 -0700
   102.2 +++ b/xen/arch/x86/domain.c	Wed Mar 01 12:47:25 2006 -0700
   102.3 @@ -351,17 +351,17 @@ int arch_set_info_guest(
   102.4  
   102.5      if ( !(c->flags & VGCF_HVM_GUEST) )
   102.6      {
   102.7 -        fixup_guest_selector(c->user_regs.ss);
   102.8 -        fixup_guest_selector(c->kernel_ss);
   102.9 -        fixup_guest_selector(c->user_regs.cs);
  102.10 +        fixup_guest_stack_selector(c->user_regs.ss);
  102.11 +        fixup_guest_stack_selector(c->kernel_ss);
  102.12 +        fixup_guest_code_selector(c->user_regs.cs);
  102.13  
  102.14  #ifdef __i386__
  102.15 -        fixup_guest_selector(c->event_callback_cs);
  102.16 -        fixup_guest_selector(c->failsafe_callback_cs);
  102.17 +        fixup_guest_code_selector(c->event_callback_cs);
  102.18 +        fixup_guest_code_selector(c->failsafe_callback_cs);
  102.19  #endif
  102.20  
  102.21          for ( i = 0; i < 256; i++ )
  102.22 -            fixup_guest_selector(c->trap_ctxt[i].cs);
  102.23 +            fixup_guest_code_selector(c->trap_ctxt[i].cs);
  102.24      }
  102.25      else if ( !hvm_enabled )
  102.26        return -EINVAL;
  102.27 @@ -784,6 +784,11 @@ void context_switch(struct vcpu *prev, s
  102.28  
  102.29      context_saved(prev);
  102.30  
  102.31 +    /* Update per-VCPU guest runstate shared memory area (if registered). */
  102.32 +    if ( next->runstate_guest != NULL )
  102.33 +        __copy_to_user(next->runstate_guest, &next->runstate,
  102.34 +                       sizeof(next->runstate));
  102.35 +
  102.36      schedule_tail(next);
  102.37      BUG();
  102.38  }
  102.39 @@ -820,56 +825,77 @@ void sync_vcpu_execstate(struct vcpu *v)
  102.40      flush_tlb_mask(v->vcpu_dirty_cpumask);
  102.41  }
  102.42  
  102.43 -unsigned long __hypercall_create_continuation(
  102.44 -    unsigned int op, unsigned int nr_args, ...)
  102.45 +#define next_arg(fmt, args) ({                                              \
  102.46 +    unsigned long __arg;                                                    \
  102.47 +    switch ( *(fmt)++ )                                                     \
  102.48 +    {                                                                       \
  102.49 +    case 'i': __arg = (unsigned long)va_arg(args, unsigned int);  break;    \
  102.50 +    case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break;    \
  102.51 +    case 'p': __arg = (unsigned long)va_arg(args, void *);        break;    \
  102.52 +    case 'h': __arg = (unsigned long)va_arg(args, void *);        break;    \
  102.53 +    default:  __arg = 0; BUG();                                             \
  102.54 +    }                                                                       \
  102.55 +    __arg;                                                                  \
  102.56 +})
  102.57 +
  102.58 +unsigned long hypercall_create_continuation(
  102.59 +    unsigned int op, const char *format, ...)
  102.60  {
  102.61      struct mc_state *mcs = &mc_state[smp_processor_id()];
  102.62      struct cpu_user_regs *regs;
  102.63 +    const char *p = format;
  102.64 +    unsigned long arg;
  102.65      unsigned int i;
  102.66      va_list args;
  102.67  
  102.68 -    va_start(args, nr_args);
  102.69 +    va_start(args, format);
  102.70  
  102.71      if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
  102.72      {
  102.73          __set_bit(_MCSF_call_preempted, &mcs->flags);
  102.74  
  102.75 -        for ( i = 0; i < nr_args; i++ )
  102.76 -            mcs->call.args[i] = va_arg(args, unsigned long);
  102.77 +        for ( i = 0; *p != '\0'; i++ )
  102.78 +            mcs->call.args[i] = next_arg(p, args);
  102.79      }
  102.80      else
  102.81      {
  102.82          regs       = guest_cpu_user_regs();
  102.83  #if defined(__i386__)
  102.84          regs->eax  = op;
  102.85 -        regs->eip -= 2;  /* re-execute 'int 0x82' */
  102.86  
  102.87 -        for ( i = 0; i < nr_args; i++ )
  102.88 +        if ( supervisor_mode_kernel )
  102.89 +            regs->eip &= ~31; /* re-execute entire hypercall entry stub */
  102.90 +        else
  102.91 +            regs->eip -= 2;   /* re-execute 'int 0x82' */
  102.92 +
  102.93 +        for ( i = 0; *p != '\0'; i++ )
  102.94          {
  102.95 +            arg = next_arg(p, args);
  102.96              switch ( i )
  102.97              {
  102.98 -            case 0: regs->ebx = va_arg(args, unsigned long); break;
  102.99 -            case 1: regs->ecx = va_arg(args, unsigned long); break;
 102.100 -            case 2: regs->edx = va_arg(args, unsigned long); break;
 102.101 -            case 3: regs->esi = va_arg(args, unsigned long); break;
 102.102 -            case 4: regs->edi = va_arg(args, unsigned long); break;
 102.103 -            case 5: regs->ebp = va_arg(args, unsigned long); break;
 102.104 +            case 0: regs->ebx = arg; break;
 102.105 +            case 1: regs->ecx = arg; break;
 102.106 +            case 2: regs->edx = arg; break;
 102.107 +            case 3: regs->esi = arg; break;
 102.108 +            case 4: regs->edi = arg; break;
 102.109 +            case 5: regs->ebp = arg; break;
 102.110              }
 102.111          }
 102.112  #elif defined(__x86_64__)
 102.113          regs->rax  = op;
 102.114          regs->rip -= 2;  /* re-execute 'syscall' */
 102.115  
 102.116 -        for ( i = 0; i < nr_args; i++ )
 102.117 +        for ( i = 0; *p != '\0'; i++ )
 102.118          {
 102.119 +            arg = next_arg(p, args);
 102.120              switch ( i )
 102.121              {
 102.122 -            case 0: regs->rdi = va_arg(args, unsigned long); break;
 102.123 -            case 1: regs->rsi = va_arg(args, unsigned long); break;
 102.124 -            case 2: regs->rdx = va_arg(args, unsigned long); break;
 102.125 -            case 3: regs->r10 = va_arg(args, unsigned long); break;
 102.126 -            case 4: regs->r8  = va_arg(args, unsigned long); break;
 102.127 -            case 5: regs->r9  = va_arg(args, unsigned long); break;
 102.128 +            case 0: regs->rdi = arg; break;
 102.129 +            case 1: regs->rsi = arg; break;
 102.130 +            case 2: regs->rdx = arg; break;
 102.131 +            case 3: regs->r10 = arg; break;
 102.132 +            case 4: regs->r8  = arg; break;
 102.133 +            case 5: regs->r9  = arg; break;
 102.134              }
 102.135          }
 102.136  #endif
   103.1 --- a/xen/arch/x86/domain_build.c	Wed Mar 01 10:01:54 2006 -0700
   103.2 +++ b/xen/arch/x86/domain_build.c	Wed Mar 01 12:47:25 2006 -0700
   103.3 @@ -28,6 +28,9 @@
   103.4  
   103.5  #include <public/version.h>
   103.6  
   103.7 +extern unsigned long initial_images_nrpages(void);
   103.8 +extern void discard_initial_images(void);
   103.9 +
  103.10  static long dom0_nrpages;
  103.11  
  103.12  /*
  103.13 @@ -181,7 +184,8 @@ static void parse_features(
  103.14          {
  103.15              printk("Unknown kernel feature \"%.*s\".\n",
  103.16                     (int)(p-feats), feats);
  103.17 -            panic("Domain 0 requires an unknown hypervisor feature.\n");
  103.18 +            if ( req )
  103.19 +                panic("Domain 0 requires an unknown hypervisor feature.\n");
  103.20          }
  103.21  
  103.22          feats = p;
  103.23 @@ -248,9 +252,6 @@ int construct_dom0(struct domain *d,
  103.24      uint32_t dom0_features_supported[XENFEAT_NR_SUBMAPS] = { 0 };
  103.25      uint32_t dom0_features_required[XENFEAT_NR_SUBMAPS] = { 0 };
  103.26  
  103.27 -    extern void translate_l2pgtable(
  103.28 -        struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn);
  103.29 -
  103.30      /* Sanity! */
  103.31      BUG_ON(d->domain_id != 0);
  103.32      BUG_ON(d->vcpu[0] == NULL);
  103.33 @@ -271,18 +272,14 @@ int construct_dom0(struct domain *d,
  103.34       */
  103.35      if ( dom0_nrpages == 0 )
  103.36      {
  103.37 -        dom0_nrpages = avail_domheap_pages() +
  103.38 -            ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
  103.39 -            ((image_len  + PAGE_SIZE - 1) >> PAGE_SHIFT);
  103.40 +        dom0_nrpages = avail_domheap_pages() + initial_images_nrpages();
  103.41          dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
  103.42          dom0_nrpages = -dom0_nrpages;
  103.43      }
  103.44  
  103.45      /* Negative memory specification means "all memory - specified amount". */
  103.46      if ( dom0_nrpages < 0 )
  103.47 -        nr_pages = avail_domheap_pages() +
  103.48 -            ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
  103.49 -            ((image_len  + PAGE_SIZE - 1) >> PAGE_SHIFT) +
  103.50 +        nr_pages = avail_domheap_pages() + initial_images_nrpages() +
  103.51              dom0_nrpages;
  103.52      else
  103.53          nr_pages = dom0_nrpages;
  103.54 @@ -704,16 +701,12 @@ int construct_dom0(struct domain *d,
  103.55          hypercall_page_initialise((void *)hypercall_page);
  103.56      }
  103.57  
  103.58 -    init_domheap_pages(
  103.59 -        _image_start, (_image_start+image_len+PAGE_SIZE-1) & PAGE_MASK);
  103.60 -
  103.61 -    /* Copy the initial ramdisk and free temporary buffer. */
  103.62 +    /* Copy the initial ramdisk. */
  103.63      if ( initrd_len != 0 )
  103.64 -    {
  103.65          memcpy((void *)vinitrd_start, initrd_start, initrd_len);
  103.66 -        init_domheap_pages(
  103.67 -            _initrd_start, (_initrd_start+initrd_len+PAGE_SIZE-1) & PAGE_MASK);
  103.68 -    }
  103.69 +
  103.70 +    /* Free temporary buffers. */
  103.71 +    discard_initial_images();
  103.72  
  103.73      /* Set up start info area. */
  103.74      si = (start_info_t *)vstartinfo_start;
  103.75 @@ -792,6 +785,25 @@ int construct_dom0(struct domain *d,
  103.76          update_pagetables(v);
  103.77      }
  103.78  
  103.79 +    if ( supervisor_mode_kernel )
  103.80 +    {
  103.81 +        v->arch.guest_context.kernel_ss &= ~3;
  103.82 +        v->arch.guest_context.user_regs.ss &= ~3;
  103.83 +        v->arch.guest_context.user_regs.es &= ~3;
  103.84 +        v->arch.guest_context.user_regs.ds &= ~3;
  103.85 +        v->arch.guest_context.user_regs.fs &= ~3;
  103.86 +        v->arch.guest_context.user_regs.gs &= ~3;
  103.87 +        printk("Dom0 runs in ring 0 (supervisor mode)\n");
  103.88 +        if ( !test_bit(XENFEAT_supervisor_mode_kernel,
  103.89 +                       dom0_features_supported) )
  103.90 +            panic("Dom0 does not support supervisor-mode execution\n");
  103.91 +    }
  103.92 +    else
  103.93 +    {
  103.94 +        if ( test_bit(XENFEAT_supervisor_mode_kernel, dom0_features_required) )
  103.95 +            panic("Dom0 requires supervisor-mode execution\n");
  103.96 +    }
  103.97 +
  103.98      rc = 0;
  103.99  
 103.100      /* DOM0 is permitted full I/O capabilities. */
   104.1 --- a/xen/arch/x86/hvm/hvm.c	Wed Mar 01 10:01:54 2006 -0700
   104.2 +++ b/xen/arch/x86/hvm/hvm.c	Wed Mar 01 12:47:25 2006 -0700
   104.3 @@ -25,6 +25,7 @@
   104.4  #include <xen/sched.h>
   104.5  #include <xen/irq.h>
   104.6  #include <xen/softirq.h>
   104.7 +#include <xen/domain.h>
   104.8  #include <xen/domain_page.h>
   104.9  #include <asm/current.h>
  104.10  #include <asm/io.h>
  104.11 @@ -59,9 +60,9 @@ static void hvm_zap_mmio_range(
  104.12  
  104.13      for ( i = 0; i < nr_pfn; i++ )
  104.14      {
  104.15 -        if ( pfn + i >= 0xfffff ) 
  104.16 +        if ( pfn + i >= 0xfffff )
  104.17              break;
  104.18 -        
  104.19 +
  104.20          __copy_to_user(&phys_to_machine_mapping[pfn + i], &val, sizeof (val));
  104.21      }
  104.22  }
  104.23 @@ -217,7 +218,7 @@ void hvm_pic_assist(struct vcpu *v)
  104.24      global_iodata_t *spg;
  104.25      u16   *virq_line, irqs;
  104.26      struct hvm_virpic *pic = &v->domain->arch.hvm_domain.vpic;
  104.27 -    
  104.28 +
  104.29      spg = &get_sp(v->domain)->sp_global;
  104.30      virq_line  = &spg->pic_clear_irr;
  104.31      if ( *virq_line ) {
  104.32 @@ -312,6 +313,52 @@ void hvm_print_line(struct vcpu *v, cons
  104.33  }
  104.34  
  104.35  /*
  104.36 + * only called in HVM domain BSP context
  104.37 + * when booting, vcpuid is always equal to apic_id
  104.38 + */
  104.39 +int hvm_bringup_ap(int vcpuid, int trampoline_vector)
  104.40 +{
  104.41 +    struct vcpu *bsp = current, *v;
  104.42 +    struct domain *d = bsp->domain;
  104.43 +    struct vcpu_guest_context *ctxt;
  104.44 +    int rc = 0;
  104.45 +
  104.46 +    /* current must be HVM domain BSP */
  104.47 +    if ( !(HVM_DOMAIN(bsp) && bsp->vcpu_id == 0) ) {
  104.48 +        printk("Not calling hvm_bringup_ap from BSP context.\n");
  104.49 +        domain_crash_synchronous();
  104.50 +    }
  104.51 +
  104.52 +    if ( (v = d->vcpu[vcpuid]) == NULL )
  104.53 +        return -ENOENT;
  104.54 +
  104.55 +    if ( (ctxt = xmalloc(struct vcpu_guest_context)) == NULL ) {
  104.56 +        printk("Failed to allocate memory in hvm_bringup_ap.\n");
  104.57 +        return -ENOMEM;
  104.58 +    }
  104.59 +
  104.60 +    hvm_init_ap_context(ctxt, vcpuid, trampoline_vector);
  104.61 +
  104.62 +    LOCK_BIGLOCK(d);
  104.63 +    rc = -EEXIST;
  104.64 +    if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
  104.65 +        rc = boot_vcpu(d, vcpuid, ctxt);
  104.66 +    UNLOCK_BIGLOCK(d);
  104.67 +
  104.68 +    if ( rc != 0 )
  104.69 +        printk("AP %d bringup failed in boot_vcpu %x.\n", vcpuid, rc);
  104.70 +    else {
  104.71 +        if ( test_and_clear_bit(_VCPUF_down, &d->vcpu[vcpuid]->vcpu_flags) )
  104.72 +            vcpu_wake(d->vcpu[vcpuid]);
  104.73 +        printk("AP %d bringup suceeded.\n", vcpuid);
  104.74 +    }
  104.75 +
  104.76 +    xfree(ctxt);
  104.77 +
  104.78 +    return rc;
  104.79 +}
  104.80 +
  104.81 +/*
  104.82   * Local variables:
  104.83   * mode: C
  104.84   * c-set-style: "BSD"
   105.1 --- a/xen/arch/x86/hvm/svm/emulate.c	Wed Mar 01 10:01:54 2006 -0700
   105.2 +++ b/xen/arch/x86/hvm/svm/emulate.c	Wed Mar 01 12:47:25 2006 -0700
   105.3 @@ -86,7 +86,7 @@ static inline unsigned long DECODE_GPR_V
   105.4      case 0x7:
   105.5          value = regs->edi;
   105.6          break;
   105.7 -#if X86_64
   105.8 +#if __x86_64__
   105.9      case 0x8:
  105.10          value = regs->r8;
  105.11          break;
  105.12 @@ -318,20 +318,14 @@ unsigned long get_effective_addr_sib(str
  105.13  
  105.14  
  105.15  /* Get the register/mode number of src register in ModRM register. */
  105.16 -unsigned int decode_dest_reg(u8 m)
  105.17 +unsigned int decode_dest_reg(u8 prefix, u8 m)
  105.18  {
  105.19 -#if __x86_64__
  105.20 -    ASSERT(0); /* Need to adjust for REX prefix if applicable */
  105.21 -#endif
  105.22 -    return (m >> 3) & 7;
  105.23 +    return DECODE_MODRM_REG(prefix, m);
  105.24  }
  105.25  
  105.26 -unsigned int decode_src_reg(u8 m)
  105.27 +unsigned int decode_src_reg(u8 prefix, u8 m)
  105.28  {
  105.29 -#if __x86_64__
  105.30 -    ASSERT(0); /* Need to adjust for REX prefix if applicable */
  105.31 -#endif
  105.32 -    return m & 7;
  105.33 +    return DECODE_MODRM_RM(prefix, m);
  105.34  }
  105.35  
  105.36  
  105.37 @@ -431,7 +425,7 @@ static const u8 *opc_bytes[INSTR_MAX_COU
  105.38   * The caller can either pass a NULL pointer to the guest_eip_buf, or a pointer
  105.39   * to enough bytes to satisfy the instruction including prefix bytes.
  105.40   */
  105.41 -unsigned int __get_instruction_length_from_list(struct vmcb_struct *vmcb,
  105.42 +int __get_instruction_length_from_list(struct vmcb_struct *vmcb,
  105.43          enum instruction_index *list, unsigned int list_count, 
  105.44          u8 *guest_eip_buf, enum instruction_index *match)
  105.45  {
   106.1 --- a/xen/arch/x86/hvm/svm/intr.c	Wed Mar 01 10:01:54 2006 -0700
   106.2 +++ b/xen/arch/x86/hvm/svm/intr.c	Wed Mar 01 12:47:25 2006 -0700
   106.3 @@ -80,12 +80,7 @@ interrupt_post_injection(struct vcpu * v
   106.4  {
   106.5      struct hvm_virpit *vpit = &(v->domain->arch.hvm_domain.vpit);
   106.6  
   106.7 -    switch(type)
   106.8 -    {
   106.9 -    case VLAPIC_DELIV_MODE_EXT:
  106.10 -    case VLAPIC_DELIV_MODE_FIXED:
  106.11 -    case VLAPIC_DELIV_MODE_LPRI:
  106.12 -        if ( is_pit_irq(v, vector, type) ) {
  106.13 +    if ( is_pit_irq(v, vector, type) ) {
  106.14              if ( !vpit->first_injected ) {
  106.15                  vpit->first_injected = 1;
  106.16                  vpit->pending_intr_nr = 0;
  106.17 @@ -95,12 +90,15 @@ interrupt_post_injection(struct vcpu * v
  106.18              }
  106.19              vpit->inject_point = NOW();
  106.20              svm_set_tsc_shift (v, vpit);
  106.21 -        }
  106.22 +    }
  106.23 +
  106.24 +    switch(type)
  106.25 +    {
  106.26 +    case VLAPIC_DELIV_MODE_EXT:
  106.27          break;
  106.28  
  106.29      default:
  106.30 -        printk("Not support interrupt type: %d\n", type);
  106.31 -        break;
  106.32 +        vlapic_post_injection(v, vector, type);
  106.33      }
  106.34  }
  106.35  
   107.1 --- a/xen/arch/x86/hvm/svm/svm.c	Wed Mar 01 10:01:54 2006 -0700
   107.2 +++ b/xen/arch/x86/hvm/svm/svm.c	Wed Mar 01 12:47:25 2006 -0700
   107.3 @@ -164,7 +164,7 @@ void asidpool_retire( struct vmcb_struct
   107.4  }
   107.5  
   107.6  static inline void svm_inject_exception(struct vmcb_struct *vmcb, 
   107.7 -                                        int trap, int error_code)
   107.8 +                                        int trap, int ev, int error_code)
   107.9  {
  107.10      eventinj_t event;
  107.11  
  107.12 @@ -172,7 +172,7 @@ static inline void svm_inject_exception(
  107.13      event.fields.v = 1;
  107.14      event.fields.type = EVENTTYPE_EXCEPTION;
  107.15      event.fields.vector = trap;
  107.16 -    event.fields.ev = 1;
  107.17 +    event.fields.ev = ev;
  107.18      event.fields.errorcode = error_code;
  107.19  
  107.20      ASSERT(vmcb->eventinj.fields.v == 0);
  107.21 @@ -237,109 +237,62 @@ void svm_load_cpu_guest_regs(struct vcpu
  107.22  }
  107.23  
  107.24  #ifdef __x86_64__
  107.25 -static struct svm_msr_state percpu_msr[NR_CPUS];
  107.26 -
  107.27 -static u32 msr_data_index[VMX_MSR_COUNT] =
  107.28 -{
  107.29 -    MSR_LSTAR, MSR_STAR, MSR_CSTAR,
  107.30 -    MSR_SYSCALL_MASK, MSR_EFER,
  107.31 -};
  107.32  
  107.33  void svm_save_segments(struct vcpu *v)
  107.34  {
  107.35 -    rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_svm.msr_content.shadow_gs);
  107.36  }
  107.37 -
  107.38 -/*
  107.39 - * To avoid MSR save/restore at every VM exit/entry time, we restore
  107.40 - * the x86_64 specific MSRs at domain switch time. Since those MSRs are
  107.41 - * are not modified once set for generic domains, we don't save them,
  107.42 - * but simply reset them to the values set at percpu_traps_init().
  107.43 - */
  107.44  void svm_load_msrs(void)
  107.45  {
  107.46 -    struct svm_msr_state *host_state = &percpu_msr[smp_processor_id()];
  107.47 -    int i;
  107.48 -
  107.49 -    while ( host_state->flags )
  107.50 -    {
  107.51 -        i = find_first_set_bit(host_state->flags);
  107.52 -        wrmsrl(msr_data_index[i], host_state->msr_items[i]);
  107.53 -        clear_bit(i, &host_state->flags);
  107.54 -    }
  107.55  }
  107.56 -
  107.57 -static void svm_save_init_msrs(void)
  107.58 +void svm_restore_msrs(struct vcpu *v)
  107.59  {
  107.60 -    struct svm_msr_state *host_state = &percpu_msr[smp_processor_id()];
  107.61 -    int i;
  107.62 -
  107.63 -    for ( i = 0; i < SVM_MSR_COUNT; i++ )
  107.64 -        rdmsrl(msr_data_index[i], host_state->msr_items[i]);
  107.65  }
  107.66  
  107.67 -#define CASE_READ_MSR(address)                               \
  107.68 -    case MSR_ ## address:                                    \
  107.69 -    msr_content = msr->msr_items[SVM_INDEX_MSR_ ## address]; \
  107.70 -    break
  107.71 -
  107.72 -#define CASE_WRITE_MSR(address)                              \
  107.73 -    case MSR_ ## address:                                    \
  107.74 -    msr->msr_items[SVM_INDEX_MSR_ ## address] = msr_content; \
  107.75 -    if (!test_bit(SVM_INDEX_MSR_ ## address, &msr->flags))   \
  107.76 -    {                                                        \
  107.77 -        set_bit(SVM_INDEX_MSR_ ## address, &msr->flags);     \
  107.78 -    }                                                        \
  107.79 -    break
  107.80 -
  107.81 -
  107.82  #define IS_CANO_ADDRESS(add) 1
  107.83  
  107.84  static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
  107.85  {
  107.86      u64 msr_content = 0;
  107.87      struct vcpu *vc = current;
  107.88 -    struct svm_msr_state *msr = &vc->arch.hvm_svm.msr_content;
  107.89 +    //    struct svm_msr_state *msr = &vc->arch.hvm_svm.msr_content;
  107.90      struct vmcb_struct *vmcb = vc->arch.hvm_svm.vmcb;
  107.91  
  107.92      switch (regs->ecx)
  107.93      {
  107.94      case MSR_EFER:
  107.95 -        msr_content = msr->msr_items[SVM_INDEX_MSR_EFER];
  107.96 -        HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content %llx\n", 
  107.97 -                (unsigned long long)msr_content);
  107.98 -
  107.99 -        if (test_bit(SVM_CPU_STATE_LME_ENABLED, &vc->arch.hvm_svm.cpu_state))
 107.100 -            msr_content |= 1 << _EFER_LME;
 107.101 -
 107.102 -        if (SVM_LONG_GUEST(vc))
 107.103 -            msr_content |= 1 << _EFER_LMA;
 107.104 -
 107.105 +        // msr_content = msr->msr_items[SVM_INDEX_MSR_EFER];
 107.106 +        msr_content = vmcb->efer;      
 107.107 +        msr_content &= ~EFER_SVME;
 107.108          break;
 107.109  
 107.110      case MSR_FS_BASE:
 107.111 -        if (!(SVM_LONG_GUEST(vc)))
 107.112 -            /* XXX should it be GP fault */
 107.113 -            domain_crash_synchronous();
 107.114 -        
 107.115          msr_content = vmcb->fs.base;
 107.116          break;
 107.117  
 107.118      case MSR_GS_BASE:
 107.119 -        if (!(SVM_LONG_GUEST(vc)))
 107.120 -            domain_crash_synchronous();
 107.121 -
 107.122          msr_content = vmcb->gs.base;
 107.123          break;
 107.124  
 107.125      case MSR_SHADOW_GS_BASE:
 107.126 -        msr_content = msr->shadow_gs;
 107.127 +        msr_content = vmcb->kerngsbase;
 107.128          break;
 107.129  
 107.130 -    CASE_READ_MSR(STAR);
 107.131 -    CASE_READ_MSR(LSTAR);
 107.132 -    CASE_READ_MSR(CSTAR);
 107.133 -    CASE_READ_MSR(SYSCALL_MASK);
 107.134 +    case MSR_STAR:
 107.135 +         msr_content = vmcb->star;
 107.136 +         break;
 107.137 + 
 107.138 +    case MSR_LSTAR:
 107.139 +         msr_content = vmcb->lstar;
 107.140 +         break;
 107.141 + 
 107.142 +    case MSR_CSTAR:
 107.143 +         msr_content = vmcb->cstar;
 107.144 +         break;
 107.145 + 
 107.146 +    case MSR_SYSCALL_MASK:
 107.147 +         msr_content = vmcb->sfmask;
 107.148 +         break;
 107.149 +
 107.150      default:
 107.151          return 0;
 107.152      }
 107.153 @@ -356,8 +309,6 @@ static inline int long_mode_do_msr_write
 107.154  {
 107.155      u64 msr_content = regs->eax | ((u64)regs->edx << 32); 
 107.156      struct vcpu *vc = current;
 107.157 -    struct svm_msr_state *msr = &vc->arch.hvm_svm.msr_content;
 107.158 -    struct svm_msr_state *host_state = &percpu_msr[smp_processor_id()];
 107.159      struct vmcb_struct *vmcb = vc->arch.hvm_svm.vmcb;
 107.160  
 107.161      HVM_DBG_LOG(DBG_LEVEL_1, "mode_do_msr_write msr %lx msr_content %lx\n", 
 107.162 @@ -373,26 +324,20 @@ static inline int long_mode_do_msr_write
 107.163                      || !test_bit(SVM_CPU_STATE_PAE_ENABLED,
 107.164                                   &vc->arch.hvm_svm.cpu_state))
 107.165              {
 107.166 -                svm_inject_exception(vmcb, TRAP_gp_fault, 0);
 107.167 +                svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
 107.168              }
 107.169          }
 107.170  
 107.171          if (msr_content & EFER_LME)
 107.172              set_bit(SVM_CPU_STATE_LME_ENABLED, &vc->arch.hvm_svm.cpu_state);
 107.173  
 107.174 +        /* We have already recorded that we want LME, so it will be set 
 107.175 +         * next time CR0 gets updated. So we clear that bit and continue.
 107.176 +         */
 107.177 +        if ((msr_content ^ vmcb->efer) & EFER_LME)
 107.178 +            msr_content &= ~EFER_LME;  
 107.179          /* No update for LME/LMA since it have no effect */
 107.180 -        msr->msr_items[SVM_INDEX_MSR_EFER] = msr_content;
 107.181 -        if (msr_content & ~(EFER_LME | EFER_LMA))
 107.182 -        {
 107.183 -            msr->msr_items[SVM_INDEX_MSR_EFER] = msr_content;
 107.184 -            if (!test_bit(SVM_INDEX_MSR_EFER, &msr->flags))
 107.185 -            { 
 107.186 -                rdmsrl(MSR_EFER, host_state->msr_items[SVM_INDEX_MSR_EFER]);
 107.187 -                set_bit(SVM_INDEX_MSR_EFER, &host_state->flags);
 107.188 -                set_bit(SVM_INDEX_MSR_EFER, &msr->flags);  
 107.189 -                wrmsrl(MSR_EFER, msr_content);
 107.190 -            }
 107.191 -        }
 107.192 +        vmcb->efer = msr_content | EFER_SVME;
 107.193          break;
 107.194  
 107.195      case MSR_FS_BASE:
 107.196 @@ -403,63 +348,42 @@ static inline int long_mode_do_msr_write
 107.197          if (!IS_CANO_ADDRESS(msr_content))
 107.198          {
 107.199              HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
 107.200 -            svm_inject_exception(vmcb, TRAP_gp_fault, 0);
 107.201 +            svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
 107.202          }
 107.203  
 107.204          if (regs->ecx == MSR_FS_BASE)
 107.205 -	    vmcb->fs.base = msr_content;
 107.206 +            vmcb->fs.base = msr_content;
 107.207          else 
 107.208 -	    vmcb->gs.base = msr_content;
 107.209 +            vmcb->gs.base = msr_content;
 107.210          break;
 107.211  
 107.212      case MSR_SHADOW_GS_BASE:
 107.213 -        if (!(SVM_LONG_GUEST(vc)))
 107.214 -            domain_crash_synchronous();
 107.215 -
 107.216 -        vc->arch.hvm_svm.msr_content.shadow_gs = msr_content;
 107.217 -        wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
 107.218 -        break;
 107.219 -
 107.220 -    CASE_WRITE_MSR(STAR);
 107.221 -    CASE_WRITE_MSR(LSTAR);
 107.222 -    CASE_WRITE_MSR(CSTAR);
 107.223 -    CASE_WRITE_MSR(SYSCALL_MASK);
 107.224 +         vmcb->kerngsbase = msr_content;
 107.225 +         break;
 107.226 + 
 107.227 +    case MSR_STAR:
 107.228 +         vmcb->star = msr_content;
 107.229 +         break;
 107.230 + 
 107.231 +    case MSR_LSTAR:
 107.232 +         vmcb->lstar = msr_content;
 107.233 +         break;
 107.234 + 
 107.235 +    case MSR_CSTAR:
 107.236 +         vmcb->cstar = msr_content;
 107.237 +         break;
 107.238 + 
 107.239 +    case MSR_SYSCALL_MASK:
 107.240 +         vmcb->sfmask = msr_content;
 107.241 +         break;
 107.242 +
 107.243      default:
 107.244          return 0;
 107.245      }
 107.246      return 1;
 107.247  }
 107.248  
 107.249 -void
 107.250 -svm_restore_msrs(struct vcpu *v)
 107.251 -{
 107.252 -    int i = 0;
 107.253 -    struct svm_msr_state *guest_state;
 107.254 -    struct svm_msr_state *host_state;
 107.255 -    unsigned long guest_flags;
 107.256 -
 107.257 -    guest_state = &v->arch.hvm_svm.msr_content;;
 107.258 -    host_state = &percpu_msr[smp_processor_id()];
 107.259 -
 107.260 -    wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs);
 107.261 -    guest_flags = guest_state->flags;
 107.262 -    if (!guest_flags)
 107.263 -        return;
 107.264 -
 107.265 -    while (guest_flags){
 107.266 -        i = find_first_set_bit(guest_flags);
 107.267 -
 107.268 -        HVM_DBG_LOG(DBG_LEVEL_2,
 107.269 -                    "restore guest's index %d msr %lx with %lx\n",
 107.270 -                    i, (unsigned long) msr_data_index[i], (unsigned long) guest_state->msr_items[i]);
 107.271 -        set_bit(i, &host_state->flags);
 107.272 -        wrmsrl(msr_data_index[i], guest_state->msr_items[i]);
 107.273 -        clear_bit(i, &guest_flags);
 107.274 -    }
 107.275 -}
 107.276  #else
 107.277 -#define	svm_save_init_msrs()	((void)0)
 107.278 -
 107.279  static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
 107.280  {
 107.281      return 0;
 107.282 @@ -497,11 +421,30 @@ int svm_instruction_length(struct vcpu *
 107.283  {
 107.284      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 107.285      unsigned long cr0 = vmcb->cr0, eflags = vmcb->rflags, mode;
 107.286 -
 107.287 -    mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4;
 107.288 +    /* check which operating mode the guest is running */
 107.289 +    if( vmcb->efer & EFER_LMA )
 107.290 +        mode = vmcb->cs.attributes.fields.l ? 8 : 4;
 107.291 +    else
 107.292 +        mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4;
 107.293      return svm_instrlen(guest_cpu_user_regs(), mode);
 107.294  }
 107.295  
 107.296 +unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
 107.297 +{
 107.298 +    switch ( num )
 107.299 +    {
 107.300 +    case 0:
 107.301 +        return v->arch.hvm_svm.cpu_shadow_cr0;
 107.302 +    case 2:
 107.303 +        return v->arch.hvm_svm.cpu_cr2;
 107.304 +    case 3:
 107.305 +        return v->arch.hvm_svm.cpu_cr3;
 107.306 +    default:
 107.307 +        BUG();
 107.308 +    }
 107.309 +    return 0;                   /* dummy */
 107.310 +}
 107.311 +
 107.312  int start_svm(void)
 107.313  {
 107.314      u32 eax, ecx, edx;
 107.315 @@ -519,8 +462,6 @@ int start_svm(void)
 107.316      asidpool_init(smp_processor_id());    
 107.317      printk("AMD SVM Extension is enabled for cpu %d.\n", smp_processor_id());
 107.318      
 107.319 -    svm_save_init_msrs();
 107.320 -
 107.321      /* Setup HVM interfaces */
 107.322      hvm_funcs.disable = stop_svm;
 107.323  
 107.324 @@ -542,6 +483,7 @@ int start_svm(void)
 107.325      hvm_funcs.realmode = svm_realmode;
 107.326      hvm_funcs.paging_enabled = svm_paging_enabled;
 107.327      hvm_funcs.instruction_length = svm_instruction_length;
 107.328 +    hvm_funcs.get_guest_ctrl_reg = svm_get_ctrl_reg;
 107.329  
 107.330      hvm_enabled = 1;    
 107.331  
 107.332 @@ -631,8 +573,17 @@ void save_svm_cpu_user_regs(struct vcpu 
 107.333  }
 107.334  
 107.335  #if defined (__x86_64__)
 107.336 -void svm_store_cpu_user_regs(struct cpu_user_regs *regs, struct vcpu *c )
 107.337 +void svm_store_cpu_user_regs(struct cpu_user_regs *regs, struct vcpu *v )
 107.338  {
 107.339 +    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 107.340 +
 107.341 +    regs->rip    = vmcb->rip;
 107.342 +    regs->rsp    = vmcb->rsp;
 107.343 +    regs->rflags = vmcb->rflags;
 107.344 +    regs->cs     = vmcb->cs.sel;
 107.345 +    regs->ds     = vmcb->ds.sel;
 107.346 +    regs->es     = vmcb->es.sel;
 107.347 +    regs->ss     = vmcb->ss.sel;
 107.348  }
 107.349  #elif defined (__i386__)
 107.350  void svm_store_cpu_user_regs(struct cpu_user_regs *regs, struct vcpu *v)
 107.351 @@ -810,7 +761,8 @@ void svm_relinquish_resources(struct vcp
 107.352      vpit = &v->domain->arch.hvm_domain.vpit;
 107.353      kill_timer(&vpit->pit_timer);
 107.354      kill_timer(&v->arch.hvm_svm.hlt_timer);
 107.355 -    if ( hvm_apic_support(v->domain) ) {
 107.356 +    if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) ) 
 107.357 +    {
 107.358          kill_timer( &(VLAPIC(v)->vlapic_timer) );
 107.359          xfree( VLAPIC(v) );
 107.360      }
 107.361 @@ -819,8 +771,29 @@ void svm_relinquish_resources(struct vcp
 107.362  
 107.363  void arch_svm_do_resume(struct vcpu *v) 
 107.364  {
 107.365 -    svm_do_resume(v);
 107.366 -    reset_stack_and_jump(svm_asm_do_resume);
 107.367 +    /* pinning VCPU to a different core? */
 107.368 +    if ( v->arch.hvm_svm.launch_core == smp_processor_id()) {
 107.369 +        svm_do_resume( v );
 107.370 +        reset_stack_and_jump( svm_asm_do_resume );
 107.371 +    }
 107.372 +    else {
 107.373 +        printk("VCPU core pinned: %d to %d\n", v->arch.hvm_svm.launch_core, smp_processor_id() );
 107.374 +        v->arch.hvm_svm.launch_core = smp_processor_id();
 107.375 +        svm_migrate_timers( v );
 107.376 +        svm_do_resume( v );
 107.377 +        reset_stack_and_jump( svm_asm_do_resume );
 107.378 +    }
 107.379 +}
 107.380 +
 107.381 +
 107.382 +void svm_migrate_timers(struct vcpu *v)
 107.383 +{
 107.384 +    struct hvm_virpit *vpit = &(v->domain->arch.hvm_domain.vpit);
 107.385 +
 107.386 +    migrate_timer( &vpit->pit_timer, v->processor );
 107.387 +    migrate_timer( &v->arch.hvm_svm.hlt_timer, v->processor );
 107.388 +    if ( hvm_apic_support(v->domain) && VLAPIC( v ))
 107.389 +        migrate_timer( &(VLAPIC(v)->vlapic_timer ), v->processor );
 107.390  }
 107.391  
 107.392  
 107.393 @@ -860,9 +833,9 @@ static int svm_do_page_fault(unsigned lo
 107.394  	/* No support for APIC */
 107.395          if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000)
 107.396          { 
 107.397 -            unsigned long inst_len;
 107.398 -	    inst_len = svm_instruction_length(v);
 107.399 -            if (inst_len == (unsigned long)-1)
 107.400 +            int inst_len;
 107.401 +            inst_len = svm_instruction_length(v);
 107.402 +            if (inst_len == -1)
 107.403              {
 107.404                  printf("%s: INST_LEN - Unable to decode properly.\n", __func__);
 107.405                  domain_crash_synchronous();
 107.406 @@ -915,6 +888,14 @@ static void svm_do_general_protection_fa
 107.407      eip = vmcb->rip;
 107.408      error_code = vmcb->exitinfo1;
 107.409  
 107.410 +    if (vmcb->idtr.limit == 0) {
 107.411 +        printf("Huh? We got a GP Fault with an invalid IDTR!\n");
 107.412 +        svm_dump_vmcb(__func__, vmcb);
 107.413 +        svm_dump_regs(__func__, regs);
 107.414 +        svm_dump_inst(vmcb->rip); 
 107.415 +        __hvm_bug(regs);
 107.416 +    }
 107.417 +
 107.418      HVM_DBG_LOG(DBG_LEVEL_1,
 107.419                  "svm_general_protection_fault: eip = %lx, erro_code = %lx",
 107.420                  eip, error_code);
 107.421 @@ -927,7 +908,7 @@ static void svm_do_general_protection_fa
 107.422  
 107.423      
 107.424      /* Reflect it back into the guest */
 107.425 -    svm_inject_exception(vmcb, TRAP_gp_fault, error_code);
 107.426 +    svm_inject_exception(vmcb, TRAP_gp_fault, 1, error_code);
 107.427  }
 107.428  
 107.429  /* Reserved bits: [31:14], [12:1] */
 107.430 @@ -939,7 +920,7 @@ static void svm_vmexit_do_cpuid(struct v
 107.431      unsigned int eax, ebx, ecx, edx;
 107.432      unsigned long eip;
 107.433      struct vcpu *v = current;
 107.434 -    unsigned int inst_len;
 107.435 +    int inst_len;
 107.436  
 107.437      ASSERT(vmcb);
 107.438  
 107.439 @@ -956,21 +937,29 @@ static void svm_vmexit_do_cpuid(struct v
 107.440  
 107.441      if (input == 1)
 107.442      {
 107.443 +#ifndef __x86_64__
 107.444          if ( hvm_apic_support(v->domain) &&
 107.445                  !vlapic_global_enabled((VLAPIC(v))) )
 107.446 +#endif
 107.447              clear_bit(X86_FEATURE_APIC, &edx);
 107.448  	    
 107.449 -#ifdef __x86_64__
 107.450 +#if CONFIG_PAGING_LEVELS < 3
 107.451 +        clear_bit(X86_FEATURE_PAE, &edx);
 107.452 +        clear_bit(X86_FEATURE_PSE, &edx);
 107.453 +        clear_bit(X86_FEATURE_PSE36, &edx);
 107.454 +#else
 107.455          if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
 107.456 -#endif
 107.457          {
 107.458 +            if ( !v->domain->arch.hvm_domain.pae_enabled )
 107.459 +                clear_bit(X86_FEATURE_PAE, &edx);
 107.460              clear_bit(X86_FEATURE_PSE, &edx);
 107.461 -            clear_bit(X86_FEATURE_PAE, &edx);
 107.462              clear_bit(X86_FEATURE_PSE36, &edx);
 107.463          }
 107.464 +#endif
 107.465  	
 107.466          /* Clear out reserved bits. */
 107.467          ecx &= ~SVM_VCPU_CPUID_L1_RESERVED; /* mask off reserved bits */
 107.468 +        clear_bit(X86_FEATURE_MWAIT & 31, &ecx);
 107.469      }
 107.470  #ifdef __i386__
 107.471      else if ( input == 0x80000001 )
 107.472 @@ -991,6 +980,7 @@ static void svm_vmexit_do_cpuid(struct v
 107.473              eip, input, eax, ebx, ecx, edx);
 107.474  
 107.475      inst_len = __get_instruction_length(vmcb, INSTR_CPUID, NULL);
 107.476 +    ASSERT(inst_len > 0);
 107.477      __update_guest_eip(vmcb, inst_len);
 107.478  }
 107.479  
 107.480 @@ -1083,9 +1073,11 @@ static void svm_dr_access (struct vcpu *
 107.481      unsigned long *reg_p = 0;
 107.482      unsigned int gpreg = 0;
 107.483      unsigned long eip;
 107.484 -    unsigned int inst_len; 
 107.485 +    int inst_len; 
 107.486 +    int index;
 107.487      struct vmcb_struct *vmcb;
 107.488      u8 buffer[MAX_INST_LEN];
 107.489 +    u8 prefix = 0;
 107.490  
 107.491      vmcb = v->arch.hvm_svm.vmcb;
 107.492      
 107.493 @@ -1093,13 +1085,15 @@ static void svm_dr_access (struct vcpu *
 107.494  
 107.495      eip = vmcb->rip;
 107.496      inst_copy_from_guest(buffer, svm_rip2pointer(vmcb), sizeof(buffer));
 107.497 -
 107.498 -    ASSERT(buffer[0] == 0x0f && (buffer[1] & 0xFD) == 0x21);
 107.499 -
 107.500 -    gpreg = decode_src_reg(buffer[2]);
 107.501 -#if DEBUG
 107.502 -    ASSERT(reg == decode_dest_reg(buffer[2]));
 107.503 -#endif
 107.504 +    index = skip_prefix_bytes(buffer, sizeof(buffer));
 107.505 +    
 107.506 +    ASSERT(buffer[index+0] == 0x0f && (buffer[index+1] & 0xFD) == 0x21);
 107.507 +
 107.508 +    if (index > 0 && (buffer[index-1] & 0xF0) == 0x40)
 107.509 +        prefix = buffer[index-1];
 107.510 +
 107.511 +    gpreg = decode_src_reg(prefix, buffer[index + 2]);
 107.512 +    ASSERT(reg == decode_dest_reg(prefix, buffer[index + 2]));
 107.513  
 107.514      HVM_DBG_LOG(DBG_LEVEL_1, "svm_dr_access : eip=%lx, reg=%d, gpreg = %x",
 107.515              eip, reg, gpreg);
 107.516 @@ -1120,6 +1114,7 @@ static void svm_dr_access (struct vcpu *
 107.517          __hvm_bug(regs);
 107.518          break;
 107.519      }
 107.520 +    ASSERT(inst_len > 0);
 107.521      __update_guest_eip(vmcb, inst_len);
 107.522  }
 107.523  
 107.524 @@ -1335,13 +1330,13 @@ static void svm_io_instruction(struct vc
 107.525      }
 107.526  }
 107.527  
 107.528 -
 107.529  static int svm_set_cr0(unsigned long value)
 107.530  {
 107.531      struct vcpu *v = current;
 107.532      unsigned long mfn;
 107.533      int paging_enabled;
 107.534      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 107.535 +    unsigned long crn;
 107.536  
 107.537      ASSERT(vmcb);
 107.538  
 107.539 @@ -1377,7 +1372,7 @@ static int svm_set_cr0(unsigned long val
 107.540                      &v->arch.hvm_svm.cpu_state))
 107.541          {
 107.542              HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable\n");
 107.543 -            svm_inject_exception(vmcb, TRAP_gp_fault, 0);
 107.544 +            svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
 107.545          }
 107.546  
 107.547          if (test_bit(SVM_CPU_STATE_LME_ENABLED, &v->arch.hvm_svm.cpu_state))
 107.548 @@ -1386,14 +1381,7 @@ static int svm_set_cr0(unsigned long val
 107.549              HVM_DBG_LOG(DBG_LEVEL_1, "Enable the Long mode\n");
 107.550              set_bit(SVM_CPU_STATE_LMA_ENABLED,
 107.551                      &v->arch.hvm_svm.cpu_state);
 107.552 -#if 0
 107.553 -            __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
 107.554 -            vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
 107.555 -            __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
 107.556 -#else
 107.557 -	    printk("Cannot yet set SVM_CPU_STATE_LMA_ENABLED\n");
 107.558 -	    domain_crash_synchronous();
 107.559 -#endif
 107.560 +            vmcb->efer |= (EFER_LMA | EFER_LME);
 107.561  
 107.562  #if CONFIG_PAGING_LEVELS >= 4 
 107.563              if (!shadow_set_guest_paging_levels(v->domain, 4)) 
 107.564 @@ -1404,8 +1392,9 @@ static int svm_set_cr0(unsigned long val
 107.565  #endif
 107.566          }
 107.567          else
 107.568 +#endif  /* __x86_64__ */
 107.569          {
 107.570 -#if CONFIG_PAGING_LEVELS >= 4
 107.571 +#if CONFIG_PAGING_LEVELS >= 3
 107.572              if (!shadow_set_guest_paging_levels(v->domain, 2))
 107.573              {
 107.574                  printk("Unsupported guest paging levels\n");
 107.575 @@ -1414,33 +1403,18 @@ static int svm_set_cr0(unsigned long val
 107.576  #endif
 107.577          }
 107.578  
 107.579 -#if 0
 107.580 -        unsigned long crn;
 107.581 -
 107.582          /* update CR4's PAE if needed */
 107.583 -        __vmread(GUEST_CR4, &crn);
 107.584 +        crn = vmcb->cr4;
 107.585          if ((!(crn & X86_CR4_PAE)) 
 107.586                  && test_bit(SVM_CPU_STATE_PAE_ENABLED, 
 107.587                      &v->arch.hvm_svm.cpu_state))
 107.588          {
 107.589              HVM_DBG_LOG(DBG_LEVEL_1, "enable PAE on cr4\n");
 107.590 -            __vmwrite(GUEST_CR4, crn | X86_CR4_PAE);
 107.591 +            vmcb->cr4 |= X86_CR4_PAE;
 107.592          }
 107.593 -#else
 107.594 -	printk("Cannot yet set SVM_CPU_STATE_PAE_ENABLED\n");
 107.595 -	domain_crash_synchronous(); 
 107.596 -#endif
 107.597 -#elif defined(__i386__)
 107.598 -	{
 107.599 -            unsigned long old_base_mfn;
 107.600 -            old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
 107.601 -            if (old_base_mfn)
 107.602 -                put_page(mfn_to_page(old_base_mfn));
 107.603 -	}
 107.604 -#endif
 107.605  
 107.606          /* Now arch.guest_table points to machine physical. */
 107.607 -        v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
 107.608 +        v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
 107.609          update_pagetables(v);
 107.610  
 107.611          HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", 
 107.612 @@ -1461,7 +1435,7 @@ static int svm_set_cr0(unsigned long val
 107.613       */
 107.614      if ((value & X86_CR0_PE) == 0) {
 107.615      	if (value & X86_CR0_PG) {
 107.616 -            svm_inject_exception(vmcb, TRAP_gp_fault, 0);
 107.617 +            svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
 107.618              return 0;
 107.619          }
 107.620  
 107.621 @@ -1472,7 +1446,6 @@ static int svm_set_cr0(unsigned long val
 107.622      return 1;
 107.623  }
 107.624  
 107.625 -
 107.626  /*
 107.627   * Read from control registers. CR0 and CR4 are read from the shadow.
 107.628   */
 107.629 @@ -1497,7 +1470,7 @@ static void mov_from_cr(int cr, int gp, 
 107.630          value = (unsigned long) v->arch.hvm_svm.cpu_cr3;
 107.631          break;
 107.632      case 4:
 107.633 -        value = vmcb->cr4;
 107.634 +        value = (unsigned long) v->arch.hvm_svm.cpu_shadow_cr4;
 107.635          break;
 107.636      case 8:
 107.637  #if 0
 107.638 @@ -1579,7 +1552,7 @@ static int mov_to_cr(int gpreg, int cr, 
 107.639              }
 107.640  
 107.641              old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
 107.642 -            v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
 107.643 +            v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
 107.644  
 107.645              if (old_base_mfn)
 107.646                  put_page(mfn_to_page(old_base_mfn));
 107.647 @@ -1596,12 +1569,19 @@ static int mov_to_cr(int gpreg, int cr, 
 107.648  
 107.649      case 4:         
 107.650          /* CR4 */
 107.651 -        if (value & X86_CR4_PAE)
 107.652 -            __hvm_bug(regs);    /* not implemented */
 107.653 -
 107.654 -        old_cr = vmcb->cr4;
 107.655 -        
 107.656 -        vmcb->cr4 = value;
 107.657 +        if (value & X86_CR4_PAE) {
 107.658 +            set_bit(SVM_CPU_STATE_PAE_ENABLED, &v->arch.hvm_svm.cpu_state);
 107.659 +        } else {
 107.660 +            if (test_bit(SVM_CPU_STATE_LMA_ENABLED,
 107.661 +                         &v->arch.hvm_svm.cpu_state)) {
 107.662 +                svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
 107.663 +            }
 107.664 +            clear_bit(SVM_CPU_STATE_PAE_ENABLED, &v->arch.hvm_svm.cpu_state);
 107.665 +        }
 107.666 +
 107.667 +        old_cr = v->arch.hvm_svm.cpu_shadow_cr4;
 107.668 +        v->arch.hvm_svm.cpu_shadow_cr4 = value;
 107.669 +        vmcb->cr4 = value | SVM_CR4_HOST_MASK;
 107.670    
 107.671          /*
 107.672           * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
 107.673 @@ -1630,10 +1610,12 @@ static int svm_cr_access(struct vcpu *v,
 107.674          struct cpu_user_regs *regs)
 107.675  {
 107.676      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 107.677 -    unsigned int inst_len = 0;
 107.678 +    int inst_len = 0;
 107.679 +    int index;
 107.680      unsigned int gpreg;
 107.681      unsigned long value;
 107.682 -    u8 buffer[6];   
 107.683 +    u8 buffer[MAX_INST_LEN];   
 107.684 +    u8 prefix = 0;
 107.685      int result = 1;
 107.686      enum instruction_index list_a[] = {INSTR_MOV2CR, INSTR_CLTS, INSTR_LMSW};
 107.687      enum instruction_index list_b[] = {INSTR_MOVCR2, INSTR_SMSW};
 107.688 @@ -1642,29 +1624,41 @@ static int svm_cr_access(struct vcpu *v,
 107.689      ASSERT(vmcb);
 107.690  
 107.691      inst_copy_from_guest(buffer, svm_rip2pointer(vmcb), sizeof(buffer));
 107.692 +    /* get index to first actual instruction byte - as we will need to know where the 
 107.693 +     * prefix lives later on
 107.694 +     */
 107.695 +    index = skip_prefix_bytes(buffer, sizeof(buffer));
 107.696      
 107.697      if (type == TYPE_MOV_TO_CR) 
 107.698      {
 107.699          inst_len = __get_instruction_length_from_list(vmcb, list_a, 
 107.700 -                ARR_SIZE(list_a), buffer, &match);
 107.701 +                ARR_SIZE(list_a), &buffer[index], &match);
 107.702      }
 107.703      else
 107.704      {
 107.705          inst_len = __get_instruction_length_from_list(vmcb, list_b, 
 107.706 -                ARR_SIZE(list_b), buffer, &match);
 107.707 +                ARR_SIZE(list_b), &buffer[index], &match);
 107.708      }
 107.709  
 107.710 +    ASSERT(inst_len > 0);
 107.711 +
 107.712 +    inst_len += index;
 107.713 +
 107.714 +    /* Check for REX prefix - it's ALWAYS the last byte of any prefix bytes */
 107.715 +    if (index > 0 && (buffer[index-1] & 0xF0) == 0x40)
 107.716 +        prefix = buffer[index-1];
 107.717 +
 107.718      HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx", (unsigned long) vmcb->rip);
 107.719  
 107.720      switch (match) 
 107.721      {
 107.722      case INSTR_MOV2CR:
 107.723 -        gpreg = decode_src_reg(buffer[2]);
 107.724 +        gpreg = decode_src_reg(prefix, buffer[index+2]);
 107.725          result = mov_to_cr(gpreg, cr, regs);
 107.726          break;
 107.727  
 107.728      case INSTR_MOVCR2:
 107.729 -        gpreg = decode_src_reg(buffer[2]);
 107.730 +        gpreg = decode_src_reg(prefix, buffer[index+2]);
 107.731          mov_from_cr(cr, gpreg, regs);
 107.732          break;
 107.733  
 107.734 @@ -1680,7 +1674,7 @@ static int svm_cr_access(struct vcpu *v,
 107.735          if (svm_dbg_on)
 107.736              svm_dump_inst(svm_rip2pointer(vmcb));
 107.737          
 107.738 -        gpreg = decode_src_reg(buffer[2]);
 107.739 +        gpreg = decode_src_reg(prefix, buffer[index+2]);
 107.740          value = get_reg(gpreg, regs, vmcb) & 0xF;
 107.741  
 107.742          if (svm_dbg_on)
 107.743 @@ -1698,7 +1692,7 @@ static int svm_cr_access(struct vcpu *v,
 107.744      case INSTR_SMSW:
 107.745          svm_dump_inst(svm_rip2pointer(vmcb));
 107.746          value = v->arch.hvm_svm.cpu_shadow_cr0;
 107.747 -        gpreg = decode_src_reg(buffer[2]);
 107.748 +        gpreg = decode_src_reg(prefix, buffer[index+2]);
 107.749          set_reg(gpreg, value, regs, vmcb);
 107.750  
 107.751          if (svm_dbg_on)
 107.752 @@ -1721,7 +1715,7 @@ static int svm_cr_access(struct vcpu *v,
 107.753  static inline void svm_do_msr_access(struct vcpu *v, struct cpu_user_regs *regs)
 107.754  {
 107.755      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 107.756 -    unsigned int  inst_len;
 107.757 +    int  inst_len;
 107.758      int64_t tsc_sum;
 107.759  
 107.760      ASSERT(vmcb);
 107.761 @@ -1813,7 +1807,9 @@ static inline void svm_vmexit_do_hlt(str
 107.762          next_wakeup = next_pit;
 107.763      if ( next_wakeup != - 1 )
 107.764          set_timer(&current->arch.hvm_svm.hlt_timer, next_wakeup);
 107.765 +/* temporary workaround for 8828/8822 evtchn patches causing SVM failure.
 107.766      hvm_safe_block();
 107.767 +*/
 107.768  }
 107.769  
 107.770  
 107.771 @@ -1860,7 +1856,7 @@ void svm_handle_invlpg(const short invlp
 107.772      struct vcpu *v = current;
 107.773      u8 opcode[MAX_INST_SIZE], prefix, length = MAX_INST_SIZE;
 107.774      unsigned long g_vaddr;
 107.775 -    unsigned int inst_len;
 107.776 +    int inst_len;
 107.777      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 107.778  
 107.779      ASSERT(vmcb);
 107.780 @@ -1877,6 +1873,7 @@ void svm_handle_invlpg(const short invlp
 107.781      if (invlpga)
 107.782      {
 107.783          inst_len = __get_instruction_length(vmcb, INSTR_INVLPGA, opcode);
 107.784 +        ASSERT(inst_len > 0);
 107.785          __update_guest_eip(vmcb, inst_len);
 107.786  
 107.787          /* 
 107.788 @@ -1890,6 +1887,7 @@ void svm_handle_invlpg(const short invlp
 107.789          /* What about multiple prefix codes? */
 107.790          prefix = (is_prefix(opcode[0])?opcode[0]:0);
 107.791          inst_len = __get_instruction_length(vmcb, INSTR_INVLPG, opcode);
 107.792 +        ASSERT(inst_len > 0);
 107.793  
 107.794          inst_len--;
 107.795          length -= inst_len;
 107.796 @@ -1941,7 +1939,10 @@ static int svm_do_vmmcall_reset_to_realm
 107.797      v->arch.hvm_svm.cpu_shadow_cr0 = X86_CR0_ET;
 107.798  
 107.799      vmcb->cr2 = 0;
 107.800 -    vmcb->cr4 = 0;
 107.801 +    vmcb->efer = EFER_SVME;
 107.802 +
 107.803 +    vmcb->cr4 = SVM_CR4_HOST_MASK;
 107.804 +    v->arch.hvm_svm.cpu_shadow_cr4 = 0;
 107.805  
 107.806      /* This will jump to ROMBIOS */
 107.807      vmcb->rip = 0xFFF0;
 107.808 @@ -2011,12 +2012,13 @@ static int svm_do_vmmcall_reset_to_realm
 107.809  static int svm_do_vmmcall(struct vcpu *v, struct cpu_user_regs *regs)
 107.810  {
 107.811      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 107.812 -    unsigned int inst_len;
 107.813 +    int inst_len;
 107.814  
 107.815      ASSERT(vmcb);
 107.816      ASSERT(regs);
 107.817  
 107.818      inst_len = __get_instruction_length(vmcb, INSTR_VMCALL, NULL);
 107.819 +    ASSERT(inst_len > 0);
 107.820  
 107.821      /* VMMCALL sanity check */
 107.822      if (vmcb->cpl > get_vmmcall_cpl(regs->edi))
 107.823 @@ -2470,7 +2472,7 @@ asmlinkage void svm_vmexit_handler(struc
 107.824          {
 107.825              v->arch.hvm_svm.injecting_event = 1;
 107.826              /* Inject #PG using Interruption-Information Fields */
 107.827 -            svm_inject_exception(vmcb, TRAP_page_fault, regs.error_code);
 107.828 +            svm_inject_exception(vmcb, TRAP_page_fault, 1, regs.error_code);
 107.829  
 107.830              v->arch.hvm_svm.cpu_cr2 = va;
 107.831              vmcb->cr2 = va;
 107.832 @@ -2665,26 +2667,23 @@ asmlinkage void svm_asid(void)
 107.833  {
 107.834      struct vcpu *v = current;
 107.835      struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 107.836 -    int core = smp_processor_id();
 107.837 -    int oldcore = v->arch.hvm_svm.core; 
 107.838 -    /* 
 107.839 -     * if need to assign new asid or if switching cores, 
 107.840 -     * then retire asid for old core, and assign new for new core.
 107.841 -     */
 107.842 -    if( v->arch.hvm_svm.core != core ) {
 107.843 -        if (svm_dbg_on)
 107.844 -            printk("old core %d new core %d\n",(int)v->arch.hvm_svm.core,(int)core);
 107.845 -        v->arch.hvm_svm.core = core;
 107.846 -    }
 107.847 -    if( test_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags) ||
 107.848 -          (oldcore != core)) {
 107.849 -        if(!asidpool_assign_next(vmcb, 1, 
 107.850 -	            oldcore, core)) {
 107.851 +
 107.852 +   /*
 107.853 +    * if need to assign new asid, or if switching cores,
 107.854 +    * retire asid for the old core, and assign a new asid to the current core.
 107.855 +    */
 107.856 +    if ( test_bit( ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags ) ||
 107.857 +       ( v->arch.hvm_svm.asid_core != v->arch.hvm_svm.launch_core )) {
 107.858 +        /* recycle asid */
 107.859 +        if ( !asidpool_assign_next( vmcb, 1,
 107.860 +	     v->arch.hvm_svm.asid_core, v->arch.hvm_svm.launch_core )) {
 107.861              /* If we get here, we have a major problem */
 107.862              domain_crash_synchronous();
 107.863          }
 107.864 +
 107.865 +        v->arch.hvm_svm.asid_core = v->arch.hvm_svm.launch_core;
 107.866 +        clear_bit( ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags );
 107.867      }
 107.868 -    clear_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
 107.869  }
 107.870  
 107.871  /*
   108.1 --- a/xen/arch/x86/hvm/svm/vmcb.c	Wed Mar 01 10:01:54 2006 -0700
   108.2 +++ b/xen/arch/x86/hvm/svm/vmcb.c	Wed Mar 01 12:47:25 2006 -0700
   108.3 @@ -190,7 +190,6 @@ static int construct_init_vmcb_guest(str
   108.4      unsigned long eflags;
   108.5      unsigned long shadow_cr;
   108.6      struct vmcb_struct *vmcb = arch_svm->vmcb;
   108.7 -    struct Xgt_desc_struct desc;
   108.8  
   108.9      /* Allows IRQs to be shares */
  108.10      vmcb->vintr.fields.intr_masking = 1;
  108.11 @@ -224,9 +223,9 @@ static int construct_init_vmcb_guest(str
  108.12      vmcb->fs.base = 0;
  108.13      vmcb->gs.base = 0;
  108.14  
  108.15 -    __asm__ __volatile__ ("sidt  (%0) \n" :: "a"(&desc) : "memory");
  108.16 -    vmcb->idtr.base = desc.address;
  108.17 -    vmcb->idtr.limit = desc.size;
  108.18 +    /* Guest Interrupt descriptor table */
  108.19 +    vmcb->idtr.base = 0;
  108.20 +    vmcb->idtr.limit = 0;
  108.21  
  108.22      /* Set up segment attributes */
  108.23      attrib.bytes = 0;
  108.24 @@ -248,15 +247,11 @@ static int construct_init_vmcb_guest(str
  108.25      attrib.fields.type = 0xb;   /* type=0xb -> executable/readable, accessed */
  108.26      vmcb->cs.attributes = attrib;
  108.27  
  108.28 -    /* Global descriptor table */
  108.29 -    //NMERGE7500 - can probably remove access to gdtr
  108.30 -    vmcb->gdtr.base = regs->edx;
  108.31 -    regs->edx = 0;
  108.32 -    ASSERT(regs->eax <= 0xFFFF); /* Make sure we're in the limit */
  108.33 -    vmcb->gdtr.limit = regs->eax;
  108.34 -    regs->eax = 0;
  108.35 +    /* Guest Global descriptor table */
  108.36 +    vmcb->gdtr.base = 0;
  108.37 +    vmcb->gdtr.limit = 0;
  108.38  
  108.39 -    /* Local Descriptor Table */
  108.40 +    /* Guest Local Descriptor Table */
  108.41      attrib.fields.s = 0; /* not code or data segement */
  108.42      attrib.fields.type = 0x2; /* LDT */
  108.43      attrib.fields.db = 0; /* 16-bit */
  108.44 @@ -279,11 +274,10 @@ static int construct_init_vmcb_guest(str
  108.45      /* CR3 is set in svm_final_setup_guest */
  108.46  
  108.47      __asm__ __volatile__ ("mov %%cr4,%0" : "=r" (crn) :); 
  108.48 -    shadow_cr = crn;
  108.49 -    vmcb->cr4 = shadow_cr;
  108.50 +    arch_svm->cpu_shadow_cr4 = crn & ~(X86_CR4_PGE | X86_CR4_PSE);
  108.51 +    vmcb->cr4 = crn | SVM_CR4_HOST_MASK;
  108.52  
  108.53 -//MERGE7500 - should write a 0 instead to rsp?
  108.54 -    vmcb->rsp = regs->esp;
  108.55 +    vmcb->rsp = 0;
  108.56      vmcb->rip = regs->eip;
  108.57  
  108.58      eflags = regs->eflags & ~HVM_EFLAGS_RESERVED_0; /* clear 0s */
  108.59 @@ -306,7 +300,7 @@ void destroy_vmcb(struct arch_svm_struct
  108.60  {
  108.61      if(arch_svm->vmcb != NULL)
  108.62      {
  108.63 -        asidpool_retire(arch_svm->vmcb, arch_svm->core);
  108.64 +        asidpool_retire(arch_svm->vmcb, arch_svm->asid_core);
  108.65           free_vmcb(arch_svm->vmcb);
  108.66      }
  108.67      if(arch_svm->iopm != NULL) {
  108.68 @@ -404,18 +398,17 @@ err_out:
  108.69  
  108.70  void svm_do_launch(struct vcpu *v)
  108.71  {
  108.72 -    /* Update CR3, GDT, LDT, TR */
  108.73 -    struct vmcb_struct *vmcb;
  108.74 +    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
  108.75      int core = smp_processor_id();
  108.76 -    vmcb = v->arch.hvm_svm.vmcb;
  108.77      ASSERT(vmcb);
  108.78  
  108.79 +    /* Update CR3, GDT, LDT, TR */
  108.80      svm_stts(v);
  108.81  
  108.82 -    /* current core is the one we will perform the vmrun on */
  108.83 -    v->arch.hvm_svm.core = core;
  108.84 +    /* current core is the one we intend to perform the VMRUN on */
  108.85 +    v->arch.hvm_svm.launch_core = v->arch.hvm_svm.asid_core = core;
  108.86      clear_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
  108.87 -    if ( !asidpool_assign_next(vmcb, 0, core, core) )
  108.88 +    if ( !asidpool_assign_next( vmcb, 0, core, core ))
  108.89          BUG();
  108.90  
  108.91      if (v->vcpu_id == 0)
   109.1 --- a/xen/arch/x86/hvm/svm/x86_64/exits.S	Wed Mar 01 10:01:54 2006 -0700
   109.2 +++ b/xen/arch/x86/hvm/svm/x86_64/exits.S	Wed Mar 01 12:47:25 2006 -0700
   109.3 @@ -107,8 +107,6 @@ ENTRY(svm_asm_do_launch)
   109.4          movq %rax, VMCB_rax(%rcx)
   109.5          movq VCPU_svm_hsa_pa(%rbx), %rax
   109.6          VMSAVE
   109.7 -	/* XXX FPU SAVE */
   109.8 -	/* XXX DO TSC OFFSET */
   109.9  
  109.10          movq VCPU_svm_vmcb_pa(%rbx), %rax
  109.11          popq %r15
  109.12 @@ -137,9 +135,7 @@ ENTRY(svm_asm_do_launch)
  109.13          VMSAVE
  109.14          /* rax is the only register we're allowed to touch here... */
  109.15  
  109.16 -	/* XXX FPU SAVE */
  109.17          GET_CURRENT(%rax)
  109.18 -	/* XXX DO TSC OFFSET */
  109.19          movq VCPU_svm_hsa_pa(%rax), %rax
  109.20          VMLOAD
  109.21  
   110.1 --- a/xen/arch/x86/hvm/vlapic.c	Wed Mar 01 10:01:54 2006 -0700
   110.2 +++ b/xen/arch/x86/hvm/vlapic.c	Wed Mar 01 12:47:25 2006 -0700
   110.3 @@ -225,27 +225,35 @@ static int vlapic_accept_irq(struct vcpu
   110.4          break;
   110.5  
   110.6      case VLAPIC_DELIV_MODE_INIT:
   110.7 -        if (!level && trig_mode == 1) {        //Deassert
   110.8 +        if ( !level && trig_mode == 1 ) {        //Deassert
   110.9              printk("This hvm_vlapic is for P4, no work for De-assert init\n");
  110.10          } else {
  110.11              /* FIXME How to check the situation after vcpu reset? */
  110.12 -            vlapic->init_sipi_sipi_state = VLAPIC_INIT_SIPI_SIPI_STATE_WAIT_SIPI;
  110.13 -            if (vlapic->vcpu) {
  110.14 -                vcpu_pause(vlapic->vcpu);
  110.15 +            if ( test_and_clear_bit(_VCPUF_initialised, &v->vcpu_flags) ) {
  110.16 +                printk("Reset hvm vcpu not supported yet\n");
  110.17 +                domain_crash_synchronous();
  110.18              }
  110.19 +            v->arch.hvm_vcpu.init_sipi_sipi_state =
  110.20 +                HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI;
  110.21 +            result = 1;
  110.22          }
  110.23          break;
  110.24  
  110.25      case VLAPIC_DELIV_MODE_STARTUP:
  110.26 -        if (vlapic->init_sipi_sipi_state != VLAPIC_INIT_SIPI_SIPI_STATE_WAIT_SIPI)
  110.27 +        if ( v->arch.hvm_vcpu.init_sipi_sipi_state ==
  110.28 +                HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM )
  110.29              break;
  110.30 -        vlapic->init_sipi_sipi_state = VLAPIC_INIT_SIPI_SIPI_STATE_NORM;
  110.31 -        if (!vlapic->vcpu) {
  110.32 -            /* XXX Call hvm_bringup_ap here */
  110.33 -             result = 0;
  110.34 -        }else{
  110.35 -            //hvm_vcpu_reset(vlapic->vcpu);
  110.36 +
  110.37 +        v->arch.hvm_vcpu.init_sipi_sipi_state =
  110.38 +                HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM;
  110.39 +
  110.40 +        if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) ) {
  110.41 +            printk("SIPI for initialized vcpu vcpuid %x\n", v->vcpu_id);
  110.42 +            domain_crash_synchronous();
  110.43          }
  110.44 +
  110.45 +        if ( hvm_bringup_ap(v->vcpu_id, vector) != 0 )
  110.46 +            result = 0;
  110.47          break;
  110.48  
  110.49      default:
   111.1 --- a/xen/arch/x86/hvm/vmx/io.c	Wed Mar 01 10:01:54 2006 -0700
   111.2 +++ b/xen/arch/x86/hvm/vmx/io.c	Wed Mar 01 12:47:25 2006 -0700
   111.3 @@ -113,13 +113,15 @@ asmlinkage void vmx_intr_assist(void)
   111.4      struct hvm_virpit *vpit = &plat->vpit;
   111.5      struct hvm_virpic *pic= &plat->vpic;
   111.6  
   111.7 -    hvm_pic_assist(v);
   111.8 -    __vmread_vcpu(v, CPU_BASED_VM_EXEC_CONTROL, &cpu_exec_control);
   111.9 -    if ( vpit->pending_intr_nr ) {
  111.10 +    if ( v->vcpu_id == 0 )
  111.11 +        hvm_pic_assist(v);
  111.12 +
  111.13 +    if ( (v->vcpu_id == 0) && vpit->pending_intr_nr ) {
  111.14          pic_set_irq(pic, 0, 0);
  111.15          pic_set_irq(pic, 0, 1);
  111.16      }
  111.17  
  111.18 +    __vmread_vcpu(v, CPU_BASED_VM_EXEC_CONTROL, &cpu_exec_control);
  111.19      __vmread(VM_ENTRY_INTR_INFO_FIELD, &intr_fields);
  111.20  
  111.21      if (intr_fields & INTR_INFO_VALID_MASK) {
   112.1 --- a/xen/arch/x86/hvm/vmx/vmx.c	Wed Mar 01 10:01:54 2006 -0700
   112.2 +++ b/xen/arch/x86/hvm/vmx/vmx.c	Wed Mar 01 12:47:25 2006 -0700
   112.3 @@ -448,6 +448,37 @@ unsigned long vmx_get_ctrl_reg(struct vc
   112.4      return 0;                   /* dummy */
   112.5  }
   112.6  
   112.7 +/* SMP VMX guest support */
   112.8 +void vmx_init_ap_context(struct vcpu_guest_context *ctxt,
   112.9 +                         int vcpuid, int trampoline_vector)
  112.10 +{
  112.11 +    int i;
  112.12 +
  112.13 +    memset(ctxt, 0, sizeof(*ctxt));
  112.14 +
  112.15 +    /*
  112.16 +     * Initial register values:
  112.17 +     */
  112.18 +    ctxt->user_regs.eip = VMXASSIST_BASE;
  112.19 +    ctxt->user_regs.edx = vcpuid;
  112.20 +    ctxt->user_regs.ebx = trampoline_vector;
  112.21 +
  112.22 +    ctxt->flags = VGCF_HVM_GUEST;
  112.23 +
  112.24 +    /* Virtual IDT is empty at start-of-day. */
  112.25 +    for ( i = 0; i < 256; i++ )
  112.26 +    {
  112.27 +        ctxt->trap_ctxt[i].vector = i;
  112.28 +        ctxt->trap_ctxt[i].cs     = FLAT_KERNEL_CS;
  112.29 +    }
  112.30 +
  112.31 +    /* No callback handlers. */
  112.32 +#if defined(__i386__)
  112.33 +    ctxt->event_callback_cs     = FLAT_KERNEL_CS;
  112.34 +    ctxt->failsafe_callback_cs  = FLAT_KERNEL_CS;
  112.35 +#endif
  112.36 +}
  112.37 +
  112.38  void do_nmi(struct cpu_user_regs *);
  112.39  
  112.40  static int check_vmx_controls(ctrls, msr)
  112.41 @@ -545,6 +576,8 @@ int start_vmx(void)
  112.42      hvm_funcs.instruction_length = vmx_instruction_length;
  112.43      hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
  112.44  
  112.45 +    hvm_funcs.init_ap_context = vmx_init_ap_context;
  112.46 +
  112.47      hvm_enabled = 1;
  112.48  
  112.49      return 1;
   113.1 --- a/xen/arch/x86/mm.c	Wed Mar 01 10:01:54 2006 -0700
   113.2 +++ b/xen/arch/x86/mm.c	Wed Mar 01 12:47:25 2006 -0700
   113.3 @@ -97,11 +97,11 @@
   113.4  #include <xen/domain_page.h>
   113.5  #include <xen/event.h>
   113.6  #include <xen/iocap.h>
   113.7 +#include <xen/guest_access.h>
   113.8  #include <asm/shadow.h>
   113.9  #include <asm/page.h>
  113.10  #include <asm/flushtlb.h>
  113.11  #include <asm/io.h>
  113.12 -#include <asm/uaccess.h>
  113.13  #include <asm/ldt.h>
  113.14  #include <asm/x86_emulate.h>
  113.15  #include <public/memory.h>
  113.16 @@ -475,7 +475,8 @@ get_page_from_l1e(
  113.17      {
  113.18          MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
  113.19                  " for dom%d",
  113.20 -                mfn, get_gpfn_from_mfn(mfn), l1e_get_intpte(l1e), d->domain_id);
  113.21 +                mfn, get_gpfn_from_mfn(mfn),
  113.22 +                l1e_get_intpte(l1e), d->domain_id);
  113.23      }
  113.24  
  113.25      return okay;
  113.26 @@ -515,7 +516,6 @@ get_page_from_l2e(
  113.27  
  113.28  
  113.29  #if CONFIG_PAGING_LEVELS >= 3
  113.30 -
  113.31  static int 
  113.32  get_page_from_l3e(
  113.33      l3_pgentry_t l3e, unsigned long pfn,
  113.34 @@ -545,11 +545,9 @@ get_page_from_l3e(
  113.35  #endif
  113.36      return rc;
  113.37  }
  113.38 -
  113.39  #endif /* 3 level */
  113.40  
  113.41  #if CONFIG_PAGING_LEVELS >= 4
  113.42 -
  113.43  static int 
  113.44  get_page_from_l4e(
  113.45      l4_pgentry_t l4e, unsigned long pfn, 
  113.46 @@ -579,7 +577,6 @@ get_page_from_l4e(
  113.47  
  113.48      return rc;
  113.49  }
  113.50 -
  113.51  #endif /* 4 level */
  113.52  
  113.53  
  113.54 @@ -649,28 +646,23 @@ static void put_page_from_l2e(l2_pgentry
  113.55  
  113.56  
  113.57  #if CONFIG_PAGING_LEVELS >= 3
  113.58 -
  113.59  static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
  113.60  {
  113.61      if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && 
  113.62           (l3e_get_pfn(l3e) != pfn) )
  113.63          put_page_and_type(mfn_to_page(l3e_get_pfn(l3e)));
  113.64  }
  113.65 -
  113.66  #endif
  113.67  
  113.68  #if CONFIG_PAGING_LEVELS >= 4
  113.69 -
  113.70  static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
  113.71  {
  113.72      if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 
  113.73           (l4e_get_pfn(l4e) != pfn) )
  113.74          put_page_and_type(mfn_to_page(l4e_get_pfn(l4e)));
  113.75  }
  113.76 -
  113.77  #endif
  113.78  
  113.79 -
  113.80  static int alloc_l1_table(struct page_info *page)
  113.81  {
  113.82      struct domain *d = page_get_owner(page);
  113.83 @@ -1569,43 +1561,71 @@ int new_guest_cr3(unsigned long mfn)
  113.84      int okay;
  113.85      unsigned long old_base_mfn;
  113.86  
  113.87 +    ASSERT(writable_pagetable_in_sync(d));
  113.88 +
  113.89      if ( shadow_mode_refcounts(d) )
  113.90 -        okay = get_page_from_pagenr(mfn, d);
  113.91 -    else
  113.92 -        okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
  113.93 -
  113.94 -    if ( likely(okay) )
  113.95      {
  113.96 -        invalidate_shadow_ldt(v);
  113.97 -
  113.98 -        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
  113.99 -        v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
 113.100 -        update_pagetables(v); /* update shadow_table and monitor_table */
 113.101 -
 113.102 -        write_ptbase(v);
 113.103 -
 113.104 +        okay = get_page_from_pagenr(mfn, d);
 113.105 +        if ( unlikely(!okay) )
 113.106 +        {
 113.107 +            MEM_LOG("Error while installing new baseptr %lx", mfn);
 113.108 +            return 0;
 113.109 +        }
 113.110 +    }
 113.111 +    else
 113.112 +    {
 113.113 +        okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
 113.114 +        if ( unlikely(!okay) )
 113.115 +        {
 113.116 +            /* Switch to idle pagetable: this VCPU has no active p.t. now. */
 113.117 +            old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
 113.118 +            v->arch.guest_table = mk_pagetable(0);
 113.119 +            update_pagetables(v);
 113.120 +            write_cr3(__pa(idle_pg_table));
 113.121 +            if ( old_base_mfn != 0 )
 113.122 +                put_page_and_type(mfn_to_page(old_base_mfn));
 113.123 +
 113.124 +            /* Retry the validation with no active p.t. for this VCPU. */
 113.125 +            okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
 113.126 +            if ( !okay )
 113.127 +            {
 113.128 +                /* Failure here is unrecoverable: the VCPU has no pagetable! */
 113.129 +                MEM_LOG("Fatal error while installing new baseptr %lx", mfn);
 113.130 +                domain_crash(d);
 113.131 +                percpu_info[v->processor].deferred_ops = 0;
 113.132 +                return 0;
 113.133 +            }
 113.134 +        }
 113.135 +    }
 113.136 +
 113.137 +    invalidate_shadow_ldt(v);
 113.138 +
 113.139 +    old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
 113.140 +    v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
 113.141 +    update_pagetables(v); /* update shadow_table and monitor_table */
 113.142 +
 113.143 +    write_ptbase(v);
 113.144 +
 113.145 +    if ( likely(old_base_mfn != 0) )
 113.146 +    {
 113.147          if ( shadow_mode_refcounts(d) )
 113.148              put_page(mfn_to_page(old_base_mfn));
 113.149          else
 113.150              put_page_and_type(mfn_to_page(old_base_mfn));
 113.151 -
 113.152 -        /* CR3 also holds a ref to its shadow... */
 113.153 -        if ( shadow_mode_enabled(d) )
 113.154 -        {
 113.155 -            if ( v->arch.monitor_shadow_ref )
 113.156 -                put_shadow_ref(v->arch.monitor_shadow_ref);
 113.157 -            v->arch.monitor_shadow_ref =
 113.158 -                pagetable_get_pfn(v->arch.monitor_table);
 113.159 -            ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
 113.160 -            get_shadow_ref(v->arch.monitor_shadow_ref);
 113.161 -        }
 113.162      }
 113.163 -    else
 113.164 +
 113.165 +    /* CR3 also holds a ref to its shadow... */
 113.166 +    if ( shadow_mode_enabled(d) )
 113.167      {
 113.168 -        MEM_LOG("Error while installing new baseptr %lx", mfn);
 113.169 +        if ( v->arch.monitor_shadow_ref )
 113.170 +            put_shadow_ref(v->arch.monitor_shadow_ref);
 113.171 +        v->arch.monitor_shadow_ref =
 113.172 +            pagetable_get_pfn(v->arch.monitor_table);
 113.173 +        ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
 113.174 +        get_shadow_ref(v->arch.monitor_shadow_ref);
 113.175      }
 113.176  
 113.177 -    return okay;
 113.178 +    return 1;
 113.179  }
 113.180  
 113.181  static void process_deferred_ops(unsigned int cpu)
 113.182 @@ -1625,7 +1645,7 @@ static void process_deferred_ops(unsigne
 113.183          else
 113.184              local_flush_tlb();
 113.185      }
 113.186 -        
 113.187 +
 113.188      if ( deferred_ops & DOP_RELOAD_LDT )
 113.189          (void)map_ldt_shadow_page(0);
 113.190  
 113.191 @@ -1752,9 +1772,9 @@ int do_mmuext_op(
 113.192      {
 113.193          if ( hypercall_preempt_check() )
 113.194          {
 113.195 -            rc = hypercall4_create_continuation(
 113.196 -                __HYPERVISOR_mmuext_op, uops,
 113.197 -                (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
 113.198 +            rc = hypercall_create_continuation(
 113.199 +                __HYPERVISOR_mmuext_op, "pipi",
 113.200 +                uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
 113.201              break;
 113.202          }
 113.203  
 113.204 @@ -2018,9 +2038,9 @@ int do_mmu_update(
 113.205      {
 113.206          if ( hypercall_preempt_check() )
 113.207          {
 113.208 -            rc = hypercall4_create_continuation(
 113.209 -                __HYPERVISOR_mmu_update, ureqs, 
 113.210 -                (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
 113.211 +            rc = hypercall_create_continuation(
 113.212 +                __HYPERVISOR_mmu_update, "pipi",
 113.213 +                ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
 113.214              break;
 113.215          }
 113.216  
 113.217 @@ -2769,7 +2789,7 @@ long do_update_descriptor(u64 pa, u64 de
 113.218  }
 113.219  
 113.220  
 113.221 -long arch_memory_op(int op, void *arg)
 113.222 +long arch_memory_op(int op, GUEST_HANDLE(void) arg)
 113.223  {
 113.224      struct xen_reserved_phys_area xrpa;
 113.225      unsigned long pfn;
 113.226 @@ -2779,7 +2799,7 @@ long arch_memory_op(int op, void *arg)
 113.227      switch ( op )
 113.228      {
 113.229      case XENMEM_reserved_phys_area:
 113.230 -        if ( copy_from_user(&xrpa, arg, sizeof(xrpa)) )
 113.231 +        if ( copy_from_guest(&xrpa, arg, 1) )
 113.232              return -EFAULT;
 113.233  
 113.234          /* No guest has more than one reserved area. */
 113.235 @@ -2813,7 +2833,7 @@ long arch_memory_op(int op, void *arg)
 113.236  
 113.237          put_domain(d);
 113.238  
 113.239 -        if ( copy_to_user(arg, &xrpa, sizeof(xrpa)) )
 113.240 +        if ( copy_to_guest(arg, &xrpa, 1) )
 113.241              return -EFAULT;
 113.242  
 113.243          break;
   114.1 --- a/xen/arch/x86/setup.c	Wed Mar 01 10:01:54 2006 -0700
   114.2 +++ b/xen/arch/x86/setup.c	Wed Mar 01 12:47:25 2006 -0700
   114.3 @@ -144,6 +144,20 @@ static void __init do_initcalls(void)
   114.4  
   114.5  static struct e820entry e820_raw[E820MAX];
   114.6  
   114.7 +static unsigned long initial_images_start, initial_images_end;
   114.8 +
   114.9 +unsigned long initial_images_nrpages(void)
  114.10 +{
  114.11 +    unsigned long s = initial_images_start + PAGE_SIZE - 1;
  114.12 +    unsigned long e = initial_images_end;
  114.13 +    return ((e >> PAGE_SHIFT) - (s >> PAGE_SHIFT));
  114.14 +}
  114.15 +
  114.16 +void discard_initial_images(void)
  114.17 +{
  114.18 +    init_domheap_pages(initial_images_start, initial_images_end);
  114.19 +}
  114.20 +
  114.21  void __init __start_xen(multiboot_info_t *mbi)
  114.22  {
  114.23      char *cmdline;
  114.24 @@ -152,7 +166,6 @@ void __init __start_xen(multiboot_info_t
  114.25      unsigned int initrdidx = 1;
  114.26      module_t *mod = (module_t *)__va(mbi->mods_addr);
  114.27      unsigned long nr_pages, modules_length;
  114.28 -    unsigned long initial_images_start, initial_images_end;
  114.29      paddr_t s, e;
  114.30      int i, e820_warn = 0, e820_raw_nr = 0, bytes = 0;
  114.31      struct ns16550_defaults ns16550 = {
  114.32 @@ -437,11 +450,7 @@ void __init __start_xen(multiboot_info_t
  114.33          set_in_cr4(X86_CR4_OSXMMEXCPT);
  114.34  
  114.35      if ( opt_nosmp )
  114.36 -    {
  114.37          max_cpus = 0;
  114.38 -        smp_num_siblings = 1;
  114.39 -        boot_cpu_data.x86_max_cores = 1;
  114.40 -    }
  114.41  
  114.42      smp_prepare_cpus(max_cpus);
  114.43  
   115.1 --- a/xen/arch/x86/shadow32.c	Wed Mar 01 10:01:54 2006 -0700
   115.2 +++ b/xen/arch/x86/shadow32.c	Wed Mar 01 12:47:25 2006 -0700
   115.3 @@ -43,7 +43,8 @@ static void free_writable_pte_prediction
   115.4  static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn);
   115.5  #endif
   115.6  
   115.7 -static void free_p2m_table(struct vcpu *v);
   115.8 +static int alloc_p2m_table(struct domain *d);
   115.9 +static void free_p2m_table(struct domain *d);
  115.10  
  115.11  /********
  115.12  
  115.13 @@ -739,7 +740,7 @@ static void alloc_monitor_pagetable(stru
  115.14      mpl2e = (l2_pgentry_t *)map_domain_page_global(mmfn);
  115.15      memset(mpl2e, 0, PAGE_SIZE);
  115.16  
  115.17 -    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
  115.18 +    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  115.19             &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  115.20             HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  115.21  
  115.22 @@ -760,6 +761,23 @@ static void alloc_monitor_pagetable(stru
  115.23  
  115.24      if ( v->vcpu_id == 0 )
  115.25          alloc_p2m_table(d);
  115.26 +    else
  115.27 +    {
  115.28 +        unsigned long mfn;
  115.29 +
  115.30 +        mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
  115.31 +        if ( mfn )
  115.32 +        {
  115.33 +            l2_pgentry_t *l2tab;
  115.34 +
  115.35 +            l2tab = map_domain_page(mfn);
  115.36 +
  115.37 +            mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
  115.38 +                l2tab[l2_table_offset(RO_MPT_VIRT_START)];
  115.39 +
  115.40 +            unmap_domain_page(l2tab);
  115.41 +        }
  115.42 +    }
  115.43  }
  115.44  
  115.45  /*
  115.46 @@ -771,7 +789,7 @@ void free_monitor_pagetable(struct vcpu 
  115.47      unsigned long mfn;
  115.48  
  115.49      ASSERT( pagetable_get_paddr(v->arch.monitor_table) );
  115.50 -    
  115.51 +
  115.52      mpl2e = v->arch.monitor_vtable;
  115.53  
  115.54      /*
  115.55 @@ -794,7 +812,7 @@ void free_monitor_pagetable(struct vcpu 
  115.56      }
  115.57  
  115.58      if ( v->vcpu_id == 0 )
  115.59 -        free_p2m_table(v);
  115.60 +        free_p2m_table(v->domain);
  115.61  
  115.62      /*
  115.63       * Then free monitor_table.
  115.64 @@ -808,8 +826,8 @@ void free_monitor_pagetable(struct vcpu 
  115.65  }
  115.66  
  115.67  static int
  115.68 -map_p2m_entry(
  115.69 -    l1_pgentry_t *l1tab, unsigned long va, unsigned long gpa, unsigned long mfn)
  115.70 +map_p2m_entry(l1_pgentry_t *l1tab, unsigned long va,
  115.71 +              unsigned long gpa, unsigned long mfn)
  115.72  {
  115.73      unsigned long *l0tab = NULL;
  115.74      l1_pgentry_t l1e = { 0 };
  115.75 @@ -820,27 +838,22 @@ map_p2m_entry(
  115.76      {
  115.77          page = alloc_domheap_page(NULL);
  115.78          if ( !page )
  115.79 -            goto fail;
  115.80 -
  115.81 -        if ( l0tab  )
  115.82 -            unmap_domain_page(l0tab);
  115.83 +            return 0;
  115.84 +
  115.85          l0tab = map_domain_page(page_to_mfn(page));
  115.86 -        memset(l0tab, 0, PAGE_SIZE );
  115.87 +        memset(l0tab, 0, PAGE_SIZE);
  115.88 +
  115.89          l1e = l1tab[l1_table_offset(va)] =
  115.90              l1e_from_page(page, __PAGE_HYPERVISOR);
  115.91      }
  115.92 -    else if ( l0tab == NULL)
  115.93 +    else
  115.94          l0tab = map_domain_page(l1e_get_pfn(l1e));
  115.95  
  115.96 -    l0tab[gpa & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn;
  115.97 -
  115.98 -    if ( l0tab )
  115.99 -        unmap_domain_page(l0tab);
 115.100 +    l0tab[gpa & ((PAGE_SIZE / sizeof(mfn)) - 1)] = mfn;
 115.101 +
 115.102 +    unmap_domain_page(l0tab);
 115.103  
 115.104      return 1;
 115.105 -
 115.106 -fail:
 115.107 -    return 0;
 115.108  }
 115.109  
 115.110  int
 115.111 @@ -853,7 +866,6 @@ set_p2m_entry(struct domain *d, unsigned
 115.112      l1_pgentry_t *l1;
 115.113      struct page_info *l1page;
 115.114      unsigned long va = pfn << PAGE_SHIFT;
 115.115 -    int error;
 115.116  
 115.117      if ( shadow_mode_external(d) )
 115.118      {
 115.119 @@ -877,6 +889,7 @@ set_p2m_entry(struct domain *d, unsigned
 115.120  
 115.121      if ( shadow_mode_external(d) )
 115.122      {
 115.123 +        int error;
 115.124          l1_pgentry_t *l1tab = NULL;
 115.125          l2_pgentry_t l2e;
 115.126  
 115.127 @@ -885,14 +898,13 @@ set_p2m_entry(struct domain *d, unsigned
 115.128          ASSERT( l2e_get_flags(l2e) & _PAGE_PRESENT );
 115.129  
 115.130          l1tab = map_domain_page(l2e_get_pfn(l2e));
 115.131 -        error = map_p2m_entry(l1tab, va, pfn, mfn);
 115.132 -        if ( !error )
 115.133 -            domain_crash_synchronous(); 
 115.134 +        if ( !(error = map_p2m_entry(l1tab, va, pfn, mfn)) )
 115.135 +            domain_crash(d);
 115.136  
 115.137          unmap_domain_page(l1tab);
 115.138          unmap_domain_page_with_cache(l2, l2cache);
 115.139  
 115.140 -        return 1;
 115.141 +        return error;
 115.142      }
 115.143  
 115.144      /*
 115.145 @@ -926,7 +938,7 @@ set_p2m_entry(struct domain *d, unsigned
 115.146      return 1;
 115.147  }
 115.148  
 115.149 -int
 115.150 +static int
 115.151  alloc_p2m_table(struct domain *d)
 115.152  {
 115.153      struct list_head *list_ent;
 115.154 @@ -937,7 +949,7 @@ alloc_p2m_table(struct domain *d)
 115.155      l2_pgentry_t l2e = { 0 };
 115.156      struct page_info *page;
 115.157      unsigned long gpfn, mfn;
 115.158 -    int error;
 115.159 +    int error = 0;
 115.160  
 115.161      if ( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) )
 115.162      {
 115.163 @@ -955,6 +967,9 @@ alloc_p2m_table(struct domain *d)
 115.164          }
 115.165          else
 115.166              l1tab = map_domain_page(l2e_get_pfn(l2e));
 115.167 +
 115.168 +        if ( l2tab )
 115.169 +            unmap_domain_page(l2tab);
 115.170      }
 115.171      else
 115.172      {
 115.173 @@ -972,23 +987,23 @@ alloc_p2m_table(struct domain *d)
 115.174          page = list_entry(list_ent, struct page_info, list);
 115.175          mfn = page_to_mfn(page);
 115.176  
 115.177 -        error = map_p2m_entry(l1tab, va, gpfn, mfn);
 115.178 -        if ( !error )
 115.179 -            domain_crash_synchronous(); 
 115.180 +        if ( !(error = map_p2m_entry(l1tab, va, gpfn, mfn)) )
 115.181 +        {
 115.182 +            domain_crash(d);
 115.183 +            break;
 115.184 +        }
 115.185  
 115.186          list_ent = frame_table[mfn].list.next;
 115.187          va += sizeof(mfn);
 115.188      }
 115.189  
 115.190 -    if (l2tab)
 115.191 -        unmap_domain_page(l2tab);
 115.192      unmap_domain_page(l1tab);
 115.193  
 115.194 -    return 1;
 115.195 +    return error;
 115.196  }
 115.197  
 115.198 -static void 
 115.199 -free_p2m_table(struct vcpu *v)
 115.200 +static void
 115.201 +free_p2m_table(struct domain *d)
 115.202  {
 115.203      unsigned long va;
 115.204      l2_pgentry_t *l2tab;
 115.205 @@ -996,10 +1011,10 @@ free_p2m_table(struct vcpu *v)
 115.206      l2_pgentry_t l2e;
 115.207      l1_pgentry_t l1e;
 115.208  
 115.209 -    ASSERT ( pagetable_get_pfn(v->arch.monitor_table) );
 115.210 +    ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
 115.211  
 115.212      l2tab = map_domain_page(
 115.213 -        pagetable_get_pfn(v->arch.monitor_table));
 115.214 +        pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
 115.215  
 115.216      for ( va = RO_MPT_VIRT_START; va < RO_MPT_VIRT_END; )
 115.217      {
 115.218 @@ -1015,11 +1030,13 @@ free_p2m_table(struct vcpu *v)
 115.219  
 115.220                  if ( l1e_get_flags(l1e) & _PAGE_PRESENT )
 115.221                      free_domheap_page(mfn_to_page(l1e_get_pfn(l1e)));
 115.222 -                va += PAGE_SIZE; 
 115.223 +                va += PAGE_SIZE;
 115.224              }
 115.225              unmap_domain_page(l1tab);
 115.226              free_domheap_page(mfn_to_page(l2e_get_pfn(l2e)));
 115.227          }
 115.228 +        else
 115.229 +            va += PAGE_SIZE * L1_PAGETABLE_ENTRIES;
 115.230      }
 115.231      unmap_domain_page(l2tab);
 115.232  }
 115.233 @@ -1246,7 +1263,7 @@ int __shadow_mode_enable(struct domain *
 115.234  
 115.235      if ( shadow_mode_refcounts(d) )
 115.236      {
 115.237 -        struct list_head *list_ent; 
 115.238 +        struct list_head *list_ent;
 115.239          struct page_info *page;
 115.240  
 115.241          /*
   116.1 --- a/xen/arch/x86/shadow_public.c	Wed Mar 01 10:01:54 2006 -0700
   116.2 +++ b/xen/arch/x86/shadow_public.c	Wed Mar 01 12:47:25 2006 -0700
   116.3 @@ -31,7 +31,8 @@
   116.4  #include <xen/trace.h>
   116.5  #include <asm/shadow_64.h>
   116.6  
   116.7 -static void free_p2m_table(struct vcpu *v);
   116.8 +static int alloc_p2m_table(struct domain *d);
   116.9 +static void free_p2m_table(struct domain *d);
  116.10  
  116.11  #define SHADOW_MAX_GUEST32(_encoded) ((L1_PAGETABLE_ENTRIES_32 - 1) - ((_encoded) >> 16))
  116.12  
  116.13 @@ -328,6 +329,23 @@ static void alloc_monitor_pagetable(stru
  116.14  
  116.15      if ( v->vcpu_id == 0 )
  116.16          alloc_p2m_table(d);
  116.17 +    else
  116.18 +    {
  116.19 +        unsigned long mfn;
  116.20 +
  116.21 +        mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
  116.22 +        if ( mfn )
  116.23 +        {
  116.24 +            l4_pgentry_t *l4tab;
  116.25 +
  116.26 +            l4tab = map_domain_page(mfn);
  116.27 +
  116.28 +            mpl4e[l4_table_offset(RO_MPT_VIRT_START)] =
  116.29 +                l4tab[l4_table_offset(RO_MPT_VIRT_START)];
  116.30 +
  116.31 +            unmap_domain_page(l4tab);
  116.32 +        }
  116.33 +    }
  116.34  }
  116.35  
  116.36  void free_monitor_pagetable(struct vcpu *v)
  116.37 @@ -338,7 +356,7 @@ void free_monitor_pagetable(struct vcpu 
  116.38       * free monitor_table.
  116.39       */
  116.40      if ( v->vcpu_id == 0 )
  116.41 -        free_p2m_table(v);
  116.42 +        free_p2m_table(v->domain);
  116.43  
  116.44      /*
  116.45       * Then free monitor_table.
  116.46 @@ -397,13 +415,49 @@ static void alloc_monitor_pagetable(stru
  116.47              l2e_empty();
  116.48      mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = l2e_empty();
  116.49  
  116.50 -    unmap_domain_page(mpl2e);
  116.51 -
  116.52      v->arch.monitor_table = mk_pagetable(m3mfn << PAGE_SHIFT); /* < 4GB */
  116.53      v->arch.monitor_vtable = (l2_pgentry_t *) mpl3e;
  116.54  
  116.55      if ( v->vcpu_id == 0 )
  116.56          alloc_p2m_table(d);
  116.57 +    else
  116.58 +    {
  116.59 +        unsigned long mfn;
  116.60 +
  116.61 +        mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
  116.62 +        if ( mfn )
  116.63 +        {
  116.64 +            l3_pgentry_t *l3tab, l3e;
  116.65 +            l2_pgentry_t *l2tab;
  116.66 +
  116.67 +            l3tab = map_domain_page(mfn);
  116.68 +            l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)];
  116.69 +
  116.70 +            /*
  116.71 +             * NB: when CONFIG_PAGING_LEVELS == 3,
  116.72 +             * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
  116.73 +             * alloc_monitor_pagetable should guarantee this.
  116.74 +             */
  116.75 +            if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
  116.76 +                BUG();
  116.77 +
  116.78 +            l2tab = map_domain_page(l3e_get_pfn(l3e));
  116.79 +
  116.80 +            /*
  116.81 +             * Just one l2 slot is used here, so at most 2M for p2m table:
  116.82 +             *      ((4K * 512)/sizeof(unsigned long)) * 4K = 2G
  116.83 +             * should be OK on PAE xen, since Qemu DM can only map 1.5G VMX
  116.84 +             * guest memory.
  116.85 +             */
  116.86 +            mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
  116.87 +                l2tab[l2_table_offset(RO_MPT_VIRT_START)];
  116.88 +
  116.89 +            unmap_domain_page(l2tab);
  116.90 +            unmap_domain_page(l3tab);
  116.91 +        }
  116.92 +    }
  116.93 +
  116.94 +    unmap_domain_page(mpl2e);
  116.95  }
  116.96  
  116.97  void free_monitor_pagetable(struct vcpu *v)
  116.98 @@ -413,7 +467,7 @@ void free_monitor_pagetable(struct vcpu 
  116.99       * free monitor_table.
 116.100       */
 116.101      if ( v->vcpu_id == 0 )
 116.102 -        free_p2m_table(v);
 116.103 +        free_p2m_table(v->domain);
 116.104  
 116.105      m3mfn = pagetable_get_pfn(v->arch.monitor_table);
 116.106      m2mfn = l2e_get_pfn(v->arch.monitor_vtable[L3_PAGETABLE_ENTRIES - 1]);
 116.107 @@ -1348,14 +1402,14 @@ int _shadow_mode_refcounts(struct domain
 116.108  }
 116.109  
 116.110  static int
 116.111 -map_p2m_entry(
 116.112 -    pgentry_64_t *top_tab, unsigned long va, unsigned long gpa, unsigned long mfn)
 116.113 +map_p2m_entry(pgentry_64_t *top_tab, unsigned long va,
 116.114 +              unsigned long gpfn, unsigned long mfn)
 116.115  {
 116.116  #if CONFIG_PAGING_LEVELS >= 4
 116.117      pgentry_64_t l4e = { 0 };
 116.118 +    pgentry_64_t *l3tab = NULL;
 116.119  #endif
 116.120  #if CONFIG_PAGING_LEVELS >= 3
 116.121 -    pgentry_64_t *l3tab = NULL;
 116.122      pgentry_64_t l3e = { 0 };
 116.123  #endif
 116.124      l2_pgentry_t *l2tab = NULL;
 116.125 @@ -1367,7 +1421,7 @@ map_p2m_entry(
 116.126  
 116.127  #if CONFIG_PAGING_LEVELS >= 4
 116.128      l4e = top_tab[l4_table_offset(va)];
 116.129 -    if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) ) 
 116.130 +    if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) )
 116.131      {
 116.132          page = alloc_domheap_page(NULL);
 116.133          if ( !page )
 116.134 @@ -1375,17 +1429,14 @@ map_p2m_entry(
 116.135  
 116.136          l3tab = map_domain_page(page_to_mfn(page));
 116.137          memset(l3tab, 0, PAGE_SIZE);
 116.138 -        l4e = top_tab[l4_table_offset(va)] = 
 116.139 +        l4e = top_tab[l4_table_offset(va)] =
 116.140              entry_from_page(page, __PAGE_HYPERVISOR);
 116.141 -    } 
 116.142 -    else if ( l3tab == NULL)
 116.143 +    }
 116.144 +    else
 116.145          l3tab = map_domain_page(entry_get_pfn(l4e));
 116.146  
 116.147      l3e = l3tab[l3_table_offset(va)];
 116.148 -#else
 116.149 -    l3e = top_tab[l3_table_offset(va)];
 116.150 -#endif
 116.151 -    if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) ) 
 116.152 +    if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
 116.153      {
 116.154          page = alloc_domheap_page(NULL);
 116.155          if ( !page )
 116.156 @@ -1393,14 +1444,29 @@ map_p2m_entry(
 116.157  
 116.158          l2tab = map_domain_page(page_to_mfn(page));
 116.159          memset(l2tab, 0, PAGE_SIZE);
 116.160 -        l3e = l3tab[l3_table_offset(va)] = 
 116.161 +        l3e = l3tab[l3_table_offset(va)] =
 116.162              entry_from_page(page, __PAGE_HYPERVISOR);
 116.163 -    } 
 116.164 -    else if ( l2tab == NULL) 
 116.165 +    }
 116.166 +    else
 116.167          l2tab = map_domain_page(entry_get_pfn(l3e));
 116.168  
 116.169 +    unmap_domain_page(l3tab);
 116.170 +#else
 116.171 +    l3e = top_tab[l3_table_offset(va)];
 116.172 +
 116.173 +    /*
 116.174 +     * NB: when CONFIG_PAGING_LEVELS == 3,
 116.175 +     * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
 116.176 +     * alloc_monitor_pagetable should guarantee this.
 116.177 +     */
 116.178 +    if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
 116.179 +        BUG();
 116.180 +
 116.181 +    l2tab = map_domain_page(entry_get_pfn(l3e));
 116.182 +#endif
 116.183 +
 116.184      l2e = l2tab[l2_table_offset(va)];
 116.185 -    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) 
 116.186 +    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
 116.187      {
 116.188          page = alloc_domheap_page(NULL);
 116.189          if ( !page )
 116.190 @@ -1408,14 +1474,16 @@ map_p2m_entry(
 116.191  
 116.192          l1tab = map_domain_page(page_to_mfn(page));
 116.193          memset(l1tab, 0, PAGE_SIZE);
 116.194 -        l2e = l2tab[l2_table_offset(va)] = 
 116.195 +        l2e = l2tab[l2_table_offset(va)] =
 116.196              l2e_from_page(page, __PAGE_HYPERVISOR);
 116.197 -    } 
 116.198 -    else if ( l1tab == NULL) 
 116.199 +    }
 116.200 +    else
 116.201          l1tab = map_domain_page(l2e_get_pfn(l2e));
 116.202  
 116.203 +    unmap_domain_page(l2tab);
 116.204 +
 116.205      l1e = l1tab[l1_table_offset(va)];
 116.206 -    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) ) 
 116.207 +    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
 116.208      {
 116.209          page = alloc_domheap_page(NULL);
 116.210          if ( !page )
 116.211 @@ -1423,96 +1491,88 @@ map_p2m_entry(
 116.212  
 116.213          l0tab = map_domain_page(page_to_mfn(page));
 116.214          memset(l0tab, 0, PAGE_SIZE);
 116.215 -        l1e = l1tab[l1_table_offset(va)] = 
 116.216 +        l1e = l1tab[l1_table_offset(va)] =
 116.217              l1e_from_page(page, __PAGE_HYPERVISOR);
 116.218      }
 116.219 -    else if ( l0tab == NULL) 
 116.220 +    else
 116.221          l0tab = map_domain_page(l1e_get_pfn(l1e));
 116.222  
 116.223 -    l0tab[gpa & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn;
 116.224 +    unmap_domain_page(l1tab);
 116.225  
 116.226 -    if ( l2tab )
 116.227 -    {
 116.228 -        unmap_domain_page(l2tab);
 116.229 -        l2tab = NULL;
 116.230 -    }
 116.231 -    if ( l1tab )
 116.232 -    {
 116.233 -        unmap_domain_page(l1tab);
 116.234 -        l1tab = NULL;
 116.235 -    }
 116.236 -    if ( l0tab )
 116.237 -    {
 116.238 -        unmap_domain_page(l0tab);
 116.239 -        l0tab = NULL;
 116.240 -    }
 116.241 +    l0tab[gpfn & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn;
 116.242 +
 116.243 +    unmap_domain_page(l0tab);
 116.244  
 116.245      return 1;
 116.246  
 116.247  nomem:
 116.248 -
 116.249      return 0;
 116.250  }
 116.251  
 116.252  int
 116.253 -set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn,
 116.254 +set_p2m_entry(struct domain *d, unsigned long gpfn, unsigned long mfn,
 116.255                struct domain_mmap_cache *l2cache,
 116.256                struct domain_mmap_cache *l1cache)
 116.257  {
 116.258 -    unsigned long tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
 116.259 -    pgentry_64_t *top;
 116.260 -    unsigned long va = RO_MPT_VIRT_START + (pfn * sizeof (unsigned long));
 116.261 +    unsigned long tabmfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
 116.262 +    unsigned long va = RO_MPT_VIRT_START + (gpfn * sizeof(unsigned long));
 116.263 +    pgentry_64_t *top_tab;
 116.264      int error;
 116.265  
 116.266 -    ASSERT(tabpfn != 0);
 116.267 +    ASSERT(tabmfn != 0);
 116.268      ASSERT(shadow_lock_is_acquired(d));
 116.269  
 116.270 -    top = map_domain_page_with_cache(tabpfn, l2cache);
 116.271 -    error = map_p2m_entry(top, va, pfn, mfn);
 116.272 -    unmap_domain_page_with_cache(top, l2cache);
 116.273 +    top_tab = map_domain_page_with_cache(tabmfn, l2cache);
 116.274  
 116.275 -    if ( !error )
 116.276 -         domain_crash_synchronous();
 116.277 -        
 116.278 -    return 1;
 116.279 +    if ( !(error = map_p2m_entry(top_tab, va, gpfn, mfn)) )
 116.280 +        domain_crash(d);
 116.281 +
 116.282 +    unmap_domain_page_with_cache(top_tab, l2cache);
 116.283 +
 116.284 +    return error;
 116.285  }
 116.286  
 116.287 -int
 116.288 +static int
 116.289  alloc_p2m_table(struct domain *d)
 116.290  {
 116.291      struct list_head *list_ent;
 116.292      unsigned long va = RO_MPT_VIRT_START; /*  phys_to_machine_mapping */
 116.293      pgentry_64_t *top_tab = NULL;
 116.294      unsigned long mfn;
 116.295 -    int gpa;
 116.296 +    int gpfn, error = 0;
 116.297  
 116.298 -    ASSERT ( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
 116.299 +    ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
 116.300  
 116.301      top_tab = map_domain_page(
 116.302          pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
 116.303  
 116.304 -
 116.305      list_ent = d->page_list.next;
 116.306  
 116.307 -    for ( gpa = 0; list_ent != &d->page_list; gpa++ ) 
 116.308 +    for ( gpfn = 0; list_ent != &d->page_list; gpfn++ )
 116.309      {
 116.310          struct page_info *page;
 116.311 +
 116.312          page = list_entry(list_ent, struct page_info, list);
 116.313          mfn = page_to_mfn(page);
 116.314  
 116.315 -        map_p2m_entry(top_tab, va, gpa, mfn);
 116.316 +        if ( !(error = map_p2m_entry(top_tab, va, gpfn, mfn)) )
 116.317 +        {
 116.318 +            domain_crash(d);
 116.319 +            break;
 116.320 +        }
 116.321 +
 116.322          list_ent = frame_table[mfn].list.next;
 116.323          va += sizeof(mfn);
 116.324      }
 116.325  
 116.326      unmap_domain_page(top_tab);
 116.327  
 116.328 -    return 1;
 116.329 +    return error;
 116.330  }
 116.331  
 116.332  #if CONFIG_PAGING_LEVELS >= 3
 116.333  static void
 116.334 -free_p2m_table(struct vcpu *v)
 116.335 +free_p2m_table(struct domain *d)
 116.336  {
 116.337      unsigned long va;
 116.338      l1_pgentry_t *l1tab;
 116.339 @@ -1520,27 +1580,35 @@ free_p2m_table(struct vcpu *v)
 116.340      l2_pgentry_t *l2tab;
 116.341      l2_pgentry_t l2e;
 116.342  #if CONFIG_PAGING_LEVELS >= 3
 116.343 -    l3_pgentry_t *l3tab; 
 116.344 +    l3_pgentry_t *l3tab;
 116.345      l3_pgentry_t l3e;
 116.346  #endif
 116.347  #if CONFIG_PAGING_LEVELS == 4
 116.348      int i3;
 116.349 -    l4_pgentry_t *l4tab; 
 116.350 +    l4_pgentry_t *l4tab;
 116.351      l4_pgentry_t l4e;
 116.352  #endif
 116.353  
 116.354 -    ASSERT ( pagetable_get_pfn(v->arch.monitor_table) );
 116.355 +    ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
 116.356  
 116.357  #if CONFIG_PAGING_LEVELS == 4
 116.358      l4tab = map_domain_page(
 116.359 -        pagetable_get_pfn(v->arch.monitor_table));
 116.360 +        pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
 116.361  #endif
 116.362  #if CONFIG_PAGING_LEVELS == 3
 116.363      l3tab = map_domain_page(
 116.364 -        pagetable_get_pfn(v->arch.monitor_table));
 116.365 +        pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
 116.366  
 116.367 -    va = RO_MPT_VIRT_START;
 116.368 -    l3e = l3tab[l3_table_offset(va)];
 116.369 +    l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)];
 116.370 +
 116.371 +    /*
 116.372 +     * NB: when CONFIG_PAGING_LEVELS == 3,
 116.373 +     * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
 116.374 +     * alloc_monitor_pagetable should guarantee this.
 116.375 +     */
 116.376 +    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
 116.377 +        BUG();
 116.378 +
 116.379      l2tab = map_domain_page(l3e_get_pfn(l3e));
 116.380  #endif
 116.381  
 116.382 @@ -1555,8 +1623,8 @@ free_p2m_table(struct vcpu *v)
 116.383  
 116.384              for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; i3++ )
 116.385              {
 116.386 +                l3e = l3tab[l3_table_offset(va)];
 116.387  
 116.388 -                l3e = l3tab[l3_table_offset(va)];
 116.389                  if ( l3e_get_flags(l3e) & _PAGE_PRESENT )
 116.390                  {
 116.391                      int i2;
 116.392 @@ -1567,12 +1635,13 @@ free_p2m_table(struct vcpu *v)
 116.393                      {
 116.394  #endif
 116.395                          l2e = l2tab[l2_table_offset(va)];
 116.396 +
 116.397                          if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
 116.398                          {
 116.399                              int i1;
 116.400  
 116.401                              l1tab = map_domain_page(l2e_get_pfn(l2e));
 116.402 -                            
 116.403 +
 116.404                              /*
 116.405                               * unsigned long phys_to_machine_mapping[]
 116.406                               */
 116.407 @@ -1591,7 +1660,7 @@ free_p2m_table(struct vcpu *v)
 116.408                          else
 116.409                              va += PAGE_SIZE * L1_PAGETABLE_ENTRIES;
 116.410  
 116.411 -#if CONFIG_PAGING_LEVELS == 4                    
 116.412 +#if CONFIG_PAGING_LEVELS == 4
 116.413                      }
 116.414                      unmap_domain_page(l2tab);
 116.415                      free_domheap_page(mfn_to_page(l3e_get_pfn(l3e)));
 116.416 @@ -1603,7 +1672,7 @@ free_p2m_table(struct vcpu *v)
 116.417              free_domheap_page(mfn_to_page(l4e_get_pfn(l4e)));
 116.418          }
 116.419          else
 116.420 -            va += PAGE_SIZE * 
 116.421 +            va += PAGE_SIZE *
 116.422                  L1_PAGETABLE_ENTRIES * L2_PAGETABLE_ENTRIES * L3_PAGETABLE_ENTRIES;
 116.423  #endif
 116.424      }
 116.425 @@ -1622,7 +1691,7 @@ void shadow_l1_normal_pt_update(
 116.426      paddr_t pa, l1_pgentry_t gpte,
 116.427      struct domain_mmap_cache *cache)
 116.428  {
 116.429 -    unsigned long sl1mfn;    
 116.430 +    unsigned long sl1mfn;
 116.431      l1_pgentry_t *spl1e, spte;
 116.432  
 116.433      shadow_lock(d);
   117.1 --- a/xen/arch/x86/traps.c	Wed Mar 01 10:01:54 2006 -0700
   117.2 +++ b/xen/arch/x86/traps.c	Wed Mar 01 12:47:25 2006 -0700
   117.3 @@ -951,6 +951,7 @@ static int emulate_privileged_op(struct 
   117.4              
   117.5          case 3: /* Write CR3 */
   117.6              LOCK_BIGLOCK(v->domain);
   117.7 +            cleanup_writable_pagetable(v->domain);
   117.8              (void)new_guest_cr3(gmfn_to_mfn(v->domain, paddr_to_pfn(*reg)));
   117.9              UNLOCK_BIGLOCK(v->domain);
  117.10              break;
  117.11 @@ -1002,7 +1003,6 @@ static int emulate_privileged_op(struct 
  117.12  #endif
  117.13          default:
  117.14              if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
  117.15 -                 (regs->ecx != MSR_EFER) ||
  117.16                   (regs->eax != l) || (regs->edx != h) )
  117.17                  DPRINTK("Domain attempted WRMSR %p from "
  117.18                          "%08x:%08x to %08lx:%08lx.\n",
  117.19 @@ -1033,8 +1033,8 @@ static int emulate_privileged_op(struct 
  117.20                  goto fail;
  117.21              break;
  117.22          default:
  117.23 -            DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx));
  117.24              /* Everyone can read the MSR space. */
  117.25 +            /*DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx));*/
  117.26              if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
  117.27                  goto fail;
  117.28              break;
  117.29 @@ -1416,8 +1416,8 @@ long do_set_trap_table(struct trap_info 
  117.30      {
  117.31          if ( hypercall_preempt_check() )
  117.32          {
  117.33 -            rc = hypercall1_create_continuation(
  117.34 -                __HYPERVISOR_set_trap_table, traps);
  117.35 +            rc = hypercall_create_continuation(
  117.36 +                __HYPERVISOR_set_trap_table, "p", traps);
  117.37              break;
  117.38          }
  117.39  
  117.40 @@ -1430,7 +1430,7 @@ long do_set_trap_table(struct trap_info 
  117.41          if ( cur.address == 0 )
  117.42              break;
  117.43  
  117.44 -        fixup_guest_selector(cur.cs);
  117.45 +        fixup_guest_code_selector(cur.cs);
  117.46  
  117.47          memcpy(&dst[cur.vector], &cur, sizeof(cur));
  117.48  
   118.1 --- a/xen/arch/x86/x86_32/asm-offsets.c	Wed Mar 01 10:01:54 2006 -0700
   118.2 +++ b/xen/arch/x86/x86_32/asm-offsets.c	Wed Mar 01 12:47:25 2006 -0700
   118.3 @@ -72,6 +72,13 @@ void __dummy__(void)
   118.4      DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked);
   118.5      BLANK();
   118.6  
   118.7 +    OFFSET(TSS_ss0, struct tss_struct, ss0);
   118.8 +    OFFSET(TSS_esp0, struct tss_struct, esp0);
   118.9 +    OFFSET(TSS_ss1, struct tss_struct, ss1);
  118.10 +    OFFSET(TSS_esp1, struct tss_struct, esp1);
  118.11 +    DEFINE(TSS_sizeof, sizeof(struct tss_struct));
  118.12 +    BLANK();
  118.13 +
  118.14      OFFSET(VCPU_svm_vmcb_pa, struct vcpu, arch.hvm_svm.vmcb_pa);
  118.15      OFFSET(VCPU_svm_hsa_pa,  struct vcpu, arch.hvm_svm.host_save_pa);
  118.16      OFFSET(VCPU_svm_vmcb, struct vcpu, arch.hvm_svm.vmcb);
   119.1 --- a/xen/arch/x86/x86_32/entry.S	Wed Mar 01 10:01:54 2006 -0700
   119.2 +++ b/xen/arch/x86/x86_32/entry.S	Wed Mar 01 12:47:25 2006 -0700
   119.3 @@ -77,6 +77,13 @@
   119.4  restore_all_guest:
   119.5          testl $X86_EFLAGS_VM,UREGS_eflags(%esp)
   119.6          jnz  restore_all_vm86
   119.7 +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
   119.8 +        testl $2,UREGS_cs(%esp)
   119.9 +        jnz   1f
  119.10 +        call  restore_ring0_guest
  119.11 +        jmp   restore_all_vm86
  119.12 +1:
  119.13 +#endif
  119.14  FLT1:   mov  UREGS_ds(%esp),%ds
  119.15  FLT2:   mov  UREGS_es(%esp),%es
  119.16  FLT3:   mov  UREGS_fs(%esp),%fs
  119.17 @@ -157,6 +164,7 @@ restore_all_xen:
  119.18          ALIGN
  119.19  ENTRY(hypercall)
  119.20          subl $4,%esp
  119.21 +        FIXUP_RING0_GUEST_STACK
  119.22  	SAVE_ALL(b)
  119.23          sti
  119.24          GET_CURRENT(%ebx)
  119.25 @@ -294,6 +302,11 @@ FLT14:  movl %eax,%gs:(%esi)
  119.26          popl %eax
  119.27          shll $16,%eax                    # Bits 16-23: saved_upcall_mask
  119.28          movw UREGS_cs+4(%esp),%ax        # Bits  0-15: CS
  119.29 +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
  119.30 +        testw $2,%ax
  119.31 +        jnz  FLT15
  119.32 +        and  $~3,%ax                     # RPL 1 -> RPL 0
  119.33 +#endif
  119.34  FLT15:  movl %eax,%gs:4(%esi) 
  119.35          test $0x00FF0000,%eax            # Bits 16-23: saved_upcall_mask
  119.36          setz %ch                         # %ch == !saved_upcall_mask
  119.37 @@ -388,6 +401,7 @@ ENTRY(divide_error)
  119.38  	pushl $TRAP_divide_error<<16
  119.39  	ALIGN
  119.40  error_code:
  119.41 +        FIXUP_RING0_GUEST_STACK
  119.42          SAVE_ALL_NOSEGREGS(a)
  119.43          SET_XEN_SEGMENTS(a)
  119.44          testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%esp)
  119.45 @@ -505,6 +519,10 @@ ENTRY(spurious_interrupt_bug)
  119.46  	jmp error_code
  119.47  
  119.48  ENTRY(nmi)
  119.49 +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
  119.50 +        # NMI entry protocol is incompatible with guest kernel in ring 0.
  119.51 +        iret
  119.52 +#else
  119.53          # Save state but do not trash the segment registers!
  119.54          # We may otherwise be unable to reload them or copy them to ring 1. 
  119.55  	pushl %eax
  119.56 @@ -546,6 +564,7 @@ 1:      movl  %ss:APIC_ICR(%eax),%ebx
  119.57          movl  $(APIC_DM_FIXED | APIC_DEST_SELF | APIC_DEST_LOGICAL | \
  119.58                  TRAP_deferred_nmi),%ss:APIC_ICR(%eax)
  119.59          jmp   restore_all_xen
  119.60 +#endif /* !CONFIG_X86_SUPERVISOR_MODE_KERNEL */
  119.61  
  119.62  ENTRY(setup_vm86_frame)
  119.63          # Copies the entire stack frame forwards by 16 bytes.
   120.1 --- a/xen/arch/x86/x86_32/mm.c	Wed Mar 01 10:01:54 2006 -0700
   120.2 +++ b/xen/arch/x86/x86_32/mm.c	Wed Mar 01 12:47:25 2006 -0700
   120.3 @@ -23,6 +23,7 @@
   120.4  #include <xen/init.h>
   120.5  #include <xen/mm.h>
   120.6  #include <xen/sched.h>
   120.7 +#include <xen/guest_access.h>
   120.8  #include <asm/current.h>
   120.9  #include <asm/page.h>
  120.10  #include <asm/flushtlb.h>
  120.11 @@ -180,9 +181,18 @@ void subarch_init_memory(struct domain *
  120.12              page_set_owner(page, dom_xen);
  120.13          }
  120.14      }
  120.15 +
  120.16 +    if ( supervisor_mode_kernel )
  120.17 +    {
  120.18 +        /* Guest kernel runs in ring 0, not ring 1. */
  120.19 +        struct desc_struct *d;
  120.20 +        d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY];
  120.21 +        d[0].b &= ~_SEGMENT_DPL;
  120.22 +        d[1].b &= ~_SEGMENT_DPL;
  120.23 +    }
  120.24  }
  120.25  
  120.26 -long subarch_memory_op(int op, void *arg)
  120.27 +long subarch_memory_op(int op, GUEST_HANDLE(void) arg)
  120.28  {
  120.29      struct xen_machphys_mfn_list xmml;
  120.30      unsigned long mfn;
  120.31 @@ -192,7 +202,7 @@ long subarch_memory_op(int op, void *arg
  120.32      switch ( op )
  120.33      {
  120.34      case XENMEM_machphys_mfn_list:
  120.35 -        if ( copy_from_user(&xmml, arg, sizeof(xmml)) )
  120.36 +        if ( copy_from_guest(&xmml, arg, 1) )
  120.37              return -EFAULT;
  120.38  
  120.39          max = min_t(unsigned int, xmml.max_extents, mpt_size >> 21);
  120.40 @@ -201,11 +211,12 @@ long subarch_memory_op(int op, void *arg
  120.41          {
  120.42              mfn = l2e_get_pfn(idle_pg_table_l2[l2_linear_offset(
  120.43                  RDWR_MPT_VIRT_START + (i << 21))]) + l1_table_offset(i << 21);
  120.44 -            if ( put_user(mfn, &xmml.extent_start[i]) )
  120.45 +            if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
  120.46                  return -EFAULT;
  120.47          }
  120.48  
  120.49 -        if ( put_user(i, &((struct xen_machphys_mfn_list *)arg)->nr_extents) )
  120.50 +        xmml.nr_extents = i;
  120.51 +        if ( copy_to_guest(arg, &xmml, 1) )
  120.52              return -EFAULT;
  120.53  
  120.54          break;
  120.55 @@ -223,7 +234,7 @@ long do_stack_switch(unsigned long ss, u
  120.56      int nr = smp_processor_id();
  120.57      struct tss_struct *t = &init_tss[nr];
  120.58  
  120.59 -    fixup_guest_selector(ss);
  120.60 +    fixup_guest_stack_selector(ss);
  120.61  
  120.62      current->arch.guest_context.kernel_ss = ss;
  120.63      current->arch.guest_context.kernel_sp = esp;
  120.64 @@ -240,6 +251,10 @@ int check_descriptor(struct desc_struct 
  120.65      u32 a = d->a, b = d->b;
  120.66      u16 cs;
  120.67  
  120.68 +    /* Let a ring0 guest kernel set any descriptor it wants to. */
  120.69 +    if ( supervisor_mode_kernel )
  120.70 +        return 1;
  120.71 +
  120.72      /* A not-present descriptor will always fault, so is safe. */
  120.73      if ( !(b & _SEGMENT_P) ) 
  120.74          goto good;
  120.75 @@ -273,7 +288,7 @@ int check_descriptor(struct desc_struct 
  120.76  
  120.77          /* Validate and fix up the target code selector. */
  120.78          cs = a >> 16;
  120.79 -        fixup_guest_selector(cs);
  120.80 +        fixup_guest_code_selector(cs);
  120.81          if ( !guest_gate_selector_okay(cs) )
  120.82              goto bad;
  120.83          a = d->a = (d->a & 0xffffU) | (cs << 16);
   121.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   121.2 +++ b/xen/arch/x86/x86_32/supervisor_mode_kernel.S	Wed Mar 01 12:47:25 2006 -0700
   121.3 @@ -0,0 +1,145 @@
   121.4 +/*
   121.5 + * Handle stack fixup for guest running in RING 0.
   121.6 + *
   121.7 + * Copyright (c) 2006 Ian Campbell
   121.8 + *
   121.9 + * When a guest kernel is allowed to run in RING 0 a hypercall,
  121.10 + * interrupt or exception interrupting the guest kernel will not cause
  121.11 + * a privilege level change and therefore the stack will not be swapped
  121.12 + * to the Xen stack.
  121.13 + *
  121.14 + * To fix this we look for RING 0 activation frames with a stack
  121.15 + * pointer below HYPERVISOR_VIRT_START (indicating a guest kernel
  121.16 + * frame) and fix this up by locating the Xen stack via the TSS
  121.17 + * and moving the activation frame to the Xen stack. In the process we
  121.18 + * convert the frame into an inter-privilege frame returning to RING 1
  121.19 + * so that we can catch and reverse the process on exit.
  121.20 + */
  121.21 +
  121.22 +#include <xen/config.h>
  121.23 +#include <asm/asm_defns.h>
  121.24 +#include <public/xen.h>
  121.25 +
  121.26 +        # Upon entry the stack should be the Xen stack and contain:
  121.27 +        #   %ss, %esp, EFLAGS, %cs|1, %eip, ERROR, SAVE_ALL, RETURN
  121.28 +        # On exit the stack should be %ss:%esp (i.e. the guest stack)
  121.29 +        # and contain:
  121.30 +        #   EFLAGS, %cs, %eip, ERROR, SAVE_ALL, RETURN
  121.31 +        ALIGN
  121.32 +ENTRY(restore_ring0_guest)
  121.33 +        # Point %gs:%esi to guest stack.
  121.34 +RRG0:   movw UREGS_ss+4(%esp),%gs
  121.35 +        movl UREGS_esp+4(%esp),%esi
  121.36 +
  121.37 +        # Copy EFLAGS...EBX, RETURN from Xen stack to guest stack.
  121.38 +        movl $(UREGS_kernel_sizeof>>2)+1,%ecx
  121.39 +
  121.40 +1:      subl $4,%esi
  121.41 +        movl -4(%esp,%ecx,4),%eax
  121.42 +RRG1:   movl %eax,%gs:(%esi)
  121.43 +        loop 1b
  121.44 +
  121.45 +RRG2:   andl $~3,%gs:UREGS_cs+4(%esi)
  121.46 +
  121.47 +        movl %gs,%eax
  121.48 +
  121.49 +        # We need to do this because these registers are not present
  121.50 +        # on the guest stack so they cannot be restored by the code in
  121.51 +        # restore_all_guest.
  121.52 +RRG3:   mov  UREGS_ds+4(%esp),%ds
  121.53 +RRG4:   mov  UREGS_es+4(%esp),%es
  121.54 +RRG5:   mov  UREGS_fs+4(%esp),%fs
  121.55 +RRG6:   mov  UREGS_gs+4(%esp),%gs
  121.56 +
  121.57 +RRG7:   movl %eax,%ss
  121.58 +        movl %esi,%esp
  121.59 +
  121.60 +        ret
  121.61 +.section __ex_table,"a"
  121.62 +        .long RRG0,domain_crash_synchronous
  121.63 +        .long RRG1,domain_crash_synchronous
  121.64 +        .long RRG2,domain_crash_synchronous
  121.65 +        .long RRG3,domain_crash_synchronous
  121.66 +        .long RRG4,domain_crash_synchronous
  121.67 +        .long RRG5,domain_crash_synchronous
  121.68 +        .long RRG6,domain_crash_synchronous
  121.69 +        .long RRG7,domain_crash_synchronous
  121.70 +.previous
  121.71 +
  121.72 +        # Upon entry the stack should be a guest stack and contain:
  121.73 +        #   EFLAGS, %cs, %eip, ERROR, RETURN
  121.74 +        # On exit the stack should be the Xen stack and contain:
  121.75 +        #   %ss, %esp, EFLAGS, %cs|1, %eip, ERROR, RETURN
  121.76 +        ALIGN
  121.77 +ENTRY(fixup_ring0_guest_stack)
  121.78 +        pushl %eax
  121.79 +        pushl %ecx
  121.80 +        pushl %ds
  121.81 +        pushl %gs
  121.82 +        pushl %esi
  121.83 +
  121.84 +        movw  $__HYPERVISOR_DS,%ax
  121.85 +        movw  %ax,%ds
  121.86 +
  121.87 +        # Point %gs:%esi to guest stack frame.
  121.88 +        movw  %ss,%ax
  121.89 +        movw  %ax,%gs
  121.90 +        movl  %esp,%esi
  121.91 +        # Account for entries on the guest stack:
  121.92 +        # * Pushed by normal exception/interrupt/hypercall mechanisms
  121.93 +        #   * EFLAGS, %cs, %eip, ERROR == 4 words.
  121.94 +        # * Pushed by the fixup routine
  121.95 +        #   * [RETURN], %eax, %ecx, %ds, %gs and %esi == 6 words.
  121.96 +        addl $((6+4)*4),%esi
  121.97 +
  121.98 +        # %gs:%esi now points to the guest stack before the
  121.99 +        # interrupt/exception occured.
 121.100 +
 121.101 +        /*
 121.102 +         * Reverse the __TSS macro, giving us the CPU number.
 121.103 +         * The TSS for this cpu is at init_tss + ( cpu * 128 ).
 121.104 +         */
 121.105 +        str   %ecx
 121.106 +        shrl  $3,%ecx                                   # Calculate GDT index for TSS.
 121.107 +        subl  $(FIRST_RESERVED_GDT_ENTRY+8),%ecx        # %ecx = 2*cpu.
 121.108 +        shll  $6,%ecx                                   # Each TSS entry is 0x80 bytes
 121.109 +        addl  $init_tss,%ecx                            # but we have 2*cpu from above.
 121.110 +
 121.111 +        # Load Xen stack from TSS.
 121.112 +        movw  TSS_ss0(%ecx),%ax
 121.113 +TRP1:   movw  %ax,%ss
 121.114 +        movl  TSS_esp0(%ecx),%esp
 121.115 +
 121.116 +        pushl %gs
 121.117 +        pushl %esi
 121.118 +
 121.119 +        # Move EFLAGS, %cs, %eip, ERROR, RETURN, %eax, %ecx, %ds, %gs, %esi
 121.120 +        # from guest stack to Xen stack.
 121.121 +        movl  $10,%ecx
 121.122 +1:      subl  $4,%esp
 121.123 +        subl  $4,%esi
 121.124 +TRP2:   movl  %gs:(%esi),%eax
 121.125 +        movl  %eax,(%esp)
 121.126 +        loop  1b
 121.127 +
 121.128 +        # CS = CS|1 to simulate RING1 stack frame.
 121.129 +        orl   $1,32(%esp)
 121.130 +
 121.131 +        popl  %esi
 121.132 +        popl  %gs
 121.133 +        popl  %ds
 121.134 +        popl  %ecx
 121.135 +        popl  %eax
 121.136 +        ret
 121.137 +.section __ex_table,"a"
 121.138 +        .long TRP1,domain_crash_synchronous
 121.139 +        .long TRP2,domain_crash_synchronous
 121.140 +.previous
 121.141 +
 121.142 +domain_crash_synchronous_string:
 121.143 +        .asciz "domain_crash_sync called from supervisor_mode_kernel.S (%lx)\n"
 121.144 +
 121.145 +domain_crash_synchronous:
 121.146 +        pushl $domain_crash_synchronous_string
 121.147 +        call  printf
 121.148 +        jmp   __domain_crash_synchronous
   122.1 --- a/xen/arch/x86/x86_32/traps.c	Wed Mar 01 10:01:54 2006 -0700
   122.2 +++ b/xen/arch/x86/x86_32/traps.c	Wed Mar 01 12:47:25 2006 -0700
   122.3 @@ -256,8 +256,14 @@ void init_int80_direct_trap(struct vcpu 
   122.4       * We can't virtualise interrupt gates, as there's no way to get
   122.5       * the CPU to automatically clear the events_mask variable. Also we
   122.6       * must ensure that the CS is safe to poke into an interrupt gate.
   122.7 +     *
   122.8 +     * When running with supervisor_mode_kernel enabled a direct trap
   122.9 +     * to the guest OS cannot be used because the INT instruction will
  122.10 +     * switch to the Xen stack and we need to swap back to the guest
  122.11 +     * kernel stack before passing control to the system call entry point.
  122.12       */
  122.13 -    if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) )
  122.14 +    if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) ||
  122.15 +         supervisor_mode_kernel )
  122.16      {
  122.17          v->arch.int80_desc.a = v->arch.int80_desc.b = 0;
  122.18          return;
  122.19 @@ -278,8 +284,8 @@ long do_set_callbacks(unsigned long even
  122.20  {
  122.21      struct vcpu *d = current;
  122.22  
  122.23 -    fixup_guest_selector(event_selector);
  122.24 -    fixup_guest_selector(failsafe_selector);
  122.25 +    fixup_guest_code_selector(event_selector);
  122.26 +    fixup_guest_code_selector(failsafe_selector);
  122.27  
  122.28      d->arch.guest_context.event_callback_cs     = event_selector;
  122.29      d->arch.guest_context.event_callback_eip    = event_address;
  122.30 @@ -289,12 +295,51 @@ long do_set_callbacks(unsigned long even
  122.31      return 0;
  122.32  }
  122.33  
  122.34 -void hypercall_page_initialise(void *hypercall_page)
  122.35 +static void hypercall_page_initialise_ring0_kernel(void *hypercall_page)
  122.36 +{
  122.37 +    extern asmlinkage int hypercall(void);
  122.38 +    char *p;
  122.39 +    int i;
  122.40 +
  122.41 +    /* Fill in all the transfer points with template machine code. */
  122.42 +
  122.43 +    for ( i = 0; i < NR_hypercalls; i++ )
  122.44 +    {
  122.45 +        p = (char *)(hypercall_page + (i * 32));
  122.46 +
  122.47 +        *(u8  *)(p+ 0) = 0x9c;      /* pushf */
  122.48 +        *(u8  *)(p+ 1) = 0xfa;      /* cli */
  122.49 +        *(u8  *)(p+ 2) = 0xb8;      /* mov $<i>,%eax */
  122.50 +        *(u32 *)(p+ 3) = i;
  122.51 +        *(u8  *)(p+ 7) = 0x9a;      /* lcall $__HYPERVISOR_CS,&hypercall */
  122.52 +        *(u32 *)(p+ 8) = (u32)&hypercall;
  122.53 +        *(u16 *)(p+12) = (u16)__HYPERVISOR_CS;
  122.54 +        *(u8  *)(p+14) = 0xc3;      /* ret */
  122.55 +    }
  122.56 +
  122.57 +    /*
  122.58 +     * HYPERVISOR_iret is special because it doesn't return and expects a
  122.59 +     * special stack frame. Guests jump at this transfer point instead of
  122.60 +     * calling it.
  122.61 +     */
  122.62 +    p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32));
  122.63 +    *(u8  *)(p+ 0) = 0x50;      /* push %eax */
  122.64 +    *(u8  *)(p+ 1) = 0x9c;      /* pushf */
  122.65 +    *(u8  *)(p+ 2) = 0xfa;      /* cli */
  122.66 +    *(u8  *)(p+ 3) = 0xb8;      /* mov $<i>,%eax */
  122.67 +    *(u32 *)(p+ 4) = __HYPERVISOR_iret;
  122.68 +    *(u8  *)(p+ 8) = 0x9a;      /* lcall $__HYPERVISOR_CS,&hypercall */
  122.69 +    *(u32 *)(p+ 9) = (u32)&hypercall;
  122.70 +    *(u16 *)(p+13) = (u16)__HYPERVISOR_CS;
  122.71 +}
  122.72 +
  122.73 +static void hypercall_page_initialise_ring1_kernel(void *hypercall_page)
  122.74  {
  122.75      char *p;
  122.76      int i;
  122.77  
  122.78      /* Fill in all the transfer points with template machine code. */
  122.79 +
  122.80      for ( i = 0; i < (PAGE_SIZE / 32); i++ )
  122.81      {
  122.82          p = (char *)(hypercall_page + (i * 32));
  122.83 @@ -316,6 +361,14 @@ void hypercall_page_initialise(void *hyp
  122.84      *(u16 *)(p+ 6) = 0x82cd;  /* int  $0x82 */
  122.85  }
  122.86  
  122.87 +void hypercall_page_initialise(void *hypercall_page)
  122.88 +{
  122.89 +    if ( supervisor_mode_kernel )
  122.90 +        hypercall_page_initialise_ring0_kernel(hypercall_page);
  122.91 +    else
  122.92 +        hypercall_page_initialise_ring1_kernel(hypercall_page);
  122.93 +}
  122.94 +
  122.95  /*
  122.96   * Local variables:
  122.97   * mode: C
   123.1 --- a/xen/arch/x86/x86_64/mm.c	Wed Mar 01 10:01:54 2006 -0700
   123.2 +++ b/xen/arch/x86/x86_64/mm.c	Wed Mar 01 12:47:25 2006 -0700
   123.3 @@ -22,6 +22,7 @@
   123.4  #include <xen/init.h>
   123.5  #include <xen/mm.h>
   123.6  #include <xen/sched.h>
   123.7 +#include <xen/guest_access.h>
   123.8  #include <asm/current.h>
   123.9  #include <asm/asm_defns.h>
  123.10  #include <asm/page.h>
  123.11 @@ -182,7 +183,7 @@ void subarch_init_memory(struct domain *
  123.12      }
  123.13  }
  123.14  
  123.15 -long subarch_memory_op(int op, void *arg)
  123.16 +long subarch_memory_op(int op, GUEST_HANDLE(void) arg)
  123.17  {
  123.18      struct xen_machphys_mfn_list xmml;
  123.19      l3_pgentry_t l3e;
  123.20 @@ -194,7 +195,7 @@ long subarch_memory_op(int op, void *arg
  123.21      switch ( op )
  123.22      {
  123.23      case XENMEM_machphys_mfn_list:
  123.24 -        if ( copy_from_user(&xmml, arg, sizeof(xmml)) )
  123.25 +        if ( copy_from_guest(&xmml, arg, 1) )
  123.26              return -EFAULT;
  123.27  
  123.28          for ( i = 0, v = RDWR_MPT_VIRT_START;
  123.29 @@ -209,11 +210,12 @@ long subarch_memory_op(int op, void *arg
  123.30              if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
  123.31                  break;
  123.32              mfn = l2e_get_pfn(l2e) + l1_table_offset(v);
  123.33 -            if ( put_user(mfn, &xmml.extent_start[i]) )
  123.34 +            if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
  123.35                  return -EFAULT;
  123.36          }
  123.37  
  123.38 -        if ( put_user(i, &((struct xen_machphys_mfn_list *)arg)->nr_extents) )
  123.39 +        xmml.nr_extents = i;
  123.40 +        if ( copy_to_guest(arg, &xmml, 1) )
  123.41              return -EFAULT;
  123.42  
  123.43          break;
  123.44 @@ -228,7 +230,7 @@ long subarch_memory_op(int op, void *arg
  123.45  
  123.46  long do_stack_switch(unsigned long ss, unsigned long esp)
  123.47  {
  123.48 -    fixup_guest_selector(ss);
  123.49 +    fixup_guest_stack_selector(ss);
  123.50      current->arch.guest_context.kernel_ss = ss;
  123.51      current->arch.guest_context.kernel_sp = esp;
  123.52      return 0;
  123.53 @@ -315,7 +317,7 @@ int check_descriptor(struct desc_struct 
  123.54  
  123.55      /* Validate and fix up the target code selector. */
  123.56      cs = a >> 16;
  123.57 -    fixup_guest_selector(cs);
  123.58 +    fixup_guest_code_selector(cs);
  123.59      if ( !guest_gate_selector_okay(cs) )
  123.60          goto bad;
  123.61      a = d->a = (d->a & 0xffffU) | (cs << 16);
   124.1 --- a/xen/common/dom0_ops.c	Wed Mar 01 10:01:54 2006 -0700
   124.2 +++ b/xen/common/dom0_ops.c	Wed Mar 01 12:47:25 2006 -0700
   124.3 @@ -46,6 +46,7 @@ static void getdomaininfo(struct domain 
   124.4      struct vcpu   *v;
   124.5      u64 cpu_time = 0;
   124.6      int flags = DOMFLAGS_BLOCKED;
   124.7 +    struct vcpu_runstate_info runstate;
   124.8      
   124.9      info->domain = d->domain_id;
  124.10      info->nr_online_vcpus = 0;
  124.11 @@ -55,7 +56,8 @@ static void getdomaininfo(struct domain 
  124.12       * - domain is marked as running if any of its vcpus is running
  124.13       */
  124.14      for_each_vcpu ( d, v ) {
  124.15 -        cpu_time += v->cpu_time;
  124.16 +        vcpu_runstate_get(v, &runstate);
  124.17 +        cpu_time += runstate.time[RUNSTATE_running];
  124.18          info->max_vcpu_id = v->vcpu_id;
  124.19          if ( !test_bit(_VCPUF_down, &v->vcpu_flags) )
  124.20          {
  124.21 @@ -165,8 +167,16 @@ long do_dom0_op(struct dom0_op *u_dom0_o
  124.22          domid_t        dom;
  124.23          struct vcpu   *v;
  124.24          unsigned int   i, cnt[NR_CPUS] = { 0 };
  124.25 +        cpumask_t      cpu_exclude_map;
  124.26          static domid_t rover = 0;
  124.27  
  124.28 +        /*
  124.29 +         * Running the domain 0 kernel in ring 0 is not compatible
  124.30 +         * with multiple guests.
  124.31 +         */
  124.32 +        if ( supervisor_mode_kernel )
  124.33 +            return -EINVAL;
  124.34 +
  124.35          dom = op->u.createdomain.domain;
  124.36          if ( (dom > 0) && (dom < DOMID_FIRST_RESERVED) )
  124.37          {
  124.38 @@ -195,18 +205,29 @@ long do_dom0_op(struct dom0_op *u_dom0_o
  124.39          read_lock(&domlist_lock);
  124.40          for_each_domain ( d )
  124.41              for_each_vcpu ( d, v )
  124.42 -                cnt[v->processor]++;
  124.43 +                if ( !test_bit(_VCPUF_down, &v->vcpu_flags) )
  124.44 +                    cnt[v->processor]++;
  124.45          read_unlock(&domlist_lock);
  124.46          
  124.47          /*
  124.48 -         * If we're on a HT system, we only use the first HT for dom0, other 
  124.49 -         * domains will all share the second HT of each CPU. Since dom0 is on 
  124.50 -         * CPU 0, we favour high numbered CPUs in the event of a tie.
  124.51 +         * If we're on a HT system, we only auto-allocate to a non-primary HT.
  124.52 +         * We favour high numbered CPUs in the event of a tie.
  124.53           */
  124.54 -        pro = smp_num_siblings - 1;
  124.55 -        for ( i = pro; i < num_online_cpus(); i += smp_num_siblings )
  124.56 +        pro = first_cpu(cpu_sibling_map[0]);
  124.57 +        if ( cpus_weight(cpu_sibling_map[0]) > 1 )
  124.58 +            pro = next_cpu(pro, cpu_sibling_map[0]);
  124.59 +        cpu_exclude_map = cpu_sibling_map[0];
  124.60 +        for_each_online_cpu ( i )
  124.61 +        {
  124.62 +            if ( cpu_isset(i, cpu_exclude_map) )
  124.63 +                continue;
  124.64 +            if ( (i == first_cpu(cpu_sibling_map[i])) &&
  124.65 +                 (cpus_weight(cpu_sibling_map[i]) > 1) )
  124.66 +                continue;
  124.67 +            cpus_or(cpu_exclude_map, cpu_exclude_map, cpu_sibling_map[i]);
  124.68              if ( cnt[i] <= cnt[pro] )
  124.69                  pro = i;
  124.70 +        }
  124.71  
  124.72          ret = -ENOMEM;
  124.73          if ( (d = domain_create(dom, pro)) == NULL )
  124.74 @@ -485,6 +506,7 @@ long do_dom0_op(struct dom0_op *u_dom0_o
  124.75      { 
  124.76          struct domain *d;
  124.77          struct vcpu   *v;
  124.78 +        struct vcpu_runstate_info runstate;
  124.79  
  124.80          ret = -ESRCH;
  124.81          if ( (d = find_domain_by_id(op->u.getvcpuinfo.domain)) == NULL )
  124.82 @@ -498,10 +520,12 @@ long do_dom0_op(struct dom0_op *u_dom0_o
  124.83          if ( (v = d->vcpu[op->u.getvcpuinfo.vcpu]) == NULL )
  124.84              goto getvcpuinfo_out;
  124.85  
  124.86 +        vcpu_runstate_get(v, &runstate);
  124.87 +
  124.88          op->u.getvcpuinfo.online   = !test_bit(_VCPUF_down, &v->vcpu_flags);
  124.89          op->u.getvcpuinfo.blocked  = test_bit(_VCPUF_blocked, &v->vcpu_flags);
  124.90          op->u.getvcpuinfo.running  = test_bit(_VCPUF_running, &v->vcpu_flags);
  124.91 -        op->u.getvcpuinfo.cpu_time = v->cpu_time;
  124.92 +        op->u.getvcpuinfo.cpu_time = runstate.time[RUNSTATE_running];
  124.93          op->u.getvcpuinfo.cpu      = v->processor;
  124.94          op->u.getvcpuinfo.cpumap   = 0;
  124.95          memcpy(&op->u.getvcpuinfo.cpumap,
   125.1 --- a/xen/common/domain.c	Wed Mar 01 10:01:54 2006 -0700
   125.2 +++ b/xen/common/domain.c	Wed Mar 01 12:47:25 2006 -0700
   125.3 @@ -451,6 +451,41 @@ long do_vcpu_op(int cmd, int vcpuid, voi
   125.4      case VCPUOP_is_up:
   125.5          rc = !test_bit(_VCPUF_down, &v->vcpu_flags);
   125.6          break;
   125.7 +
   125.8 +    case VCPUOP_get_runstate_info:
   125.9 +    {
  125.10 +        struct vcpu_runstate_info runstate;
  125.11 +        vcpu_runstate_get(v, &runstate);
  125.12 +        if ( copy_to_user(arg, &runstate, sizeof(runstate)) )
  125.13 +            rc = -EFAULT;
  125.14 +        break;
  125.15 +    }
  125.16 +
  125.17 +    case VCPUOP_register_runstate_memory_area:
  125.18 +    {
  125.19 +        struct vcpu_register_runstate_memory_area area;
  125.20 +
  125.21 +        rc = -EINVAL;
  125.22 +        if ( v != current )
  125.23 +            break;
  125.24 +
  125.25 +        rc = -EFAULT;
  125.26 +        if ( copy_from_user(&area, arg, sizeof(area)) )
  125.27 +            break;
  125.28 +
  125.29 +        if ( !access_ok(area.addr.v, sizeof(*area.addr.v)) )
  125.30 +            break;
  125.31 +
  125.32 +        rc = 0;
  125.33 +        v->runstate_guest = area.addr.v;
  125.34 +        __copy_to_user(v->runstate_guest, &v->runstate, sizeof(v->runstate));
  125.35 +
  125.36 +        break;
  125.37 +    }
  125.38 +
  125.39 +    default:
  125.40 +        rc = -ENOSYS;
  125.41 +        break;
  125.42      }
  125.43  
  125.44      return rc;
   126.1 --- a/xen/common/kernel.c	Wed Mar 01 10:01:54 2006 -0700
   126.2 +++ b/xen/common/kernel.c	Wed Mar 01 12:47:25 2006 -0700
   126.3 @@ -195,6 +195,8 @@ long do_xen_version(int cmd, void *arg)
   126.4                      (1U << XENFEAT_writable_page_tables) |
   126.5                      (1U << XENFEAT_auto_translated_physmap) |
   126.6                      (1U << XENFEAT_pae_pgdir_above_4gb);
   126.7 +            if ( supervisor_mode_kernel )
   126.8 +                fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
   126.9              break;
  126.10          default:
  126.11              return -EINVAL;
   127.1 --- a/xen/common/keyhandler.c	Wed Mar 01 10:01:54 2006 -0700
   127.2 +++ b/xen/common/keyhandler.c	Wed Mar 01 12:47:25 2006 -0700
   127.3 @@ -169,8 +169,6 @@ static void dump_domains(unsigned char k
   127.4  }
   127.5  
   127.6  extern void dump_runq(unsigned char key);
   127.7 -extern void print_sched_histo(unsigned char key);
   127.8 -extern void reset_sched_histo(unsigned char key);
   127.9  #ifndef NDEBUG
  127.10  extern void audit_domains_key(unsigned char key);
  127.11  #endif
  127.12 @@ -207,10 +205,6 @@ void initialize_keytable(void)
  127.13      register_keyhandler(
  127.14          'h', show_handlers, "show this message");
  127.15      register_keyhandler(
  127.16 -        'l', print_sched_histo, "print sched latency histogram");
  127.17 -    register_keyhandler(
  127.18 -        'L', reset_sched_histo, "reset sched latency histogram");
  127.19 -    register_keyhandler(
  127.20          'q', dump_domains, "dump domain (and guest debug) info");
  127.21      register_keyhandler(
  127.22          'r', dump_runq,      "dump run queues");
   128.1 --- a/xen/common/memory.c	Wed Mar 01 10:01:54 2006 -0700
   128.2 +++ b/xen/common/memory.c	Wed Mar 01 12:47:25 2006 -0700
   128.3 @@ -16,6 +16,7 @@
   128.4  #include <xen/event.h>
   128.5  #include <xen/shadow.h>
   128.6  #include <xen/iocap.h>
   128.7 +#include <xen/guest_access.h>
   128.8  #include <asm/current.h>
   128.9  #include <asm/hardirq.h>
  128.10  #include <public/memory.h>
  128.11 @@ -30,7 +31,7 @@
  128.12  static long
  128.13  increase_reservation(
  128.14      struct domain *d, 
  128.15 -    unsigned long *extent_list, 
  128.16 +    GUEST_HANDLE(xen_ulong) extent_list,
  128.17      unsigned int   nr_extents,
  128.18      unsigned int   extent_order,
  128.19      unsigned int   flags,
  128.20 @@ -39,8 +40,8 @@ increase_reservation(
  128.21      struct page_info *page;
  128.22      unsigned long     i, mfn;
  128.23  
  128.24 -    if ( (extent_list != NULL) &&
  128.25 -         !array_access_ok(extent_list, nr_extents, sizeof(*extent_list)) )
  128.26 +    if ( !guest_handle_is_null(extent_list) &&
  128.27 +         !guest_handle_okay(extent_list, nr_extents) )
  128.28          return 0;
  128.29  
  128.30      if ( (extent_order != 0) &&
  128.31 @@ -65,10 +66,10 @@ increase_reservation(
  128.32          }
  128.33  
  128.34          /* Inform the domain of the new page's machine address. */ 
  128.35 -        if ( extent_list != NULL )
  128.36 +        if ( !guest_handle_is_null(extent_list) )
  128.37          {
  128.38              mfn = page_to_mfn(page);
  128.39 -            if ( unlikely(__copy_to_user(&extent_list[i], &mfn, sizeof(mfn))) )
  128.40 +            if ( unlikely(__copy_to_guest_offset(extent_list, i, &mfn, 1)) )
  128.41                  return i;
  128.42          }
  128.43      }
  128.44 @@ -79,16 +80,16 @@ increase_reservation(
  128.45  static long
  128.46  populate_physmap(
  128.47      struct domain *d, 
  128.48 -    unsigned long *extent_list, 
  128.49 -    unsigned int   nr_extents,
  128.50 -    unsigned int   extent_order,
  128.51 -    unsigned int   flags,
  128.52 -    int           *preempted)
  128.53 +    GUEST_HANDLE(xen_ulong) extent_list,
  128.54 +    unsigned int  nr_extents,
  128.55 +    unsigned int  extent_order,
  128.56 +    unsigned int  flags,
  128.57 +    int          *preempted)
  128.58  {
  128.59      struct page_info *page;
  128.60      unsigned long    i, j, gpfn, mfn;
  128.61  
  128.62 -    if ( !array_access_ok(extent_list, nr_extents, sizeof(*extent_list)) )
  128.63 +    if ( !guest_handle_okay(extent_list, nr_extents) )
  128.64          return 0;
  128.65  
  128.66      if ( (extent_order != 0) &&
  128.67 @@ -103,7 +104,7 @@ populate_physmap(
  128.68              goto out;
  128.69          }
  128.70  
  128.71 -        if ( unlikely(__copy_from_user(&gpfn, &extent_list[i], sizeof(gpfn))) )
  128.72 +        if ( unlikely(__copy_from_guest_offset(&gpfn, extent_list, i, 1)) )
  128.73              goto out;
  128.74  
  128.75          if ( unlikely((page = alloc_domheap_pages(
  128.76 @@ -128,7 +129,7 @@ populate_physmap(
  128.77                  set_gpfn_from_mfn(mfn + j, gpfn + j);
  128.78  
  128.79              /* Inform the domain of the new page's machine address. */ 
  128.80 -            if ( unlikely(__copy_to_user(&extent_list[i], &mfn, sizeof(mfn))) )
  128.81 +            if ( unlikely(__copy_to_guest_offset(extent_list, i, &mfn, 1)) )
  128.82                  goto out;
  128.83          }
  128.84      }
  128.85 @@ -139,8 +140,8 @@ populate_physmap(
  128.86      
  128.87  static long
  128.88  decrease_reservation(
  128.89 -    struct domain *d, 
  128.90 -    unsigned long *extent_list, 
  128.91 +    struct domain *d,
  128.92 +    GUEST_HANDLE(xen_ulong) extent_list,
  128.93      unsigned int   nr_extents,
  128.94      unsigned int   extent_order,
  128.95      unsigned int   flags,
  128.96 @@ -149,7 +150,7 @@ decrease_reservation(
  128.97      struct page_info *page;
  128.98      unsigned long    i, j, gmfn, mfn;
  128.99  
 128.100 -    if ( !array_access_ok(extent_list, nr_extents, sizeof(*extent_list)) )
 128.101 +    if ( !guest_handle_okay(extent_list, nr_extents) )
 128.102          return 0;
 128.103  
 128.104      for ( i = 0; i < nr_extents; i++ )
 128.105 @@ -160,7 +161,7 @@ decrease_reservation(
 128.106              return i;
 128.107          }
 128.108  
 128.109 -        if ( unlikely(__copy_from_user(&gmfn, &extent_list[i], sizeof(gmfn))) )
 128.110 +        if ( unlikely(__copy_from_guest_offset(&gmfn, extent_list, i, 1)) )
 128.111              return i;
 128.112  
 128.113          for ( j = 0; j < (1 << extent_order); j++ )
 128.114 @@ -197,21 +198,21 @@ decrease_reservation(
 128.115  
 128.116  static long
 128.117  translate_gpfn_list(
 128.118 -    struct xen_translate_gpfn_list *uop, unsigned long *progress)
 128.119 +    GUEST_HANDLE(xen_translate_gpfn_list_t) uop, unsigned long *progress)
 128.120  {
 128.121      struct xen_translate_gpfn_list op;
 128.122      unsigned long i, gpfn, mfn;
 128.123      struct domain *d;
 128.124  
 128.125 -    if ( copy_from_user(&op, uop, sizeof(op)) )
 128.126 +    if ( copy_from_guest(&op, uop, 1) )
 128.127          return -EFAULT;
 128.128  
 128.129      /* Is size too large for us to encode a continuation? */
 128.130      if ( op.nr_gpfns > (ULONG_MAX >> START_EXTENT_SHIFT) )
 128.131          return -EINVAL;
 128.132  
 128.133 -    if ( !array_access_ok(op.gpfn_list, op.nr_gpfns, sizeof(*op.gpfn_list)) ||
 128.134 -         !array_access_ok(op.mfn_list, op.nr_gpfns, sizeof(*op.mfn_list)) )
 128.135 +    if ( !guest_handle_okay(op.gpfn_list, op.nr_gpfns) ||
 128.136 +         !guest_handle_okay(op.mfn_list,  op.nr_gpfns) )
 128.137          return -EFAULT;
 128.138  
 128.139      if ( op.domid == DOMID_SELF )
 128.140 @@ -237,8 +238,7 @@ translate_gpfn_list(
 128.141              return -EAGAIN;
 128.142          }
 128.143  
 128.144 -        if ( unlikely(__copy_from_user(&gpfn, &op.gpfn_list[i],
 128.145 -                                       sizeof(gpfn))) )
 128.146 +        if ( unlikely(__copy_from_guest_offset(&gpfn, op.gpfn_list, i, 1)) )
 128.147          {
 128.148              put_domain(d);
 128.149              return -EFAULT;
 128.150 @@ -246,8 +246,7 @@ translate_gpfn_list(
 128.151  
 128.152          mfn = gmfn_to_mfn(d, gpfn);
 128.153  
 128.154 -        if ( unlikely(__copy_to_user(&op.mfn_list[i], &mfn,
 128.155 -                                     sizeof(mfn))) )
 128.156 +        if ( unlikely(__copy_to_guest_offset(op.mfn_list, i, &mfn, 1)) )
 128.157          {
 128.158              put_domain(d);
 128.159              return -EFAULT;
 128.160 @@ -258,7 +257,7 @@ translate_gpfn_list(
 128.161      return 0;
 128.162  }
 128.163  
 128.164 -long do_memory_op(unsigned long cmd, void *arg)
 128.165 +long do_memory_op(unsigned long cmd, GUEST_HANDLE(void) arg)
 128.166  {
 128.167      struct domain *d;
 128.168      int rc, op, flags = 0, preempted = 0;
 128.169 @@ -273,7 +272,7 @@ long do_memory_op(unsigned long cmd, voi
 128.170      case XENMEM_increase_reservation:
 128.171      case XENMEM_decrease_reservation:
 128.172      case XENMEM_populate_physmap:
 128.173 -        if ( copy_from_user(&reservation, arg, sizeof(reservation)) )
 128.174 +        if ( copy_from_guest(&reservation, arg, 1) )
 128.175              return -EFAULT;
 128.176  
 128.177          /* Is size too large for us to encode a continuation? */
 128.178 @@ -283,9 +282,9 @@ long do_memory_op(unsigned long cmd, voi
 128.179          start_extent = cmd >> START_EXTENT_SHIFT;
 128.180          if ( unlikely(start_extent > reservation.nr_extents) )
 128.181              return -EINVAL;
 128.182 -        
 128.183 -        if ( reservation.extent_start != NULL )
 128.184 -            reservation.extent_start += start_extent;
 128.185 +
 128.186 +        if ( !guest_handle_is_null(reservation.extent_start) )
 128.187 +            guest_handle_add_offset(reservation.extent_start, start_extent);
 128.188          reservation.nr_extents -= start_extent;
 128.189  
 128.190          if ( (reservation.address_bits != 0) &&
 128.191 @@ -342,8 +341,9 @@ long do_memory_op(unsigned long cmd, voi
 128.192          rc += start_extent;
 128.193  
 128.194          if ( preempted )
 128.195 -            return hypercall2_create_continuation(
 128.196 -                __HYPERVISOR_memory_op, op | (rc << START_EXTENT_SHIFT), arg);
 128.197 +            return hypercall_create_continuation(
 128.198 +                __HYPERVISOR_memory_op, "lh",
 128.199 +                op | (rc << START_EXTENT_SHIFT), arg);
 128.200  
 128.201          break;
 128.202  
 128.203 @@ -353,10 +353,10 @@ long do_memory_op(unsigned long cmd, voi
 128.204  
 128.205      case XENMEM_current_reservation:
 128.206      case XENMEM_maximum_reservation:
 128.207 -        if ( copy_from_user(&domid, (domid_t *)arg, sizeof(domid)) )
 128.208 +        if ( copy_from_guest(&domid, arg, 1) )
 128.209              return -EFAULT;
 128.210  
 128.211 -        if ( likely((domid = (unsigned long)arg) == DOMID_SELF) )
 128.212 +        if ( likely(domid == DOMID_SELF) )
 128.213              d = current->domain;
 128.214          else if ( !IS_PRIV(current->domain) )
 128.215              return -EPERM;
 128.216 @@ -372,12 +372,13 @@ long do_memory_op(unsigned long cmd, voi
 128.217  
 128.218      case XENMEM_translate_gpfn_list:
 128.219          progress = cmd >> START_EXTENT_SHIFT;
 128.220 -        rc = translate_gpfn_list(arg, &progress);
 128.221 +        rc = translate_gpfn_list(
 128.222 +            guest_handle_cast(arg, xen_translate_gpfn_list_t),
 128.223 +            &progress);
 128.224          if ( rc == -EAGAIN )
 128.225 -            return hypercall2_create_continuation(
 128.226 -                __HYPERVISOR_memory_op,
 128.227 -                op | (progress << START_EXTENT_SHIFT),
 128.228 -                arg);
 128.229 +            return hypercall_create_continuation(
 128.230 +                __HYPERVISOR_memory_op, "lh",
 128.231 +                op | (progress << START_EXTENT_SHIFT), arg);
 128.232          break;
 128.233  
 128.234      default:
   129.1 --- a/xen/common/multicall.c	Wed Mar 01 10:01:54 2006 -0700
   129.2 +++ b/xen/common/multicall.c	Wed Mar 01 12:47:25 2006 -0700
   129.3 @@ -81,8 +81,8 @@ long do_multicall(struct multicall_entry
   129.4              if ( i < nr_calls )
   129.5              {
   129.6                  mcs->flags = 0;
   129.7 -                return hypercall2_create_continuation(
   129.8 -                    __HYPERVISOR_multicall, &call_list[i], nr_calls-i);
   129.9 +                return hypercall_create_continuation(
  129.10 +                    __HYPERVISOR_multicall, "pi", &call_list[i], nr_calls-i);
  129.11              }
  129.12          }
  129.13      }
   130.1 --- a/xen/common/page_alloc.c	Wed Mar 01 10:01:54 2006 -0700
   130.2 +++ b/xen/common/page_alloc.c	Wed Mar 01 12:47:25 2006 -0700
   130.3 @@ -32,6 +32,7 @@
   130.4  #include <xen/softirq.h>
   130.5  #include <xen/shadow.h>
   130.6  #include <xen/domain_page.h>
   130.7 +#include <xen/keyhandler.h>
   130.8  #include <asm/page.h>
   130.9  
  130.10  /*
  130.11 @@ -662,6 +663,26 @@ unsigned long avail_domheap_pages(void)
  130.12  }
  130.13  
  130.14  
  130.15 +static void pagealloc_keyhandler(unsigned char key)
  130.16 +{
  130.17 +    printk("Physical memory information:\n");
  130.18 +    printk("    Xen heap: %lukB free\n"
  130.19 +           "    DMA heap: %lukB free\n"
  130.20 +           "    Dom heap: %lukB free\n",
  130.21 +           avail[MEMZONE_XEN]<<(PAGE_SHIFT-10),
  130.22 +           avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10),
  130.23 +           avail[MEMZONE_DOM]<<(PAGE_SHIFT-10));
  130.24 +}
  130.25 +
  130.26 +
  130.27 +static __init int pagealloc_keyhandler_init(void)
  130.28 +{
  130.29 +    register_keyhandler('m', pagealloc_keyhandler, "memory info");
  130.30 +    return 0;
  130.31 +}
  130.32 +__initcall(pagealloc_keyhandler_init);
  130.33 +
  130.34 +
  130.35  
  130.36  /*************************
  130.37   * PAGE SCRUBBING
   131.1 --- a/xen/common/sched_bvt.c	Wed Mar 01 10:01:54 2006 -0700
   131.2 +++ b/xen/common/sched_bvt.c	Wed Mar 01 12:47:25 2006 -0700
   131.3 @@ -132,13 +132,13 @@ static void unwarp_timer_fn(void *data)
   131.4      vcpu_schedule_unlock_irq(v);
   131.5  }
   131.6  
   131.7 -static inline u32 calc_avt(struct vcpu *d, s_time_t now)
   131.8 +static inline u32 calc_avt(struct vcpu *v, s_time_t now)
   131.9  {
  131.10      u32 ranfor, mcus;
  131.11 -    struct bvt_dom_info *inf = BVT_INFO(d->domain);
  131.12 -    struct bvt_vcpu_info *einf = EBVT_INFO(d);
  131.13 +    struct bvt_dom_info *inf = BVT_INFO(v->domain);
  131.14 +    struct bvt_vcpu_info *einf = EBVT_INFO(v);
  131.15      
  131.16 -    ranfor = (u32)(now - d->lastschd);
  131.17 +    ranfor = (u32)(now - v->runstate.state_entry_time);
  131.18      mcus = (ranfor + MCU - 1)/MCU;
  131.19  
  131.20      return einf->avt + mcus * inf->mcu_advance;
  131.21 @@ -262,7 +262,7 @@ static void bvt_wake(struct vcpu *v)
  131.22      curr_evt = calc_evt(curr, calc_avt(curr, now));
  131.23      /* Calculate the time the current domain would run assuming
  131.24         the second smallest evt is of the newly woken domain */
  131.25 -    r_time = curr->lastschd +
  131.26 +    r_time = curr->runstate.state_entry_time +
  131.27          ((einf->evt - curr_evt) / BVT_INFO(curr->domain)->mcu_advance) +
  131.28          ctx_allow;
  131.29  
  131.30 @@ -558,7 +558,6 @@ static void bvt_dump_cpu_state(int i)
  131.31          printk("%3d: %u has=%c ", loop++, v->domain->domain_id,
  131.32                 test_bit(_VCPUF_running, &v->vcpu_flags) ? 'T':'F');
  131.33          bvt_dump_runq_el(v);
  131.34 -        printk("c=0x%X%08X\n", (u32)(v->cpu_time>>32), (u32)v->cpu_time);
  131.35          printk("         l: %p n: %p  p: %p\n",
  131.36                 &vcpu_inf->run_list, vcpu_inf->run_list.next,
  131.37                 vcpu_inf->run_list.prev);
   132.1 --- a/xen/common/sched_sedf.c	Wed Mar 01 10:01:54 2006 -0700
   132.2 +++ b/xen/common/sched_sedf.c	Wed Mar 01 12:47:25 2006 -0700
   132.3 @@ -1408,18 +1408,14 @@ static void sedf_dump_domain(struct vcpu
   132.4  {
   132.5      printk("%i.%i has=%c ", d->domain->domain_id, d->vcpu_id,
   132.6             test_bit(_VCPUF_running, &d->vcpu_flags) ? 'T':'F');
   132.7 -    printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu c=%"PRIu64
   132.8 +    printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu"
   132.9             " sc=%i xtr(%s)=%"PRIu64" ew=%hu",
  132.10             EDOM_INFO(d)->period, EDOM_INFO(d)->slice, EDOM_INFO(d)->deadl_abs,
  132.11 -           EDOM_INFO(d)->weight, d->cpu_time,
  132.12 +           EDOM_INFO(d)->weight,
  132.13             EDOM_INFO(d)->score[EXTRA_UTIL_Q],
  132.14             (EDOM_INFO(d)->status & EXTRA_AWARE) ? "yes" : "no",
  132.15             EDOM_INFO(d)->extra_time_tot, EDOM_INFO(d)->extraweight);
  132.16      
  132.17 -    if ( d->cpu_time != 0 )
  132.18 -        printf(" (%"PRIu64"%%)", (EDOM_INFO(d)->extra_time_tot * 100)
  132.19 -               / d->cpu_time);
  132.20 -
  132.21  #ifdef SEDF_STATS
  132.22      if ( EDOM_INFO(d)->block_time_tot != 0 )
  132.23          printf(" pen=%"PRIu64"%%", (EDOM_INFO(d)->penalty_time_tot * 100) /
   133.1 --- a/xen/common/schedule.c	Wed Mar 01 10:01:54 2006 -0700
   133.2 +++ b/xen/common/schedule.c	Wed Mar 01 12:47:25 2006 -0700
   133.3 @@ -36,14 +36,6 @@ extern void arch_getdomaininfo_ctxt(stru
   133.4  static char opt_sched[10] = "sedf";
   133.5  string_param("sched", opt_sched);
   133.6  
   133.7 -/*#define WAKE_HISTO*/
   133.8 -/*#define BLOCKTIME_HISTO*/
   133.9 -#if defined(WAKE_HISTO)
  133.10 -#define BUCKETS 31
  133.11 -#elif defined(BLOCKTIME_HISTO)
  133.12 -#define BUCKETS 200
  133.13 -#endif
  133.14 -
  133.15  #define TIME_SLOP      (s32)MICROSECS(50)     /* allow time to slip a bit */
  133.16  
  133.17  /* Various timer handlers. */
  133.18 @@ -73,6 +65,36 @@ static struct scheduler ops;
  133.19  /* Per-CPU periodic timer sends an event to the currently-executing domain. */
  133.20  static struct timer t_timer[NR_CPUS]; 
  133.21  
  133.22 +static inline void vcpu_runstate_change(
  133.23 +    struct vcpu *v, int new_state, s_time_t new_entry_time)
  133.24 +{
  133.25 +    ASSERT(v->runstate.state != new_state);
  133.26 +    ASSERT(spin_is_locked(&schedule_data[v->processor].schedule_lock));
  133.27 +
  133.28 +    v->runstate.time[v->runstate.state] +=
  133.29 +        new_entry_time - v->runstate.state_entry_time;
  133.30 +    v->runstate.state_entry_time = new_entry_time;
  133.31 +    v->runstate.state = new_state;
  133.32 +}
  133.33 +
  133.34 +void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
  133.35 +{
  133.36 +    if ( likely(v == current) )
  133.37 +    {
  133.38 +        /* Fast lock-free path. */
  133.39 +        memcpy(runstate, &v->runstate, sizeof(*runstate));
  133.40 +        ASSERT(runstate->state == RUNSTATE_running);
  133.41 +        runstate->time[RUNSTATE_running] += NOW() - runstate->state_entry_time;
  133.42 +    }
  133.43 +    else
  133.44 +    {
  133.45 +        vcpu_schedule_lock_irq(v);
  133.46 +        memcpy(runstate, &v->runstate, sizeof(*runstate));
  133.47 +        runstate->time[runstate->state] += NOW() - runstate->state_entry_time;
  133.48 +        vcpu_schedule_unlock_irq(v);
  133.49 +    }
  133.50 +}
  133.51 +
  133.52  struct domain *alloc_domain(void)
  133.53  {
  133.54      struct domain *d;
  133.55 @@ -119,6 +141,9 @@ struct vcpu *alloc_vcpu(
  133.56      v->cpu_affinity = is_idle_domain(d) ?
  133.57          cpumask_of_cpu(cpu_id) : CPU_MASK_ALL;
  133.58  
  133.59 +    v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline;
  133.60 +    v->runstate.state_entry_time = NOW();
  133.61 +
  133.62      if ( (vcpu_id != 0) && !is_idle_domain(d) )
  133.63          set_bit(_VCPUF_down, &v->vcpu_flags);
  133.64  
  133.65 @@ -165,8 +190,15 @@ void vcpu_sleep_nosync(struct vcpu *v)
  133.66      unsigned long flags;
  133.67  
  133.68      vcpu_schedule_lock_irqsave(v, flags);
  133.69 +
  133.70      if ( likely(!vcpu_runnable(v)) )
  133.71 +    {
  133.72 +        if ( v->runstate.state == RUNSTATE_runnable )
  133.73 +            vcpu_runstate_change(v, RUNSTATE_offline, NOW());
  133.74 +
  133.75          SCHED_OP(sleep, v);
  133.76 +    }
  133.77 +
  133.78      vcpu_schedule_unlock_irqrestore(v, flags);
  133.79  
  133.80      TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
  133.81 @@ -187,11 +219,19 @@ void vcpu_wake(struct vcpu *v)
  133.82      unsigned long flags;
  133.83  
  133.84      vcpu_schedule_lock_irqsave(v, flags);
  133.85 +
  133.86      if ( likely(vcpu_runnable(v)) )
  133.87      {
  133.88 +        if ( v->runstate.state >= RUNSTATE_blocked )
  133.89 +            vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
  133.90          SCHED_OP(wake, v);
  133.91 -        v->wokenup = NOW();
  133.92      }
  133.93 +    else if ( !test_bit(_VCPUF_blocked, &v->vcpu_flags) )
  133.94 +    {
  133.95 +        if ( v->runstate.state == RUNSTATE_blocked )
  133.96 +            vcpu_runstate_change(v, RUNSTATE_offline, NOW());
  133.97 +    }
  133.98 +
  133.99      vcpu_schedule_unlock_irqrestore(v, flags);
 133.100  
 133.101      TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
 133.102 @@ -376,8 +416,6 @@ static void __enter_scheduler(void)
 133.103  
 133.104      stop_timer(&schedule_data[cpu].s_timer);
 133.105      
 133.106 -    prev->cpu_time += now - prev->lastschd;
 133.107 -
 133.108      /* get policy-specific decision on scheduling... */
 133.109      next_slice = ops.do_schedule(now);
 133.110  
 133.111 @@ -386,8 +424,6 @@ static void __enter_scheduler(void)
 133.112  
 133.113      schedule_data[cpu].curr = next;
 133.114      
 133.115 -    next->lastschd = now;
 133.116 -
 133.117      set_timer(&schedule_data[cpu].s_timer, now + r_time);
 133.118  
 133.119      if ( unlikely(prev == next) )
 133.120 @@ -397,38 +433,23 @@ static void __enter_scheduler(void)
 133.121      }
 133.122  
 133.123      TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
 133.124 -             prev->domain->domain_id, now - prev->lastschd);
 133.125 +             prev->domain->domain_id,
 133.126 +             now - prev->runstate.state_entry_time);
 133.127      TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
 133.128 -             next->domain->domain_id, now - next->wokenup, r_time);
 133.129 -
 133.130 -    /*
 133.131 -     * Logic of wokenup field in domain struct:
 133.132 -     * Used to calculate "waiting time", which is the time that a domain
 133.133 -     * spends being "runnable", but not actually running. wokenup is set
 133.134 -     * set whenever a domain wakes from sleeping. However, if wokenup is not
 133.135 -     * also set here then a preempted runnable domain will get a screwed up
 133.136 -     * "waiting time" value next time it is scheduled.
 133.137 -     */
 133.138 -    prev->wokenup = now;
 133.139 +             next->domain->domain_id,
 133.140 +             (next->runstate.state == RUNSTATE_runnable) ?
 133.141 +             (now - next->runstate.state_entry_time) : 0,
 133.142 +             r_time);
 133.143  
 133.144 -#if defined(WAKE_HISTO)
 133.145 -    if ( !is_idle_vcpu(next) && next->wokenup )
 133.146 -    {
 133.147 -        ulong diff = (ulong)(now - next->wokenup);
 133.148 -        diff /= (ulong)MILLISECS(1);
 133.149 -        if (diff <= BUCKETS-2)  schedule_data[cpu].hist[diff]++;
 133.150 -        else                    schedule_data[cpu].hist[BUCKETS-1]++;
 133.151 -    }
 133.152 -    next->wokenup = (s_time_t)0;
 133.153 -#elif defined(BLOCKTIME_HISTO)
 133.154 -    prev->lastdeschd = now;
 133.155 -    if ( !is_idle_vcpu(next) )
 133.156 -    {
 133.157 -        ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10));
 133.158 -        if (diff <= BUCKETS-2)  schedule_data[cpu].hist[diff]++;
 133.159 -        else                    schedule_data[cpu].hist[BUCKETS-1]++;
 133.160 -    }
 133.161 -#endif
 133.162 +    ASSERT(prev->runstate.state == RUNSTATE_running);
 133.163 +    vcpu_runstate_change(
 133.164 +        prev,
 133.165 +        (test_bit(_VCPUF_blocked, &prev->vcpu_flags) ? RUNSTATE_blocked :
 133.166 +         (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
 133.167 +        now);
 133.168 +
 133.169 +    ASSERT(next->runstate.state != RUNSTATE_running);
 133.170 +    vcpu_runstate_change(next, RUNSTATE_running, now);
 133.171  
 133.172      ASSERT(!test_bit(_VCPUF_running, &next->vcpu_flags));
 133.173      set_bit(_VCPUF_running, &next->vcpu_flags);
 133.174 @@ -568,47 +589,6 @@ void dump_runq(unsigned char key)
 133.175      local_irq_restore(flags);
 133.176  }
 133.177  
 133.178 -#if defined(WAKE_HISTO) || defined(BLOCKTIME_HISTO)
 133.179 -
 133.180 -void print_sched_histo(unsigned char key)
 133.181 -{
 133.182 -    int i, j, k;
 133.183 -    for_each_online_cpu ( k )
 133.184 -    {
 133.185 -        j = 0;
 133.186 -        printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", k);
 133.187 -        for ( i = 0; i < BUCKETS; i++ )
 133.188 -        {
 133.189 -            if ( schedule_data[k].hist[i] != 0 )
 133.190 -            {
 133.191 -                if ( i < BUCKETS-1 )
 133.192 -                    printk("%2d:[%7u]    ", i, schedule_data[k].hist[i]);
 133.193 -                else
 133.194 -                    printk(" >:[%7u]    ", schedule_data[k].hist[i]);
 133.195 -                if ( !(++j % 5) )
 133.196 -                    printk("\n");
 133.197 -            }
 133.198 -        }
 133.199 -        printk("\n");
 133.200 -    }
 133.201 -      
 133.202 -}
 133.203 -
 133.204 -void reset_sched_histo(unsigned char key)
 133.205 -{
 133.206 -    int i, j;
 133.207 -    for ( j = 0; j < NR_CPUS; j++ )
 133.208 -        for ( i=0; i < BUCKETS; i++ ) 
 133.209 -            schedule_data[j].hist[i] = 0;
 133.210 -}
 133.211 -
 133.212 -#else
 133.213 -
 133.214 -void print_sched_histo(unsigned char key) { }
 133.215 -void reset_sched_histo(unsigned char key) { }
 133.216 -
 133.217 -#endif
 133.218 -
 133.219  /*
 133.220   * Local variables:
 133.221   * mode: C
   134.1 --- a/xen/drivers/char/console.c	Wed Mar 01 10:01:54 2006 -0700
   134.2 +++ b/xen/drivers/char/console.c	Wed Mar 01 12:47:25 2006 -0700
   134.3 @@ -335,8 +335,9 @@ long guest_console_write(char *buffer, i
   134.4          }
   134.5  
   134.6          if ( hypercall_preempt_check() )
   134.7 -            return hypercall3_create_continuation(
   134.8 -                __HYPERVISOR_console_io, CONSOLEIO_write, count, buffer);
   134.9 +            return hypercall_create_continuation(
  134.10 +                __HYPERVISOR_console_io, "iip",
  134.11 +                CONSOLEIO_write, count, buffer);
  134.12  
  134.13          kcount = min_t(int, count, sizeof(kbuf)-1);
  134.14          if ( copy_from_user(kbuf, buffer, kcount) )
   135.1 --- a/xen/include/asm-ia64/config.h	Wed Mar 01 10:01:54 2006 -0700
   135.2 +++ b/xen/include/asm-ia64/config.h	Wed Mar 01 12:47:25 2006 -0700
   135.3 @@ -37,6 +37,8 @@
   135.4  //leave SMP for a later time
   135.5  //#undef CONFIG_SMP
   135.6  
   135.7 +#define supervisor_mode_kernel (0)
   135.8 +
   135.9  #define MAX_DMADOM_PFN (0x7FFFFFFFUL >> PAGE_SHIFT) /* 31 addressable bits */
  135.10  
  135.11  #ifndef __ASSEMBLY__
  135.12 @@ -190,11 +192,6 @@ void sort_main_extable(void);
  135.13  
  135.14  #define find_first_set_bit(x)	(ffs(x)-1)	// FIXME: Is this right???
  135.15  
  135.16 -// from include/asm-x86/*/uaccess.h
  135.17 -#define array_access_ok(addr,count,size)			\
  135.18 -    (likely(sizeof(count) <= 4) /* disallow 64-bit counts */ &&  \
  135.19 -     access_ok(type,addr,count*size))
  135.20 -
  135.21  // see drivers/char/console.c
  135.22  #ifndef VALIDATE_VT
  135.23  #define	OPT_CONSOLE_STR "com1"
  135.24 @@ -299,7 +296,6 @@ extern int ht_per_core;
  135.25  //#define raw_smp_processor_id()	0
  135.26  //#endif
  135.27  
  135.28 -
  135.29  #ifndef __ASSEMBLY__
  135.30  #include <linux/linkage.h>
  135.31  #define FORCE_CRASH()	asm("break.m 0;;");
   136.1 --- a/xen/include/asm-ia64/linux-xen/asm/README.origin	Wed Mar 01 10:01:54 2006 -0700
   136.2 +++ b/xen/include/asm-ia64/linux-xen/asm/README.origin	Wed Mar 01 12:47:25 2006 -0700
   136.3 @@ -22,4 +22,3 @@ spinlock.h		-> linux/include/asm-ia64/sp
   136.4  system.h		-> linux/include/asm-ia64/system.h
   136.5  tlbflush.h		-> linux/include/asm-ia64/tlbflush.h
   136.6  types.h			-> linux/include/asm-ia64/types.h
   136.7 -uaccess.h		-> linux/include/asm-ia64/uaccess.h
   137.1 --- a/xen/include/asm-ia64/linux-xen/asm/uaccess.h	Wed Mar 01 10:01:54 2006 -0700
   137.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
   137.3 @@ -1,415 +0,0 @@
   137.4 -#ifndef _ASM_IA64_UACCESS_H
   137.5 -#define _ASM_IA64_UACCESS_H
   137.6 -
   137.7 -/*
   137.8 - * This file defines various macros to transfer memory areas across
   137.9 - * the user/kernel boundary.  This needs to be done carefully because
  137.10 - * this code is executed in kernel mode and uses user-specified
  137.11 - * addresses.  Thus, we need to be careful not to let the user to
  137.12 - * trick us into accessing kernel memory that would normally be
  137.13 - * inaccessible.  This code is also fairly performance sensitive,
  137.14 - * so we want to spend as little time doing safety checks as
  137.15 - * possible.
  137.16 - *
  137.17 - * To make matters a bit more interesting, these macros sometimes also
  137.18 - * called from within the kernel itself, in which case the address
  137.19 - * validity check must be skipped.  The get_fs() macro tells us what
  137.20 - * to do: if get_fs()==USER_DS, checking is performed, if
  137.21 - * get_fs()==KERNEL_DS, checking is bypassed.
  137.22 - *
  137.23 - * Note that even if the memory area specified by the user is in a
  137.24 - * valid address range, it is still possible that we'll get a page
  137.25 - * fault while accessing it.  This is handled by filling out an
  137.26 - * exception handler fixup entry for each instruction that has the
  137.27 - * potential to fault.  When such a fault occurs, the page fault
  137.28 - * handler checks to see whether the faulting instruction has a fixup
  137.29 - * associated and, if so, sets r8 to -EFAULT and clears r9 to 0 and
  137.30 - * then resumes execution at the continuation point.
  137.31 - *
  137.32 - * Based on <asm-alpha/uaccess.h>.
  137.33 - *
  137.34 - * Copyright (C) 1998, 1999, 2001-2004 Hewlett-Packard Co
  137.35 - *	David Mosberger-Tang <davidm@hpl.hp.com>
  137.36 - */
  137.37 -
  137.38 -#include <linux/compiler.h>
  137.39 -#include <linux/errno.h>
  137.40 -#include <linux/sched.h>
  137.41 -#include <linux/page-flags.h>
  137.42 -#include <linux/mm.h>
  137.43 -
  137.44 -#include <asm/intrinsics.h>
  137.45 -#include <asm/pgtable.h>
  137.46 -#include <asm/io.h>
  137.47 -
  137.48 -/*
  137.49 - * For historical reasons, the following macros are grossly misnamed:
  137.50 - */
  137.51 -#define KERNEL_DS	((mm_segment_t) { ~0UL })		/* cf. access_ok() */
  137.52 -#define USER_DS		((mm_segment_t) { TASK_SIZE-1 })	/* cf. access_ok() */
  137.53 -
  137.54 -#define VERIFY_READ	0
  137.55 -#define VERIFY_WRITE	1
  137.56 -
  137.57 -#define get_ds()  (KERNEL_DS)
  137.58 -#define get_fs()  (current_thread_info()->addr_limit)
  137.59 -#define set_fs(x) (current_thread_info()->addr_limit = (x))
  137.60 -
  137.61 -#define segment_eq(a, b)	((a).seg == (b).seg)
  137.62 -
  137.63 -/*
  137.64 - * When accessing user memory, we need to make sure the entire area really is in
  137.65 - * user-level space.  In order to do this efficiently, we make sure that the page at
  137.66 - * address TASK_SIZE is never valid.  We also need to make sure that the address doesn't
  137.67 - * point inside the virtually mapped linear page table.
  137.68 - */
  137.69 -#ifdef XEN
  137.70 -#define IS_VMM_ADDRESS(addr) ((((addr) >> 60) ^ ((addr) >> 59)) & 1)
  137.71 -#define __access_ok(addr, size, segment) (!IS_VMM_ADDRESS((unsigned long)(addr)))
  137.72 -#else
  137.73 -#define __access_ok(addr, size, segment)						\
  137.74 -({											\
  137.75 -	__chk_user_ptr(addr);								\
  137.76 -	(likely((unsigned long) (addr) <= (segment).seg)				\
  137.77 -	 && ((segment).seg == KERNEL_DS.seg						\
  137.78 -	     || likely(REGION_OFFSET((unsigned long) (addr)) < RGN_MAP_LIMIT)));	\
  137.79 -})
  137.80 -#endif
  137.81 -#define access_ok(type, addr, size)	__access_ok((addr), (size), get_fs())
  137.82 -
  137.83 -/* this function will go away soon - use access_ok() instead */
  137.84 -static inline int __deprecated
  137.85 -verify_area (int type, const void __user *addr, unsigned long size)
  137.86 -{
  137.87 -	return access_ok(type, addr, size) ? 0 : -EFAULT;
  137.88 -}
  137.89 -
  137.90 -/*
  137.91 - * These are the main single-value transfer routines.  They automatically
  137.92 - * use the right size if we just have the right pointer type.
  137.93 - *
  137.94 - * Careful to not
  137.95 - * (a) re-use the arguments for side effects (sizeof/typeof is ok)
  137.96 - * (b) require any knowledge of processes at this stage
  137.97 - */
  137.98 -#define put_user(x, ptr)	__put_user_check((__typeof__(*(ptr))) (x), (ptr), sizeof(*(ptr)), get_fs())
  137.99 -#define get_user(x, ptr)	__get_user_check((x), (ptr), sizeof(*(ptr)), get_fs())
 137.100 -
 137.101 -/*
 137.102 - * The "__xxx" versions do not do address space checking, useful when
 137.103 - * doing multiple accesses to the same area (the programmer has to do the
 137.104 - * checks by hand with "access_ok()")
 137.105 - */
 137.106 -#define __put_user(x, ptr)	__put_user_nocheck((__typeof__(*(ptr))) (x), (ptr), sizeof(*(ptr)))
 137.107 -#define __get_user(x, ptr)	__get_user_nocheck((x), (ptr), sizeof(*(ptr)))
 137.108 -
 137.109 -extern long __put_user_unaligned_unknown (void);
 137.110 -
 137.111 -#define __put_user_unaligned(x, ptr)								\
 137.112 -({												\
 137.113 -	long __ret;										\
 137.114 -	switch (sizeof(*(ptr))) {								\
 137.115 -		case 1: __ret = __put_user((x), (ptr)); break;					\
 137.116 -		case 2: __ret = (__put_user((x), (u8 __user *)(ptr)))				\
 137.117 -			| (__put_user((x) >> 8, ((u8 __user *)(ptr) + 1))); break;		\
 137.118 -		case 4: __ret = (__put_user((x), (u16 __user *)(ptr)))				\
 137.119 -			| (__put_user((x) >> 16, ((u16 __user *)(ptr) + 1))); break;		\
 137.120 -		case 8: __ret = (__put_user((x), (u32 __user *)(ptr)))				\
 137.121 -			| (__put_user((x) >> 32, ((u32 __user *)(ptr) + 1))); break;		\
 137.122 -		default: __ret = __put_user_unaligned_unknown();				\
 137.123 -	}											\
 137.124 -	__ret;											\
 137.125 -})
 137.126 -
 137.127 -extern long __get_user_unaligned_unknown (void);
 137.128 -
 137.129 -#define __get_user_unaligned(x, ptr)								\
 137.130 -({												\
 137.131 -	long __ret;										\
 137.132 -	switch (sizeof(*(ptr))) {								\
 137.133 -		case 1: __ret = __get_user((x), (ptr)); break;					\
 137.134 -		case 2: __ret = (__get_user((x), (u8 __user *)(ptr)))				\
 137.135 -			| (__get_user((x) >> 8, ((u8 __user *)(ptr) + 1))); break;		\
 137.136 -		case 4: __ret = (__get_user((x), (u16 __user *)(ptr)))				\
 137.137 -			| (__get_user((x) >> 16, ((u16 __user *)(ptr) + 1))); break;		\
 137.138 -		case 8: __ret = (__get_user((x), (u32 __user *)(ptr)))				\
 137.139 -			| (__get_user((x) >> 32, ((u32 __user *)(ptr) + 1))); break;		\
 137.140 -		default: __ret = __get_user_unaligned_unknown();				\
 137.141 -	}											\
 137.142 -	__ret;											\
 137.143 -})
 137.144 -
 137.145 -#ifdef ASM_SUPPORTED
 137.146 -  struct __large_struct { unsigned long buf[100]; };
 137.147 -# define __m(x) (*(struct __large_struct __user *)(x))
 137.148 -
 137.149 -/* We need to declare the __ex_table section before we can use it in .xdata.  */
 137.150 -asm (".section \"__ex_table\", \"a\"\n\t.previous");
 137.151 -
 137.152 -# define __get_user_size(val, addr, n, err)							\
 137.153 -do {												\
 137.154 -	register long __gu_r8 asm ("r8") = 0;							\
 137.155 -	register long __gu_r9 asm ("r9");							\
 137.156 -	asm ("\n[1:]\tld"#n" %0=%2%P2\t// %0 and %1 get overwritten by exception handler\n"	\
 137.157 -	     "\t.xdata4 \"__ex_table\", 1b-., 1f-.+4\n"						\
 137.158 -	     "[1:]"										\
 137.159 -	     : "=r"(__gu_r9), "=r"(__gu_r8) : "m"(__m(addr)), "1"(__gu_r8));			\
 137.160 -	(err) = __gu_r8;									\
 137.161 -	(val) = __gu_r9;									\
 137.162 -} while (0)
 137.163 -
 137.164 -/*
 137.165 - * The "__put_user_size()" macro tells gcc it reads from memory instead of writing it.  This
 137.166 - * is because they do not write to any memory gcc knows about, so there are no aliasing
 137.167 - * issues.
 137.168 - */
 137.169 -# define __put_user_size(val, addr, n, err)							\
 137.170 -do {												\
 137.171 -	register long __pu_r8 asm ("r8") = 0;							\
 137.172 -	asm volatile ("\n[1:]\tst"#n" %1=%r2%P1\t// %0 gets overwritten by exception handler\n"	\
 137.173 -		      "\t.xdata4 \"__ex_table\", 1b-., 1f-.\n"					\
 137.174 -		      "[1:]"									\
 137.175 -		      : "=r"(__pu_r8) : "m"(__m(addr)), "rO"(val), "0"(__pu_r8));		\
 137.176 -	(err) = __pu_r8;									\
 137.177 -} while (0)
 137.178 -
 137.179 -#else /* !ASM_SUPPORTED */
 137.180 -# define RELOC_TYPE	2	/* ip-rel */
 137.181 -# define __get_user_size(val, addr, n, err)				\
 137.182 -do {									\
 137.183 -	__ld_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE);	\
 137.184 -	(err) = ia64_getreg(_IA64_REG_R8);				\
 137.185 -	(val) = ia64_getreg(_IA64_REG_R9);				\
 137.186 -} while (0)
 137.187 -# define __put_user_size(val, addr, n, err)							\
 137.188 -do {												\
 137.189 -	__st_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE, (unsigned long) (val));	\
 137.190 -	(err) = ia64_getreg(_IA64_REG_R8);							\
 137.191 -} while (0)
 137.192 -#endif /* !ASM_SUPPORTED */
 137.193 -
 137.194 -extern void __get_user_unknown (void);
 137.195 -
 137.196 -/*
 137.197 - * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve subroutine-calls, which
 137.198 - * could clobber r8 and r9 (among others).  Thus, be careful not to evaluate it while
 137.199 - * using r8/r9.
 137.200 - */
 137.201 -#define __do_get_user(check, x, ptr, size, segment)					\
 137.202 -({											\
 137.203 -	const __typeof__(*(ptr)) __user *__gu_ptr = (ptr);				\
 137.204 -	__typeof__ (size) __gu_size = (size);						\
 137.205 -	long __gu_err = -EFAULT, __gu_val = 0;						\
 137.206 -											\
 137.207 -	if (!check || __access_ok(__gu_ptr, size, segment))				\
 137.208 -		switch (__gu_size) {							\
 137.209 -		      case 1: __get_user_size(__gu_val, __gu_ptr, 1, __gu_err); break;	\
 137.210 -		      case 2: __get_user_size(__gu_val, __gu_ptr, 2, __gu_err); break;	\
 137.211 -		      case 4: __get_user_size(__gu_val, __gu_ptr, 4, __gu_err); break;	\
 137.212 -		      case 8: __get_user_size(__gu_val, __gu_ptr, 8, __gu_err); break;	\
 137.213 -		      default: __get_user_unknown(); break;				\
 137.214 -		}									\
 137.215 -	(x) = (__typeof__(*(__gu_ptr))) __gu_val;					\
 137.216 -	__gu_err;									\
 137.217 -})
 137.218 -
 137.219 -#define __get_user_nocheck(x, ptr, size)	__do_get_user(0, x, ptr, size, KERNEL_DS)
 137.220 -#define __get_user_check(x, ptr, size, segment)	__do_get_user(1, x, ptr, size, segment)
 137.221 -
 137.222 -extern void __put_user_unknown (void);
 137.223 -
 137.224 -/*
 137.225 - * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve subroutine-calls, which
 137.226 - * could clobber r8 (among others).  Thus, be careful not to evaluate them while using r8.
 137.227 - */
 137.228 -#define __do_put_user(check, x, ptr, size, segment)					\
 137.229 -({											\
 137.230 -	__typeof__ (x) __pu_x = (x);							\
 137.231 -	__typeof__ (*(ptr)) __user *__pu_ptr = (ptr);					\
 137.232 -	__typeof__ (size) __pu_size = (size);						\
 137.233 -	long __pu_err = -EFAULT;							\
 137.234 -											\
 137.235 -	if (!check || __access_ok(__pu_ptr, __pu_size, segment))			\
 137.236 -		switch (__pu_size) {							\
 137.237 -		      case 1: __put_user_size(__pu_x, __pu_ptr, 1, __pu_err); break;	\
 137.238 -		      case 2: __put_user_size(__pu_x, __pu_ptr, 2, __pu_err); break;	\
 137.239 -		      case 4: __put_user_size(__pu_x, __pu_ptr, 4, __pu_err); break;	\
 137.240 -		      case 8: __put_user_size(__pu_x, __pu_ptr, 8, __pu_err); break;	\
 137.241 -		      default: __put_user_unknown(); break;				\
 137.242 -		}									\
 137.243 -	__pu_err;									\
 137.244 -})
 137.245 -
 137.246 -#define __put_user_nocheck(x, ptr, size)	__do_put_user(0, x, ptr, size, KERNEL_DS)
 137.247 -#define __put_user_check(x, ptr, size, segment)	__do_put_user(1, x, ptr, size, segment)
 137.248 -
 137.249 -/*
 137.250 - * Complex access routines
 137.251 - */
 137.252 -extern unsigned long __must_check __copy_user (void __user *to, const void __user *from,
 137.253 -					       unsigned long count);
 137.254 -
 137.255 -static inline unsigned long
 137.256 -__copy_to_user (void __user *to, const void *from, unsigned long count)
 137.257 -{
 137.258 -	return __copy_user(to, (void __user *) from, count);
 137.259 -}
 137.260 -
 137.261 -static inline unsigned long
 137.262 -__copy_from_user (void *to, const void __user *from, unsigned long count)
 137.263 -{
 137.264 -	return __copy_user((void __user *) to, from, count);
 137.265 -}
 137.266 -
 137.267 -#define __copy_to_user_inatomic		__copy_to_user
 137.268 -#define __copy_from_user_inatomic	__copy_from_user
 137.269 -#define copy_to_user(to, from, n)							\
 137.270 -({											\
 137.271 -	void __user *__cu_to = (to);							\
 137.272 -	const void *__cu_from = (from);							\
 137.273 -	long __cu_len = (n);								\
 137.274 -											\
 137.275 -	if (__access_ok(__cu_to, __cu_len, get_fs()))					\
 137.276 -		__cu_len = __copy_user(__cu_to, (void __user *) __cu_from, __cu_len);	\
 137.277 -	__cu_len;									\
 137.278 -})
 137.279 -
 137.280 -#define copy_from_user(to, from, n)							\
 137.281 -({											\
 137.282 -	void *__cu_to = (to);								\
 137.283 -	const void __user *__cu_from = (from);						\
 137.284 -	long __cu_len = (n);								\
 137.285 -											\
 137.286 -	__chk_user_ptr(__cu_from);							\
 137.287 -	if (__access_ok(__cu_from, __cu_len, get_fs()))					\
 137.288 -		__cu_len = __copy_user((void __user *) __cu_to, __cu_from, __cu_len);	\
 137.289 -	__cu_len;									\
 137.290 -})
 137.291 -
 137.292 -#define __copy_in_user(to, from, size)	__copy_user((to), (from), (size))
 137.293 -
 137.294 -static inline unsigned long
 137.295 -copy_in_user (void __user *to, const void __user *from, unsigned long n)
 137.296 -{
 137.297 -	if (likely(access_ok(VERIFY_READ, from, n) && access_ok(VERIFY_WRITE, to, n)))
 137.298 -		n = __copy_user(to, from, n);
 137.299 -	return n;
 137.300 -}
 137.301 -
 137.302 -extern unsigned long __do_clear_user (void __user *, unsigned long);
 137.303 -
 137.304 -#define __clear_user(to, n)		__do_clear_user(to, n)
 137.305 -
 137.306 -#define clear_user(to, n)					\
 137.307 -({								\
 137.308 -	unsigned long __cu_len = (n);				\
 137.309 -	if (__access_ok(to, __cu_len, get_fs()))		\
 137.310 -		__cu_len = __do_clear_user(to, __cu_len);	\
 137.311 -	__cu_len;						\
 137.312 -})
 137.313 -
 137.314 -
 137.315 -/*
 137.316 - * Returns: -EFAULT if exception before terminator, N if the entire buffer filled, else
 137.317 - * strlen.
 137.318 - */
 137.319 -extern long __must_check __strncpy_from_user (char *to, const char __user *from, long to_len);
 137.320 -
 137.321 -#define strncpy_from_user(to, from, n)					\
 137.322 -({									\
 137.323 -	const char __user * __sfu_from = (from);			\
 137.324 -	long __sfu_ret = -EFAULT;					\
 137.325 -	if (__access_ok(__sfu_from, 0, get_fs()))			\
 137.326 -		__sfu_ret = __strncpy_from_user((to), __sfu_from, (n));	\
 137.327 -	__sfu_ret;							\
 137.328 -})
 137.329 -
 137.330 -/* Returns: 0 if bad, string length+1 (memory size) of string if ok */
 137.331 -extern unsigned long __strlen_user (const char __user *);
 137.332 -
 137.333 -#define strlen_user(str)				\
 137.334 -({							\
 137.335 -	const char __user *__su_str = (str);		\
 137.336 -	unsigned long __su_ret = 0;			\
 137.337 -	if (__access_ok(__su_str, 0, get_fs()))		\
 137.338 -		__su_ret = __strlen_user(__su_str);	\
 137.339 -	__su_ret;					\
 137.340 -})
 137.341 -
 137.342 -/*
 137.343 - * Returns: 0 if exception before NUL or reaching the supplied limit
 137.344 - * (N), a value greater than N if the limit would be exceeded, else
 137.345 - * strlen.
 137.346 - */
 137.347 -extern unsigned long __strnlen_user (const char __user *, long);
 137.348 -
 137.349 -#define strnlen_user(str, len)					\
 137.350 -({								\
 137.351 -	const char __user *__su_str = (str);			\
 137.352 -	unsigned long __su_ret = 0;				\
 137.353 -	if (__access_ok(__su_str, 0, get_fs()))			\
 137.354 -		__su_ret = __strnlen_user(__su_str, len);	\
 137.355 -	__su_ret;						\
 137.356 -})
 137.357 -
 137.358 -/* Generic code can't deal with the location-relative format that we use for compactness.  */
 137.359 -#define ARCH_HAS_SORT_EXTABLE
 137.360 -#define ARCH_HAS_SEARCH_EXTABLE
 137.361 -
 137.362 -struct exception_table_entry {
 137.363 -	int addr;	/* location-relative address of insn this fixup is for */
 137.364 -	int cont;	/* location-relative continuation addr.; if bit 2 is set, r9 is set to 0 */
 137.365 -};
 137.366 -
 137.367 -extern void ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e);
 137.368 -extern const struct exception_table_entry *search_exception_tables (unsigned long addr);
 137.369 -
 137.370 -static inline int
 137.371 -ia64_done_with_exception (struct pt_regs *regs)
 137.372 -{
 137.373 -	const struct exception_table_entry *e;
 137.374 -	e = search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri);
 137.375 -	if (e) {
 137.376 -		ia64_handle_exception(regs, e);
 137.377 -		return 1;
 137.378 -	}
 137.379 -	return 0;
 137.380 -}
 137.381 -
 137.382 -#ifndef XEN
 137.383 -#define ARCH_HAS_TRANSLATE_MEM_PTR	1
 137.384 -static __inline__ char *
 137.385 -xlate_dev_mem_ptr (unsigned long p)
 137.386 -{
 137.387 -	struct page *page;
 137.388 -	char * ptr;
 137.389 -
 137.390 -	page = mfn_to_page(p >> PAGE_SHIFT);
 137.391 -	if (PageUncached(page))
 137.392 -		ptr = (char *)p + __IA64_UNCACHED_OFFSET;
 137.393 -	else
 137.394 -		ptr = __va(p);
 137.395 -
 137.396 -	return ptr;
 137.397 -}
 137.398 -
 137.399 -/*
 137.400 - * Convert a virtual cached kernel memory pointer to an uncached pointer
 137.401 - */
 137.402 -static __inline__ char *
 137.403 -xlate_dev_kmem_ptr (char * p)
 137.404 -{
 137.405 -	struct page *page;
 137.406 -	char * ptr;
 137.407 -
 137.408 -	page = virt_to_page((unsigned long)p >> PAGE_SHIFT);
 137.409 -	if (PageUncached(page))
 137.410 -		ptr = (char *)__pa(p) + __IA64_UNCACHED_OFFSET;
 137.411 -	else
 137.412 -		ptr = p;
 137.413 -
 137.414 -	return ptr;
 137.415 -}
 137.416 -#endif
 137.417 -
 137.418 -#endif /* _ASM_IA64_UACCESS_H */
   138.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   138.2 +++ b/xen/include/asm-ia64/uaccess.h	Wed Mar 01 12:47:25 2006 -0700
   138.3 @@ -0,0 +1,285 @@
   138.4 +#ifndef _ASM_IA64_UACCESS_H
   138.5 +#define _ASM_IA64_UACCESS_H
   138.6 +
   138.7 +/*
   138.8 + * This file defines various macros to transfer memory areas across
   138.9 + * the user/kernel boundary.  This needs to be done carefully because
  138.10 + * this code is executed in kernel mode and uses user-specified
  138.11 + * addresses.  Thus, we need to be careful not to let the user to
  138.12 + * trick us into accessing kernel memory that would normally be
  138.13 + * inaccessible.  This code is also fairly performance sensitive,
  138.14 + * so we want to spend as little time doing safety checks as
  138.15 + * possible.
  138.16 + *
  138.17 + * To make matters a bit more interesting, these macros sometimes also
  138.18 + * called from within the kernel itself, in which case the address
  138.19 + * validity check must be skipped.  The get_fs() macro tells us what
  138.20 + * to do: if get_fs()==USER_DS, checking is performed, if
  138.21 + * get_fs()==KERNEL_DS, checking is bypassed.
  138.22 + *
  138.23 + * Note that even if the memory area specified by the user is in a
  138.24 + * valid address range, it is still possible that we'll get a page
  138.25 + * fault while accessing it.  This is handled by filling out an
  138.26 + * exception handler fixup entry for each instruction that has the
  138.27 + * potential to fault.  When such a fault occurs, the page fault
  138.28 + * handler checks to see whether the faulting instruction has a fixup
  138.29 + * associated and, if so, sets r8 to -EFAULT and clears r9 to 0 and
  138.30 + * then resumes execution at the continuation point.
  138.31 + *
  138.32 + * Based on <asm-alpha/uaccess.h>.
  138.33 + *
  138.34 + * Copyright (C) 1998, 1999, 2001-2004 Hewlett-Packard Co
  138.35 + *	David Mosberger-Tang <davidm@hpl.hp.com>
  138.36 + */
  138.37 +
  138.38 +#include <linux/compiler.h>
  138.39 +#include <linux/errno.h>
  138.40 +#include <linux/sched.h>
  138.41 +#include <linux/page-flags.h>
  138.42 +#include <linux/mm.h>
  138.43 +
  138.44 +#include <asm/intrinsics.h>
  138.45 +#include <asm/pgtable.h>
  138.46 +#include <asm/io.h>
  138.47 +
  138.48 +#define IS_VMM_ADDRESS(addr) ((((addr) >> 60) ^ ((addr) >> 59)) & 1)
  138.49 +#define __access_ok(addr) (!IS_VMM_ADDRESS((unsigned long)(addr)))
  138.50 +#define access_ok(addr, size) (__access_ok(addr))
  138.51 +#define array_access_ok(addr,count,size)( __access_ok(addr))
  138.52 +
  138.53 +/*
  138.54 + * These are the main single-value transfer routines.  They automatically
  138.55 + * use the right size if we just have the right pointer type.
  138.56 + *
  138.57 + * Careful to not
  138.58 + * (a) re-use the arguments for side effects (sizeof/typeof is ok)
  138.59 + * (b) require any knowledge of processes at this stage
  138.60 + */
  138.61 +#define put_user(x, ptr)	__put_user_check((__typeof__(*(ptr))) (x), (ptr), sizeof(*(ptr)), get_fs())
  138.62 +#define get_user(x, ptr)	__get_user_check((x), (ptr), sizeof(*(ptr)), get_fs())
  138.63 +
  138.64 +/*
  138.65 + * The "__xxx" versions do not do address space checking, useful when
  138.66 + * doing multiple accesses to the same area (the programmer has to do the
  138.67 + * checks by hand with "access_ok()")
  138.68 + */
  138.69 +#define __put_user(x, ptr)	__put_user_nocheck((__typeof__(*(ptr))) (x), (ptr), sizeof(*(ptr)))
  138.70 +#define __get_user(x, ptr)	__get_user_nocheck((x), (ptr), sizeof(*(ptr)))
  138.71 +
  138.72 +extern long __put_user_unaligned_unknown (void);
  138.73 +
  138.74 +#define __put_user_unaligned(x, ptr)								\
  138.75 +({												\
  138.76 +	long __ret;										\
  138.77 +	switch (sizeof(*(ptr))) {								\
  138.78 +		case 1: __ret = __put_user((x), (ptr)); break;					\
  138.79 +		case 2: __ret = (__put_user((x), (u8 __user *)(ptr)))				\
  138.80 +			| (__put_user((x) >> 8, ((u8 __user *)(ptr) + 1))); break;		\
  138.81 +		case 4: __ret = (__put_user((x), (u16 __user *)(ptr)))				\
  138.82 +			| (__put_user((x) >> 16, ((u16 __user *)(ptr) + 1))); break;		\
  138.83 +		case 8: __ret = (__put_user((x), (u32 __user *)(ptr)))				\
  138.84 +			| (__put_user((x) >> 32, ((u32 __user *)(ptr) + 1))); break;		\
  138.85 +		default: __ret = __put_user_unaligned_unknown();				\
  138.86 +	}											\
  138.87 +	__ret;											\
  138.88 +})
  138.89 +
  138.90 +extern long __get_user_unaligned_unknown (void);
  138.91 +
  138.92 +#define __get_user_unaligned(x, ptr)								\
  138.93 +({												\
  138.94 +	long __ret;										\
  138.95 +	switch (sizeof(*(ptr))) {								\
  138.96 +		case 1: __ret = __get_user((x), (ptr)); break;					\
  138.97 +		case 2: __ret = (__get_user((x), (u8 __user *)(ptr)))				\
  138.98 +			| (__get_user((x) >> 8, ((u8 __user *)(ptr) + 1))); break;		\
  138.99 +		case 4: __ret = (__get_user((x), (u16 __user *)(ptr)))				\
 138.100 +			| (__get_user((x) >> 16, ((u16 __user *)(ptr) + 1))); break;		\
 138.101 +		case 8: __ret = (__get_user((x), (u32 __user *)(ptr)))				\
 138.102 +			| (__get_user((x) >> 32, ((u32 __user *)(ptr) + 1))); break;		\
 138.103 +		default: __ret = __get_user_unaligned_unknown();				\
 138.104 +	}											\
 138.105 +	__ret;											\
 138.106 +})
 138.107 +
 138.108 +#ifdef ASM_SUPPORTED
 138.109 +  struct __large_struct { unsigned long buf[100]; };
 138.110 +# define __m(x) (*(struct __large_struct __user *)(x))
 138.111 +
 138.112 +/* We need to declare the __ex_table section before we can use it in .xdata.  */
 138.113 +asm (".section \"__ex_table\", \"a\"\n\t.previous");
 138.114 +
 138.115 +# define __get_user_size(val, addr, n, err)							\
 138.116 +do {												\
 138.117 +	register long __gu_r8 asm ("r8") = 0;							\
 138.118 +	register long __gu_r9 asm ("r9");							\
 138.119 +	asm ("\n[1:]\tld"#n" %0=%2%P2\t// %0 and %1 get overwritten by exception handler\n"	\
 138.120 +	     "\t.xdata4 \"__ex_table\", 1b-., 1f-.+4\n"						\
 138.121 +	     "[1:]"										\
 138.122 +	     : "=r"(__gu_r9), "=r"(__gu_r8) : "m"(__m(addr)), "1"(__gu_r8));			\
 138.123 +	(err) = __gu_r8;									\
 138.124 +	(val) = __gu_r9;									\
 138.125 +} while (0)
 138.126 +
 138.127 +/*
 138.128 + * The "__put_user_size()" macro tells gcc it reads from memory instead of writing it.  This
 138.129 + * is because they do not write to any memory gcc knows about, so there are no aliasing
 138.130 + * issues.
 138.131 + */
 138.132 +# define __put_user_size(val, addr, n, err)							\
 138.133 +do {												\
 138.134 +	register long __pu_r8 asm ("r8") = 0;							\
 138.135 +	asm volatile ("\n[1:]\tst"#n" %1=%r2%P1\t// %0 gets overwritten by exception handler\n"	\
 138.136 +		      "\t.xdata4 \"__ex_table\", 1b-., 1f-.\n"					\
 138.137 +		      "[1:]"									\
 138.138 +		      : "=r"(__pu_r8) : "m"(__m(addr)), "rO"(val), "0"(__pu_r8));		\
 138.139 +	(err) = __pu_r8;									\
 138.140 +} while (0)
 138.141 +
 138.142 +#else /* !ASM_SUPPORTED */
 138.143 +# define RELOC_TYPE	2	/* ip-rel */
 138.144 +# define __get_user_size(val, addr, n, err)				\
 138.145 +do {									\
 138.146 +	__ld_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE);	\
 138.147 +	(err) = ia64_getreg(_IA64_REG_R8);				\
 138.148 +	(val) = ia64_getreg(_IA64_REG_R9);				\
 138.149 +} while (0)
 138.150 +# define __put_user_size(val, addr, n, err)							\
 138.151 +do {												\
 138.152 +	__st_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE, (unsigned long) (val));	\
 138.153 +	(err) = ia64_getreg(_IA64_REG_R8);							\
 138.154 +} while (0)
 138.155 +#endif /* !ASM_SUPPORTED */
 138.156 +
 138.157 +extern void __get_user_unknown (void);
 138.158 +
 138.159 +/*
 138.160 + * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve subroutine-calls, which
 138.161 + * could clobber r8 and r9 (among others).  Thus, be careful not to evaluate it while
 138.162 + * using r8/r9.
 138.163 + */
 138.164 +#define __do_get_user(check, x, ptr, size, segment)					\
 138.165 +({											\
 138.166 +	const __typeof__(*(ptr)) __user *__gu_ptr = (ptr);				\
 138.167 +	__typeof__ (size) __gu_size = (size);						\
 138.168 +	long __gu_err = -EFAULT, __gu_val = 0;						\
 138.169 +											\
 138.170 +	if (!check || __access_ok(__gu_ptr))						\
 138.171 +		switch (__gu_size) {							\
 138.172 +		      case 1: __get_user_size(__gu_val, __gu_ptr, 1, __gu_err); break;	\
 138.173 +		      case 2: __get_user_size(__gu_val, __gu_ptr, 2, __gu_err); break;	\
 138.174 +		      case 4: __get_user_size(__gu_val, __gu_ptr, 4, __gu_err); break;	\
 138.175 +		      case 8: __get_user_size(__gu_val, __gu_ptr, 8, __gu_err); break;	\
 138.176 +		      default: __get_user_unknown(); break;				\
 138.177 +		}									\
 138.178 +	(x) = (__typeof__(*(__gu_ptr))) __gu_val;					\
 138.179 +	__gu_err;									\
 138.180 +})
 138.181 +
 138.182 +#define __get_user_nocheck(x, ptr, size)	__do_get_user(0, x, ptr, size, KERNEL_DS)
 138.183 +#define __get_user_check(x, ptr, size, segment)	__do_get_user(1, x, ptr, size, segment)
 138.184 +
 138.185 +extern void __put_user_unknown (void);
 138.186 +
 138.187 +/*
 138.188 + * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve subroutine-calls, which
 138.189 + * could clobber r8 (among others).  Thus, be careful not to evaluate them while using r8.
 138.190 + */
 138.191 +#define __do_put_user(check, x, ptr, size, segment)					\
 138.192 +({											\
 138.193 +	__typeof__ (x) __pu_x = (x);							\
 138.194 +	__typeof__ (*(ptr)) __user *__pu_ptr = (ptr);					\
 138.195 +	__typeof__ (size) __pu_size = (size);						\
 138.196 +	long __pu_err = -EFAULT;							\
 138.197 +											\
 138.198 +	if (!check || __access_ok(__pu_ptr))						\
 138.199 +		switch (__pu_size) {							\
 138.200 +		      case 1: __put_user_size(__pu_x, __pu_ptr, 1, __pu_err); break;	\
 138.201 +		      case 2: __put_user_size(__pu_x, __pu_ptr, 2, __pu_err); break;	\
 138.202 +		      case 4: __put_user_size(__pu_x, __pu_ptr, 4, __pu_err); break;	\
 138.203 +		      case 8: __put_user_size(__pu_x, __pu_ptr, 8, __pu_err); break;	\
 138.204 +		      default: __put_user_unknown(); break;				\
 138.205 +		}									\
 138.206 +	__pu_err;									\
 138.207 +})
 138.208 +
 138.209 +#define __put_user_nocheck(x, ptr, size)	__do_put_user(0, x, ptr, size, KERNEL_DS)
 138.210 +#define __put_user_check(x, ptr, size, segment)	__do_put_user(1, x, ptr, size, segment)
 138.211 +
 138.212 +/*
 138.213 + * Complex access routines
 138.214 + */
 138.215 +extern unsigned long __must_check __copy_user (void __user *to, const void __user *from,
 138.216 +					       unsigned long count);
 138.217 +
 138.218 +static inline unsigned long
 138.219 +__copy_to_user (void __user *to, const void *from, unsigned long count)
 138.220 +{
 138.221 +	return __copy_user(to, (void __user *) from, count);
 138.222 +}
 138.223 +
 138.224 +static inline unsigned long
 138.225 +__copy_from_user (void *to, const void __user *from, unsigned long count)
 138.226 +{
 138.227 +	return __copy_user((void __user *) to, from, count);
 138.228 +}
 138.229 +
 138.230 +#define __copy_to_user_inatomic		__copy_to_user
 138.231 +#define __copy_from_user_inatomic	__copy_from_user
 138.232 +#define copy_to_user(to, from, n)							\
 138.233 +({											\
 138.234 +	void __user *__cu_to = (to);							\
 138.235 +	const void *__cu_from = (from);							\
 138.236 +	long __cu_len = (n);								\
 138.237 +											\
 138.238 +	if (__access_ok(__cu_to))							\
 138.239 +		__cu_len = __copy_user(__cu_to, (void __user *) __cu_from, __cu_len);	\
 138.240 +	__cu_len;									\
 138.241 +})
 138.242 +
 138.243 +#define copy_from_user(to, from, n)							\
 138.244 +({											\
 138.245 +	void *__cu_to = (to);								\
 138.246 +	const void __user *__cu_from = (from);						\
 138.247 +	long __cu_len = (n);								\
 138.248 +											\
 138.249 +	__chk_user_ptr(__cu_from);							\
 138.250 +	if (__access_ok(__cu_from))							\
 138.251 +		__cu_len = __copy_user((void __user *) __cu_to, __cu_from, __cu_len);	\
 138.252 +	__cu_len;									\
 138.253 +})
 138.254 +
 138.255 +#define __copy_in_user(to, from, size)	__copy_user((to), (from), (size))
 138.256 +
 138.257 +static inline unsigned long
 138.258 +copy_in_user (void __user *to, const void __user *from, unsigned long n)
 138.259 +{
 138.260 +	if (likely(access_ok(from, n) && access_ok(to, n)))
 138.261 +		n = __copy_user(to, from, n);
 138.262 +	return n;
 138.263 +}
 138.264 +
 138.265 +#define ARCH_HAS_SORT_EXTABLE
 138.266 +#define ARCH_HAS_SEARCH_EXTABLE
 138.267 +
 138.268 +struct exception_table_entry {
 138.269 +	int addr;	/* location-relative address of insn this fixup is for */
 138.270 +	int cont;	/* location-relative continuation addr.; if bit 2 is set, r9 is set to 0 */
 138.271 +};
 138.272 +
 138.273 +extern void ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e);
 138.274 +extern const struct exception_table_entry *search_exception_tables (unsigned long addr);
 138.275 +
 138.276 +static inline int
 138.277 +ia64_done_with_exception (struct pt_regs *regs)
 138.278 +{
 138.279 +	const struct exception_table_entry *e;
 138.280 +	e = search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri);
 138.281 +	if (e) {
 138.282 +		ia64_handle_exception(regs, e);
 138.283 +		return 1;
 138.284 +	}
 138.285 +	return 0;
 138.286 +}
 138.287 +
 138.288 +#endif /* _ASM_IA64_UACCESS_H */
   139.1 --- a/xen/include/asm-x86/config.h	Wed Mar 01 10:01:54 2006 -0700
   139.2 +++ b/xen/include/asm-x86/config.h	Wed Mar 01 12:47:25 2006 -0700
   139.3 @@ -37,6 +37,12 @@
   139.4  
   139.5  #define NR_CPUS 32
   139.6  
   139.7 +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
   139.8 +# define supervisor_mode_kernel (1)
   139.9 +#else
  139.10 +# define supervisor_mode_kernel (0)
  139.11 +#endif
  139.12 +
  139.13  /* Linkage for x86 */
  139.14  #define __ALIGN .align 16,0x90
  139.15  #define __ALIGN_STR ".align 16,0x90"
   140.1 --- a/xen/include/asm-x86/desc.h	Wed Mar 01 10:01:54 2006 -0700
   140.2 +++ b/xen/include/asm-x86/desc.h	Wed Mar 01 12:47:25 2006 -0700
   140.3 @@ -27,10 +27,23 @@
   140.4  #endif
   140.5  
   140.6  /* Fix up the RPL of a guest segment selector. */
   140.7 -#define fixup_guest_selector(sel)                               \
   140.8 +#define __fixup_guest_selector(sel)                             \
   140.9      ((sel) = (((sel) & 3) >= GUEST_KERNEL_RPL) ? (sel) :        \
  140.10       (((sel) & ~3) | GUEST_KERNEL_RPL))
  140.11  
  140.12 +/* Stack selectors don't need fixing up if the kernel runs in ring 0. */
  140.13 +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
  140.14 +#define fixup_guest_stack_selector(ss) ((void)0)
  140.15 +#else
  140.16 +#define fixup_guest_stack_selector(ss) __fixup_guest_selector(ss)
  140.17 +#endif
  140.18 +
  140.19 +/*
  140.20 + * Code selectors are always fixed up. It allows the Xen exit stub to detect
  140.21 + * return to guest context, even when the guest kernel runs in ring 0.
  140.22 + */
  140.23 +#define fixup_guest_code_selector(cs)  __fixup_guest_selector(cs)
  140.24 +
  140.25  /*
  140.26   * We need this function because enforcing the correct guest kernel RPL is
  140.27   * unsufficient if the selector is poked into an interrupt, trap or call gate.
   141.1 --- a/xen/include/asm-x86/hvm/hvm.h	Wed Mar 01 10:01:54 2006 -0700
   141.2 +++ b/xen/include/asm-x86/hvm/hvm.h	Wed Mar 01 12:47:25 2006 -0700
   141.3 @@ -67,6 +67,9 @@ struct hvm_function_table {
   141.4      int (*paging_enabled)(struct vcpu *v);
   141.5      int (*instruction_length)(struct vcpu *v);
   141.6      unsigned long (*get_guest_ctrl_reg)(struct vcpu *v, unsigned int num);
   141.7 +
   141.8 +    void (*init_ap_context)(struct vcpu_guest_context *ctxt,
   141.9 +                            int vcpuid, int trampoline_vector);
  141.10  };
  141.11  
  141.12  extern struct hvm_function_table hvm_funcs;
  141.13 @@ -173,4 +176,14 @@ hvm_get_guest_ctrl_reg(struct vcpu *v, u
  141.14          return hvm_funcs.get_guest_ctrl_reg(v, num);
  141.15      return 0;                   /* force to fail */
  141.16  }
  141.17 +
  141.18 +static inline void
  141.19 +hvm_init_ap_context(struct vcpu_guest_context *ctxt,
  141.20 +                    int vcpuid, int trampoline_vector)
  141.21 +{
  141.22 +    return hvm_funcs.init_ap_context(ctxt, vcpuid, trampoline_vector);
  141.23 +}
  141.24 +
  141.25 +extern int hvm_bringup_ap(int vcpuid, int trampoline_vector);
  141.26 +
  141.27  #endif /* __ASM_X86_HVM_HVM_H__ */
   142.1 --- a/xen/include/asm-x86/hvm/svm/emulate.h	Wed Mar 01 10:01:54 2006 -0700
   142.2 +++ b/xen/include/asm-x86/hvm/svm/emulate.h	Wed Mar 01 12:47:25 2006 -0700
   142.3 @@ -83,15 +83,15 @@ extern unsigned long get_effective_addr_
   142.4          struct cpu_user_regs *regs, const u8 prefix, const u8 *operand, 
   142.5          u8 *size);
   142.6  extern OPERATING_MODE get_operating_mode (struct vmcb_struct *vmcb);
   142.7 -extern unsigned int decode_dest_reg(u8 modrm);
   142.8 -extern unsigned int decode_src_reg(u8 modrm);
   142.9 +extern unsigned int decode_dest_reg(u8 prefix, u8 modrm);
  142.10 +extern unsigned int decode_src_reg(u8 prefix, u8 modrm);
  142.11  extern unsigned long svm_rip2pointer(struct vmcb_struct *vmcb);
  142.12 -extern unsigned int __get_instruction_length_from_list(struct vmcb_struct *vmcb,
  142.13 +extern int __get_instruction_length_from_list(struct vmcb_struct *vmcb,
  142.14          enum instruction_index *list, unsigned int list_count, 
  142.15          u8 *guest_eip_buf, enum instruction_index *match);
  142.16  
  142.17  
  142.18 -static inline unsigned int __get_instruction_length(struct vmcb_struct *vmcb, 
  142.19 +static inline int __get_instruction_length(struct vmcb_struct *vmcb, 
  142.20          enum instruction_index instr, u8 *guest_eip_buf)
  142.21  {
  142.22      return __get_instruction_length_from_list(vmcb, &instr, 1, guest_eip_buf, 
  142.23 @@ -138,9 +138,20 @@ static inline unsigned int is_prefix(u8 
  142.24  }
  142.25  
  142.26  
  142.27 +static inline int skip_prefix_bytes(u8 *buf, size_t size)
  142.28 +{
  142.29 +    int index;
  142.30 +    for (index = 0; index < size && is_prefix(buf[index]); index ++)  
  142.31 +        /* do nothing */ ;
  142.32 +    return index;
  142.33 +}
  142.34 +
  142.35 +
  142.36 +
  142.37  static void inline __update_guest_eip(struct vmcb_struct *vmcb, 
  142.38 -        unsigned long inst_len) 
  142.39 +        int inst_len) 
  142.40  {
  142.41 +    ASSERT(inst_len > 0);
  142.42      vmcb->rip += inst_len;
  142.43  }
  142.44  
   143.1 --- a/xen/include/asm-x86/hvm/svm/svm.h	Wed Mar 01 10:01:54 2006 -0700
   143.2 +++ b/xen/include/asm-x86/hvm/svm/svm.h	Wed Mar 01 12:47:25 2006 -0700
   143.3 @@ -54,6 +54,8 @@ extern int load_vmcb(struct arch_svm_str
   143.4  /* For debugging. Remove when no longer needed. */
   143.5  extern void svm_dump_host_regs(const char *from);
   143.6  
   143.7 +extern void svm_migrate_timers(struct vcpu *v);
   143.8 +
   143.9  /* ASID API */
  143.10  enum {
  143.11      ASID_AVAILABLE = 0,
   144.1 --- a/xen/include/asm-x86/hvm/svm/vmcb.h	Wed Mar 01 10:01:54 2006 -0700
   144.2 +++ b/xen/include/asm-x86/hvm/svm/vmcb.h	Wed Mar 01 12:47:25 2006 -0700
   144.3 @@ -269,21 +269,6 @@ enum {
   144.4  #define SVM_LONG_GUEST(ed)    \
   144.5    (test_bit(SVM_CPU_STATE_LMA_ENABLED, &ed->arch.hvm_svm.cpu_state))
   144.6  
   144.7 -enum {
   144.8 -    SVM_INDEX_MSR_LSTAR = 0,
   144.9 -    SVM_INDEX_MSR_STAR,
  144.10 -    SVM_INDEX_MSR_CSTAR,
  144.11 -    SVM_INDEX_MSR_SYSCALL_MASK,
  144.12 -    SVM_INDEX_MSR_EFER,
  144.13 -
  144.14 -    SVM_MSR_COUNT,
  144.15 -};
  144.16 -
  144.17 -struct svm_msr_state {
  144.18 -    unsigned long flags;
  144.19 -    unsigned long msr_items[SVM_MSR_COUNT];
  144.20 -    unsigned long shadow_gs;
  144.21 -};
  144.22  
  144.23  /* 
  144.24   * Attribute for segment selector. This is a copy of bit 40:47 & 52:55 of the
  144.25 @@ -449,7 +434,7 @@ struct vmcb_struct {
  144.26  
  144.27  struct arch_svm_struct {
  144.28      struct vmcb_struct	*vmcb;
  144.29 -    void		*host_save_area;
  144.30 +    void		        *host_save_area;
  144.31      u64                 host_save_pa;
  144.32      u64                 vmcb_pa;
  144.33      u32                 *iopm;
  144.34 @@ -457,14 +442,15 @@ struct arch_svm_struct {
  144.35      u64                 vmexit_tsc; /* tsc read at #VMEXIT. for TSC_OFFSET */
  144.36      int                 injecting_event;
  144.37      int                 saved_irq_vector;
  144.38 -    u32                 core;        /* cpu of last vmexit */
  144.39 +    u32                 launch_core;
  144.40 +    u32                 asid_core;
  144.41      
  144.42      unsigned long       flags;      /* VMCB flags */
  144.43 -    unsigned long       cpu_shadow_cr0; /* copy of guest read shadow CR0 */
  144.44 +    unsigned long       cpu_shadow_cr0; /* Guest value for CR0 */
  144.45 +    unsigned long       cpu_shadow_cr4; /* Guest value for CR4 */
  144.46      unsigned long       cpu_cr2;
  144.47      unsigned long       cpu_cr3;
  144.48      unsigned long       cpu_state;
  144.49 -    struct svm_msr_state msr_content;
  144.50      struct timer        hlt_timer;  /* hlt ins emulation wakeup timer */
  144.51  };
  144.52  
  144.53 @@ -486,6 +472,14 @@ enum {
  144.54  #define VMCB_EFLAGS_RESERVED_0          0xffc08028 /* bitmap for 0 */
  144.55  #define VMCB_EFLAGS_RESERVED_1          0x00000002 /* bitmap for 1 */
  144.56  
  144.57 +/* These bits in the CR4 are owned by the host */
  144.58 +#ifdef __i386__
  144.59 +#define SVM_CR4_HOST_MASK (0)
  144.60 +#else
  144.61 +#define SVM_CR4_HOST_MASK (X86_CR4_PAE)
  144.62 +#endif
  144.63 +
  144.64 +
  144.65  #endif /* ASM_X86_HVM_SVM_VMCS_H__ */
  144.66  
  144.67  /*
   145.1 --- a/xen/include/asm-x86/hvm/vcpu.h	Wed Mar 01 10:01:54 2006 -0700
   145.2 +++ b/xen/include/asm-x86/hvm/vcpu.h	Wed Mar 01 12:47:25 2006 -0700
   145.3 @@ -25,10 +25,15 @@
   145.4  #include <asm/hvm/vmx/vmcs.h>
   145.5  #include <asm/hvm/svm/vmcb.h>
   145.6  
   145.7 +#define HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM          0
   145.8 +#define HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI     1
   145.9 +
  145.10  struct hvm_vcpu {
  145.11 -    unsigned long       ioflags;
  145.12 -    struct mmio_op      mmio_op;
  145.13 -    struct vlapic       *vlapic;
  145.14 +    unsigned long   ioflags;
  145.15 +    struct mmio_op  mmio_op;
  145.16 +    struct vlapic   *vlapic;
  145.17 +    /* For AP startup */
  145.18 +    unsigned long   init_sipi_sipi_state;
  145.19  
  145.20      union {
  145.21          struct arch_vmx_struct vmx;
   146.1 --- a/xen/include/asm-x86/hvm/vlapic.h	Wed Mar 01 10:01:54 2006 -0700
   146.2 +++ b/xen/include/asm-x86/hvm/vlapic.h	Wed Mar 01 12:47:25 2006 -0700
   146.3 @@ -159,9 +159,6 @@ typedef struct direct_intr_info {
   146.4      int source[6];
   146.5  } direct_intr_info_t;
   146.6  
   146.7 -#define VLAPIC_INIT_SIPI_SIPI_STATE_NORM          0
   146.8 -#define VLAPIC_INIT_SIPI_SIPI_STATE_WAIT_SIPI     1
   146.9 -
  146.10  struct vlapic
  146.11  {
  146.12      //FIXME check what would be 64 bit on EM64T
  146.13 @@ -197,7 +194,6 @@ struct vlapic
  146.14      unsigned long      init_ticks;
  146.15      uint32_t           err_write_count;
  146.16      uint64_t           apic_base_msr;
  146.17 -    uint32_t           init_sipi_sipi_state;
  146.18      struct vcpu        *vcpu;
  146.19      struct domain      *domain;
  146.20  };
   147.1 --- a/xen/include/asm-x86/mm.h	Wed Mar 01 10:01:54 2006 -0700
   147.2 +++ b/xen/include/asm-x86/mm.h	Wed Mar 01 12:47:25 2006 -0700
   147.3 @@ -337,6 +337,10 @@ void cleanup_writable_pagetable(struct d
   147.4          UNLOCK_BIGLOCK(d);                                      \
   147.5      } while ( 0 )
   147.6  
   147.7 +#define writable_pagetable_in_sync(d)           \
   147.8 +    (!((d)->arch.ptwr[PTWR_PT_ACTIVE].l1va |    \
   147.9 +       (d)->arch.ptwr[PTWR_PT_INACTIVE].l1va))
  147.10 +
  147.11  int audit_adjust_pgtables(struct domain *d, int dir, int noisy);
  147.12  
  147.13  #ifndef NDEBUG
  147.14 @@ -376,7 +380,7 @@ void propagate_page_fault(unsigned long 
  147.15  int __sync_lazy_execstate(void);
  147.16  
  147.17  /* Arch-specific portion of memory_op hypercall. */
  147.18 -long arch_memory_op(int op, void *arg);
  147.19 -long subarch_memory_op(int op, void *arg);
  147.20 +long arch_memory_op(int op, GUEST_HANDLE(void) arg);
  147.21 +long subarch_memory_op(int op, GUEST_HANDLE(void) arg);
  147.22  
  147.23  #endif /* __ASM_X86_MM_H__ */
   148.1 --- a/xen/include/asm-x86/shadow_64.h	Wed Mar 01 10:01:54 2006 -0700
   148.2 +++ b/xen/include/asm-x86/shadow_64.h	Wed Mar 01 12:47:25 2006 -0700
   148.3 @@ -223,6 +223,7 @@ static inline int __entry(
   148.4      int i;
   148.5      pgentry_64_t *le_e;
   148.6      pgentry_64_t *le_p = NULL;
   148.7 +    pgentry_64_t *phys_vtable = NULL;
   148.8      unsigned long mfn;
   148.9      int index;
  148.10      u32 level = flag & L_MASK;
  148.11 @@ -251,25 +252,35 @@ static inline int __entry(
  148.12      {
  148.13          root_level = PAE_PAGING_LEVELS;
  148.14          index = table_offset_64(va, root_level);
  148.15 -        le_e = (pgentry_64_t *)map_domain_page(
  148.16 +        phys_vtable = (pgentry_64_t *)map_domain_page(
  148.17              pagetable_get_pfn(v->domain->arch.phys_table));
  148.18 +        le_e = &phys_vtable[index];
  148.19      }
  148.20  
  148.21      /*
  148.22       * If it's not external mode, then mfn should be machine physical.
  148.23       */
  148.24 -    for (i = root_level - level; i > 0; i--) {
  148.25 -        if ( unlikely(!(entry_get_flags(*le_e) & _PAGE_PRESENT)) ) {
  148.26 +    for ( i = root_level - level; i > 0; i-- )
  148.27 +    {
  148.28 +        if ( unlikely(!(entry_get_flags(*le_e) & _PAGE_PRESENT)) )
  148.29 +        {
  148.30              if ( le_p )
  148.31                  unmap_domain_page(le_p);
  148.32 +
  148.33 +            if ( phys_vtable )
  148.34 +                unmap_domain_page(phys_vtable);
  148.35 +
  148.36              return 0;
  148.37          }
  148.38 +
  148.39          mfn = entry_get_pfn(*le_e);
  148.40          if ( (flag & GUEST_ENTRY) && shadow_mode_translate(d) )
  148.41              mfn = get_mfn_from_gpfn(mfn);
  148.42 +
  148.43          if ( le_p )
  148.44              unmap_domain_page(le_p);
  148.45          le_p = (pgentry_64_t *)map_domain_page(mfn);
  148.46 +
  148.47          if ( flag & SHADOW_ENTRY )
  148.48              index = table_offset_64(va, (level + i - 1));
  148.49          else
  148.50 @@ -285,8 +296,10 @@ static inline int __entry(
  148.51      if ( le_p )
  148.52          unmap_domain_page(le_p);
  148.53  
  148.54 +    if ( phys_vtable )
  148.55 +        unmap_domain_page(phys_vtable);
  148.56 +
  148.57      return 1;
  148.58 -
  148.59  }
  148.60  
  148.61  static inline int __rw_entry(
   149.1 --- a/xen/include/asm-x86/shadow_public.h	Wed Mar 01 10:01:54 2006 -0700
   149.2 +++ b/xen/include/asm-x86/shadow_public.h	Wed Mar 01 12:47:25 2006 -0700
   149.3 @@ -22,8 +22,6 @@
   149.4  #ifndef _XEN_SHADOW_PUBLIC_H
   149.5  #define _XEN_SHADOW_PUBLIC_H
   149.6  
   149.7 -extern int alloc_p2m_table(struct domain *d);
   149.8 -
   149.9  #if CONFIG_PAGING_LEVELS >= 3
  149.10  #define MFN_PINNED(_x) (mfn_to_page(_x)->u.inuse.type_info & PGT_pinned)
  149.11  
   150.1 --- a/xen/include/asm-x86/x86_32/asm_defns.h	Wed Mar 01 10:01:54 2006 -0700
   150.2 +++ b/xen/include/asm-x86/x86_32/asm_defns.h	Wed Mar 01 12:47:25 2006 -0700
   150.3 @@ -48,11 +48,26 @@
   150.4  
   150.5  #ifdef PERF_COUNTERS
   150.6  #define PERFC_INCR(_name,_idx)                          \
   150.7 -    lock incl perfcounters+_name(,_idx,4)
   150.8 +        lock incl perfcounters+_name(,_idx,4)
   150.9  #else
  150.10  #define PERFC_INCR(_name,_idx)
  150.11  #endif
  150.12  
  150.13 +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
  150.14 +#define FIXUP_RING0_GUEST_STACK                         \
  150.15 +        testl $2,8(%esp);                               \
  150.16 +        jnz 1f; /* rings 2 & 3 permitted */             \
  150.17 +        testl $1,8(%esp);                               \
  150.18 +        jz 2f;                                          \
  150.19 +        ud2; /* ring 1 should not be used */            \
  150.20 +        2:cmpl $(__HYPERVISOR_VIRT_START),%esp;         \
  150.21 +        jge 1f;                                         \
  150.22 +        call fixup_ring0_guest_stack;                   \
  150.23 +        1:
  150.24 +#else
  150.25 +#define FIXUP_RING0_GUEST_STACK
  150.26 +#endif
  150.27 +
  150.28  #define BUILD_SMP_INTERRUPT(x,v) XBUILD_SMP_INTERRUPT(x,v)
  150.29  #define XBUILD_SMP_INTERRUPT(x,v)               \
  150.30  asmlinkage void x(void);                        \
  150.31 @@ -61,6 +76,7 @@ asmlinkage void x(void);                
  150.32      ".globl " STR(x) "\n\t"                     \
  150.33      STR(x) ":\n\t"                              \
  150.34      "pushl $"#v"<<16\n\t"                       \
  150.35 +    STR(FIXUP_RING0_GUEST_STACK)                \
  150.36      STR(SAVE_ALL(a))                            \
  150.37      "movl %esp,%eax\n\t"                        \
  150.38      "pushl %eax\n\t"                            \
  150.39 @@ -72,6 +88,7 @@ asmlinkage void x(void);                
  150.40  __asm__(                                        \
  150.41      "\n" __ALIGN_STR"\n"                        \
  150.42      "common_interrupt:\n\t"                     \
  150.43 +    STR(FIXUP_RING0_GUEST_STACK)                \
  150.44      STR(SAVE_ALL(a))                            \
  150.45      "movl %esp,%eax\n\t"                        \
  150.46      "pushl %eax\n\t"                            \
   151.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   151.2 +++ b/xen/include/public/features.h	Wed Mar 01 12:47:25 2006 -0700
   151.3 @@ -0,0 +1,53 @@
   151.4 +/******************************************************************************
   151.5 + * features.h
   151.6 + * 
   151.7 + * Feature flags, reported by XENVER_get_features.
   151.8 + * 
   151.9 + * Copyright (c) 2006, Keir Fraser <keir@xensource.com>
  151.10 + */
  151.11 +
  151.12 +#ifndef __XEN_PUBLIC_FEATURES_H__
  151.13 +#define __XEN_PUBLIC_FEATURES_H__
  151.14 +
  151.15 +/*
  151.16 + * If set, the guest does not need to write-protect its pagetables, and can
  151.17 + * update them via direct writes.
  151.18 + */
  151.19 +#define XENFEAT_writable_page_tables       0
  151.20 +
  151.21 +/*
  151.22 + * If set, the guest does not need to write-protect its segment descriptor
  151.23 + * tables, and can update them via direct writes.
  151.24 + */
  151.25 +#define XENFEAT_writable_descriptor_tables 1
  151.26 +
  151.27 +/*
  151.28 + * If set, translation between the guest's 'pseudo-physical' address space
  151.29 + * and the host's machine address space are handled by the hypervisor. In this
  151.30 + * mode the guest does not need to perform phys-to/from-machine translations
  151.31 + * when performing page table operations.
  151.32 + */
  151.33 +#define XENFEAT_auto_translated_physmap    2
  151.34 +
  151.35 +/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
  151.36 +#define XENFEAT_supervisor_mode_kernel     3
  151.37 +
  151.38 +/*
  151.39 + * If set, the guest does not need to allocate x86 PAE page directories
  151.40 + * below 4GB. This flag is usually implied by auto_translated_physmap.
  151.41 + */
  151.42 +#define XENFEAT_pae_pgdir_above_4gb        4
  151.43 +
  151.44 +#define XENFEAT_NR_SUBMAPS 1
  151.45 +
  151.46 +#endif /* __XEN_PUBLIC_FEATURES_H__ */
  151.47 +
  151.48 +/*
  151.49 + * Local variables:
  151.50 + * mode: C
  151.51 + * c-set-style: "BSD"
  151.52 + * c-basic-offset: 4
  151.53 + * tab-width: 4
  151.54 + * indent-tabs-mode: nil
  151.55 + * End:
  151.56 + */
   152.1 --- a/xen/include/public/memory.h	Wed Mar 01 10:01:54 2006 -0700
   152.2 +++ b/xen/include/public/memory.h	Wed Mar 01 12:47:25 2006 -0700
   152.3 @@ -29,7 +29,7 @@ typedef struct xen_memory_reservation {
   152.4       *   OUT: GMFN bases of extents that were allocated
   152.5       *   (NB. This command also updates the mach_to_phys translation table)
   152.6       */
   152.7 -    unsigned long *extent_start;
   152.8 +    GUEST_HANDLE(xen_ulong) extent_start;
   152.9  
  152.10      /* Number of extents, and size/alignment of each (2^extent_order pages). */
  152.11      unsigned long  nr_extents;
  152.12 @@ -50,6 +50,7 @@ typedef struct xen_memory_reservation {
  152.13      domid_t        domid;
  152.14  
  152.15  } xen_memory_reservation_t;
  152.16 +DEFINE_GUEST_HANDLE(xen_memory_reservation_t);
  152.17  
  152.18  /*
  152.19   * Returns the maximum machine frame number of mapped RAM in this system.
  152.20 @@ -85,7 +86,7 @@ typedef struct xen_machphys_mfn_list {
  152.21       * any large discontiguities in the machine address space, 2MB gaps in
  152.22       * the machphys table will be represented by an MFN base of zero.
  152.23       */
  152.24 -    unsigned long *extent_start;
  152.25 +    GUEST_HANDLE(xen_ulong) extent_start;
  152.26  
  152.27      /*
  152.28       * Number of extents written to the above array. This will be smaller
  152.29 @@ -93,6 +94,7 @@ typedef struct xen_machphys_mfn_list {
  152.30       */
  152.31      unsigned int nr_extents;
  152.32  } xen_machphys_mfn_list_t;
  152.33 +DEFINE_GUEST_HANDLE(xen_machphys_mfn_list_t);
  152.34  
  152.35  /*
  152.36   * Returns the base and size of the specified reserved 'RAM hole' in the
  152.37 @@ -113,6 +115,7 @@ typedef struct xen_reserved_phys_area {
  152.38      /* Base and size of the specified reserved area. */
  152.39      unsigned long first_gpfn, nr_gpfns;
  152.40  } xen_reserved_phys_area_t;
  152.41 +DEFINE_GUEST_HANDLE(xen_reserved_phys_area_t);
  152.42  
  152.43  /*
  152.44   * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
  152.45 @@ -127,14 +130,15 @@ typedef struct xen_translate_gpfn_list {
  152.46      unsigned long nr_gpfns;
  152.47  
  152.48      /* List of GPFNs to translate. */
  152.49 -    unsigned long *gpfn_list;
  152.50 +    GUEST_HANDLE(xen_ulong) gpfn_list;
  152.51  
  152.52      /*
  152.53       * Output list to contain MFN translations. May be the same as the input
  152.54       * list (in which case each input GPFN is overwritten with the output MFN).
  152.55       */
  152.56 -    unsigned long *mfn_list;
  152.57 +    GUEST_HANDLE(xen_ulong) mfn_list;
  152.58  } xen_translate_gpfn_list_t;
  152.59 +DEFINE_GUEST_HANDLE(xen_translate_gpfn_list_t);
  152.60  
  152.61  #endif /* __XEN_PUBLIC_MEMORY_H__ */
  152.62  
   153.1 --- a/xen/include/public/vcpu.h	Wed Mar 01 10:01:54 2006 -0700
   153.2 +++ b/xen/include/public/vcpu.h	Wed Mar 01 12:47:25 2006 -0700
   153.3 @@ -51,6 +51,61 @@
   153.4  /* Returns 1 if the given VCPU is up. */
   153.5  #define VCPUOP_is_up                3
   153.6  
   153.7 +/*
   153.8 + * Return information about the state and running time of a VCPU.
   153.9 + * @extra_arg == pointer to vcpu_runstate_info structure.
  153.10 + */
  153.11 +#define VCPUOP_get_runstate_info    4
  153.12 +typedef struct vcpu_runstate_info {
  153.13 +    /* VCPU's current state (RUNSTATE_*). */
  153.14 +    int      state;
  153.15 +    /* When was current state entered (system time, ns)? */
  153.16 +    uint64_t state_entry_time;
  153.17 +    /*
  153.18 +     * Time spent in each RUNSTATE_* (ns). The sum of these times is
  153.19 +     * guaranteed not to drift from system time.
  153.20 +     */
  153.21 +    uint64_t time[4];
  153.22 +} vcpu_runstate_info_t;
  153.23 +
  153.24 +/* VCPU is currently running on a physical CPU. */
  153.25 +#define RUNSTATE_running  0
  153.26 +
  153.27 +/* VCPU is runnable, but not currently scheduled on any physical CPU. */
  153.28 +#define RUNSTATE_runnable 1
  153.29 +
  153.30 +/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */
  153.31 +#define RUNSTATE_blocked  2
  153.32 +
  153.33 +/*
  153.34 + * VCPU is not runnable, but it is not blocked.
  153.35 + * This is a 'catch all' state for things like hotplug and pauses by the
  153.36 + * system administrator (or for critical sections in the hypervisor).
  153.37 + * RUNSTATE_blocked dominates this state (it is the preferred state).
  153.38 + */
  153.39 +#define RUNSTATE_offline  3
  153.40 +
  153.41 +/*
  153.42 + * Register a shared memory area from which the guest may obtain its own
  153.43 + * runstate information without needing to execute a hypercall.
  153.44 + * Notes:
  153.45 + *  1. The registered address may be virtual or physical, depending on the
  153.46 + *     platform. The virtual address should be registered on x86 systems.
  153.47 + *  2. Only one shared area may be registered per VCPU. The shared area is
  153.48 + *     updated by the hypervisor each time the VCPU is scheduled. Thus
  153.49 + *     runstate.state will always be RUNSTATE_running and
  153.50 + *     runstate.state_entry_time will indicate the system time at which the
  153.51 + *     VCPU was last scheduled to run.
  153.52 + * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
  153.53 + */
  153.54 +#define VCPUOP_register_runstate_memory_area 5
  153.55 +typedef struct vcpu_register_runstate_memory_area {
  153.56 +    union {
  153.57 +        struct vcpu_runstate_info *v;
  153.58 +        uint64_t p;
  153.59 +    } addr;
  153.60 +} vcpu_register_runstate_memory_area_t;
  153.61 +
  153.62  #endif /* __XEN_PUBLIC_VCPU_H__ */
  153.63  
  153.64  /*
   154.1 --- a/xen/include/public/version.h	Wed Mar 01 10:01:54 2006 -0700
   154.2 +++ b/xen/include/public/version.h	Wed Mar 01 12:47:25 2006 -0700
   154.3 @@ -48,