ia64/xen-unstable

changeset 3729:f5f2757b3aa2

bitkeeper revision 1.1159.1.545 (4208ec60-ql2CB2KKyZRC_8udlW9kA)

Merge tempest.cl.cam.ac.uk:/auto/groups/xeno-xenod/BK/xeno.bk
into tempest.cl.cam.ac.uk:/local/scratch/smh22/xen-unstable.bk
author smh22@tempest.cl.cam.ac.uk
date Tue Feb 08 16:44:16 2005 +0000 (2005-02-08)
parents 88957a238191 89e86842952a
children 9168fa9e70e0 d21fbb46b9d8
files .rootkeys linux-2.4.29-xen-sparse/mm/memory.c linux-2.6.10-xen-sparse/arch/xen/i386/mm/fault.c linux-2.6.10-xen-sparse/drivers/xen/blkback/blkback.c linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap.c linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap.h linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_datapath.c linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_userdev.c linux-2.6.10-xen-sparse/drivers/xen/netback/netback.c linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c linux-2.6.10-xen-sparse/drivers/xen/usbback/usbback.c linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable.h linux-2.6.10-xen-sparse/include/asm-xen/hypervisor.h netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c tools/examples/bochsrc tools/examples/vif-nat tools/ioemu/include/config.h tools/ioemu/include/pc_system.h tools/ioemu/iodev/cpu.cc tools/ioemu/iodev/pc_system.cc tools/libxc/xc_linux_build.c tools/libxc/xc_vmx_build.c tools/python/xen/xend/XendDomainInfo.py tools/python/xen/xend/server/SrvDaemon.py xen/arch/x86/boot/mkelf32.c xen/arch/x86/boot/x86_32.S xen/arch/x86/boot/x86_64.S xen/arch/x86/dom0_ops.c xen/arch/x86/domain.c xen/arch/x86/memory.c xen/arch/x86/mm.c xen/arch/x86/setup.c xen/arch/x86/shadow.c xen/arch/x86/smpboot.c xen/arch/x86/traps.c xen/arch/x86/vmx.c xen/arch/x86/vmx_io.c xen/arch/x86/vmx_platform.c xen/arch/x86/vmx_vmcs.c xen/arch/x86/x86_32/domain_build.c xen/arch/x86/x86_32/entry.S xen/arch/x86/x86_32/mm.c xen/arch/x86/x86_32/traps.c xen/arch/x86/x86_64/domain_build.c xen/arch/x86/x86_64/entry.S xen/arch/x86/x86_64/mm.c xen/arch/x86/x86_64/traps.c xen/common/dom_mem_ops.c xen/common/domain.c xen/common/elf.c xen/common/keyhandler.c xen/common/multicall.c xen/common/physdev.c xen/common/resource.c xen/common/sched_bvt.c xen/drivers/pci/Makefile xen/drivers/pci/compat.c xen/include/asm-x86/config.h xen/include/asm-x86/domain.h xen/include/asm-x86/mm.h xen/include/asm-x86/multicall.h xen/include/asm-x86/page.h xen/include/asm-x86/shadow.h xen/include/asm-x86/x86_32/page.h xen/include/asm-x86/x86_32/regs.h xen/include/asm-x86/x86_32/uaccess.h xen/include/asm-x86/x86_64/page.h xen/include/asm-x86/x86_64/regs.h xen/include/asm-x86/x86_64/uaccess.h xen/include/public/arch-x86_64.h xen/include/public/xen.h xen/include/xen/ioport.h xen/include/xen/sched.h
line diff
     1.1 --- a/.rootkeys	Mon Feb 07 08:19:24 2005 +0000
     1.2 +++ b/.rootkeys	Tue Feb 08 16:44:16 2005 +0000
     1.3 @@ -867,8 +867,8 @@ 3ddb79bcCAq6IpdkHueChoVTfXqEQQ xen/arch/
     1.4  3ddb79bcBit4xJXbwtX0kb1hh2uO1Q xen/arch/x86/idle0_task.c
     1.5  3ddb79bcKIkRR0kqWaJhe5VUDkMdxg xen/arch/x86/io_apic.c
     1.6  3ddb79bdqfIcjkz_h9Hvtp8Tk_19Zw xen/arch/x86/irq.c
     1.7 -40ec29ffuOa1ZvmJHzFKyZn4k_RcXg xen/arch/x86/memory.c
     1.8  41d54a76qfpO0VnbL2tYs0Jgt3W3XA xen/arch/x86/microcode.c
     1.9 +40ec29ffuOa1ZvmJHzFKyZn4k_RcXg xen/arch/x86/mm.c
    1.10  3ddb79bdS4UeWWXDH-FaBKqcpMFcnw xen/arch/x86/mpparse.c
    1.11  41aaf566Z4sTDgJ77eEg0TzzQ1ka6Q xen/arch/x86/mtrr/amd.c
    1.12  41aaf566TOpOBXT00wwQGUh20f1rlA xen/arch/x86/mtrr/centaur.c
    1.13 @@ -1038,6 +1038,7 @@ 41c0c412lQ0NVVN9PsOSznQ-qhOiPA xen/inclu
    1.14  418fbcfe_WliJPToeVM-9VStvym-hw xen/include/asm-x86/x86_32/asm_defns.h
    1.15  3ddb79c2ADvRmdexd9y3AYK9_NTx-Q xen/include/asm-x86/x86_32/current.h
    1.16  3e20b82fl1jmQiKdLy7fxMcutfpjWA xen/include/asm-x86/x86_32/domain_page.h
    1.17 +4208e2a3ZNFroNXbX9OYaOB-xtUyDQ xen/include/asm-x86/x86_32/page.h
    1.18  3ddb79c3mbqEM7QQr3zVq7NiBNhouA xen/include/asm-x86/x86_32/regs.h
    1.19  3e7f358aG11EvMI9VJ4_9hD4LUO7rQ xen/include/asm-x86/x86_32/string.h
    1.20  3ddb79c3M2n1ROZH6xk3HbyN4CPDqg xen/include/asm-x86/x86_32/uaccess.h
    1.21 @@ -1045,6 +1046,7 @@ 41bf1717bML6GxpclTWJabiaO5W5vg xen/inclu
    1.22  404f1b9ceJeGVaPNIENm2FkK0AgEOQ xen/include/asm-x86/x86_64/current.h
    1.23  41febc4b1aCGLsm0Y0b_82h7lFtrEA xen/include/asm-x86/x86_64/domain_page.h
    1.24  404f1badfXZJZ2sU8sh9PS2EZvd19Q xen/include/asm-x86/x86_64/ldt.h
    1.25 +4208e2a3Fktw4ZttKdDxbhvTQ6brfQ xen/include/asm-x86/x86_64/page.h
    1.26  404f1bb86rAXB3aLS1vYdcqpJiEcyg xen/include/asm-x86/x86_64/regs.h
    1.27  40e1966azOJZfNI6Ilthe6Q-T3Hewg xen/include/asm-x86/x86_64/string.h
    1.28  404f1bc4tWkB9Qr8RkKtZGW5eMQzhw xen/include/asm-x86/x86_64/uaccess.h
     2.1 --- a/linux-2.4.29-xen-sparse/mm/memory.c	Mon Feb 07 08:19:24 2005 +0000
     2.2 +++ b/linux-2.4.29-xen-sparse/mm/memory.c	Tue Feb 08 16:44:16 2005 +0000
     2.3 @@ -915,7 +915,7 @@ static inline void establish_pte(struct 
     2.4  #ifdef CONFIG_XEN
     2.5  	if ( likely(vma->vm_mm == current->mm) ) {
     2.6  		XEN_flush_page_update_queue();
     2.7 -		HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, entry, UVMF_INVLPG);
     2.8 +		HYPERVISOR_update_va_mapping(address, entry, UVMF_INVLPG);
     2.9  	} else {
    2.10  		set_pte(page_table, entry);
    2.11  		flush_tlb_page(vma, address);
    2.12 @@ -1191,7 +1191,7 @@ static int do_swap_page(struct mm_struct
    2.13  #ifdef CONFIG_XEN
    2.14  	if ( likely(vma->vm_mm == current->mm) ) {
    2.15  		XEN_flush_page_update_queue();
    2.16 -		HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, pte, 0);
    2.17 +		HYPERVISOR_update_va_mapping(address, pte, 0);
    2.18  	} else {
    2.19  		set_pte(page_table, pte);
    2.20  		XEN_flush_page_update_queue();
    2.21 @@ -1247,7 +1247,7 @@ static int do_anonymous_page(struct mm_s
    2.22  #ifdef CONFIG_XEN
    2.23  	if ( likely(vma->vm_mm == current->mm) ) {
    2.24  		XEN_flush_page_update_queue();
    2.25 -		HYPERVISOR_update_va_mapping(addr>>PAGE_SHIFT, entry, 0);
    2.26 +		HYPERVISOR_update_va_mapping(addr, entry, 0);
    2.27  	} else {
    2.28  		set_pte(page_table, entry);
    2.29  		XEN_flush_page_update_queue();
    2.30 @@ -1333,7 +1333,7 @@ static int do_no_page(struct mm_struct *
    2.31  #ifdef CONFIG_XEN
    2.32  		if ( likely(vma->vm_mm == current->mm) ) {
    2.33  			XEN_flush_page_update_queue();
    2.34 -			HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, entry, 0);
    2.35 +			HYPERVISOR_update_va_mapping(address, entry, 0);
    2.36  		} else {
    2.37  			set_pte(page_table, entry);
    2.38  			XEN_flush_page_update_queue();
     3.1 --- a/linux-2.6.10-xen-sparse/arch/xen/i386/mm/fault.c	Mon Feb 07 08:19:24 2005 +0000
     3.2 +++ b/linux-2.6.10-xen-sparse/arch/xen/i386/mm/fault.c	Tue Feb 08 16:44:16 2005 +0000
     3.3 @@ -229,7 +229,9 @@ fastcall void do_page_fault(struct pt_re
     3.4  	/* Set the "privileged fault" bit to something sane. */
     3.5  	error_code &= 3;
     3.6  	error_code |= (regs->xcs & 2) << 1;
     3.7 -
     3.8 +	if (regs->eflags & X86_EFLAGS_VM)
     3.9 +		error_code |= 4;
    3.10 +		
    3.11   	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
    3.12   					SIGSEGV) == NOTIFY_STOP)
    3.13   		return;
     4.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/blkback/blkback.c	Mon Feb 07 08:19:24 2005 +0000
     4.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/blkback/blkback.c	Tue Feb 08 16:44:16 2005 +0000
     4.3 @@ -95,7 +95,7 @@ static void fast_flush_area(int idx, int
     4.4      for ( i = 0; i < nr_pages; i++ )
     4.5      {
     4.6          mcl[i].op = __HYPERVISOR_update_va_mapping;
     4.7 -        mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT;
     4.8 +        mcl[i].args[0] = MMAP_VADDR(idx, i);
     4.9          mcl[i].args[1] = 0;
    4.10          mcl[i].args[2] = 0;
    4.11      }
    4.12 @@ -343,14 +343,14 @@ static void dispatch_probe(blkif_t *blki
    4.13  
    4.14  #ifdef CONFIG_XEN_BLKDEV_TAP_BE
    4.15      if ( HYPERVISOR_update_va_mapping_otherdomain(
    4.16 -        MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT,
    4.17 +        MMAP_VADDR(pending_idx, 0),
    4.18          (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
    4.19          0, (blkif->is_blktap ? ID_TO_DOM(req->id) : blkif->domid) ) )
    4.20          
    4.21          goto out;
    4.22  #else
    4.23      if ( HYPERVISOR_update_va_mapping_otherdomain(
    4.24 -        MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT,
    4.25 +        MMAP_VADDR(pending_idx, 0),
    4.26          (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
    4.27          0, blkif->domid) ) 
    4.28          
    4.29 @@ -436,7 +436,7 @@ static void dispatch_rw_block_io(blkif_t
    4.30      for ( i = 0; i < nr_psegs; i++ )
    4.31      {
    4.32          mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
    4.33 -        mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
    4.34 +        mcl[i].args[0] = MMAP_VADDR(pending_idx, i);
    4.35          mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot;
    4.36          mcl[i].args[2] = 0;
    4.37  #ifdef CONFIG_XEN_BLKDEV_TAP_BE
     5.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap.c	Mon Feb 07 08:19:24 2005 +0000
     5.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap.c	Tue Feb 08 16:44:16 2005 +0000
     5.3 @@ -16,7 +16,7 @@
     5.4  
     5.5  #include "blktap.h"
     5.6  
     5.7 -int __init xlblk_init(void)
     5.8 +int __init xlblktap_init(void)
     5.9  {
    5.10      ctrl_msg_t               cmsg;
    5.11      blkif_fe_driver_status_t fe_st;
    5.12 @@ -64,6 +64,7 @@ int __init xlblk_init(void)
    5.13      return 0;
    5.14  }
    5.15  
    5.16 +#if 0 /* tap doesn't handle suspend/resume */
    5.17  void blkdev_suspend(void)
    5.18  {
    5.19  }
    5.20 @@ -81,6 +82,6 @@ void blkdev_resume(void)
    5.21      memcpy(cmsg.msg, &st, sizeof(st));
    5.22      ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
    5.23  }
    5.24 +#endif
    5.25  
    5.26 -
    5.27 -__initcall(xlblk_init);
    5.28 +__initcall(xlblktap_init);
     6.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap.h	Mon Feb 07 08:19:24 2005 +0000
     6.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap.h	Tue Feb 08 16:44:16 2005 +0000
     6.3 @@ -48,6 +48,12 @@
     6.4  #define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args)
     6.5  
     6.6  
     6.7 +/* -------[ state descriptors ]--------------------------------------- */
     6.8 +
     6.9 +#define BLKIF_STATE_CLOSED       0
    6.10 +#define BLKIF_STATE_DISCONNECTED 1
    6.11 +#define BLKIF_STATE_CONNECTED    2
    6.12 +
    6.13  /* -------[ connection tracking ]------------------------------------- */
    6.14  
    6.15  #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
    6.16 @@ -99,7 +105,6 @@ typedef struct {
    6.17      unsigned long  mach_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    6.18      unsigned long  virt_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST];
    6.19      int            next_free;
    6.20 -    int inuse; /* debugging */
    6.21  } active_req_t;
    6.22  
    6.23  typedef unsigned int ACTIVE_RING_IDX;
    6.24 @@ -181,7 +186,7 @@ extern unsigned long mmap_vstart;
    6.25   * for shared memory rings.
    6.26   */
    6.27  
    6.28 -#define RING_PAGES 128 
    6.29 +#define RING_PAGES 3 /* Ctrl, Front, and Back */ 
    6.30  extern unsigned long rings_vstart;
    6.31  
    6.32  
    6.33 @@ -190,11 +195,10 @@ extern unsigned long blktap_mode;
    6.34  
    6.35  /* Connection to a single backend domain. */
    6.36  extern blkif_front_ring_t blktap_be_ring;
    6.37 +extern unsigned int blktap_be_evtchn;
    6.38 +extern unsigned int blktap_be_state;
    6.39  
    6.40 -/* Event channel to backend domain. */
    6.41 -extern unsigned int blkif_ptbe_evtchn;
    6.42 -
    6.43 -/* User ring status... this will soon vanish into a ring struct. */
    6.44 +/* User ring status. */
    6.45  extern unsigned long blktap_ring_ok;
    6.46  
    6.47  /* -------[ ...and function prototypes. ]----------------------------- */
    6.48 @@ -213,8 +217,7 @@ void blktap_kick_user(void);
    6.49  /* user ring access functions: */
    6.50  int blktap_write_fe_ring(blkif_request_t *req);
    6.51  int blktap_write_be_ring(blkif_response_t *rsp);
    6.52 -int blktap_read_fe_ring(void);
    6.53 -int blktap_read_be_ring(void);
    6.54 +int blktap_write_ctrl_ring(ctrl_msg_t *msg);
    6.55  
    6.56  /* fe/be ring access functions: */
    6.57  int write_resp_to_fe_ring(blkif_t *blkif, blkif_response_t *rsp);
     7.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c	Mon Feb 07 08:19:24 2005 +0000
     7.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c	Tue Feb 08 16:44:16 2005 +0000
     7.3 @@ -10,10 +10,6 @@
     7.4   
     7.5  #include "blktap.h"
     7.6  
     7.7 -#define BLKIF_STATE_CLOSED       0
     7.8 -#define BLKIF_STATE_DISCONNECTED 1
     7.9 -#define BLKIF_STATE_CONNECTED    2
    7.10 -
    7.11  static char *blkif_state_name[] = {
    7.12      [BLKIF_STATE_CLOSED]       = "closed",
    7.13      [BLKIF_STATE_DISCONNECTED] = "disconnected",
    7.14 @@ -26,9 +22,10 @@ static char * blkif_status_name[] = {
    7.15      [BLKIF_INTERFACE_STATUS_CONNECTED]    = "connected",
    7.16      [BLKIF_INTERFACE_STATUS_CHANGED]      = "changed",
    7.17  };
    7.18 -static unsigned int blkif_pt_state = BLKIF_STATE_CLOSED;
    7.19 -static unsigned blkif_ptbe_irq;
    7.20 -unsigned int blkif_ptbe_evtchn;
    7.21 +
    7.22 +static unsigned blktap_be_irq;
    7.23 +unsigned int    blktap_be_state = BLKIF_STATE_CLOSED;
    7.24 +unsigned int    blktap_be_evtchn;
    7.25  
    7.26  /*-----[ Control Messages to/from Frontend VMs ]--------------------------*/
    7.27  
    7.28 @@ -306,7 +303,7 @@ static void blkif_ptbe_disconnect(void)
    7.29      sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL);
    7.30      SHARED_RING_INIT(BLKIF_RING, sring);
    7.31      FRONT_RING_INIT(BLKIF_RING, &blktap_be_ring, sring);
    7.32 -    blkif_pt_state  = BLKIF_STATE_DISCONNECTED;
    7.33 +    blktap_be_state  = BLKIF_STATE_DISCONNECTED;
    7.34      DPRINTK("Blkif-Passthrough-BE is now DISCONNECTED.\n");
    7.35      blkif_ptbe_send_interface_connect();
    7.36  }
    7.37 @@ -315,10 +312,10 @@ static void blkif_ptbe_connect(blkif_fe_
    7.38  {
    7.39      int err = 0;
    7.40      
    7.41 -    blkif_ptbe_evtchn = status->evtchn;
    7.42 -    blkif_ptbe_irq    = bind_evtchn_to_irq(blkif_ptbe_evtchn);
    7.43 +    blktap_be_evtchn = status->evtchn;
    7.44 +    blktap_be_irq    = bind_evtchn_to_irq(blktap_be_evtchn);
    7.45  
    7.46 -    err = request_irq(blkif_ptbe_irq, blkif_ptbe_int, 
    7.47 +    err = request_irq(blktap_be_irq, blkif_ptbe_int, 
    7.48                        SA_SAMPLE_RANDOM, "blkif", NULL);
    7.49      if ( err ) {
    7.50  	WPRINTK("blkfront request_irq failed (%d)\n", err);
    7.51 @@ -326,7 +323,7 @@ static void blkif_ptbe_connect(blkif_fe_
    7.52      } else {
    7.53  	/* transtion to connected in case we need to do a 
    7.54             a partion probe on a whole disk */
    7.55 -        blkif_pt_state = BLKIF_STATE_CONNECTED;
    7.56 +        blktap_be_state = BLKIF_STATE_CONNECTED;
    7.57      }
    7.58  }
    7.59  
    7.60 @@ -334,7 +331,7 @@ static void unexpected(blkif_fe_interfac
    7.61  {
    7.62      WPRINTK(" TAP: Unexpected blkif status %s in state %s\n", 
    7.63             blkif_status_name[status->status],
    7.64 -           blkif_state_name[blkif_pt_state]);
    7.65 +           blkif_state_name[blktap_be_state]);
    7.66  }
    7.67  
    7.68  static void blkif_ptbe_status(
    7.69 @@ -352,7 +349,7 @@ static void blkif_ptbe_status(
    7.70      switch ( status->status )
    7.71      {
    7.72      case BLKIF_INTERFACE_STATUS_CLOSED:
    7.73 -        switch ( blkif_pt_state )
    7.74 +        switch ( blktap_be_state )
    7.75          {
    7.76          case BLKIF_STATE_CLOSED:
    7.77              unexpected(status);
    7.78 @@ -366,7 +363,7 @@ static void blkif_ptbe_status(
    7.79          break;
    7.80          
    7.81      case BLKIF_INTERFACE_STATUS_DISCONNECTED:
    7.82 -        switch ( blkif_pt_state )
    7.83 +        switch ( blktap_be_state )
    7.84          {
    7.85          case BLKIF_STATE_CLOSED:
    7.86              blkif_ptbe_disconnect();
    7.87 @@ -380,7 +377,7 @@ static void blkif_ptbe_status(
    7.88          break;
    7.89          
    7.90      case BLKIF_INTERFACE_STATUS_CONNECTED:
    7.91 -        switch ( blkif_pt_state )
    7.92 +        switch ( blktap_be_state )
    7.93          {
    7.94          case BLKIF_STATE_CLOSED:
    7.95              unexpected(status);
    7.96 @@ -398,7 +395,7 @@ static void blkif_ptbe_status(
    7.97          break;
    7.98  
    7.99     case BLKIF_INTERFACE_STATUS_CHANGED:
   7.100 -        switch ( blkif_pt_state )
   7.101 +        switch ( blktap_be_state )
   7.102          {
   7.103          case BLKIF_STATE_CLOSED:
   7.104          case BLKIF_STATE_DISCONNECTED:
   7.105 @@ -440,6 +437,14 @@ void blkif_ctrlif_rx(ctrl_msg_t *msg, un
   7.106  
   7.107      case CMSG_BLKIF_BE:
   7.108          
   7.109 +        /* send a copy of the message to user if wanted */
   7.110 +        
   7.111 +        if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
   7.112 +             (blktap_mode & BLKTAP_MODE_COPY_FE) ) {
   7.113 +            
   7.114 +            blktap_write_ctrl_ring(msg);
   7.115 +        }
   7.116 +        
   7.117          switch ( msg->subtype )
   7.118          {
   7.119          case CMSG_BLKIF_BE_CREATE:
   7.120 @@ -500,11 +505,13 @@ void blkif_ctrlif_rx(ctrl_msg_t *msg, un
   7.121      ctrl_if_send_response(msg);
   7.122  }
   7.123  
   7.124 -/*-----[ All control messages enter here: ]-------------------------------*/
   7.125 +/*-----[ Initialization ]-------------------------------------------------*/
   7.126  
   7.127  void __init blkif_interface_init(void)
   7.128  {
   7.129      blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 
   7.130                                       0, 0, NULL, NULL);
   7.131      memset(blkif_hash, 0, sizeof(blkif_hash));
   7.132 +    
   7.133 +    blktap_be_ring.sring = NULL;
   7.134  }
     8.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_datapath.c	Mon Feb 07 08:19:24 2005 +0000
     8.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_datapath.c	Tue Feb 08 16:44:16 2005 +0000
     8.3 @@ -40,8 +40,6 @@ inline active_req_t *get_active_req(void
     8.4      spin_lock_irqsave(&active_req_lock, flags);
     8.5      idx =  active_req_ring[MASK_ACTIVE_IDX(active_cons++)];
     8.6      ar = &active_reqs[idx];
     8.7 -if (ar->inuse) WPRINTK("AR INUSE! (%lu)\n", ar->id);
     8.8 -ar->inuse = 1;
     8.9      spin_unlock_irqrestore(&active_req_lock, flags);
    8.10      
    8.11      return ar;
    8.12 @@ -52,7 +50,6 @@ inline void free_active_req(active_req_t
    8.13      unsigned long flags;
    8.14          
    8.15      spin_lock_irqsave(&active_req_lock, flags);
    8.16 -ar->inuse = 0;
    8.17      active_req_ring[MASK_ACTIVE_IDX(active_prod++)] = ACTIVE_IDX(ar);
    8.18      spin_unlock_irqrestore(&active_req_lock, flags);
    8.19  }
    8.20 @@ -97,11 +94,8 @@ inline int write_resp_to_fe_ring(blkif_t
    8.21      blkif_response_t *resp_d;
    8.22      active_req_t *ar;
    8.23      
    8.24 -    /* remap id, and free the active req. blkif lookup goes here too.*/
    8.25      ar = &active_reqs[ID_TO_IDX(rsp->id)];
    8.26 -    /* WPRINTK("%3u > %3lu\n", ID_TO_IDX(rsp->id), ar->id); */
    8.27      rsp->id = ar->id;
    8.28 -    free_active_req(ar);
    8.29              
    8.30      resp_d = RING_GET_RESPONSE(BLKIF_RING, &blkif->blk_ring,
    8.31              blkif->blk_ring.rsp_prod_pvt);
    8.32 @@ -109,6 +103,9 @@ inline int write_resp_to_fe_ring(blkif_t
    8.33      wmb();
    8.34      blkif->blk_ring.rsp_prod_pvt++;
    8.35              
    8.36 +    blkif_put(ar->blkif);
    8.37 +    free_active_req(ar);
    8.38 +    
    8.39      return 0;
    8.40  }
    8.41  
    8.42 @@ -116,6 +113,11 @@ inline int write_req_to_be_ring(blkif_re
    8.43  {
    8.44      blkif_request_t *req_d;
    8.45  
    8.46 +    if ( blktap_be_state != BLKIF_STATE_CONNECTED ) {
    8.47 +        WPRINTK("Tap trying to access an unconnected backend!\n");
    8.48 +        return 0;
    8.49 +    }
    8.50 +    
    8.51      req_d = RING_GET_REQUEST(BLKIF_RING, &blktap_be_ring,
    8.52              blktap_be_ring.req_prod_pvt);
    8.53      memcpy(req_d, req, sizeof(blkif_request_t));
    8.54 @@ -135,9 +137,12 @@ inline void kick_fe_domain(blkif_t *blki
    8.55  
    8.56  inline void kick_be_domain(void)
    8.57  {
    8.58 +    if ( blktap_be_state != BLKIF_STATE_CONNECTED ) 
    8.59 +        return;
    8.60 +    
    8.61      wmb(); /* Ensure that the frontend can see the requests. */
    8.62      RING_PUSH_REQUESTS(BLKIF_RING, &blktap_be_ring);
    8.63 -    notify_via_evtchn(blkif_ptbe_evtchn);
    8.64 +    notify_via_evtchn(blktap_be_evtchn);
    8.65      DPRINTK("notified BE\n");
    8.66  }
    8.67  
    8.68 @@ -310,6 +315,7 @@ static int do_block_io_op(blkif_t *blkif
    8.69           */
    8.70          ar = get_active_req();
    8.71          ar->id = req_s->id;
    8.72 +        blkif_get(blkif);
    8.73          ar->blkif = blkif;
    8.74          req_s->id = MAKE_ID(blkif->domid, ACTIVE_IDX(ar));
    8.75          /* WPRINTK("%3u < %3lu\n", ID_TO_IDX(req_s->id), ar->id); */
    8.76 @@ -458,11 +464,13 @@ void print_vm_ring_idxs(void)
    8.77                  blkif->blk_ring.sring->req_prod,
    8.78                  blkif->blk_ring.sring->rsp_prod);
    8.79      }
    8.80 -    WPRINTK("BE Ring: \n--------\n");
    8.81 -    WPRINTK("BE: rsp_cons: %2d, req_prod_prv: %2d "
    8.82 -        "| req_prod: %2d, rsp_prod: %2d\n",
    8.83 -        blktap_be_ring.rsp_cons,
    8.84 -        blktap_be_ring.req_prod_pvt,
    8.85 -        blktap_be_ring.sring->req_prod,
    8.86 -        blktap_be_ring.sring->rsp_prod);
    8.87 +    if (blktap_be_ring.sring != NULL) {
    8.88 +        WPRINTK("BE Ring: \n--------\n");
    8.89 +        WPRINTK("BE: rsp_cons: %2d, req_prod_prv: %2d "
    8.90 +            "| req_prod: %2d, rsp_prod: %2d\n",
    8.91 +            blktap_be_ring.rsp_cons,
    8.92 +            blktap_be_ring.req_prod_pvt,
    8.93 +            blktap_be_ring.sring->req_prod,
    8.94 +            blktap_be_ring.sring->rsp_prod);
    8.95 +    }
    8.96  }        
     9.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_userdev.c	Mon Feb 07 08:19:24 2005 +0000
     9.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/blktap/blktap_userdev.c	Tue Feb 08 16:44:16 2005 +0000
     9.3 @@ -19,6 +19,7 @@
     9.4  #include <linux/gfp.h>
     9.5  #include <linux/poll.h>
     9.6  #include <asm/pgalloc.h>
     9.7 +#include <asm-xen/xen-public/io/blkif.h> /* for control ring. */
     9.8  
     9.9  #include "blktap.h"
    9.10  
    9.11 @@ -40,6 +41,11 @@ unsigned long rings_vstart;
    9.12  /* Rings up to user space. */
    9.13  static blkif_front_ring_t blktap_ufe_ring;
    9.14  static blkif_back_ring_t  blktap_ube_ring;
    9.15 +static ctrl_front_ring_t  blktap_uctrl_ring;
    9.16 +
    9.17 +/* local prototypes */
    9.18 +static int blktap_read_fe_ring(void);
    9.19 +static int blktap_read_be_ring(void);
    9.20  
    9.21  /* -------[ blktap vm ops ]------------------------------------------- */
    9.22  
    9.23 @@ -66,16 +72,28 @@ struct vm_operations_struct blktap_vm_op
    9.24  static int blktap_open(struct inode *inode, struct file *filp)
    9.25  {
    9.26      blkif_sring_t *sring;
    9.27 +    ctrl_sring_t *csring;
    9.28      
    9.29      if ( test_and_set_bit(0, &blktap_dev_inuse) )
    9.30          return -EBUSY;
    9.31  
    9.32      printk(KERN_ALERT "blktap open.\n");
    9.33 +    
    9.34 +    /* Allocate the ctrl ring. */
    9.35 +    csring = (ctrl_sring_t *)get_zeroed_page(GFP_KERNEL);
    9.36 +    if (csring == NULL)
    9.37 +        goto fail_nomem;
    9.38 +
    9.39 +    SetPageReserved(virt_to_page(csring));
    9.40 +    
    9.41 +    SHARED_RING_INIT(CTRL_RING, csring);
    9.42 +    FRONT_RING_INIT(CTRL_RING, &blktap_uctrl_ring, csring);
    9.43 +
    9.44  
    9.45      /* Allocate the fe ring. */
    9.46      sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL);
    9.47      if (sring == NULL)
    9.48 -        goto fail_nomem;
    9.49 +        goto fail_free_ctrl;
    9.50  
    9.51      SetPageReserved(virt_to_page(sring));
    9.52      
    9.53 @@ -95,6 +113,9 @@ static int blktap_open(struct inode *ino
    9.54      DPRINTK(KERN_ALERT "blktap open.\n");
    9.55  
    9.56      return 0;
    9.57 +    
    9.58 + fail_free_ctrl:
    9.59 +    free_page( (unsigned long) blktap_uctrl_ring.sring);
    9.60  
    9.61   fail_free_fe:
    9.62      free_page( (unsigned long) blktap_ufe_ring.sring);
    9.63 @@ -111,6 +132,9 @@ static int blktap_release(struct inode *
    9.64      printk(KERN_ALERT "blktap closed.\n");
    9.65  
    9.66      /* Free the ring page. */
    9.67 +    ClearPageReserved(virt_to_page(blktap_uctrl_ring.sring));
    9.68 +    free_page((unsigned long) blktap_uctrl_ring.sring);
    9.69 +
    9.70      ClearPageReserved(virt_to_page(blktap_ufe_ring.sring));
    9.71      free_page((unsigned long) blktap_ufe_ring.sring);
    9.72  
    9.73 @@ -120,6 +144,15 @@ static int blktap_release(struct inode *
    9.74      return 0;
    9.75  }
    9.76  
    9.77 +/* Note on mmap:
    9.78 + * remap_pfn_range sets VM_IO on vma->vm_flags.  In trying to make libaio
    9.79 + * work to do direct page access from userspace, this ended up being a
    9.80 + * problem.  The bigger issue seems to be that there is no way to map
    9.81 + * a foreign page in to user space and have the virtual address of that 
    9.82 + * page map sanely down to a mfn.
    9.83 + * Removing the VM_IO flag results in a loop in get_user_pages, as 
    9.84 + * pfn_valid() always fails on a foreign page.
    9.85 + */
    9.86  static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
    9.87  {
    9.88      int size;
    9.89 @@ -148,20 +181,28 @@ static int blktap_mmap(struct file *filp
    9.90      /* not sure if I really need to do this... */
    9.91      vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
    9.92  
    9.93 +    DPRINTK("Mapping ctrl_ring page %lx.\n", __pa(blktap_uctrl_ring.sring));
    9.94 +    if (remap_pfn_range(vma, vma->vm_start, 
    9.95 +                         __pa(blktap_uctrl_ring.sring) >> PAGE_SHIFT, 
    9.96 +                         PAGE_SIZE, vma->vm_page_prot)) {
    9.97 +        WPRINTK("ctrl_ring: remap_pfn_range failure!\n");
    9.98 +    }
    9.99 +
   9.100 +
   9.101      DPRINTK("Mapping be_ring page %lx.\n", __pa(blktap_ube_ring.sring));
   9.102 -    if (remap_page_range(vma, vma->vm_start, 
   9.103 -                         __pa(blktap_ube_ring.sring), 
   9.104 +    if (remap_pfn_range(vma, vma->vm_start + PAGE_SIZE, 
   9.105 +                         __pa(blktap_ube_ring.sring) >> PAGE_SHIFT, 
   9.106                           PAGE_SIZE, vma->vm_page_prot)) {
   9.107 -        WPRINTK("be_ring: remap_page_range failure!\n");
   9.108 +        WPRINTK("be_ring: remap_pfn_range failure!\n");
   9.109      }
   9.110  
   9.111      DPRINTK("Mapping fe_ring page %lx.\n", __pa(blktap_ufe_ring.sring));
   9.112 -    if (remap_page_range(vma, vma->vm_start + PAGE_SIZE, 
   9.113 -                         __pa(blktap_ufe_ring.sring), 
   9.114 +    if (remap_pfn_range(vma, vma->vm_start + ( 2 * PAGE_SIZE ), 
   9.115 +                         __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT, 
   9.116                           PAGE_SIZE, vma->vm_page_prot)) {
   9.117 -        WPRINTK("fe_ring: remap_page_range failure!\n");
   9.118 +        WPRINTK("fe_ring: remap_pfn_range failure!\n");
   9.119      }
   9.120 -
   9.121 +            
   9.122      blktap_vma = vma;
   9.123      blktap_ring_ok = 1;
   9.124  
   9.125 @@ -211,9 +252,11 @@ static unsigned int blktap_poll(struct f
   9.126  {
   9.127          poll_wait(file, &blktap_wait, wait);
   9.128  
   9.129 -        if ( RING_HAS_UNPUSHED_REQUESTS(BLKIF_RING, &blktap_ufe_ring) ||
   9.130 +        if ( RING_HAS_UNPUSHED_REQUESTS(BLKIF_RING, &blktap_uctrl_ring) ||
   9.131 +             RING_HAS_UNPUSHED_REQUESTS(BLKIF_RING, &blktap_ufe_ring)   ||
   9.132               RING_HAS_UNPUSHED_RESPONSES(BLKIF_RING, &blktap_ube_ring) ) {
   9.133  
   9.134 +            RING_PUSH_REQUESTS(BLKIF_RING, &blktap_uctrl_ring);
   9.135              RING_PUSH_REQUESTS(BLKIF_RING, &blktap_ufe_ring);
   9.136              RING_PUSH_RESPONSES(BLKIF_RING, &blktap_ube_ring);
   9.137              return POLLIN | POLLRDNORM;
   9.138 @@ -260,7 +303,6 @@ int blktap_write_fe_ring(blkif_request_t
   9.139          return 0;
   9.140      }
   9.141  
   9.142 -    //target = RING_NEXT_EMPTY_REQUEST(BLKIF_RING, &blktap_ufe_ring);
   9.143      target = RING_GET_REQUEST(BLKIF_RING, &blktap_ufe_ring,
   9.144              blktap_ufe_ring.req_prod_pvt);
   9.145      memcpy(target, req, sizeof(*req));
   9.146 @@ -270,7 +312,7 @@ int blktap_write_fe_ring(blkif_request_t
   9.147  
   9.148          error = direct_remap_area_pages(blktap_vma->vm_mm, 
   9.149                                          MMAP_VADDR(ID_TO_IDX(req->id), i), 
   9.150 -                                        target->frame_and_sects[0] & PAGE_MASK,
   9.151 +                                        target->frame_and_sects[i] & PAGE_MASK,
   9.152                                          PAGE_SIZE,
   9.153                                          blktap_vma->vm_page_prot,
   9.154                                          ID_TO_DOM(req->id));
   9.155 @@ -302,7 +344,6 @@ int blktap_write_be_ring(blkif_response_
   9.156  
   9.157      /* No test for fullness in the response direction. */
   9.158  
   9.159 -    //target = RING_NEXT_EMPTY_RESPONSE(BLKIF_RING, &blktap_ube_ring);
   9.160      target = RING_GET_RESPONSE(BLKIF_RING, &blktap_ube_ring,
   9.161              blktap_ube_ring.rsp_prod_pvt);
   9.162      memcpy(target, rsp, sizeof(*rsp));
   9.163 @@ -314,7 +355,7 @@ int blktap_write_be_ring(blkif_response_
   9.164      return 0;
   9.165  }
   9.166  
   9.167 -int blktap_read_fe_ring(void)
   9.168 +static int blktap_read_fe_ring(void)
   9.169  {
   9.170      /* This is called to read responses from the UFE ring. */
   9.171  
   9.172 @@ -329,7 +370,6 @@ int blktap_read_fe_ring(void)
   9.173      if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) {
   9.174  
   9.175          /* for each outstanding message on the UFEring  */
   9.176 -        //RING_FOREACH_RESPONSE(BLKIF_RING, &blktap_ufe_ring, prod, resp_s) {
   9.177          rp = blktap_ufe_ring.sring->rsp_prod;
   9.178          rmb();
   9.179          
   9.180 @@ -349,7 +389,7 @@ int blktap_read_fe_ring(void)
   9.181      return 0;
   9.182  }
   9.183  
   9.184 -int blktap_read_be_ring(void)
   9.185 +static int blktap_read_be_ring(void)
   9.186  {
   9.187      /* This is called to read requests from the UBE ring. */
   9.188  
   9.189 @@ -362,7 +402,6 @@ int blktap_read_be_ring(void)
   9.190      if (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) {
   9.191  
   9.192          /* for each outstanding message on the UFEring  */
   9.193 -        //RING_FOREACH_REQUEST(BLKIF_RING, &blktap_ube_ring, prod, req_s) {
   9.194          rp = blktap_ube_ring.sring->req_prod;
   9.195          rmb();
   9.196          for ( i = blktap_ube_ring.req_cons; i != rp; i++ )
   9.197 @@ -379,6 +418,31 @@ int blktap_read_be_ring(void)
   9.198  
   9.199      return 0;
   9.200  }
   9.201 +
   9.202 +int blktap_write_ctrl_ring(ctrl_msg_t *msg)
   9.203 +{
   9.204 +    ctrl_msg_t *target;
   9.205 +
   9.206 +    if ( ! blktap_ring_ok ) {
   9.207 +        DPRINTK("blktap: be_ring not ready for a request!\n");
   9.208 +        return 0;
   9.209 +    }
   9.210 +
   9.211 +    /* No test for fullness in the response direction. */
   9.212 +
   9.213 +    target = RING_GET_REQUEST(CTRL_RING, &blktap_uctrl_ring,
   9.214 +            blktap_uctrl_ring.req_prod_pvt);
   9.215 +    memcpy(target, msg, sizeof(*msg));
   9.216 +
   9.217 +    blktap_uctrl_ring.req_prod_pvt++;
   9.218 +    
   9.219 +    /* currently treat the ring as unidirectional. */
   9.220 +    blktap_uctrl_ring.rsp_cons = blktap_uctrl_ring.sring->rsp_prod;
   9.221 +    
   9.222 +    return 0;
   9.223 +       
   9.224 +}
   9.225 +
   9.226  /* -------[ blktap module setup ]------------------------------------- */
   9.227  
   9.228  static struct miscdevice blktap_miscdev = {
    10.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/netback/netback.c	Mon Feb 07 08:19:24 2005 +0000
    10.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/netback/netback.c	Tue Feb 08 16:44:16 2005 +0000
    10.3 @@ -234,7 +234,7 @@ static void net_rx_action(unsigned long 
    10.4          mmu[2].val  = MMUEXT_REASSIGN_PAGE;
    10.5  
    10.6          mcl[0].op = __HYPERVISOR_update_va_mapping;
    10.7 -        mcl[0].args[0] = vdata >> PAGE_SHIFT;
    10.8 +        mcl[0].args[0] = vdata;
    10.9          mcl[0].args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL;
   10.10          mcl[0].args[2] = 0;
   10.11          mcl[1].op = __HYPERVISOR_mmu_update;
   10.12 @@ -409,7 +409,7 @@ static void net_tx_action(unsigned long 
   10.13      {
   10.14          pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
   10.15          mcl[0].op = __HYPERVISOR_update_va_mapping;
   10.16 -        mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
   10.17 +        mcl[0].args[0] = MMAP_VADDR(pending_idx);
   10.18          mcl[0].args[1] = 0;
   10.19          mcl[0].args[2] = 0;
   10.20          mcl++;     
   10.21 @@ -546,7 +546,7 @@ static void net_tx_action(unsigned long 
   10.22          skb_reserve(skb, 16);
   10.23  
   10.24          mcl[0].op = __HYPERVISOR_update_va_mapping_otherdomain;
   10.25 -        mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
   10.26 +        mcl[0].args[0] = MMAP_VADDR(pending_idx);
   10.27          mcl[0].args[1] = (txreq.addr & PAGE_MASK) | __PAGE_KERNEL;
   10.28          mcl[0].args[2] = 0;
   10.29          mcl[0].args[3] = netif->domid;
    11.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c	Mon Feb 07 08:19:24 2005 +0000
    11.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c	Tue Feb 08 16:44:16 2005 +0000
    11.3 @@ -392,7 +392,7 @@ static void network_alloc_rx_buffers(str
    11.4  	    = INVALID_P2M_ENTRY;
    11.5  
    11.6          rx_mcl[i].op = __HYPERVISOR_update_va_mapping;
    11.7 -        rx_mcl[i].args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
    11.8 +        rx_mcl[i].args[0] = (unsigned long)skb->head;
    11.9          rx_mcl[i].args[1] = 0;
   11.10          rx_mcl[i].args[2] = 0;
   11.11      }
   11.12 @@ -593,7 +593,7 @@ static int netif_poll(struct net_device 
   11.13          mmu->val  = __pa(skb->head) >> PAGE_SHIFT;
   11.14          mmu++;
   11.15          mcl->op = __HYPERVISOR_update_va_mapping;
   11.16 -        mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
   11.17 +        mcl->args[0] = (unsigned long)skb->head;
   11.18          mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
   11.19          mcl->args[2] = 0;
   11.20          mcl++;
    12.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/usbback/usbback.c	Mon Feb 07 08:19:24 2005 +0000
    12.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/usbback/usbback.c	Tue Feb 08 16:44:16 2005 +0000
    12.3 @@ -191,7 +191,7 @@ static void fast_flush_area(int idx, int
    12.4      for ( i = 0; i < nr_pages; i++ )
    12.5      {
    12.6          mcl[i].op = __HYPERVISOR_update_va_mapping;
    12.7 -        mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT;
    12.8 +        mcl[i].args[0] = MMAP_VADDR(idx, i);
    12.9          mcl[i].args[1] = 0;
   12.10          mcl[i].args[2] = 0;
   12.11      }
   12.12 @@ -630,7 +630,7 @@ static void dispatch_usb_io(usbif_priv_t
   12.13            i++, offset += PAGE_SIZE )
   12.14      {
   12.15  	mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
   12.16 -	mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
   12.17 +	mcl[i].args[0] = MMAP_VADDR(pending_idx, i);
   12.18          mcl[i].args[1] = ((buffer_mach & PAGE_MASK) + offset) | remap_prot;
   12.19          mcl[i].args[2] = 0;
   12.20          mcl[i].args[3] = up->domid;
   12.21 @@ -646,7 +646,7 @@ static void dispatch_usb_io(usbif_priv_t
   12.22      {
   12.23          /* Map in ISO schedule, if necessary. */
   12.24          mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
   12.25 -        mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
   12.26 +        mcl[i].args[0] = MMAP_VADDR(pending_idx, i);
   12.27          mcl[i].args[1] = (req->iso_schedule & PAGE_MASK) | remap_prot;
   12.28          mcl[i].args[2] = 0;
   12.29          mcl[i].args[3] = up->domid;
    13.1 --- a/linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Mon Feb 07 08:19:24 2005 +0000
    13.2 +++ b/linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Tue Feb 08 16:44:16 2005 +0000
    13.3 @@ -426,7 +426,7 @@ extern pte_t *lookup_address(unsigned lo
    13.4  		if (__dirty) {						  \
    13.5  		        if ( likely((__vma)->vm_mm == current->mm) ) {    \
    13.6  			    xen_flush_page_update_queue();                \
    13.7 -			    HYPERVISOR_update_va_mapping((__address)>>PAGE_SHIFT, (__entry), UVMF_INVLPG); \
    13.8 +			    HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG); \
    13.9  			} else {                                          \
   13.10                              xen_l1_entry_update((__ptep), (__entry).pte_low); \
   13.11  			    flush_tlb_page((__vma), (__address));         \
   13.12 @@ -445,7 +445,7 @@ do {				  					\
   13.13  do {				  					\
   13.14  	if (likely((__vma)->vm_mm == current->mm)) {			\
   13.15  		xen_flush_page_update_queue();				\
   13.16 -		HYPERVISOR_update_va_mapping((__address)>>PAGE_SHIFT,	\
   13.17 +		HYPERVISOR_update_va_mapping((__address),		\
   13.18  					     __entry, 0);		\
   13.19  	} else {							\
   13.20  		xen_l1_entry_update((__ptep), (__entry).pte_low);	\
    14.1 --- a/linux-2.6.10-xen-sparse/include/asm-xen/hypervisor.h	Mon Feb 07 08:19:24 2005 +0000
    14.2 +++ b/linux-2.6.10-xen-sparse/include/asm-xen/hypervisor.h	Tue Feb 08 16:44:16 2005 +0000
    14.3 @@ -438,7 +438,7 @@ HYPERVISOR_multicall(
    14.4  
    14.5  static inline int
    14.6  HYPERVISOR_update_va_mapping(
    14.7 -    unsigned long page_nr, pte_t new_val, unsigned long flags)
    14.8 +    unsigned long nr, pte_t new_val, unsigned long flags)
    14.9  {
   14.10      int ret;
   14.11      unsigned long ign1, ign2, ign3;
   14.12 @@ -447,13 +447,13 @@ HYPERVISOR_update_va_mapping(
   14.13          TRAP_INSTR
   14.14          : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3)
   14.15  	: "0" (__HYPERVISOR_update_va_mapping), 
   14.16 -          "1" (page_nr), "2" ((new_val).pte_low), "3" (flags)
   14.17 +          "1" (va), "2" ((new_val).pte_low), "3" (flags)
   14.18  	: "memory" );
   14.19  
   14.20      if ( unlikely(ret < 0) )
   14.21      {
   14.22          printk(KERN_ALERT "Failed update VA mapping: %08lx, %08lx, %08lx\n",
   14.23 -               page_nr, (new_val).pte_low, flags);
   14.24 +               va, (new_val).pte_low, flags);
   14.25          BUG();
   14.26      }
   14.27  
   14.28 @@ -540,7 +540,7 @@ HYPERVISOR_grant_table_op(
   14.29  
   14.30  static inline int
   14.31  HYPERVISOR_update_va_mapping_otherdomain(
   14.32 -    unsigned long page_nr, pte_t new_val, unsigned long flags, domid_t domid)
   14.33 +    unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
   14.34  {
   14.35      int ret;
   14.36      unsigned long ign1, ign2, ign3, ign4;
   14.37 @@ -549,7 +549,7 @@ HYPERVISOR_update_va_mapping_otherdomain
   14.38          TRAP_INSTR
   14.39          : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4)
   14.40  	: "0" (__HYPERVISOR_update_va_mapping_otherdomain),
   14.41 -          "1" (page_nr), "2" ((new_val).pte_low), "3" (flags), "4" (domid) :
   14.42 +          "1" (va), "2" ((new_val).pte_low), "3" (flags), "4" (domid) :
   14.43          "memory" );
   14.44      
   14.45      return ret;
    15.1 --- a/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h	Mon Feb 07 08:19:24 2005 +0000
    15.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h	Tue Feb 08 16:44:16 2005 +0000
    15.3 @@ -398,7 +398,7 @@ HYPERVISOR_multicall(void *call_list, in
    15.4  }
    15.5  
    15.6  static inline int
    15.7 -HYPERVISOR_update_va_mapping(unsigned long page_nr, unsigned long new_val,
    15.8 +HYPERVISOR_update_va_mapping(unsigned long va, unsigned long new_val,
    15.9      unsigned long flags)
   15.10  {
   15.11      int ret;
   15.12 @@ -408,12 +408,12 @@ HYPERVISOR_update_va_mapping(unsigned lo
   15.13          TRAP_INSTR
   15.14          : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3)
   15.15  	: "0" (__HYPERVISOR_update_va_mapping), 
   15.16 -          "1" (page_nr), "2" (new_val), "3" (flags)
   15.17 +          "1" (va), "2" (new_val), "3" (flags)
   15.18  	: "memory" );
   15.19  
   15.20      if (__predict_false(ret < 0))
   15.21          panic("Failed update VA mapping: %08lx, %08lx, %08lx",
   15.22 -              page_nr, new_val, flags);
   15.23 +              va, new_val, flags);
   15.24  
   15.25      return ret;
   15.26  }
   15.27 @@ -494,7 +494,7 @@ HYPERVISOR_grant_table_op(unsigned int c
   15.28  }
   15.29  
   15.30  static inline int
   15.31 -HYPERVISOR_update_va_mapping_otherdomain(unsigned long page_nr,
   15.32 +HYPERVISOR_update_va_mapping_otherdomain(unsigned long va,
   15.33      unsigned long new_val, unsigned long flags, domid_t domid)
   15.34  {
   15.35      int ret;
   15.36 @@ -504,7 +504,7 @@ HYPERVISOR_update_va_mapping_otherdomain
   15.37          TRAP_INSTR
   15.38          : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4)
   15.39  	: "0" (__HYPERVISOR_update_va_mapping_otherdomain),
   15.40 -          "1" (page_nr), "2" (new_val), "3" (flags), "4" (domid) :
   15.41 +          "1" (va), "2" (new_val), "3" (flags), "4" (domid) :
   15.42          "memory" );
   15.43      
   15.44      return ret;
    16.1 --- a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c	Mon Feb 07 08:19:24 2005 +0000
    16.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c	Tue Feb 08 16:44:16 2005 +0000
    16.3 @@ -580,7 +580,7 @@ xennet_rx_push_buffer(struct xennet_soft
    16.4  		INVALID_P2M_ENTRY;
    16.5  
    16.6  	rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
    16.7 -	rx_mcl[nr_pfns].args[0] = sc->sc_rx_bufa[id].xb_rx.xbrx_va >> PAGE_SHIFT;
    16.8 +	rx_mcl[nr_pfns].args[0] = sc->sc_rx_bufa[id].xb_rx.xbrx_va;
    16.9  	rx_mcl[nr_pfns].args[1] = 0;
   16.10  	rx_mcl[nr_pfns].args[2] = 0;
   16.11  
   16.12 @@ -679,7 +679,7 @@ xen_network_handler(void *arg)
   16.13  		mmu->val  = (pa - XPMAP_OFFSET) >> PAGE_SHIFT;
   16.14  		mmu++;
   16.15  		mcl->op = __HYPERVISOR_update_va_mapping;
   16.16 -		mcl->args[0] = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va >> PAGE_SHIFT;
   16.17 +		mcl->args[0] = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va;
   16.18  		mcl->args[1] = (rx->addr & PG_FRAME) | PG_V|PG_KW;
   16.19  		mcl->args[2] = UVMF_FLUSH_TLB; // 0;
   16.20  		mcl++;
   16.21 @@ -872,7 +872,7 @@ network_alloc_rx_buffers(struct xennet_s
   16.22  			INVALID_P2M_ENTRY;
   16.23  
   16.24  		rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
   16.25 -		rx_mcl[nr_pfns].args[0] = va >> PAGE_SHIFT;
   16.26 +		rx_mcl[nr_pfns].args[0] = va;
   16.27  		rx_mcl[nr_pfns].args[1] = 0;
   16.28  		rx_mcl[nr_pfns].args[2] = 0;
   16.29  
    17.1 --- a/tools/examples/bochsrc	Mon Feb 07 08:19:24 2005 +0000
    17.2 +++ b/tools/examples/bochsrc	Tue Feb 08 16:44:16 2005 +0000
    17.3 @@ -3,10 +3,12 @@
    17.4  #vgaromimage: $BXSHARE/VGABIOS-lgpl-latest
    17.5  floppya: 1_44=a.img, status=inserted
    17.6  floppyb: 1_44=b.img, status=inserted
    17.7 -#ata0-master: type=disk, path=minibootable.img, cylinders=900, heads=15, spt=17
    17.8  # if you don't use absolute paths below, bochs looks under the cwd of xend, 
    17.9  # which is usually "/"
   17.10 -ata0-master: type=disk, path=/tmp/min-fc2-i386.img, cylinders=800, heads=4, spt=32
   17.11 +#ata0-master: type=disk, path=/var/images/min-el3-i386.img, cylinders=800, heads=4, spt=32
   17.12 +i440fxsupport: enabled=1
   17.13 +ne2k: ioaddr=0x300, irq=9, mac=b0:c4:22:01:00:00, ethmod=linux, ethdev=eth0
   17.14 +ata0-master: type=disk, path=/var/images/1g-el3-i386.img, mode=flat, cylinders=2048, heads=16, spt=63
   17.15  boot: c
   17.16  
   17.17  log: /tmp/bochsout.txt
   17.18 @@ -16,4 +18,3 @@ error: action=report
   17.19  panic: action=ask
   17.20  
   17.21  mouse: enabled=0
   17.22 -ips: 1500000
    18.1 --- a/tools/examples/vif-nat	Mon Feb 07 08:19:24 2005 +0000
    18.2 +++ b/tools/examples/vif-nat	Tue Feb 08 16:44:16 2005 +0000
    18.3 @@ -37,8 +37,8 @@ domain=${domain:?}
    18.4  vif=${vif:?}
    18.5  ip=${ip:?} 
    18.6  
    18.7 -# better way to strip /netmask from the ip?
    18.8 -vif_ip=`echo ${ip} | awk -F. '{print $1"."$2"."$3"."$4}'`
    18.9 +# strip /netmask
   18.10 +vif_ip=`echo ${ip} | awk -F/ '{print $1}'`
   18.11  
   18.12  main_ip=`ifconfig eth0 | grep "inet addr:" | sed -e 's/.*inet addr:\(\w\w*\.\w\w*\.\w\w*\.\w\w*\).*/\1/'`
   18.13  
    19.1 --- a/tools/ioemu/include/config.h	Mon Feb 07 08:19:24 2005 +0000
    19.2 +++ b/tools/ioemu/include/config.h	Tue Feb 08 16:44:16 2005 +0000
    19.3 @@ -687,13 +687,13 @@ typedef
    19.4  #define BX_NUM_SIMULATORS 1
    19.5  
    19.6  // limited i440FX PCI support
    19.7 -#define BX_PCI_SUPPORT 0
    19.8 +#define BX_PCI_SUPPORT 1
    19.9  
   19.10  // Experimental VGA on PCI
   19.11  #define BX_PCI_VGA_SUPPORT 1
   19.12  
   19.13  // limited USB on PCI
   19.14 -#define BX_PCI_USB_SUPPORT 0
   19.15 +#define BX_PCI_USB_SUPPORT 1
   19.16  
   19.17  #if (BX_PCI_USB_SUPPORT && !BX_PCI_SUPPORT)
   19.18  #error To enable USB, you must also enable PCI
    20.1 --- a/tools/ioemu/include/pc_system.h	Mon Feb 07 08:19:24 2005 +0000
    20.2 +++ b/tools/ioemu/include/pc_system.h	Tue Feb 08 16:44:16 2005 +0000
    20.3 @@ -45,6 +45,13 @@ BOCHSAPI extern class bx_pc_system_c bx_
    20.4  extern double m_ips;
    20.5  #endif
    20.6  
    20.7 +#ifdef BX_USE_VMX
    20.8 +extern unsigned int tsc_per_bx_tick;
    20.9 +
   20.10 +#define rdtscll(val) \
   20.11 +     __asm__ __volatile__("rdtsc" : "=A" (val))
   20.12 +#endif
   20.13 +
   20.14  class BOCHSAPI bx_pc_system_c : private logfunctions {
   20.15  private:
   20.16  
   20.17 @@ -87,6 +94,26 @@ private:
   20.18    double     m_ips; // Millions of Instructions Per Second
   20.19  #endif
   20.20  
   20.21 +#ifdef BX_USE_VMX
   20.22 +  static Bit64s get_clock(void) {
   20.23 +    struct timeval tv;
   20.24 +    gettimeofday(&tv, NULL);
   20.25 +    return tv.tv_sec * 1000000LL + tv.tv_usec;
   20.26 +    }
   20.27 +
   20.28 +  static Bit64u cpu_calibrate_ticks(void) {
   20.29 +    Bit64s usec, t1, t2;
   20.30 +
   20.31 +    usec = get_clock();
   20.32 +    rdtscll(t1);
   20.33 +
   20.34 +    usleep(50 * 1000);
   20.35 +    usec = get_clock() - usec;
   20.36 +    rdtscll(t2);
   20.37 +
   20.38 +    return (((t2 - t1) * 1000000LL + (usec >> 1)) / usec);
   20.39 +    }
   20.40 +#endif
   20.41    // This handler is called when the function which decrements the clock
   20.42    // ticks finds that an event has occurred.
   20.43    void   countdownEvent(void);
    21.1 --- a/tools/ioemu/iodev/cpu.cc	Mon Feb 07 08:19:24 2005 +0000
    21.2 +++ b/tools/ioemu/iodev/cpu.cc	Tue Feb 08 16:44:16 2005 +0000
    21.3 @@ -180,7 +180,8 @@ bx_cpu_c::cpu_loop(int max_instr_count)
    21.4  	FD_ZERO(&rfds);
    21.5  
    21.6  	while (1) {
    21.7 -		unsigned long t1, t2;
    21.8 +                static unsigned long long t1 = 0;
    21.9 +		unsigned long long t2;
   21.10  
   21.11  		/* Wait up to one seconds. */
   21.12  		tv.tv_sec = 0;
   21.13 @@ -188,18 +189,30 @@ bx_cpu_c::cpu_loop(int max_instr_count)
   21.14  		FD_SET(evtchn_fd, &rfds);
   21.15  
   21.16  		send_event = 0;
   21.17 -		rdtscl(t1);		
   21.18 +
   21.19 +		if (t1 == 0) // the first time
   21.20 +			rdtscll(t1);
   21.21 +
   21.22  		retval = select(evtchn_fd+1, &rfds, NULL, NULL, &tv);
   21.23 -		rdtscl(t2);
   21.24  		if (retval == -1) {
   21.25  			perror("select");
   21.26  			return;
   21.27  		}
   21.28 -		//stime_usec = 1000000 * (1 - tv.tv_sec)  - tv.tv_usec;
   21.29 -		if (t2 > t1)
   21.30 -			BX_TICKN((t2 - t1) / 2000);	// should match ips in bochsrc
   21.31 +
   21.32 +		rdtscll(t2);
   21.33 +
   21.34 +#if __WORDSIZE == 32
   21.35 +#define ULONGLONG_MAX   0xffffffffffffffffULL
   21.36 +#else
   21.37 +#define ULONGLONG_MAX   ULONG_MAX
   21.38 +#endif
   21.39 +
   21.40 +		if (t2 <= t1)
   21.41 +			BX_TICKN((t2 + ULONGLONG_MAX - t1) / tsc_per_bx_tick);
   21.42  		else
   21.43 -			BX_TICKN((MAXINT - t1 + t2) / 2000);	// should match ips in bochsrc
   21.44 +			BX_TICKN((t2 - t1) / tsc_per_bx_tick);
   21.45 +		t1 = t2;
   21.46 +
   21.47  		timer_handler();
   21.48  		if (BX_CPU_INTR) {
   21.49  #if BX_SUPPORT_APIC
   21.50 @@ -248,7 +261,7 @@ bx_cpu_c::interrupt(Bit8u vector)
   21.51  	// page.
   21.52  
   21.53  	rdtscl(tscl);
   21.54 -	BX_INFO(("%lx: injecting vector: %x\n", tscl, vector));
   21.55 +	BX_DEBUG(("%lx: injecting vector: %x\n", tscl, vector));
   21.56  	intr = &(((vcpu_iodata_t *) shared_page)->vp_intr[0]);
   21.57  	set_bit(vector, intr);
   21.58  	
    22.1 --- a/tools/ioemu/iodev/pc_system.cc	Mon Feb 07 08:19:24 2005 +0000
    22.2 +++ b/tools/ioemu/iodev/pc_system.cc	Tue Feb 08 16:44:16 2005 +0000
    22.3 @@ -44,6 +44,10 @@ unsigned long ips_count=0;
    22.4  double     m_ips; // Millions of Instructions Per Second
    22.5  #endif
    22.6  
    22.7 +#ifdef BX_USE_VMX
    22.8 +unsigned int tsc_per_bx_tick;
    22.9 +#endif
   22.10 +
   22.11  // Option for turning off BX_TIMER_DEBUG?
   22.12  // Check out m_ips and ips
   22.13  
   22.14 @@ -98,6 +102,16 @@ bx_pc_system_c::init_ips(Bit32u ips)
   22.15    a20_mask   = 0xffffffff;
   22.16  #endif
   22.17  
   22.18 +#ifdef BX_USE_VMX
   22.19 +  Bit64u phy_cpu_freq = cpu_calibrate_ticks();
   22.20 + 
   22.21 +  if (ips == 500000) {  //default ips: we use fixed scaling factor to calulate ips
   22.22 +    tsc_per_bx_tick = 2000;
   22.23 +    ips = phy_cpu_freq / tsc_per_bx_tick;
   22.24 +  } else  //use uesr defined ips to calulate factor
   22.25 +    tsc_per_bx_tick = ((phy_cpu_freq + (ips>>1)) / ips);
   22.26 +#endif
   22.27 +
   22.28    // parameter 'ips' is the processor speed in Instructions-Per-Second
   22.29    m_ips = double(ips) / 1000000.0L;
   22.30  
    23.1 --- a/tools/libxc/xc_linux_build.c	Mon Feb 07 08:19:24 2005 +0000
    23.2 +++ b/tools/libxc/xc_linux_build.c	Tue Feb 08 16:44:16 2005 +0000
    23.3 @@ -558,10 +558,10 @@ static int parseelfimage(char *elfbase,
    23.4          phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
    23.5          if ( !is_loadable_phdr(phdr) )
    23.6              continue;
    23.7 -        if ( phdr->p_vaddr < kernstart )
    23.8 -            kernstart = phdr->p_vaddr;
    23.9 -        if ( (phdr->p_vaddr + phdr->p_memsz) > kernend )
   23.10 -            kernend = phdr->p_vaddr + phdr->p_memsz;
   23.11 +        if ( phdr->p_paddr < kernstart )
   23.12 +            kernstart = phdr->p_paddr;
   23.13 +        if ( (phdr->p_paddr + phdr->p_memsz) > kernend )
   23.14 +            kernend = phdr->p_paddr + phdr->p_memsz;
   23.15      }
   23.16  
   23.17      if ( (kernstart > kernend) || 
   23.18 @@ -611,7 +611,7 @@ loadelfimage(
   23.19          
   23.20          for ( done = 0; done < phdr->p_filesz; done += chunksz )
   23.21          {
   23.22 -            pa = (phdr->p_vaddr + done) - vstart;
   23.23 +            pa = (phdr->p_paddr + done) - vstart;
   23.24              va = xc_map_foreign_range(
   23.25                  xch, dom, PAGE_SIZE, PROT_WRITE, parray[pa>>PAGE_SHIFT]);
   23.26              chunksz = phdr->p_filesz - done;
   23.27 @@ -624,7 +624,7 @@ loadelfimage(
   23.28  
   23.29          for ( ; done < phdr->p_memsz; done += chunksz )
   23.30          {
   23.31 -            pa = (phdr->p_vaddr + done) - vstart;
   23.32 +            pa = (phdr->p_paddr + done) - vstart;
   23.33              va = xc_map_foreign_range(
   23.34                  xch, dom, PAGE_SIZE, PROT_WRITE, parray[pa>>PAGE_SHIFT]);
   23.35              chunksz = phdr->p_memsz - done;
    24.1 --- a/tools/libxc/xc_vmx_build.c	Mon Feb 07 08:19:24 2005 +0000
    24.2 +++ b/tools/libxc/xc_vmx_build.c	Tue Feb 08 16:44:16 2005 +0000
    24.3 @@ -629,10 +629,10 @@ static int parseelfimage(char *elfbase,
    24.4          phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
    24.5          if ( !is_loadable_phdr(phdr) )
    24.6              continue;
    24.7 -        if ( phdr->p_vaddr < kernstart )
    24.8 -            kernstart = phdr->p_vaddr;
    24.9 -        if ( (phdr->p_vaddr + phdr->p_memsz) > kernend )
   24.10 -            kernend = phdr->p_vaddr + phdr->p_memsz;
   24.11 +        if ( phdr->p_paddr < kernstart )
   24.12 +            kernstart = phdr->p_paddr;
   24.13 +        if ( (phdr->p_paddr + phdr->p_memsz) > kernend )
   24.14 +            kernend = phdr->p_paddr + phdr->p_memsz;
   24.15      }
   24.16  
   24.17      if ( (kernstart > kernend) || 
   24.18 @@ -676,7 +676,7 @@ loadelfimage(
   24.19          
   24.20          for ( done = 0; done < phdr->p_filesz; done += chunksz )
   24.21          {
   24.22 -            pa = (phdr->p_vaddr + done) - vstart - LINUX_PAGE_OFFSET;
   24.23 +            pa = (phdr->p_paddr + done) - vstart - LINUX_PAGE_OFFSET;
   24.24              va = xc_map_foreign_range(
   24.25                  xch, dom, PAGE_SIZE, PROT_WRITE, parray[pa>>PAGE_SHIFT]);
   24.26              chunksz = phdr->p_filesz - done;
   24.27 @@ -689,7 +689,7 @@ loadelfimage(
   24.28  
   24.29          for ( ; done < phdr->p_memsz; done += chunksz )
   24.30          {
   24.31 -            pa = (phdr->p_vaddr + done) - vstart - LINUX_PAGE_OFFSET;
   24.32 +            pa = (phdr->p_paddr + done) - vstart - LINUX_PAGE_OFFSET;
   24.33              va = xc_map_foreign_range(
   24.34                  xch, dom, PAGE_SIZE, PROT_WRITE, parray[pa>>PAGE_SHIFT]);
   24.35              chunksz = phdr->p_memsz - done;
    25.1 --- a/tools/python/xen/xend/XendDomainInfo.py	Mon Feb 07 08:19:24 2005 +0000
    25.2 +++ b/tools/python/xen/xend/XendDomainInfo.py	Tue Feb 08 16:44:16 2005 +0000
    25.3 @@ -1337,6 +1337,7 @@ add_config_handler('memory',     vm_fiel
    25.4  add_config_handler('cpu',        vm_field_ignore)
    25.5  add_config_handler('cpu_weight', vm_field_ignore)
    25.6  add_config_handler('console',    vm_field_ignore)
    25.7 +add_config_handler('restart',    vm_field_ignore)
    25.8  add_config_handler('image',      vm_field_ignore)
    25.9  add_config_handler('device',     vm_field_ignore)
   25.10  add_config_handler('backend',    vm_field_ignore)
    26.1 --- a/tools/python/xen/xend/server/SrvDaemon.py	Mon Feb 07 08:19:24 2005 +0000
    26.2 +++ b/tools/python/xen/xend/server/SrvDaemon.py	Tue Feb 08 16:44:16 2005 +0000
    26.3 @@ -486,10 +486,12 @@ class Daemon:
    26.4              # XXX KAF: Why doesn't this capture output from C extensions that
    26.5              # fprintf(stdout) or fprintf(stderr) ??
    26.6              os.open('/var/log/xend-debug.log', os.O_WRONLY|os.O_CREAT)
    26.7 +            os.dup(1)
    26.8          else:
    26.9              os.open('/dev/null', os.O_RDWR)
   26.10              os.dup(0)
   26.11 -        os.dup(1)
   26.12 +            os.open('/var/log/xend-debug.log', os.O_WRONLY|os.O_CREAT)
   26.13 +
   26.14          
   26.15      def start(self, trace=0):
   26.16          """Attempts to start the daemons.
    27.1 --- a/xen/arch/x86/boot/mkelf32.c	Mon Feb 07 08:19:24 2005 +0000
    27.2 +++ b/xen/arch/x86/boot/mkelf32.c	Tue Feb 08 16:44:16 2005 +0000
    27.3 @@ -245,6 +245,12 @@ int main(int argc, char **argv)
    27.4          return 1;
    27.5      }
    27.6  
    27.7 +    /*
    27.8 +     * End the image on a page boundary. This gets round alignment bugs
    27.9 +     * in the boot- or chain-loader (e.g., kexec on the XenoBoot CD).
   27.10 +     */
   27.11 +    mem_siz += -(loadbase + mem_siz) & 0xfff;
   27.12 +
   27.13      out_ehdr.e_entry = loadbase;
   27.14      out_ehdr.e_shoff = RAW_OFFSET + dat_siz;
   27.15  
    28.1 --- a/xen/arch/x86/boot/x86_32.S	Mon Feb 07 08:19:24 2005 +0000
    28.2 +++ b/xen/arch/x86/boot/x86_32.S	Tue Feb 08 16:44:16 2005 +0000
    28.3 @@ -214,7 +214,7 @@ ENTRY(gdt_table)
    28.4          .org 0x1000
    28.5  ENTRY(idle_pg_table) # Initial page directory is 4kB
    28.6          .org 0x2000
    28.7 -ENTRY(cpu0_stack)    # Initial stack is 8kB
    28.8 -        .org 0x4000
    28.9 +ENTRY(cpu0_stack)
   28.10 +        .org 0x2000 + STACK_SIZE
   28.11  ENTRY(stext)
   28.12  ENTRY(_stext)
    29.1 --- a/xen/arch/x86/boot/x86_64.S	Mon Feb 07 08:19:24 2005 +0000
    29.2 +++ b/xen/arch/x86/boot/x86_64.S	Tue Feb 08 16:44:16 2005 +0000
    29.3 @@ -193,8 +193,8 @@ ENTRY(gdt_table)
    29.4          .quad 0x00af9a000000ffff     /* 0x0810 ring 0 code, 64-bit mode   */
    29.5          .quad 0x00cf92000000ffff     /* 0x0818 ring 0 data                */
    29.6          .quad 0x00cffa000000ffff     /* 0x0823 ring 3 code, compatibility */
    29.7 -        .quad 0x00affa000000ffff     /* 0x082b ring 3 code, 64-bit mode   */
    29.8 -        .quad 0x00cff2000000ffff     /* 0x0833 ring 3 data                */
    29.9 +        .quad 0x00cff2000000ffff     /* 0x082b ring 3 data                */
   29.10 +        .quad 0x00affa000000ffff     /* 0x0833 ring 3 code, 64-bit mode   */
   29.11          .quad 0x0000000000000000     /* unused                            */
   29.12          .fill 4*NR_CPUS,8,0          /* space for TSS and LDT per CPU     */
   29.13  
   29.14 @@ -243,8 +243,8 @@ ENTRY(idle_pg_table_l2)
   29.15          identmap /* Too orangey for crows :-) */
   29.16  
   29.17          .org 0x4000
   29.18 -ENTRY(cpu0_stack)    # Initial stack is 8kB
   29.19 +ENTRY(cpu0_stack)
   29.20  
   29.21 -        .org 0x6000
   29.22 +        .org 0x4000 + STACK_SIZE
   29.23  ENTRY(stext)
   29.24  ENTRY(_stext)
    30.1 --- a/xen/arch/x86/dom0_ops.c	Mon Feb 07 08:19:24 2005 +0000
    30.2 +++ b/xen/arch/x86/dom0_ops.c	Tue Feb 08 16:44:16 2005 +0000
    30.3 @@ -376,7 +376,7 @@ void arch_getdomaininfo_ctxt(
    30.4      {
    30.5          for ( i = 0; i < 16; i++ )
    30.6              c->gdt_frames[i] = 
    30.7 -                l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i]);
    30.8 +                l1_pgentry_to_pfn(ed->arch.perdomain_ptes[i]);
    30.9          c->gdt_ents = GET_GDT_ENTRIES(ed);
   30.10      }
   30.11      c->guestos_ss  = ed->arch.guestos_ss;
    31.1 --- a/xen/arch/x86/domain.c	Mon Feb 07 08:19:24 2005 +0000
    31.2 +++ b/xen/arch/x86/domain.c	Tue Feb 08 16:44:16 2005 +0000
    31.3 @@ -304,7 +304,7 @@ void arch_vmx_do_launch(struct exec_doma
    31.4  static void monitor_mk_pagetable(struct exec_domain *ed)
    31.5  {
    31.6      unsigned long mpfn;
    31.7 -    l2_pgentry_t *mpl2e;
    31.8 +    l2_pgentry_t *mpl2e, *phys_table;
    31.9      struct pfn_info *mpfn_info;
   31.10      struct domain *d = ed->domain;
   31.11  
   31.12 @@ -312,20 +312,26 @@ static void monitor_mk_pagetable(struct 
   31.13      ASSERT( mpfn_info ); 
   31.14  
   31.15      mpfn = (unsigned long) (mpfn_info - frame_table);
   31.16 -    mpl2e = (l2_pgentry_t *) map_domain_mem(mpfn << L1_PAGETABLE_SHIFT);
   31.17 +    mpl2e = (l2_pgentry_t *) map_domain_mem(mpfn << PAGE_SHIFT);
   31.18      memset(mpl2e, 0, PAGE_SIZE);
   31.19  
   31.20      memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
   31.21             &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
   31.22             HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
   31.23  
   31.24 -    ed->arch.monitor_table = mk_pagetable(mpfn << L1_PAGETABLE_SHIFT);
   31.25 +    ed->arch.monitor_table = mk_pagetable(mpfn << PAGE_SHIFT);
   31.26      d->arch.shadow_mode = SHM_full_32;
   31.27  
   31.28      mpl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
   31.29          mk_l2_pgentry((__pa(d->arch.mm_perdomain_pt) & PAGE_MASK) 
   31.30                        | __PAGE_HYPERVISOR);
   31.31  
   31.32 +    phys_table = (l2_pgentry_t *) map_domain_mem(pagetable_val(
   31.33 +                                        ed->arch.phys_table));
   31.34 +    memcpy(d->arch.mm_perdomain_pt, phys_table,
   31.35 +           ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
   31.36 +
   31.37 +    unmap_domain_mem(phys_table);
   31.38      unmap_domain_mem(mpl2e);
   31.39  }
   31.40  
   31.41 @@ -466,6 +472,7 @@ int arch_final_setup_guestos(
   31.42      
   31.43      phys_basetab = c->pt_base;
   31.44      d->arch.pagetable = mk_pagetable(phys_basetab);
   31.45 +    d->arch.phys_table = d->arch.pagetable;
   31.46      if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d->domain, 
   31.47                              PGT_base_page_table) )
   31.48          return -EINVAL;
   31.49 @@ -628,12 +635,11 @@ long do_iopl(domid_t domain, unsigned in
   31.50      return 0;
   31.51  }
   31.52  
   31.53 -unsigned long hypercall_create_continuation(
   31.54 +unsigned long __hypercall_create_continuation(
   31.55      unsigned int op, unsigned int nr_args, ...)
   31.56  {
   31.57      struct mc_state *mcs = &mc_state[smp_processor_id()];
   31.58      execution_context_t *ec;
   31.59 -    unsigned long *preg;
   31.60      unsigned int i;
   31.61      va_list args;
   31.62  
   31.63 @@ -653,10 +659,34 @@ unsigned long hypercall_create_continuat
   31.64          ec->eax  = op;
   31.65          ec->eip -= 2;  /* re-execute 'int 0x82' */
   31.66          
   31.67 -        for ( i = 0, preg = &ec->ebx; i < nr_args; i++, preg++ )
   31.68 -            *preg = va_arg(args, unsigned long);
   31.69 -#else
   31.70 -        preg = NULL; /* XXX x86/64 */
   31.71 +        for ( i = 0; i < nr_args; i++ )
   31.72 +        {
   31.73 +            switch ( i )
   31.74 +            {
   31.75 +            case 0: ec->ebx = va_arg(args, unsigned long); break;
   31.76 +            case 1: ec->ecx = va_arg(args, unsigned long); break;
   31.77 +            case 2: ec->edx = va_arg(args, unsigned long); break;
   31.78 +            case 3: ec->esi = va_arg(args, unsigned long); break;
   31.79 +            case 4: ec->edi = va_arg(args, unsigned long); break;
   31.80 +            case 5: ec->ebp = va_arg(args, unsigned long); break;
   31.81 +            }
   31.82 +        }
   31.83 +#elif defined(__x86_64__)
   31.84 +        ec->rax  = op;
   31.85 +        ec->rip -= 2;  /* re-execute 'syscall' */
   31.86 +        
   31.87 +        for ( i = 0; i < nr_args; i++ )
   31.88 +        {
   31.89 +            switch ( i )
   31.90 +            {
   31.91 +            case 0: ec->rdi = va_arg(args, unsigned long); break;
   31.92 +            case 1: ec->rsi = va_arg(args, unsigned long); break;
   31.93 +            case 2: ec->rdx = va_arg(args, unsigned long); break;
   31.94 +            case 3: ec->r10 = va_arg(args, unsigned long); break;
   31.95 +            case 4: ec->r8  = va_arg(args, unsigned long); break;
   31.96 +            case 5: ec->r9  = va_arg(args, unsigned long); break;
   31.97 +            }
   31.98 +        }
   31.99  #endif
  31.100      }
  31.101  
  31.102 @@ -726,8 +756,6 @@ static void relinquish_list(struct domai
  31.103  #ifdef CONFIG_VMX
  31.104  static void vmx_domain_relinquish_memory(struct exec_domain *ed)
  31.105  {
  31.106 -    struct domain *d = ed->domain;
  31.107 -
  31.108      /*
  31.109       * Free VMCS
  31.110       */
  31.111 @@ -736,22 +764,6 @@ static void vmx_domain_relinquish_memory
  31.112      ed->arch.arch_vmx.vmcs = 0;
  31.113      
  31.114      monitor_rm_pagetable(ed);
  31.115 -
  31.116 -    if (ed == d->exec_domain[0]) {
  31.117 -        int i;
  31.118 -        unsigned long pfn;
  31.119 -
  31.120 -        for (i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++) {
  31.121 -            unsigned long l1e;
  31.122 -            
  31.123 -            l1e = l1_pgentry_val(d->arch.mm_perdomain_pt[i]);
  31.124 -            if (l1e & _PAGE_PRESENT) {
  31.125 -                pfn = l1e >> PAGE_SHIFT;
  31.126 -                free_domheap_page(&frame_table[pfn]);
  31.127 -            }
  31.128 -        }
  31.129 -    }
  31.130 -
  31.131  }
  31.132  #endif
  31.133  
    32.1 --- a/xen/arch/x86/memory.c	Mon Feb 07 08:19:24 2005 +0000
    32.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    32.3 @@ -1,2401 +0,0 @@
    32.4 -/* -*-  Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
    32.5 -/******************************************************************************
    32.6 - * arch/x86/memory.c
    32.7 - * 
    32.8 - * Copyright (c) 2002-2004 K A Fraser
    32.9 - * Copyright (c) 2004 Christian Limpach
   32.10 - * 
   32.11 - * This program is free software; you can redistribute it and/or modify
   32.12 - * it under the terms of the GNU General Public License as published by
   32.13 - * the Free Software Foundation; either version 2 of the License, or
   32.14 - * (at your option) any later version.
   32.15 - * 
   32.16 - * This program is distributed in the hope that it will be useful,
   32.17 - * but WITHOUT ANY WARRANTY; without even the implied warranty of
   32.18 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   32.19 - * GNU General Public License for more details.
   32.20 - * 
   32.21 - * You should have received a copy of the GNU General Public License
   32.22 - * along with this program; if not, write to the Free Software
   32.23 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   32.24 - */
   32.25 -
   32.26 -/*
   32.27 - * A description of the x86 page table API:
   32.28 - * 
   32.29 - * Domains trap to do_mmu_update with a list of update requests.
   32.30 - * This is a list of (ptr, val) pairs, where the requested operation
   32.31 - * is *ptr = val.
   32.32 - * 
   32.33 - * Reference counting of pages:
   32.34 - * ----------------------------
   32.35 - * Each page has two refcounts: tot_count and type_count.
   32.36 - * 
   32.37 - * TOT_COUNT is the obvious reference count. It counts all uses of a
   32.38 - * physical page frame by a domain, including uses as a page directory,
   32.39 - * a page table, or simple mappings via a PTE. This count prevents a
   32.40 - * domain from releasing a frame back to the free pool when it still holds
   32.41 - * a reference to it.
   32.42 - * 
   32.43 - * TYPE_COUNT is more subtle. A frame can be put to one of three
   32.44 - * mutually-exclusive uses: it might be used as a page directory, or a
   32.45 - * page table, or it may be mapped writable by the domain [of course, a
   32.46 - * frame may not be used in any of these three ways!].
   32.47 - * So, type_count is a count of the number of times a frame is being 
   32.48 - * referred to in its current incarnation. Therefore, a page can only
   32.49 - * change its type when its type count is zero.
   32.50 - * 
   32.51 - * Pinning the page type:
   32.52 - * ----------------------
   32.53 - * The type of a page can be pinned/unpinned with the commands
   32.54 - * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
   32.55 - * pinning is not reference counted, so it can't be nested).
   32.56 - * This is useful to prevent a page's type count falling to zero, at which
   32.57 - * point safety checks would need to be carried out next time the count
   32.58 - * is increased again.
   32.59 - * 
   32.60 - * A further note on writable page mappings:
   32.61 - * -----------------------------------------
   32.62 - * For simplicity, the count of writable mappings for a page may not
   32.63 - * correspond to reality. The 'writable count' is incremented for every
   32.64 - * PTE which maps the page with the _PAGE_RW flag set. However, for
   32.65 - * write access to be possible the page directory entry must also have
   32.66 - * its _PAGE_RW bit set. We do not check this as it complicates the 
   32.67 - * reference counting considerably [consider the case of multiple
   32.68 - * directory entries referencing a single page table, some with the RW
   32.69 - * bit set, others not -- it starts getting a bit messy].
   32.70 - * In normal use, this simplification shouldn't be a problem.
   32.71 - * However, the logic can be added if required.
   32.72 - * 
   32.73 - * One more note on read-only page mappings:
   32.74 - * -----------------------------------------
   32.75 - * We want domains to be able to map pages for read-only access. The
   32.76 - * main reason is that page tables and directories should be readable
   32.77 - * by a domain, but it would not be safe for them to be writable.
   32.78 - * However, domains have free access to rings 1 & 2 of the Intel
   32.79 - * privilege model. In terms of page protection, these are considered
   32.80 - * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
   32.81 - * read-only restrictions are respected in supervisor mode -- if the 
   32.82 - * bit is clear then any mapped page is writable.
   32.83 - * 
   32.84 - * We get round this by always setting the WP bit and disallowing 
   32.85 - * updates to it. This is very unlikely to cause a problem for guest
   32.86 - * OS's, which will generally use the WP bit to simplify copy-on-write
   32.87 - * implementation (in that case, OS wants a fault when it writes to
   32.88 - * an application-supplied buffer).
   32.89 - */
   32.90 -
   32.91 -#include <xen/config.h>
   32.92 -#include <xen/init.h>
   32.93 -#include <xen/kernel.h>
   32.94 -#include <xen/lib.h>
   32.95 -#include <xen/mm.h>
   32.96 -#include <xen/sched.h>
   32.97 -#include <xen/errno.h>
   32.98 -#include <xen/perfc.h>
   32.99 -#include <xen/irq.h>
  32.100 -#include <xen/softirq.h>
  32.101 -#include <asm/shadow.h>
  32.102 -#include <asm/page.h>
  32.103 -#include <asm/flushtlb.h>
  32.104 -#include <asm/io.h>
  32.105 -#include <asm/uaccess.h>
  32.106 -#include <asm/domain_page.h>
  32.107 -#include <asm/ldt.h>
  32.108 -
  32.109 -#ifdef VERBOSE
  32.110 -#define MEM_LOG(_f, _a...)                           \
  32.111 -  printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
  32.112 -         current->domain->id , __LINE__ , ## _a )
  32.113 -#else
  32.114 -#define MEM_LOG(_f, _a...) ((void)0)
  32.115 -#endif
  32.116 -
  32.117 -static int alloc_l2_table(struct pfn_info *page);
  32.118 -static int alloc_l1_table(struct pfn_info *page);
  32.119 -static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
  32.120 -static int get_page_and_type_from_pagenr(unsigned long page_nr, 
  32.121 -                                         u32 type,
  32.122 -                                         struct domain *d);
  32.123 -
  32.124 -static void free_l2_table(struct pfn_info *page);
  32.125 -static void free_l1_table(struct pfn_info *page);
  32.126 -
  32.127 -static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
  32.128 -static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
  32.129 -
  32.130 -/* Used to defer flushing of memory structures. */
  32.131 -static struct {
  32.132 -#define DOP_FLUSH_TLB   (1<<0) /* Flush the TLB.                 */
  32.133 -#define DOP_RELOAD_LDT  (1<<1) /* Reload the LDT shadow mapping. */
  32.134 -    unsigned long  deferred_ops;
  32.135 -    /* If non-NULL, specifies a foreign subject domain for some operations. */
  32.136 -    struct domain *foreign;
  32.137 -} __cacheline_aligned percpu_info[NR_CPUS];
  32.138 -
  32.139 -/*
  32.140 - * Returns the current foreign domain; defaults to the currently-executing
  32.141 - * domain if a foreign override hasn't been specified.
  32.142 - */
  32.143 -#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
  32.144 -
  32.145 -/* Private domain structs for DOMID_XEN and DOMID_IO. */
  32.146 -static struct domain *dom_xen, *dom_io;
  32.147 -
  32.148 -/* Frame table and its size in pages. */
  32.149 -struct pfn_info *frame_table;
  32.150 -unsigned long frame_table_size;
  32.151 -unsigned long max_page;
  32.152 -
  32.153 -void __init init_frametable(void)
  32.154 -{
  32.155 -    unsigned long i, p;
  32.156 -
  32.157 -    frame_table      = (struct pfn_info *)FRAMETABLE_VIRT_START;
  32.158 -    frame_table_size = max_page * sizeof(struct pfn_info);
  32.159 -    frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
  32.160 -
  32.161 -    for ( i = 0; i < frame_table_size; i += (4UL << 20) )
  32.162 -    {
  32.163 -        p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
  32.164 -        if ( p == 0 )
  32.165 -            panic("Not enough memory for frame table\n");
  32.166 -        map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p, 
  32.167 -                  4UL << 20, PAGE_HYPERVISOR);
  32.168 -    }
  32.169 -
  32.170 -    memset(frame_table, 0, frame_table_size);
  32.171 -}
  32.172 -
  32.173 -void arch_init_memory(void)
  32.174 -{
  32.175 -    extern void subarch_init_memory(struct domain *);
  32.176 -
  32.177 -    memset(percpu_info, 0, sizeof(percpu_info));
  32.178 -
  32.179 -    /*
  32.180 -     * Initialise our DOMID_XEN domain.
  32.181 -     * Any Xen-heap pages that we will allow to be mapped will have
  32.182 -     * their domain field set to dom_xen.
  32.183 -     */
  32.184 -    dom_xen = alloc_domain_struct();
  32.185 -    atomic_set(&dom_xen->refcnt, 1);
  32.186 -    dom_xen->id = DOMID_XEN;
  32.187 -
  32.188 -    /*
  32.189 -     * Initialise our DOMID_IO domain.
  32.190 -     * This domain owns no pages but is considered a special case when
  32.191 -     * mapping I/O pages, as the mappings occur at the priv of the caller.
  32.192 -     */
  32.193 -    dom_io = alloc_domain_struct();
  32.194 -    atomic_set(&dom_io->refcnt, 1);
  32.195 -    dom_io->id = DOMID_IO;
  32.196 -
  32.197 -    subarch_init_memory(dom_xen);
  32.198 -}
  32.199 -
  32.200 -void write_ptbase(struct exec_domain *ed)
  32.201 -{
  32.202 -    struct domain *d = ed->domain;
  32.203 -    unsigned long pa;
  32.204 -
  32.205 -#ifdef CONFIG_VMX
  32.206 -    if ( unlikely(d->arch.shadow_mode) )
  32.207 -        pa = ((d->arch.shadow_mode == SHM_full_32) ?
  32.208 -              pagetable_val(ed->arch.monitor_table) :
  32.209 -              pagetable_val(ed->arch.shadow_table));
  32.210 -    else
  32.211 -        pa = pagetable_val(ed->arch.pagetable);
  32.212 -#else
  32.213 -    if ( unlikely(d->arch.shadow_mode) )
  32.214 -        pa = pagetable_val(ed->arch.shadow_table);    
  32.215 -    else
  32.216 -        pa = pagetable_val(ed->arch.pagetable);
  32.217 -#endif
  32.218 -
  32.219 -    write_cr3(pa);
  32.220 -}
  32.221 -
  32.222 -static void __invalidate_shadow_ldt(struct exec_domain *d)
  32.223 -{
  32.224 -    int i;
  32.225 -    unsigned long pfn;
  32.226 -    struct pfn_info *page;
  32.227 -    
  32.228 -    d->arch.shadow_ldt_mapcnt = 0;
  32.229 -
  32.230 -    for ( i = 16; i < 32; i++ )
  32.231 -    {
  32.232 -        pfn = l1_pgentry_to_pagenr(d->arch.perdomain_ptes[i]);
  32.233 -        if ( pfn == 0 ) continue;
  32.234 -        d->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
  32.235 -        page = &frame_table[pfn];
  32.236 -        ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
  32.237 -        ASSERT_PAGE_IS_DOMAIN(page, d->domain);
  32.238 -        put_page_and_type(page);
  32.239 -    }
  32.240 -
  32.241 -    /* Dispose of the (now possibly invalid) mappings from the TLB.  */
  32.242 -    percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
  32.243 -}
  32.244 -
  32.245 -
  32.246 -static inline void invalidate_shadow_ldt(struct exec_domain *d)
  32.247 -{
  32.248 -    if ( d->arch.shadow_ldt_mapcnt != 0 )
  32.249 -        __invalidate_shadow_ldt(d);
  32.250 -}
  32.251 -
  32.252 -
  32.253 -static int alloc_segdesc_page(struct pfn_info *page)
  32.254 -{
  32.255 -    unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
  32.256 -    int i;
  32.257 -
  32.258 -    for ( i = 0; i < 512; i++ )
  32.259 -        if ( unlikely(!check_descriptor(&descs[i*2])) )
  32.260 -            goto fail;
  32.261 -
  32.262 -    unmap_domain_mem(descs);
  32.263 -    return 1;
  32.264 -
  32.265 - fail:
  32.266 -    unmap_domain_mem(descs);
  32.267 -    return 0;
  32.268 -}
  32.269 -
  32.270 -
  32.271 -/* Map shadow page at offset @off. */
  32.272 -int map_ldt_shadow_page(unsigned int off)
  32.273 -{
  32.274 -    struct exec_domain *ed = current;
  32.275 -    struct domain *d = ed->domain;
  32.276 -    unsigned long l1e;
  32.277 -
  32.278 -    if ( unlikely(in_irq()) )
  32.279 -        BUG();
  32.280 -
  32.281 -    __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->arch.ldt_base >> 
  32.282 -                                                       PAGE_SHIFT) + off]);
  32.283 -
  32.284 -    if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
  32.285 -         unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], 
  32.286 -                                     d, PGT_ldt_page)) )
  32.287 -        return 0;
  32.288 -
  32.289 -    ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
  32.290 -    ed->arch.shadow_ldt_mapcnt++;
  32.291 -
  32.292 -    return 1;
  32.293 -}
  32.294 -
  32.295 -
  32.296 -static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
  32.297 -{
  32.298 -    struct pfn_info *page = &frame_table[page_nr];
  32.299 -
  32.300 -    if ( unlikely(!pfn_is_ram(page_nr)) )
  32.301 -    {
  32.302 -        MEM_LOG("Pfn %08lx is not RAM", page_nr);
  32.303 -        return 0;
  32.304 -    }
  32.305 -
  32.306 -    if ( unlikely(!get_page(page, d)) )
  32.307 -    {
  32.308 -        MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
  32.309 -        return 0;
  32.310 -    }
  32.311 -
  32.312 -    return 1;
  32.313 -}
  32.314 -
  32.315 -
  32.316 -static int get_page_and_type_from_pagenr(unsigned long page_nr, 
  32.317 -                                         u32 type,
  32.318 -                                         struct domain *d)
  32.319 -{
  32.320 -    struct pfn_info *page = &frame_table[page_nr];
  32.321 -
  32.322 -    if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
  32.323 -        return 0;
  32.324 -
  32.325 -    if ( unlikely(!get_page_type(page, type)) )
  32.326 -    {
  32.327 -#ifdef VERBOSE
  32.328 -        if ( (type & PGT_type_mask) != PGT_l1_page_table )
  32.329 -            MEM_LOG("Bad page type for pfn %08lx (%08x)", 
  32.330 -                    page_nr, page->u.inuse.type_info);
  32.331 -#endif
  32.332 -        put_page(page);
  32.333 -        return 0;
  32.334 -    }
  32.335 -
  32.336 -    return 1;
  32.337 -}
  32.338 -
  32.339 -
  32.340 -/*
  32.341 - * We allow an L2 tables to map each other (a.k.a. linear page tables). It
  32.342 - * needs some special care with reference counst and access permissions:
  32.343 - *  1. The mapping entry must be read-only, or the guest may get write access
  32.344 - *     to its own PTEs.
  32.345 - *  2. We must only bump the reference counts for an *already validated*
  32.346 - *     L2 table, or we can end up in a deadlock in get_page_type() by waiting
  32.347 - *     on a validation that is required to complete that validation.
  32.348 - *  3. We only need to increment the reference counts for the mapped page
  32.349 - *     frame if it is mapped by a different L2 table. This is sufficient and
  32.350 - *     also necessary to allow validation of an L2 table mapping itself.
  32.351 - */
  32.352 -static int 
  32.353 -get_linear_pagetable(
  32.354 -    l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
  32.355 -{
  32.356 -    u32 x, y;
  32.357 -    struct pfn_info *page;
  32.358 -
  32.359 -    if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
  32.360 -    {
  32.361 -        MEM_LOG("Attempt to create linear p.t. with write perms");
  32.362 -        return 0;
  32.363 -    }
  32.364 -
  32.365 -    if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
  32.366 -    {
  32.367 -        /* Make sure the mapped frame belongs to the correct domain. */
  32.368 -        if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
  32.369 -            return 0;
  32.370 -
  32.371 -        /*
  32.372 -         * Make sure that the mapped frame is an already-validated L2 table. 
  32.373 -         * If so, atomically increment the count (checking for overflow).
  32.374 -         */
  32.375 -        page = &frame_table[l2_pgentry_to_pagenr(l2e)];
  32.376 -        y = page->u.inuse.type_info;
  32.377 -        do {
  32.378 -            x = y;
  32.379 -            if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
  32.380 -                 unlikely((x & (PGT_type_mask|PGT_validated)) != 
  32.381 -                          (PGT_l2_page_table|PGT_validated)) )
  32.382 -            {
  32.383 -                put_page(page);
  32.384 -                return 0;
  32.385 -            }
  32.386 -        }
  32.387 -        while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
  32.388 -    }
  32.389 -
  32.390 -    return 1;
  32.391 -}
  32.392 -
  32.393 -
  32.394 -static int
  32.395 -get_page_from_l1e(
  32.396 -    l1_pgentry_t l1e, struct domain *d)
  32.397 -{
  32.398 -    unsigned long l1v = l1_pgentry_val(l1e);
  32.399 -    unsigned long pfn = l1_pgentry_to_pagenr(l1e);
  32.400 -    struct pfn_info *page = &frame_table[pfn];
  32.401 -    extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
  32.402 -
  32.403 -    if ( !(l1v & _PAGE_PRESENT) )
  32.404 -        return 1;
  32.405 -
  32.406 -    if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
  32.407 -    {
  32.408 -        MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
  32.409 -        return 0;
  32.410 -    }
  32.411 -
  32.412 -    if ( unlikely(!pfn_is_ram(pfn)) )
  32.413 -    {
  32.414 -        /* Revert to caller privileges if FD == DOMID_IO. */
  32.415 -        if ( d == dom_io )
  32.416 -            d = current->domain;
  32.417 -
  32.418 -        if ( IS_PRIV(d) )
  32.419 -            return 1;
  32.420 -
  32.421 -        if ( IS_CAPABLE_PHYSDEV(d) )
  32.422 -            return domain_iomem_in_pfn(d, pfn);
  32.423 -
  32.424 -        MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
  32.425 -        return 0;
  32.426 -    }
  32.427 -
  32.428 -    return ((l1v & _PAGE_RW) ?
  32.429 -            get_page_and_type(page, d, PGT_writable_page) :
  32.430 -            get_page(page, d));
  32.431 -}
  32.432 -
  32.433 -
  32.434 -/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
  32.435 -static int 
  32.436 -get_page_from_l2e(
  32.437 -    l2_pgentry_t l2e, unsigned long pfn,
  32.438 -    struct domain *d, unsigned long va_idx)
  32.439 -{
  32.440 -    int rc;
  32.441 -
  32.442 -    if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
  32.443 -        return 1;
  32.444 -
  32.445 -    if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
  32.446 -    {
  32.447 -        MEM_LOG("Bad L2 page type settings %04lx",
  32.448 -                l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
  32.449 -        return 0;
  32.450 -    }
  32.451 -
  32.452 -    rc = get_page_and_type_from_pagenr(
  32.453 -        l2_pgentry_to_pagenr(l2e), 
  32.454 -        PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
  32.455 -
  32.456 -    if ( unlikely(!rc) )
  32.457 -        return get_linear_pagetable(l2e, pfn, d);
  32.458 -
  32.459 -    return 1;
  32.460 -}
  32.461 -
  32.462 -
  32.463 -static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
  32.464 -{
  32.465 -    unsigned long    l1v  = l1_pgentry_val(l1e);
  32.466 -    unsigned long    pfn  = l1_pgentry_to_pagenr(l1e);
  32.467 -    struct pfn_info *page = &frame_table[pfn];
  32.468 -    struct domain   *e;
  32.469 -
  32.470 -    if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
  32.471 -        return;
  32.472 -
  32.473 -    e = page_get_owner(page);
  32.474 -    if ( unlikely(e != d) )
  32.475 -    {
  32.476 -        /*
  32.477 -         * Unmap a foreign page that may have been mapped via a grant table.
  32.478 -         * Note that this can fail for a privileged domain that can map foreign
  32.479 -         * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
  32.480 -         * counted via a grant entry and some counted directly in the page
  32.481 -         * structure's reference count. Note that reference counts won't get
  32.482 -         * dangerously confused as long as we always try to decrement the
  32.483 -         * grant entry first. We may end up with a mismatch between which
  32.484 -         * mappings and which unmappings are counted via the grant entry, but
  32.485 -         * really it doesn't matter as privileged domains have carte blanche.
  32.486 -         */
  32.487 -        if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
  32.488 -            return;
  32.489 -        /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
  32.490 -    }
  32.491 -
  32.492 -    if ( l1v & _PAGE_RW )
  32.493 -    {
  32.494 -        put_page_and_type(page);
  32.495 -    }
  32.496 -    else
  32.497 -    {
  32.498 -        /* We expect this is rare so we blow the entire shadow LDT. */
  32.499 -        if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == 
  32.500 -                       PGT_ldt_page)) &&
  32.501 -             unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
  32.502 -            invalidate_shadow_ldt(e->exec_domain[0]);
  32.503 -        put_page(page);
  32.504 -    }
  32.505 -}
  32.506 -
  32.507 -
  32.508 -/*
  32.509 - * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
  32.510 - * Note also that this automatically deals correctly with linear p.t.'s.
  32.511 - */
  32.512 -static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
  32.513 -{
  32.514 -    if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && 
  32.515 -         ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
  32.516 -        put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
  32.517 -}
  32.518 -
  32.519 -
  32.520 -static int alloc_l2_table(struct pfn_info *page)
  32.521 -{
  32.522 -    struct domain *d = page_get_owner(page);
  32.523 -    unsigned long  page_nr = page_to_pfn(page);
  32.524 -    l2_pgentry_t  *pl2e;
  32.525 -    int            i;
  32.526 -   
  32.527 -    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
  32.528 -
  32.529 -    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  32.530 -        if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
  32.531 -            goto fail;
  32.532 -
  32.533 -#if defined(__i386__)
  32.534 -    /* Now we add our private high mappings. */
  32.535 -    memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
  32.536 -           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  32.537 -           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  32.538 -    pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
  32.539 -        mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  32.540 -    pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
  32.541 -        mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) | 
  32.542 -                      __PAGE_HYPERVISOR);
  32.543 -#endif
  32.544 -
  32.545 -    unmap_domain_mem(pl2e);
  32.546 -    return 1;
  32.547 -
  32.548 - fail:
  32.549 -    while ( i-- > 0 )
  32.550 -        put_page_from_l2e(pl2e[i], page_nr);
  32.551 -
  32.552 -    unmap_domain_mem(pl2e);
  32.553 -    return 0;
  32.554 -}
  32.555 -
  32.556 -
  32.557 -static int alloc_l1_table(struct pfn_info *page)
  32.558 -{
  32.559 -    struct domain *d = page_get_owner(page);
  32.560 -    unsigned long  page_nr = page_to_pfn(page);
  32.561 -    l1_pgentry_t  *pl1e;
  32.562 -    int            i;
  32.563 -
  32.564 -    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
  32.565 -
  32.566 -    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
  32.567 -        if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
  32.568 -            goto fail;
  32.569 -
  32.570 -    unmap_domain_mem(pl1e);
  32.571 -    return 1;
  32.572 -
  32.573 - fail:
  32.574 -    while ( i-- > 0 )
  32.575 -        put_page_from_l1e(pl1e[i], d);
  32.576 -
  32.577 -    unmap_domain_mem(pl1e);
  32.578 -    return 0;
  32.579 -}
  32.580 -
  32.581 -
  32.582 -static void free_l2_table(struct pfn_info *page)
  32.583 -{
  32.584 -    unsigned long page_nr = page - frame_table;
  32.585 -    l2_pgentry_t *pl2e;
  32.586 -    int i;
  32.587 -
  32.588 -    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
  32.589 -
  32.590 -    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  32.591 -        put_page_from_l2e(pl2e[i], page_nr);
  32.592 -
  32.593 -    unmap_domain_mem(pl2e);
  32.594 -}
  32.595 -
  32.596 -
  32.597 -static void free_l1_table(struct pfn_info *page)
  32.598 -{
  32.599 -    struct domain *d = page_get_owner(page);
  32.600 -    unsigned long page_nr = page - frame_table;
  32.601 -    l1_pgentry_t *pl1e;
  32.602 -    int i;
  32.603 -
  32.604 -    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
  32.605 -
  32.606 -    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
  32.607 -        put_page_from_l1e(pl1e[i], d);
  32.608 -
  32.609 -    unmap_domain_mem(pl1e);
  32.610 -}
  32.611 -
  32.612 -
  32.613 -static inline int update_l2e(l2_pgentry_t *pl2e, 
  32.614 -                             l2_pgentry_t  ol2e, 
  32.615 -                             l2_pgentry_t  nl2e)
  32.616 -{
  32.617 -    unsigned long o = cmpxchg((unsigned long *)pl2e, 
  32.618 -                              l2_pgentry_val(ol2e), 
  32.619 -                              l2_pgentry_val(nl2e));
  32.620 -    if ( o != l2_pgentry_val(ol2e) )
  32.621 -        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
  32.622 -                l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
  32.623 -    return (o == l2_pgentry_val(ol2e));
  32.624 -}
  32.625 -
  32.626 -
  32.627 -/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
  32.628 -static int mod_l2_entry(l2_pgentry_t *pl2e, 
  32.629 -                        l2_pgentry_t nl2e, 
  32.630 -                        unsigned long pfn)
  32.631 -{
  32.632 -    l2_pgentry_t ol2e;
  32.633 -    unsigned long _ol2e;
  32.634 -
  32.635 -    if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
  32.636 -                  DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
  32.637 -    {
  32.638 -        MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
  32.639 -        return 0;
  32.640 -    }
  32.641 -
  32.642 -    if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
  32.643 -        return 0;
  32.644 -    ol2e = mk_l2_pgentry(_ol2e);
  32.645 -
  32.646 -    if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
  32.647 -    {
  32.648 -        /* Differ in mapping (bits 12-31) or presence (bit 0)? */
  32.649 -        if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
  32.650 -            return update_l2e(pl2e, ol2e, nl2e);
  32.651 -
  32.652 -        if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
  32.653 -                                        ((unsigned long)pl2e & 
  32.654 -                                         ~PAGE_MASK) >> 2)) )
  32.655 -            return 0;
  32.656 -
  32.657 -        if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
  32.658 -        {
  32.659 -            put_page_from_l2e(nl2e, pfn);
  32.660 -            return 0;
  32.661 -        }
  32.662 -        
  32.663 -        put_page_from_l2e(ol2e, pfn);
  32.664 -        return 1;
  32.665 -    }
  32.666 -
  32.667 -    if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
  32.668 -        return 0;
  32.669 -
  32.670 -    put_page_from_l2e(ol2e, pfn);
  32.671 -    return 1;
  32.672 -}
  32.673 -
  32.674 -
  32.675 -static inline int update_l1e(l1_pgentry_t *pl1e, 
  32.676 -                             l1_pgentry_t  ol1e, 
  32.677 -                             l1_pgentry_t  nl1e)
  32.678 -{
  32.679 -    unsigned long o = l1_pgentry_val(ol1e);
  32.680 -    unsigned long n = l1_pgentry_val(nl1e);
  32.681 -
  32.682 -    if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
  32.683 -         unlikely(o != l1_pgentry_val(ol1e)) )
  32.684 -    {
  32.685 -        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
  32.686 -                l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
  32.687 -        return 0;
  32.688 -    }
  32.689 -
  32.690 -    return 1;
  32.691 -}
  32.692 -
  32.693 -
  32.694 -/* Update the L1 entry at pl1e to new value nl1e. */
  32.695 -static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
  32.696 -{
  32.697 -    l1_pgentry_t ol1e;
  32.698 -    unsigned long _ol1e;
  32.699 -    struct domain *d = current->domain;
  32.700 -
  32.701 -    if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
  32.702 -    {
  32.703 -        MEM_LOG("Bad get_user\n");
  32.704 -        return 0;
  32.705 -    }
  32.706 -    
  32.707 -    ol1e = mk_l1_pgentry(_ol1e);
  32.708 -
  32.709 -    if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
  32.710 -    {
  32.711 -        /* Differ in mapping (bits 12-31), r/w (bit 1), or presence (bit 0)? */
  32.712 -        if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
  32.713 -            return update_l1e(pl1e, ol1e, nl1e);
  32.714 -
  32.715 -        if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
  32.716 -            return 0;
  32.717 -        
  32.718 -        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
  32.719 -        {
  32.720 -            put_page_from_l1e(nl1e, d);
  32.721 -            return 0;
  32.722 -        }
  32.723 -        
  32.724 -        put_page_from_l1e(ol1e, d);
  32.725 -        return 1;
  32.726 -    }
  32.727 -
  32.728 -    if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
  32.729 -        return 0;
  32.730 -    
  32.731 -    put_page_from_l1e(ol1e, d);
  32.732 -    return 1;
  32.733 -}
  32.734 -
  32.735 -
  32.736 -int alloc_page_type(struct pfn_info *page, unsigned int type)
  32.737 -{
  32.738 -    switch ( type )
  32.739 -    {
  32.740 -    case PGT_l1_page_table:
  32.741 -        return alloc_l1_table(page);
  32.742 -    case PGT_l2_page_table:
  32.743 -        return alloc_l2_table(page);
  32.744 -    case PGT_gdt_page:
  32.745 -    case PGT_ldt_page:
  32.746 -        return alloc_segdesc_page(page);
  32.747 -    default:
  32.748 -        printk("Bad type in alloc_page_type %x t=%x c=%x\n", 
  32.749 -               type, page->u.inuse.type_info,
  32.750 -               page->count_info);
  32.751 -        BUG();
  32.752 -    }
  32.753 -
  32.754 -    return 0;
  32.755 -}
  32.756 -
  32.757 -
  32.758 -void free_page_type(struct pfn_info *page, unsigned int type)
  32.759 -{
  32.760 -    struct domain *d = page_get_owner(page);
  32.761 -
  32.762 -    switch ( type )
  32.763 -    {
  32.764 -    case PGT_l1_page_table:
  32.765 -        free_l1_table(page);
  32.766 -        break;
  32.767 -
  32.768 -    case PGT_l2_page_table:
  32.769 -        free_l2_table(page);
  32.770 -        break;
  32.771 -
  32.772 -    default:
  32.773 -        BUG();
  32.774 -    }
  32.775 -
  32.776 -    if ( unlikely(d->arch.shadow_mode) && 
  32.777 -         (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) )
  32.778 -    {
  32.779 -        unshadow_table(page_to_pfn(page), type);
  32.780 -        put_shadow_status(d);
  32.781 -    }
  32.782 -}
  32.783 -
  32.784 -
  32.785 -void put_page_type(struct pfn_info *page)
  32.786 -{
  32.787 -    u32 nx, x, y = page->u.inuse.type_info;
  32.788 -
  32.789 - again:
  32.790 -    do {
  32.791 -        x  = y;
  32.792 -        nx = x - 1;
  32.793 -
  32.794 -        ASSERT((x & PGT_count_mask) != 0);
  32.795 -
  32.796 -        /*
  32.797 -         * The page should always be validated while a reference is held. The 
  32.798 -         * exception is during domain destruction, when we forcibly invalidate 
  32.799 -         * page-table pages if we detect a referential loop.
  32.800 -         * See domain.c:relinquish_list().
  32.801 -         */
  32.802 -        ASSERT((x & PGT_validated) || 
  32.803 -               test_bit(DF_DYING, &page_get_owner(page)->d_flags));
  32.804 -
  32.805 -        if ( unlikely((nx & PGT_count_mask) == 0) )
  32.806 -        {
  32.807 -            /* Record TLB information for flush later. Races are harmless. */
  32.808 -            page->tlbflush_timestamp = tlbflush_current_time();
  32.809 -            
  32.810 -            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
  32.811 -                 likely(nx & PGT_validated) )
  32.812 -            {
  32.813 -                /*
  32.814 -                 * Page-table pages must be unvalidated when count is zero. The
  32.815 -                 * 'free' is safe because the refcnt is non-zero and validated
  32.816 -                 * bit is clear => other ops will spin or fail.
  32.817 -                 */
  32.818 -                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
  32.819 -                                           x & ~PGT_validated)) != x) )
  32.820 -                    goto again;
  32.821 -                /* We cleared the 'valid bit' so we do the clear up. */
  32.822 -                free_page_type(page, x & PGT_type_mask);
  32.823 -                /* Carry on, but with the 'valid bit' now clear. */
  32.824 -                x  &= ~PGT_validated;
  32.825 -                nx &= ~PGT_validated;
  32.826 -            }
  32.827 -        }
  32.828 -        else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 
  32.829 -                           (PGT_pinned | 1)) )
  32.830 -        {
  32.831 -            /* Page is now only pinned. Make the back pointer mutable again. */
  32.832 -            nx |= PGT_va_mutable;
  32.833 -        }
  32.834 -    }
  32.835 -    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
  32.836 -}
  32.837 -
  32.838 -
  32.839 -int get_page_type(struct pfn_info *page, u32 type)
  32.840 -{
  32.841 -    u32 nx, x, y = page->u.inuse.type_info;
  32.842 -
  32.843 - again:
  32.844 -    do {
  32.845 -        x  = y;
  32.846 -        nx = x + 1;
  32.847 -        if ( unlikely((nx & PGT_count_mask) == 0) )
  32.848 -        {
  32.849 -            MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
  32.850 -            return 0;
  32.851 -        }
  32.852 -        else if ( unlikely((x & PGT_count_mask) == 0) )
  32.853 -        {
  32.854 -            if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
  32.855 -            {
  32.856 -                /*
  32.857 -                 * On type change we check to flush stale TLB entries. This 
  32.858 -                 * may be unnecessary (e.g., page was GDT/LDT) but those
  32.859 -                 * circumstances should be very rare.
  32.860 -                 */
  32.861 -                struct domain *d = page_get_owner(page);
  32.862 -                if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
  32.863 -                                         page->tlbflush_timestamp)) )
  32.864 -                {
  32.865 -                    perfc_incr(need_flush_tlb_flush);
  32.866 -                    flush_tlb_cpu(d->exec_domain[0]->processor);
  32.867 -                }
  32.868 -
  32.869 -                /* We lose existing type, back pointer, and validity. */
  32.870 -                nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
  32.871 -                nx |= type;
  32.872 -
  32.873 -                /* No special validation needed for writable pages. */
  32.874 -                /* Page tables and GDT/LDT need to be scanned for validity. */
  32.875 -                if ( type == PGT_writable_page )
  32.876 -                    nx |= PGT_validated;
  32.877 -            }
  32.878 -        }
  32.879 -        else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
  32.880 -        {
  32.881 -            if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
  32.882 -            {
  32.883 -                if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
  32.884 -                     ((type & PGT_type_mask) != PGT_l1_page_table) )
  32.885 -                    MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
  32.886 -                            x & PGT_type_mask, type, page_to_pfn(page));
  32.887 -                return 0;
  32.888 -            }
  32.889 -            else if ( (x & PGT_va_mask) == PGT_va_mutable )
  32.890 -            {
  32.891 -                /* The va backpointer is mutable, hence we update it. */
  32.892 -                nx &= ~PGT_va_mask;
  32.893 -                nx |= type; /* we know the actual type is correct */
  32.894 -            }
  32.895 -            else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
  32.896 -            {
  32.897 -                /* This table is potentially mapped at multiple locations. */
  32.898 -                nx &= ~PGT_va_mask;
  32.899 -                nx |= PGT_va_unknown;
  32.900 -            }
  32.901 -        }
  32.902 -        else if ( unlikely(!(x & PGT_validated)) )
  32.903 -        {
  32.904 -            /* Someone else is updating validation of this page. Wait... */
  32.905 -            while ( (y = page->u.inuse.type_info) == x )
  32.906 -            {
  32.907 -                rep_nop();
  32.908 -                barrier();
  32.909 -            }
  32.910 -            goto again;
  32.911 -        }
  32.912 -    }
  32.913 -    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
  32.914 -
  32.915 -    if ( unlikely(!(nx & PGT_validated)) )
  32.916 -    {
  32.917 -        /* Try to validate page type; drop the new reference on failure. */
  32.918 -        if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
  32.919 -        {
  32.920 -            MEM_LOG("Error while validating pfn %08lx for type %08x."
  32.921 -                    " caf=%08x taf=%08x\n",
  32.922 -                    page_to_pfn(page), type,
  32.923 -                    page->count_info,
  32.924 -                    page->u.inuse.type_info);
  32.925 -            /* Noone else can get a reference. We hold the only ref. */
  32.926 -            page->u.inuse.type_info = 0;
  32.927 -            return 0;
  32.928 -        }
  32.929 -
  32.930 -        /* Noone else is updating simultaneously. */
  32.931 -        __set_bit(_PGT_validated, &page->u.inuse.type_info);
  32.932 -    }
  32.933 -
  32.934 -    return 1;
  32.935 -}
  32.936 -
  32.937 -
  32.938 -int new_guest_cr3(unsigned long pfn)
  32.939 -{
  32.940 -    struct exec_domain *ed = current;
  32.941 -    struct domain *d = ed->domain;
  32.942 -    int okay, cpu = smp_processor_id();
  32.943 -    unsigned long old_base_pfn;
  32.944 -    
  32.945 -    okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
  32.946 -    if ( likely(okay) )
  32.947 -    {
  32.948 -        invalidate_shadow_ldt(ed);
  32.949 -
  32.950 -        percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
  32.951 -        old_base_pfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT;
  32.952 -        ed->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
  32.953 -
  32.954 -        shadow_mk_pagetable(ed);
  32.955 -
  32.956 -        write_ptbase(ed);
  32.957 -
  32.958 -        put_page_and_type(&frame_table[old_base_pfn]);
  32.959 -    }
  32.960 -    else
  32.961 -    {
  32.962 -        MEM_LOG("Error while installing new baseptr %08lx", pfn);
  32.963 -    }
  32.964 -
  32.965 -    return okay;
  32.966 -}
  32.967 -
  32.968 -static int do_extended_command(unsigned long ptr, unsigned long val)
  32.969 -{
  32.970 -    int okay = 1, cpu = smp_processor_id();
  32.971 -    unsigned int cmd = val & MMUEXT_CMD_MASK;
  32.972 -    unsigned long pfn = ptr >> PAGE_SHIFT;
  32.973 -    struct pfn_info *page = &frame_table[pfn];
  32.974 -    struct exec_domain *ed = current;
  32.975 -    struct domain *d = ed->domain, *nd, *e;
  32.976 -    u32 x, y;
  32.977 -    domid_t domid;
  32.978 -    grant_ref_t gntref;
  32.979 -
  32.980 -    switch ( cmd )
  32.981 -    {
  32.982 -    case MMUEXT_PIN_L1_TABLE:
  32.983 -    case MMUEXT_PIN_L2_TABLE:
  32.984 -        /*
  32.985 -         * We insist that, if you pin an L1 page, it's the first thing that
  32.986 -         * you do to it. This is because we require the backptr to still be
  32.987 -         * mutable. This assumption seems safe.
  32.988 -         */
  32.989 -        okay = get_page_and_type_from_pagenr(
  32.990 -            pfn, 
  32.991 -            ((cmd==MMUEXT_PIN_L2_TABLE) ? 
  32.992 -             PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
  32.993 -            FOREIGNDOM);
  32.994 -
  32.995 -        if ( unlikely(!okay) )
  32.996 -        {
  32.997 -            MEM_LOG("Error while pinning pfn %08lx", pfn);
  32.998 -            break;
  32.999 -        }
 32.1000 -
 32.1001 -        if ( unlikely(test_and_set_bit(_PGT_pinned,
 32.1002 -                                       &page->u.inuse.type_info)) )
 32.1003 -        {
 32.1004 -            MEM_LOG("Pfn %08lx already pinned", pfn);
 32.1005 -            put_page_and_type(page);
 32.1006 -            okay = 0;
 32.1007 -            break;
 32.1008 -        }
 32.1009 -
 32.1010 -        break;
 32.1011 -
 32.1012 -    case MMUEXT_UNPIN_TABLE:
 32.1013 -        if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
 32.1014 -        {
 32.1015 -            MEM_LOG("Page %08lx bad domain (dom=%p)",
 32.1016 -                    ptr, page_get_owner(page));
 32.1017 -        }
 32.1018 -        else if ( likely(test_and_clear_bit(_PGT_pinned, 
 32.1019 -                                            &page->u.inuse.type_info)) )
 32.1020 -        {
 32.1021 -            put_page_and_type(page);
 32.1022 -            put_page(page);
 32.1023 -        }
 32.1024 -        else
 32.1025 -        {
 32.1026 -            okay = 0;
 32.1027 -            put_page(page);
 32.1028 -            MEM_LOG("Pfn %08lx not pinned", pfn);
 32.1029 -        }
 32.1030 -        break;
 32.1031 -
 32.1032 -    case MMUEXT_NEW_BASEPTR:
 32.1033 -        okay = new_guest_cr3(pfn);
 32.1034 -        break;
 32.1035 -        
 32.1036 -    case MMUEXT_TLB_FLUSH:
 32.1037 -        percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
 32.1038 -        break;
 32.1039 -    
 32.1040 -    case MMUEXT_INVLPG:
 32.1041 -        __flush_tlb_one(ptr);
 32.1042 -        break;
 32.1043 -
 32.1044 -    case MMUEXT_FLUSH_CACHE:
 32.1045 -        if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
 32.1046 -        {
 32.1047 -            MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
 32.1048 -            okay = 0;
 32.1049 -        }
 32.1050 -        else
 32.1051 -        {
 32.1052 -            wbinvd();
 32.1053 -        }
 32.1054 -        break;
 32.1055 -
 32.1056 -    case MMUEXT_SET_LDT:
 32.1057 -    {
 32.1058 -        unsigned long ents = val >> MMUEXT_CMD_SHIFT;
 32.1059 -        if ( ((ptr & (PAGE_SIZE-1)) != 0) || 
 32.1060 -             (ents > 8192) ||
 32.1061 -             ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
 32.1062 -             ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
 32.1063 -        {
 32.1064 -            okay = 0;
 32.1065 -            MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
 32.1066 -        }
 32.1067 -        else if ( (ed->arch.ldt_ents != ents) || 
 32.1068 -                  (ed->arch.ldt_base != ptr) )
 32.1069 -        {
 32.1070 -            invalidate_shadow_ldt(ed);
 32.1071 -            ed->arch.ldt_base = ptr;
 32.1072 -            ed->arch.ldt_ents = ents;
 32.1073 -            load_LDT(ed);
 32.1074 -            percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
 32.1075 -            if ( ents != 0 )
 32.1076 -                percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
 32.1077 -        }
 32.1078 -        break;
 32.1079 -    }
 32.1080 -
 32.1081 -    case MMUEXT_SET_FOREIGNDOM:
 32.1082 -        domid = (domid_t)(val >> 16);
 32.1083 -
 32.1084 -        if ( (e = percpu_info[cpu].foreign) != NULL )
 32.1085 -            put_domain(e);
 32.1086 -        percpu_info[cpu].foreign = NULL;
 32.1087 -
 32.1088 -        if ( !IS_PRIV(d) )
 32.1089 -        {
 32.1090 -            switch ( domid )
 32.1091 -            {
 32.1092 -            case DOMID_IO:
 32.1093 -                get_knownalive_domain(dom_io);
 32.1094 -                percpu_info[cpu].foreign = dom_io;
 32.1095 -                break;
 32.1096 -            default:
 32.1097 -                MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
 32.1098 -                okay = 0;
 32.1099 -                break;
 32.1100 -            }
 32.1101 -        }
 32.1102 -        else
 32.1103 -        {
 32.1104 -            percpu_info[cpu].foreign = e = find_domain_by_id(domid);
 32.1105 -            if ( e == NULL )
 32.1106 -            {
 32.1107 -                switch ( domid )
 32.1108 -                {
 32.1109 -                case DOMID_XEN:
 32.1110 -                    get_knownalive_domain(dom_xen);
 32.1111 -                    percpu_info[cpu].foreign = dom_xen;
 32.1112 -                    break;
 32.1113 -                case DOMID_IO:
 32.1114 -                    get_knownalive_domain(dom_io);
 32.1115 -                    percpu_info[cpu].foreign = dom_io;
 32.1116 -                    break;
 32.1117 -                default:
 32.1118 -                    MEM_LOG("Unknown domain '%u'", domid);
 32.1119 -                    okay = 0;
 32.1120 -                    break;
 32.1121 -                }
 32.1122 -            }
 32.1123 -        }
 32.1124 -        break;
 32.1125 -
 32.1126 -    case MMUEXT_TRANSFER_PAGE:
 32.1127 -        domid  = (domid_t)(val >> 16);
 32.1128 -        gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
 32.1129 -        
 32.1130 -        if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
 32.1131 -             unlikely(!pfn_is_ram(pfn)) ||
 32.1132 -             unlikely((e = find_domain_by_id(domid)) == NULL) )
 32.1133 -        {
 32.1134 -            MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
 32.1135 -            okay = 0;
 32.1136 -            break;
 32.1137 -        }
 32.1138 -
 32.1139 -        spin_lock(&d->page_alloc_lock);
 32.1140 -
 32.1141 -        /*
 32.1142 -         * The tricky bit: atomically release ownership while there is just one
 32.1143 -         * benign reference to the page (PGC_allocated). If that reference
 32.1144 -         * disappears then the deallocation routine will safely spin.
 32.1145 -         */
 32.1146 -        nd = page_get_owner(page);
 32.1147 -        y  = page->count_info;
 32.1148 -        do {
 32.1149 -            x = y;
 32.1150 -            if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 
 32.1151 -                          (1|PGC_allocated)) ||
 32.1152 -                 unlikely(nd != d) )
 32.1153 -            {
 32.1154 -                MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
 32.1155 -                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
 32.1156 -                        d, d->id, nd, x, page->u.inuse.type_info);
 32.1157 -                spin_unlock(&d->page_alloc_lock);
 32.1158 -                put_domain(e);
 32.1159 -                return 0;
 32.1160 -            }
 32.1161 -            __asm__ __volatile__(
 32.1162 -                LOCK_PREFIX "cmpxchg8b %2"
 32.1163 -                : "=d" (nd), "=a" (y),
 32.1164 -                "=m" (*(volatile u64 *)(&page->count_info))
 32.1165 -                : "0" (d), "1" (x), "c" (NULL), "b" (x) );
 32.1166 -        } 
 32.1167 -        while ( unlikely(nd != d) || unlikely(y != x) );
 32.1168 -
 32.1169 -        /*
 32.1170 -         * Unlink from 'd'. At least one reference remains (now anonymous), so
 32.1171 -         * noone else is spinning to try to delete this page from 'd'.
 32.1172 -         */
 32.1173 -        d->tot_pages--;
 32.1174 -        list_del(&page->list);
 32.1175 -        
 32.1176 -        spin_unlock(&d->page_alloc_lock);
 32.1177 -
 32.1178 -        spin_lock(&e->page_alloc_lock);
 32.1179 -
 32.1180 -        /*
 32.1181 -         * Check that 'e' will accept the page and has reservation headroom.
 32.1182 -         * Also, a domain mustn't have PGC_allocated pages when it is dying.
 32.1183 -         */
 32.1184 -        ASSERT(e->tot_pages <= e->max_pages);
 32.1185 -        if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
 32.1186 -             unlikely(e->tot_pages == e->max_pages) ||
 32.1187 -             unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
 32.1188 -        {
 32.1189 -            MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
 32.1190 -                    "provided a bad grant ref, or is dying (%08lx).\n",
 32.1191 -                    e->tot_pages, e->max_pages, e->d_flags);
 32.1192 -            spin_unlock(&e->page_alloc_lock);
 32.1193 -            put_domain(e);
 32.1194 -            okay = 0;
 32.1195 -            break;
 32.1196 -        }
 32.1197 -
 32.1198 -        /* Okay, add the page to 'e'. */
 32.1199 -        if ( unlikely(e->tot_pages++ == 0) )
 32.1200 -            get_knownalive_domain(e);
 32.1201 -        list_add_tail(&page->list, &e->page_list);
 32.1202 -        page_set_owner(page, e);
 32.1203 -
 32.1204 -        spin_unlock(&e->page_alloc_lock);
 32.1205 -
 32.1206 -        /* Transfer is all done: tell the guest about its new page frame. */
 32.1207 -        gnttab_notify_transfer(e, gntref, pfn);
 32.1208 -        
 32.1209 -        put_domain(e);
 32.1210 -        break;
 32.1211 -
 32.1212 -    case MMUEXT_REASSIGN_PAGE:
 32.1213 -        if ( unlikely(!IS_PRIV(d)) )
 32.1214 -        {
 32.1215 -            MEM_LOG("Dom %u has no reassignment priv", d->id);
 32.1216 -            okay = 0;
 32.1217 -            break;
 32.1218 -        }
 32.1219 -
 32.1220 -        e = percpu_info[cpu].foreign;
 32.1221 -        if ( unlikely(e == NULL) )
 32.1222 -        {
 32.1223 -            MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
 32.1224 -            okay = 0;
 32.1225 -            break;
 32.1226 -        }
 32.1227 -
 32.1228 -        /*
 32.1229 -         * Grab both page_list locks, in order. This prevents the page from
 32.1230 -         * disappearing elsewhere while we modify the owner, and we'll need
 32.1231 -         * both locks if we're successful so that we can change lists.
 32.1232 -         */
 32.1233 -        if ( d < e )
 32.1234 -        {
 32.1235 -            spin_lock(&d->page_alloc_lock);
 32.1236 -            spin_lock(&e->page_alloc_lock);
 32.1237 -        }
 32.1238 -        else
 32.1239 -        {
 32.1240 -            spin_lock(&e->page_alloc_lock);
 32.1241 -            spin_lock(&d->page_alloc_lock);
 32.1242 -        }
 32.1243 -
 32.1244 -        /* A domain shouldn't have PGC_allocated pages when it is dying. */
 32.1245 -        if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
 32.1246 -             unlikely(IS_XEN_HEAP_FRAME(page)) )
 32.1247 -        {
 32.1248 -            MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
 32.1249 -            okay = 0;
 32.1250 -            goto reassign_fail;
 32.1251 -        }
 32.1252 -
 32.1253 -        /*
 32.1254 -         * The tricky bit: atomically change owner while there is just one
 32.1255 -         * benign reference to the page (PGC_allocated). If that reference
 32.1256 -         * disappears then the deallocation routine will safely spin.
 32.1257 -         */
 32.1258 -        nd = page_get_owner(page);
 32.1259 -        y  = page->count_info;
 32.1260 -        do {
 32.1261 -            x = y;
 32.1262 -            if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 
 32.1263 -                          (1|PGC_allocated)) ||
 32.1264 -                 unlikely(nd != d) )
 32.1265 -            {
 32.1266 -                MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
 32.1267 -                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
 32.1268 -                        d, d->id, nd, x, page->u.inuse.type_info);
 32.1269 -                okay = 0;
 32.1270 -                goto reassign_fail;
 32.1271 -            }
 32.1272 -            __asm__ __volatile__(
 32.1273 -                LOCK_PREFIX "cmpxchg8b %3"
 32.1274 -                : "=d" (nd), "=a" (y), "=c" (e),
 32.1275 -                "=m" (*(volatile u64 *)(&page->count_info))
 32.1276 -                : "0" (d), "1" (x), "c" (e), "b" (x) );
 32.1277 -        } 
 32.1278 -        while ( unlikely(nd != d) || unlikely(y != x) );
 32.1279 -        
 32.1280 -        /*
 32.1281 -         * Unlink from 'd'. We transferred at least one reference to 'e', so
 32.1282 -         * noone else is spinning to try to delete this page from 'd'.
 32.1283 -         */
 32.1284 -        d->tot_pages--;
 32.1285 -        list_del(&page->list);
 32.1286 -        
 32.1287 -        /*
 32.1288 -         * Add the page to 'e'. Someone may already have removed the last
 32.1289 -         * reference and want to remove the page from 'e'. However, we have
 32.1290 -         * the lock so they'll spin waiting for us.
 32.1291 -         */
 32.1292 -        if ( unlikely(e->tot_pages++ == 0) )
 32.1293 -            get_knownalive_domain(e);
 32.1294 -        list_add_tail(&page->list, &e->page_list);
 32.1295 -
 32.1296 -    reassign_fail:        
 32.1297 -        spin_unlock(&d->page_alloc_lock);
 32.1298 -        spin_unlock(&e->page_alloc_lock);
 32.1299 -        break;
 32.1300 -
 32.1301 -    case MMUEXT_CLEAR_FOREIGNDOM:
 32.1302 -        if ( (e = percpu_info[cpu].foreign) != NULL )
 32.1303 -            put_domain(e);
 32.1304 -        percpu_info[cpu].foreign = NULL;
 32.1305 -        break;
 32.1306 -
 32.1307 -    default:
 32.1308 -        MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
 32.1309 -        okay = 0;
 32.1310 -        break;
 32.1311 -    }
 32.1312 -
 32.1313 -    return okay;
 32.1314 -}
 32.1315 -
 32.1316 -int do_mmu_update(
 32.1317 -    mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
 32.1318 -{
 32.1319 -/*
 32.1320 - * We steal the m.s.b. of the @count parameter to indicate whether this
 32.1321 - * invocation of do_mmu_update() is resuming a previously preempted call.
 32.1322 - * We steal the next 15 bits to remember the current FOREIGNDOM.
 32.1323 - */
 32.1324 -#define MMU_UPDATE_PREEMPTED          (~(~0U>>1))
 32.1325 -#define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
 32.1326 -#define MMU_UPDATE_PREEMPT_FDOM_MASK  (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
 32.1327 -
 32.1328 -    mmu_update_t req;
 32.1329 -    unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
 32.1330 -    struct pfn_info *page;
 32.1331 -    int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
 32.1332 -    unsigned int cmd, done = 0;
 32.1333 -    unsigned long prev_spfn = 0;
 32.1334 -    l1_pgentry_t *prev_spl1e = 0;
 32.1335 -    struct exec_domain *ed = current;
 32.1336 -    struct domain *d = ed->domain;
 32.1337 -    u32 type_info;
 32.1338 -    domid_t domid;
 32.1339 -
 32.1340 -    LOCK_BIGLOCK(d);
 32.1341 -
 32.1342 -    cleanup_writable_pagetable(d);
 32.1343 -
 32.1344 -    /*
 32.1345 -     * If we are resuming after preemption, read how much work we have already
 32.1346 -     * done. This allows us to set the @done output parameter correctly.
 32.1347 -     * We also reset FOREIGNDOM here.
 32.1348 -     */
 32.1349 -    if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
 32.1350 -    {
 32.1351 -        if ( !(count & MMU_UPDATE_PREEMPTED) )
 32.1352 -        {
 32.1353 -            /* Count overflow into private FOREIGNDOM field. */
 32.1354 -            MEM_LOG("do_mmu_update count is too large");
 32.1355 -            rc = -EINVAL;
 32.1356 -            goto out;
 32.1357 -        }
 32.1358 -        count &= ~MMU_UPDATE_PREEMPTED;
 32.1359 -        domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
 32.1360 -        count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
 32.1361 -        if ( unlikely(pdone != NULL) )
 32.1362 -            (void)get_user(done, pdone);
 32.1363 -        if ( (domid != current->domain->id) &&
 32.1364 -             !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
 32.1365 -        {
 32.1366 -            rc = -EINVAL;
 32.1367 -            goto out;
 32.1368 -        }
 32.1369 -    }
 32.1370 -
 32.1371 -    perfc_incrc(calls_to_mmu_update); 
 32.1372 -    perfc_addc(num_page_updates, count);
 32.1373 -
 32.1374 -    if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
 32.1375 -    {
 32.1376 -        rc = -EFAULT;
 32.1377 -        goto out;
 32.1378 -    }
 32.1379 -
 32.1380 -    for ( i = 0; i < count; i++ )
 32.1381 -    {
 32.1382 -        if ( hypercall_preempt_check() )
 32.1383 -        {
 32.1384 -            rc = hypercall_create_continuation(
 32.1385 -                __HYPERVISOR_mmu_update, 3, ureqs, 
 32.1386 -                (count - i) |
 32.1387 -                (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) | 
 32.1388 -                MMU_UPDATE_PREEMPTED, pdone);
 32.1389 -            break;
 32.1390 -        }
 32.1391 -
 32.1392 -        if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
 32.1393 -        {
 32.1394 -            MEM_LOG("Bad __copy_from_user");
 32.1395 -            rc = -EFAULT;
 32.1396 -            break;
 32.1397 -        }
 32.1398 -
 32.1399 -        cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
 32.1400 -        pfn = req.ptr >> PAGE_SHIFT;
 32.1401 -
 32.1402 -        okay = 0;
 32.1403 -
 32.1404 -        switch ( cmd )
 32.1405 -        {
 32.1406 -            /*
 32.1407 -             * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
 32.1408 -             */
 32.1409 -        case MMU_NORMAL_PT_UPDATE:
 32.1410 -            if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
 32.1411 -            {
 32.1412 -                MEM_LOG("Could not get page for normal update");
 32.1413 -                break;
 32.1414 -            }
 32.1415 -
 32.1416 -            if ( likely(prev_pfn == pfn) )
 32.1417 -            {
 32.1418 -                va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
 32.1419 -            }
 32.1420 -            else
 32.1421 -            {
 32.1422 -                if ( prev_pfn != 0 )
 32.1423 -                    unmap_domain_mem((void *)va);
 32.1424 -                va = (unsigned long)map_domain_mem(req.ptr);
 32.1425 -                prev_pfn = pfn;
 32.1426 -            }
 32.1427 -
 32.1428 -            page = &frame_table[pfn];
 32.1429 -            switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
 32.1430 -            {
 32.1431 -            case PGT_l1_page_table: 
 32.1432 -                if ( likely(get_page_type(
 32.1433 -                    page, type_info & (PGT_type_mask|PGT_va_mask))) )
 32.1434 -                {
 32.1435 -                    okay = mod_l1_entry((l1_pgentry_t *)va, 
 32.1436 -                                        mk_l1_pgentry(req.val)); 
 32.1437 -
 32.1438 -                    if ( unlikely(d->arch.shadow_mode) && okay &&
 32.1439 -                         (get_shadow_status(d, page-frame_table) &
 32.1440 -                          PSH_shadowed) )
 32.1441 -                    {
 32.1442 -                        shadow_l1_normal_pt_update(
 32.1443 -                            req.ptr, req.val, &prev_spfn, &prev_spl1e);
 32.1444 -                        put_shadow_status(d);
 32.1445 -                    }
 32.1446 -
 32.1447 -                    put_page_type(page);
 32.1448 -                }
 32.1449 -                break;
 32.1450 -            case PGT_l2_page_table:
 32.1451 -                if ( likely(get_page_type(page, PGT_l2_page_table)) )
 32.1452 -                {
 32.1453 -                    okay = mod_l2_entry((l2_pgentry_t *)va, 
 32.1454 -                                        mk_l2_pgentry(req.val),
 32.1455 -                                        pfn); 
 32.1456 -
 32.1457 -                    if ( unlikely(d->arch.shadow_mode) && okay &&
 32.1458 -                         (get_shadow_status(d, page-frame_table) & 
 32.1459 -                          PSH_shadowed) )
 32.1460 -                    {
 32.1461 -                        shadow_l2_normal_pt_update(req.ptr, req.val);
 32.1462 -                        put_shadow_status(d);
 32.1463 -                    }
 32.1464 -
 32.1465 -                    put_page_type(page);
 32.1466 -                }
 32.1467 -                break;
 32.1468 -            default:
 32.1469 -                if ( likely(get_page_type(page, PGT_writable_page)) )
 32.1470 -                {
 32.1471 -                    *(unsigned long *)va = req.val;
 32.1472 -                    okay = 1;
 32.1473 -                    put_page_type(page);
 32.1474 -                }
 32.1475 -                break;
 32.1476 -            }
 32.1477 -
 32.1478 -            put_page(page);
 32.1479 -            break;
 32.1480 -
 32.1481 -        case MMU_MACHPHYS_UPDATE:
 32.1482 -            if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
 32.1483 -            {
 32.1484 -                MEM_LOG("Could not get page for mach->phys update");
 32.1485 -                break;
 32.1486 -            }
 32.1487 -
 32.1488 -            machine_to_phys_mapping[pfn] = req.val;
 32.1489 -            okay = 1;
 32.1490 -
 32.1491 -            /*
 32.1492 -             * If in log-dirty mode, mark the corresponding pseudo-physical
 32.1493 -             * page as dirty.
 32.1494 -             */
 32.1495 -            if ( unlikely(d->arch.shadow_mode == SHM_logdirty) && 
 32.1496 -                 mark_dirty(d, pfn) )
 32.1497 -                d->arch.shadow_dirty_block_count++;
 32.1498 -
 32.1499 -            put_page(&frame_table[pfn]);
 32.1500 -            break;
 32.1501 -
 32.1502 -            /*
 32.1503 -             * MMU_EXTENDED_COMMAND: Extended command is specified
 32.1504 -             * in the least-siginificant bits of the 'value' field.
 32.1505 -             */
 32.1506 -        case MMU_EXTENDED_COMMAND:
 32.1507 -            req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
 32.1508 -            okay = do_extended_command(req.ptr, req.val);
 32.1509 -            break;
 32.1510 -
 32.1511 -        default:
 32.1512 -            MEM_LOG("Invalid page update command %08lx", req.ptr);
 32.1513 -            break;
 32.1514 -        }
 32.1515 -
 32.1516 -        if ( unlikely(!okay) )
 32.1517 -        {
 32.1518 -            rc = -EINVAL;
 32.1519 -            break;
 32.1520 -        }
 32.1521 -
 32.1522 -        ureqs++;
 32.1523 -    }
 32.1524 -
 32.1525 - out:
 32.1526 -    if ( prev_pfn != 0 )
 32.1527 -        unmap_domain_mem((void *)va);
 32.1528 -
 32.1529 -    if ( unlikely(prev_spl1e != 0) ) 
 32.1530 -        unmap_domain_mem((void *)prev_spl1e);
 32.1531 -
 32.1532 -    deferred_ops = percpu_info[cpu].deferred_ops;
 32.1533 -    percpu_info[cpu].deferred_ops = 0;
 32.1534 -
 32.1535 -    if ( deferred_ops & DOP_FLUSH_TLB )
 32.1536 -        local_flush_tlb();
 32.1537 -        
 32.1538 -    if ( deferred_ops & DOP_RELOAD_LDT )
 32.1539 -        (void)map_ldt_shadow_page(0);
 32.1540 -
 32.1541 -    if ( unlikely(percpu_info[cpu].foreign != NULL) )
 32.1542 -    {
 32.1543 -        put_domain(percpu_info[cpu].foreign);
 32.1544 -        percpu_info[cpu].foreign = NULL;
 32.1545 -    }
 32.1546 -
 32.1547 -    /* Add incremental work we have done to the @done output parameter. */
 32.1548 -    if ( unlikely(pdone != NULL) )
 32.1549 -        __put_user(done + i, pdone);
 32.1550 -
 32.1551 -    UNLOCK_BIGLOCK(d);
 32.1552 -    return rc;
 32.1553 -}
 32.1554 -
 32.1555 -
 32.1556 -int do_update_va_mapping(unsigned long page_nr, 
 32.1557 -                         unsigned long val, 
 32.1558 -                         unsigned long flags)
 32.1559 -{
 32.1560 -    struct exec_domain *ed = current;
 32.1561 -    struct domain *d = ed->domain;
 32.1562 -    int err = 0;
 32.1563 -    unsigned int cpu = ed->processor;
 32.1564 -    unsigned long deferred_ops;
 32.1565 -
 32.1566 -    perfc_incrc(calls_to_update_va);
 32.1567 -
 32.1568 -    if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
 32.1569 -        return -EINVAL;
 32.1570 -
 32.1571 -    LOCK_BIGLOCK(d);
 32.1572 -
 32.1573 -    cleanup_writable_pagetable(d);
 32.1574 -
 32.1575 -    /*
 32.1576 -     * XXX When we make this support 4MB superpages we should also deal with 
 32.1577 -     * the case of updating L2 entries.
 32.1578 -     */
 32.1579 -
 32.1580 -    if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr], 
 32.1581 -                                mk_l1_pgentry(val))) )
 32.1582 -        err = -EINVAL;
 32.1583 -
 32.1584 -    if ( unlikely(d->arch.shadow_mode) )
 32.1585 -    {
 32.1586 -        unsigned long sval;
 32.1587 -
 32.1588 -        l1pte_propagate_from_guest(d, &val, &sval);
 32.1589 -
 32.1590 -        if ( unlikely(__put_user(sval, ((unsigned long *)(
 32.1591 -            &shadow_linear_pg_table[page_nr])))) )
 32.1592 -        {
 32.1593 -            /*
 32.1594 -             * Since L2's are guranteed RW, failure indicates the page was not 
 32.1595 -             * shadowed, so ignore.
 32.1596 -             */
 32.1597 -            perfc_incrc(shadow_update_va_fail);
 32.1598 -        }
 32.1599 -
 32.1600 -        /*
 32.1601 -         * If we're in log-dirty mode then we need to note that we've updated
 32.1602 -         * the PTE in the PT-holding page. We need the machine frame number
 32.1603 -         * for this.
 32.1604 -         */
 32.1605 -        if ( d->arch.shadow_mode == SHM_logdirty )
 32.1606 -            mark_dirty(d, va_to_l1mfn(page_nr << PAGE_SHIFT));  
 32.1607 -  
 32.1608 -        check_pagetable(d, ed->arch.pagetable, "va"); /* debug */
 32.1609 -    }
 32.1610 -
 32.1611 -    deferred_ops = percpu_info[cpu].deferred_ops;
 32.1612 -    percpu_info[cpu].deferred_ops = 0;
 32.1613 -
 32.1614 -    if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || 
 32.1615 -         unlikely(flags & UVMF_FLUSH_TLB) )
 32.1616 -        local_flush_tlb();
 32.1617 -    else if ( unlikely(flags & UVMF_INVLPG) )
 32.1618 -        __flush_tlb_one(page_nr << PAGE_SHIFT);
 32.1619 -
 32.1620 -    if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
 32.1621 -        (void)map_ldt_shadow_page(0);
 32.1622 -    
 32.1623 -    UNLOCK_BIGLOCK(d);
 32.1624 -
 32.1625 -    return err;
 32.1626 -}
 32.1627 -
 32.1628 -int do_update_va_mapping_otherdomain(unsigned long page_nr, 
 32.1629 -                                     unsigned long val, 
 32.1630 -                                     unsigned long flags,
 32.1631 -                                     domid_t domid)
 32.1632 -{
 32.1633 -    unsigned int cpu = smp_processor_id();
 32.1634 -    struct domain *d;
 32.1635 -    int rc;
 32.1636 -
 32.1637 -    if ( unlikely(!IS_PRIV(current->domain)) )
 32.1638 -        return -EPERM;
 32.1639 -
 32.1640 -    percpu_info[cpu].foreign = d = find_domain_by_id(domid);
 32.1641 -    if ( unlikely(d == NULL) )
 32.1642 -    {
 32.1643 -        MEM_LOG("Unknown domain '%u'", domid);
 32.1644 -        return -ESRCH;
 32.1645 -    }
 32.1646 -
 32.1647 -    rc = do_update_va_mapping(page_nr, val, flags);
 32.1648 -
 32.1649 -    put_domain(d);
 32.1650 -    percpu_info[cpu].foreign = NULL;
 32.1651 -
 32.1652 -    return rc;
 32.1653 -}
 32.1654 -
 32.1655 -
 32.1656 -
 32.1657 -/*************************
 32.1658 - * Writable Pagetables
 32.1659 - */
 32.1660 -
 32.1661 -ptwr_info_t ptwr_info[NR_CPUS];
 32.1662 -
 32.1663 -#ifdef VERBOSE
 32.1664 -int ptwr_debug = 0x0;
 32.1665 -#define PTWR_PRINTK(_f, _a...) \
 32.1666 - do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
 32.1667 -#define PTWR_PRINT_WHICH (which ? 'I' : 'A')
 32.1668 -#else
 32.1669 -#define PTWR_PRINTK(_f, _a...) ((void)0)
 32.1670 -#endif
 32.1671 -
 32.1672 -/* Flush the given writable p.t. page and write-protect it again. */
 32.1673 -void ptwr_flush(const int which)
 32.1674 -{
 32.1675 -    unsigned long  sstat, spte, pte, *ptep, l1va;
 32.1676 -    l1_pgentry_t  *sl1e = NULL, *pl1e, ol1e, nl1e;
 32.1677 -    l2_pgentry_t  *pl2e;
 32.1678 -    int            i, cpu = smp_processor_id();
 32.1679 -    struct exec_domain *ed = current;
 32.1680 -    struct domain *d = ed->domain;
 32.1681 -
 32.1682 -    l1va = ptwr_info[cpu].ptinfo[which].l1va;
 32.1683 -    ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
 32.1684 -
 32.1685 -    /*
 32.1686 -     * STEP 1. Write-protect the p.t. page so no more updates can occur.
 32.1687 -     */
 32.1688 -
 32.1689 -    if ( unlikely(__get_user(pte, ptep)) )
 32.1690 -    {
 32.1691 -        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
 32.1692 -        /*
 32.1693 -         * Really a bug. We could read this PTE during the initial fault,
 32.1694 -         * and pagetables can't have changed meantime. XXX Multi-CPU guests?
 32.1695 -         */
 32.1696 -        BUG();
 32.1697 -    }
 32.1698 -    PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
 32.1699 -                PTWR_PRINT_WHICH, ptep, pte);
 32.1700 -    pte &= ~_PAGE_RW;
 32.1701 -
 32.1702 -    if ( unlikely(d->arch.shadow_mode) )
 32.1703 -    {
 32.1704 -        /* Write-protect the p.t. page in the shadow page table. */
 32.1705 -        l1pte_propagate_from_guest(d, &pte, &spte);
 32.1706 -        __put_user(
 32.1707 -            spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
 32.1708 -
 32.1709 -        /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
 32.1710 -        sstat = get_shadow_status(d, pte >> PAGE_SHIFT);
 32.1711 -        if ( sstat & PSH_shadowed )
 32.1712 -            sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
 32.1713 -    }
 32.1714 -
 32.1715 -    /* Write-protect the p.t. page in the guest page table. */
 32.1716 -    if ( unlikely(__put_user(pte, ptep)) )
 32.1717 -    {
 32.1718 -        MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
 32.1719 -        /*
 32.1720 -         * Really a bug. We could write this PTE during the initial fault,
 32.1721 -         * and pagetables can't have changed meantime. XXX Multi-CPU guests?
 32.1722 -         */
 32.1723 -        BUG();
 32.1724 -    }
 32.1725 -
 32.1726 -    /* Ensure that there are no stale writable mappings in any TLB. */
 32.1727 -    /* NB. INVLPG is a serialising instruction: flushes pending updates. */
 32.1728 -#if 1
 32.1729 -    __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
 32.1730 -#else
 32.1731 -    flush_tlb_all();
 32.1732 -#endif
 32.1733 -    PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
 32.1734 -                PTWR_PRINT_WHICH, ptep, pte);
 32.1735 -
 32.1736 -    /*
 32.1737 -     * STEP 2. Validate any modified PTEs.
 32.1738 -     */
 32.1739 -
 32.1740 -    pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
 32.1741 -    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
 32.1742 -    {
 32.1743 -        ol1e = ptwr_info[cpu].ptinfo[which].page[i];
 32.1744 -        nl1e = pl1e[i];
 32.1745 -
 32.1746 -        if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
 32.1747 -            continue;
 32.1748 -
 32.1749 -        /*
 32.1750 -         * Fast path for PTEs that have merely been write-protected
 32.1751 -         * (e.g., during a Unix fork()). A strict reduction in privilege.
 32.1752 -         */
 32.1753 -        if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
 32.1754 -        {
 32.1755 -            if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
 32.1756 -            {
 32.1757 -                if ( unlikely(sl1e != NULL) )
 32.1758 -                    l1pte_propagate_from_guest(
 32.1759 -                        d, &l1_pgentry_val(nl1e), 
 32.1760 -                        &l1_pgentry_val(sl1e[i]));
 32.1761 -                put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
 32.1762 -            }
 32.1763 -            continue;
 32.1764 -        }
 32.1765 -
 32.1766 -        if ( unlikely(!get_page_from_l1e(nl1e, d)) )
 32.1767 -        {
 32.1768 -            MEM_LOG("ptwr: Could not re-validate l1 page\n");
 32.1769 -            /*
 32.1770 -             * Make the remaining p.t's consistent before crashing, so the
 32.1771 -             * reference counts are correct.
 32.1772 -             */
 32.1773 -            memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
 32.1774 -                   (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
 32.1775 -            unmap_domain_mem(pl1e);
 32.1776 -            ptwr_info[cpu].ptinfo[which].l1va = 0;
 32.1777 -            UNLOCK_BIGLOCK(d);
 32.1778 -            domain_crash();
 32.1779 -        }
 32.1780 -        
 32.1781 -        if ( unlikely(sl1e != NULL) )
 32.1782 -            l1pte_propagate_from_guest(
 32.1783 -                d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
 32.1784 -
 32.1785 -        if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
 32.1786 -            put_page_from_l1e(ol1e, d);
 32.1787 -    }
 32.1788 -    unmap_domain_mem(pl1e);
 32.1789 -
 32.1790 -    /*
 32.1791 -     * STEP 3. Reattach the L1 p.t. page into the current address space.
 32.1792 -     */
 32.1793 -
 32.1794 -    if ( (which == PTWR_PT_ACTIVE) && likely(!d->arch.shadow_mode) )
 32.1795 -    {
 32.1796 -        pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
 32.1797 -        *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); 
 32.1798 -    }
 32.1799 -
 32.1800 -    /*
 32.1801 -     * STEP 4. Final tidy-up.
 32.1802 -     */
 32.1803 -
 32.1804 -    ptwr_info[cpu].ptinfo[which].l1va = 0;
 32.1805 -
 32.1806 -    if ( unlikely(sl1e != NULL) )
 32.1807 -    {
 32.1808 -        unmap_domain_mem(sl1e);
 32.1809 -        put_shadow_status(d);
 32.1810 -    }
 32.1811 -}
 32.1812 -
 32.1813 -/* Write page fault handler: check if guest is trying to modify a PTE. */
 32.1814 -int ptwr_do_page_fault(unsigned long addr)
 32.1815 -{
 32.1816 -    unsigned long    pte, pfn, l2e;
 32.1817 -    struct pfn_info *page;
 32.1818 -    l2_pgentry_t    *pl2e;
 32.1819 -    int              which, cpu = smp_processor_id();
 32.1820 -    u32              l2_idx;
 32.1821 -
 32.1822 -    /*
 32.1823 -     * Attempt to read the PTE that maps the VA being accessed. By checking for
 32.1824 -     * PDE validity in the L2 we avoid many expensive fixups in __get_user().
 32.1825 -     */
 32.1826 -    if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
 32.1827 -           _PAGE_PRESENT) ||
 32.1828 -         __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
 32.1829 -    {
 32.1830 -        return 0;
 32.1831 -    }
 32.1832 -
 32.1833 -    pfn  = pte >> PAGE_SHIFT;
 32.1834 -    page = &frame_table[pfn];
 32.1835 -
 32.1836 -    /* We are looking only for read-only mappings of p.t. pages. */
 32.1837 -    if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
 32.1838 -         ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
 32.1839 -    {
 32.1840 -        return 0;
 32.1841 -    }
 32.1842 -    
 32.1843 -    /* Get the L2 index at which this L1 p.t. is always mapped. */
 32.1844 -    l2_idx = page->u.inuse.type_info & PGT_va_mask;
 32.1845 -    if ( unlikely(l2_idx >= PGT_va_unknown) )
 32.1846 -    {
 32.1847 -        domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
 32.1848 -    }
 32.1849 -    l2_idx >>= PGT_va_shift;
 32.1850 -
 32.1851 -    if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
 32.1852 -    {
 32.1853 -        MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
 32.1854 -        domain_crash();
 32.1855 -    }
 32.1856 -
 32.1857 -    /*
 32.1858 -     * Is the L1 p.t. mapped into the current address space? If so we call it
 32.1859 -     * an ACTIVE p.t., otherwise it is INACTIVE.
 32.1860 -     */
 32.1861 -    pl2e = &linear_l2_table[l2_idx];
 32.1862 -    l2e  = l2_pgentry_val(*pl2e);
 32.1863 -    which = PTWR_PT_INACTIVE;
 32.1864 -    if ( (l2e >> PAGE_SHIFT) == pfn )
 32.1865 -    {
 32.1866 -        /* Check the PRESENT bit to set ACTIVE. */
 32.1867 -        if ( likely(l2e & _PAGE_PRESENT) )
 32.1868 -            which = PTWR_PT_ACTIVE;
 32.1869 -        else {
 32.1870 -            /*
 32.1871 -             * If the PRESENT bit is clear, we may be conflicting with
 32.1872 -             * the current ACTIVE p.t. (it may be the same p.t. mapped
 32.1873 -             * at another virt addr).
 32.1874 -             * The ptwr_flush call below will restore the PRESENT bit.
 32.1875 -             */
 32.1876 -            if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
 32.1877 -                 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
 32.1878 -                which = PTWR_PT_ACTIVE;
 32.1879 -        }
 32.1880 -    }
 32.1881 -    
 32.1882 -    PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
 32.1883 -                "pfn %08lx\n", PTWR_PRINT_WHICH,
 32.1884 -                addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
 32.1885 -    
 32.1886 -    /*
 32.1887 -     * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at 
 32.1888 -     * time. If there is already one, we must flush it out.
 32.1889 -     */
 32.1890 -    if ( ptwr_info[cpu].ptinfo[which].l1va )
 32.1891 -        ptwr_flush(which);
 32.1892 -
 32.1893 -    ptwr_info[cpu].ptinfo[which].l1va   = addr | 1;
 32.1894 -    ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
 32.1895 -    
 32.1896 -    /* For safety, disconnect the L1 p.t. page from current space. */
 32.1897 -    if ( (which == PTWR_PT_ACTIVE) && 
 32.1898 -         likely(!current->domain->arch.shadow_mode) )
 32.1899 -    {
 32.1900 -        *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
 32.1901 -#if 1
 32.1902 -        flush_tlb(); /* XXX Multi-CPU guests? */
 32.1903 -#else
 32.1904 -        flush_tlb_all();
 32.1905 -#endif
 32.1906 -    }
 32.1907 -    
 32.1908 -    /* Temporarily map the L1 page, and make a copy of it. */
 32.1909 -    ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
 32.1910 -    memcpy(ptwr_info[cpu].ptinfo[which].page,
 32.1911 -           ptwr_info[cpu].ptinfo[which].pl1e,
 32.1912 -           ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
 32.1913 -    
 32.1914 -    /* Finally, make the p.t. page writable by the guest OS. */
 32.1915 -    pte |= _PAGE_RW;
 32.1916 -    PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
 32.1917 -                &linear_pg_table[addr>>PAGE_SHIFT], pte);
 32.1918 -    if ( unlikely(__put_user(pte, (unsigned long *)
 32.1919 -                             &linear_pg_table[addr>>PAGE_SHIFT])) )
 32.1920 -    {
 32.1921 -        MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
 32.1922 -                &linear_pg_table[addr>>PAGE_SHIFT]);
 32.1923 -        /* Toss the writable pagetable state and crash. */
 32.1924 -        unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
 32.1925 -        ptwr_info[cpu].ptinfo[which].l1va = 0;
 32.1926 -        domain_crash();
 32.1927 -    }
 32.1928 -    
 32.1929 -    return EXCRET_fault_fixed;
 32.1930 -}
 32.1931 -
 32.1932 -static __init int ptwr_init(void)
 32.1933 -{
 32.1934 -    int i;
 32.1935 -
 32.1936 -    for ( i = 0; i < smp_num_cpus; i++ )
 32.1937 -    {
 32.1938 -        ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
 32.1939 -            (void *)alloc_xenheap_page();
 32.1940 -        ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
 32.1941 -            (void *)alloc_xenheap_page();
 32.1942 -    }
 32.1943 -
 32.1944 -    return 0;
 32.1945 -}
 32.1946 -__initcall(ptwr_init);
 32.1947 -
 32.1948 -
 32.1949 -
 32.1950 -
 32.1951 -/************************************************************************/
 32.1952 -/************************************************************************/
 32.1953 -/************************************************************************/
 32.1954 -
 32.1955 -#ifndef NDEBUG
 32.1956 -
 32.1957 -void ptwr_status(void)
 32.1958 -{
 32.1959 -    unsigned long pte, *ptep, pfn;
 32.1960 -    struct pfn_info *page;
 32.1961 -    int cpu = smp_processor_id();
 32.1962 -
 32.1963 -    ptep = (unsigned long *)&linear_pg_table
 32.1964 -        [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
 32.1965 -
 32.1966 -    if ( __get_user(pte, ptep) ) {
 32.1967 -        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
 32.1968 -        domain_crash();
 32.1969 -    }
 32.1970 -
 32.1971 -    pfn = pte >> PAGE_SHIFT;
 32.1972 -    page = &frame_table[pfn];
 32.1973 -    printk("need to alloc l1 page %p\n", page);
 32.1974 -    /* make pt page writable */
 32.1975 -    printk("need to make read-only l1-page at %p is %08lx\n",
 32.1976 -           ptep, pte);
 32.1977 -
 32.1978 -    if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
 32.1979 -        return;
 32.1980 -
 32.1981 -    if ( __get_user(pte, (unsigned long *)
 32.1982 -                    ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
 32.1983 -        MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
 32.1984 -                ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
 32.1985 -        domain_crash();
 32.1986 -    }
 32.1987 -    pfn = pte >> PAGE_SHIFT;
 32.1988 -    page = &frame_table[pfn];
 32.1989 -}
 32.1990 -
 32.1991 -void audit_domain(struct domain *d)
 32.1992 -{
 32.1993 -    int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
 32.1994 -
 32.1995 -    void adjust (struct pfn_info *page, int dir, int adjtype)
 32.1996 -    {
 32.1997 -        int count = page->count_info & PGC_count_mask;
 32.1998 -
 32.1999 -        if ( adjtype )
 32.2000 -        {
 32.2001 -            int tcount = page->u.inuse.type_info & PGT_count_mask;
 32.2002 -            
 32.2003 -            ttot++;
 32.2004 -
 32.2005 -            tcount += dir;
 32.2006 -
 32.2007 -            if ( tcount < 0 )
 32.2008 -            {
 32.2009 -                /* This will only come out once. */
 32.2010 -                printk("Audit %d: type count whent below zero pfn=%x "
 32.2011 -                       "taf=%x otaf=%x\n",
 32.2012 -                       d->id, page-frame_table,
 32.2013 -                       page->u.inuse.type_info,
 32.2014 -                       page->tlbflush_timestamp);
 32.2015 -            }
 32.2016 -            
 32.2017 -            page->u.inuse.type_info =
 32.2018 -                (page->u.inuse.type_info & ~PGT_count_mask) | 
 32.2019 -                (tcount & PGT_count_mask);
 32.2020 -        }
 32.2021 -
 32.2022 -        ctot++;
 32.2023 -        count += dir;
 32.2024 -        if ( count < 0 )
 32.2025 -        {
 32.2026 -            /* This will only come out once. */
 32.2027 -            printk("Audit %d: general count whent below zero pfn=%x "
 32.2028 -                   "taf=%x otaf=%x\n",
 32.2029 -                   d->id, page-frame_table,
 32.2030 -                   page->u.inuse.type_info,
 32.2031 -                   page->tlbflush_timestamp);
 32.2032 -        }
 32.2033 -            
 32.2034 -        page->count_info =
 32.2035 -            (page->count_info & ~PGC_count_mask) | 
 32.2036 -            (count & PGC_count_mask);            
 32.2037 -
 32.2038 -    }
 32.2039 -
 32.2040 -    void scan_for_pfn(struct domain *d, unsigned long xpfn)
 32.2041 -    {
 32.2042 -        unsigned long pfn, *pt;
 32.2043 -        struct list_head *list_ent;
 32.2044 -        struct pfn_info *page;
 32.2045 -        int i;
 32.2046 -
 32.2047 -        list_ent = d->page_list.next;
 32.2048 -        for ( i = 0; (list_ent != &d->page_list); i++ )
 32.2049 -        {
 32.2050 -            pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
 32.2051 -            page = &frame_table[pfn];
 32.2052 -            
 32.2053 -            switch ( page->u.inuse.type_info & PGT_type_mask )
 32.2054 -            {
 32.2055 -            case PGT_l1_page_table:
 32.2056 -            case PGT_l2_page_table:
 32.2057 -                pt = map_domain_mem(pfn<<PAGE_SHIFT);
 32.2058 -                for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
 32.2059 -                    if ( (pt[i] & _PAGE_PRESENT) &&
 32.2060 -                         ((pt[i] >> PAGE_SHIFT) == xpfn) )
 32.2061 -                        printk("     found dom=%d i=%x pfn=%lx t=%x c=%x\n",
 32.2062 -                               d->id, i, pfn, page->u.inuse.type_info,
 32.2063 -                               page->count_info);
 32.2064 -                unmap_domain_mem(pt);           
 32.2065 -            }
 32.2066 -
 32.2067 -            list_ent = frame_table[pfn].list.next;
 32.2068 -        }
 32.2069 -
 32.2070 -    }
 32.2071 -
 32.2072 -    void scan_for_pfn_remote(unsigned long xpfn)
 32.2073 -    {
 32.2074 -        struct domain *e;
 32.2075 -        for_each_domain ( e )
 32.2076 -            scan_for_pfn( e, xpfn );            
 32.2077 -    }   
 32.2078 -
 32.2079 -    int i;
 32.2080 -    unsigned long pfn;
 32.2081 -    struct list_head *list_ent;
 32.2082 -    struct pfn_info *page;
 32.2083 -
 32.2084 -    if ( d != current->domain )
 32.2085 -        domain_pause(d);
 32.2086 -    synchronise_pagetables(~0UL);
 32.2087 -
 32.2088 -    printk("pt base=%lx sh_info=%x\n",
 32.2089 -           pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT,
 32.2090 -           virt_to_page(d->shared_info)-frame_table);
 32.2091 -           
 32.2092 -    spin_lock(&d->page_alloc_lock);
 32.2093 -
 32.2094 -    /* PHASE 0 */
 32.2095 -
 32.2096 -    list_ent = d->page_list.next;
 32.2097 -    for ( i = 0; (list_ent != &d->page_list); i++ )
 32.2098 -    {
 32.2099 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
 32.2100 -        page = &frame_table[pfn];
 32.2101 -
 32.2102 -        if ( page_get_owner(page) != d )
 32.2103 -            BUG();
 32.2104 -
 32.2105 -        if ( (page->u.inuse.type_info & PGT_count_mask) >
 32.2106 -             (page->count_info & PGC_count_mask) )
 32.2107 -            printk("taf > caf %x %x pfn=%lx\n",
 32.2108 -                   page->u.inuse.type_info, page->count_info, pfn );
 32.2109 - 
 32.2110 -#if 0   /* SYSV shared memory pages plus writeable files. */
 32.2111 -        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && 
 32.2112 -             (page->u.inuse.type_info & PGT_count_mask) > 1 )
 32.2113 -        {
 32.2114 -            printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
 32.2115 -                  pfn,
 32.2116 -                  page->u.inuse.type_info,
 32.2117 -                  page->count_info );
 32.2118 -            scan_for_pfn_remote(pfn);
 32.2119 -        }
 32.2120 -#endif
 32.2121 -        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && 
 32.2122 -             (page->u.inuse.type_info & PGT_count_mask) > 1 )
 32.2123 -        {
 32.2124 -            printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
 32.2125 -                  pfn,
 32.2126 -                  page->u.inuse.type_info,
 32.2127 -                  page->count_info );
 32.2128 -        }
 32.2129 -
 32.2130 -        /* Use tlbflush_timestamp to store original type_info. */
 32.2131 -        page->tlbflush_timestamp = page->u.inuse.type_info;
 32.2132 -
 32.2133 -        list_ent = frame_table[pfn].list.next;
 32.2134 -    }
 32.2135 -
 32.2136 -
 32.2137 -    /* PHASE 1 */
 32.2138 -
 32.2139 -    adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], -1, 1);
 32.2140 -
 32.2141 -    list_ent = d->page_list.next;
 32.2142 -    for ( i = 0; (list_ent != &d->page_list); i++ )
 32.2143 -    {
 32.2144 -        unsigned long *pt;
 32.2145 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
 32.2146 -        page = &frame_table[pfn];
 32.2147 -
 32.2148 -        if ( page_get_owner(page) != d )
 32.2149 -            BUG();
 32.2150 -
 32.2151 -        switch ( page->u.inuse.type_info & PGT_type_mask )
 32.2152 -        {
 32.2153 -        case PGT_l2_page_table:
 32.2154 -
 32.2155 -            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
 32.2156 -                printk("Audit %d: L2 not validated %x\n",
 32.2157 -                       d->id, page->u.inuse.type_info);
 32.2158 -
 32.2159 -            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
 32.2160 -                printk("Audit %d: L2 not pinned %x\n",
 32.2161 -                       d->id, page->u.inuse.type_info);
 32.2162 -            else
 32.2163 -                adjust( page, -1, 1 );
 32.2164 -           
 32.2165 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 32.2166 -
 32.2167 -            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
 32.2168 -            {
 32.2169 -                if ( pt[i] & _PAGE_PRESENT )
 32.2170 -                {
 32.2171 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 32.2172 -                    struct pfn_info *l1page = &frame_table[l1pfn];
 32.2173 -
 32.2174 -                    if ( page_get_owner(l1page) != d )
 32.2175 -                    {
 32.2176 -                        printk("L2: Skip bizarre page belonging to other "
 32.2177 -                               "dom %p\n", page_get_owner(l1page));
 32.2178 -                        continue;
 32.2179 -                    }
 32.2180 -                    
 32.2181 -                    if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
 32.2182 -                         PGT_l2_page_table )
 32.2183 -                        printk("Audit %d: [%x] Found %s Linear PT "
 32.2184 -                               "t=%x pfn=%lx\n", d->id, i, 
 32.2185 -                               (l1pfn==pfn) ? "Self" : "Other",
 32.2186 -                               l1page->u.inuse.type_info,
 32.2187 -                               l1pfn);
 32.2188 -                    else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
 32.2189 -                              PGT_l1_page_table )
 32.2190 -                        printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
 32.2191 -                               d->id, i,
 32.2192 -                               l1page->u.inuse.type_info,
 32.2193 -                               l1pfn);
 32.2194 -
 32.2195 -                    adjust(l1page, -1, 1);
 32.2196 -                }
 32.2197 -            }
 32.2198 -
 32.2199 -            unmap_domain_mem(pt);
 32.2200 -
 32.2201 -            break;
 32.2202 -
 32.2203 -
 32.2204 -        case PGT_l1_page_table:
 32.2205 -            
 32.2206 -            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
 32.2207 -                adjust( page, -1, 1 );
 32.2208 -
 32.2209 -            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
 32.2210 -                printk("Audit %d: L1 not validated %x\n",
 32.2211 -                       d->id, page->u.inuse.type_info);
 32.2212 -#if 0
 32.2213 -            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
 32.2214 -                printk("Audit %d: L1 not pinned %x\n",
 32.2215 -                       d->id, page->u.inuse.type_info);
 32.2216 -#endif
 32.2217 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 32.2218 -
 32.2219 -            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
 32.2220 -            {
 32.2221 -                if ( pt[i] & _PAGE_PRESENT )
 32.2222 -                {
 32.2223 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 32.2224 -                    struct pfn_info *l1page = &frame_table[l1pfn];
 32.2225 -
 32.2226 -                    if ( l1pfn < 0x100 )
 32.2227 -                    {
 32.2228 -                        lowmem_mappings++;
 32.2229 -                        continue;
 32.2230 -                    }
 32.2231 -
 32.2232 -                    if ( l1pfn > max_page )
 32.2233 -                    {
 32.2234 -                        io_mappings++;
 32.2235 -                        continue;
 32.2236 -                    }
 32.2237 -
 32.2238 -                    if ( pt[i] & _PAGE_RW )
 32.2239 -                    {
 32.2240 -
 32.2241 -                        if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
 32.2242 -                             PGT_l1_page_table ||
 32.2243 -                             (l1page->u.inuse.type_info & PGT_type_mask) ==
 32.2244 -                             PGT_l2_page_table )
 32.2245 -                            printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
 32.2246 -                                   d->id, i,
 32.2247 -                                   l1page->u.inuse.type_info,
 32.2248 -                                   l1pfn);
 32.2249 -
 32.2250 -                    }
 32.2251 -
 32.2252 -                    if ( page_get_owner(l1page) != d )
 32.2253 -                    {
 32.2254 -                        printk("Audit %d: [%lx,%x] Skip foreign page dom=%p "
 32.2255 -                               "pfn=%lx c=%08x t=%08x m2p=%lx\n",
 32.2256 -                               d->id, pfn, i,
 32.2257 -                               page_get_owner(l1page),
 32.2258 -                               l1pfn,
 32.2259 -                               l1page->count_info,
 32.2260 -                               l1page->u.inuse.type_info,
 32.2261 -                               machine_to_phys_mapping[l1pfn]);    
 32.2262 -                        continue;
 32.2263 -                    }
 32.2264 -
 32.2265 -                    adjust(l1page, -1, 0);
 32.2266 -                }
 32.2267 -            }
 32.2268 -
 32.2269 -            unmap_domain_mem(pt);
 32.2270 -
 32.2271 -            break;
 32.2272 -        }       
 32.2273 -
 32.2274 -        list_ent = frame_table[pfn].list.next;
 32.2275 -    }
 32.2276 -
 32.2277 -    if ( (io_mappings > 0) || (lowmem_mappings > 0) )
 32.2278 -        printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
 32.2279 -               d->id, lowmem_mappings, io_mappings);
 32.2280 -
 32.2281 -    /* PHASE 2 */
 32.2282 -
 32.2283 -    ctot = ttot = 0;
 32.2284 -    list_ent = d->page_list.next;
 32.2285 -    for ( i = 0; (list_ent != &d->page_list); i++ )
 32.2286 -    {
 32.2287 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
 32.2288 -        page = &frame_table[pfn];
 32.2289 -
 32.2290 -        switch ( page->u.inuse.type_info & PGT_type_mask)
 32.2291 -        {
 32.2292 -        case PGT_l1_page_table:
 32.2293 -        case PGT_l2_page_table:
 32.2294 -            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
 32.2295 -            {
 32.2296 -                printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
 32.2297 -                       d->id, page->u.inuse.type_info, 
 32.2298 -                       page->tlbflush_timestamp,
 32.2299 -                       page->count_info, pfn );
 32.2300 -                scan_for_pfn_remote(pfn);
 32.2301 -            }
 32.2302 -        default:
 32.2303 -            if ( (page->count_info & PGC_count_mask) != 1 )
 32.2304 -            {
 32.2305 -                printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
 32.2306 -                       d->id, 
 32.2307 -                       page->count_info,
 32.2308 -                       page->u.inuse.type_info, 
 32.2309 -                       page->tlbflush_timestamp, pfn );
 32.2310 -                scan_for_pfn_remote(pfn);
 32.2311 -            }
 32.2312 -            break;
 32.2313 -        }
 32.2314 -
 32.2315 -        list_ent = frame_table[pfn].list.next;
 32.2316 -    }
 32.2317 -
 32.2318 -    /* PHASE 3 */
 32.2319 -
 32.2320 -    list_ent = d->page_list.next;
 32.2321 -    for ( i = 0; (list_ent != &d->page_list); i++ )
 32.2322 -    {
 32.2323 -        unsigned long *pt;
 32.2324 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
 32.2325 -        page = &frame_table[pfn];
 32.2326 -
 32.2327 -        switch ( page->u.inuse.type_info & PGT_type_mask )
 32.2328 -        {
 32.2329 -        case PGT_l2_page_table:
 32.2330 -            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
 32.2331 -                adjust( page, 1, 1 );          
 32.2332 -
 32.2333 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 32.2334 -
 32.2335 -            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
 32.2336 -            {
 32.2337 -                if ( pt[i] & _PAGE_PRESENT )
 32.2338 -                {
 32.2339 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 32.2340 -                    struct pfn_info *l1page = &frame_table[l1pfn];
 32.2341 -
 32.2342 -                    if ( page_get_owner(l1page) == d )
 32.2343 -                        adjust(l1page, 1, 1);
 32.2344 -                }
 32.2345 -            }
 32.2346 -
 32.2347 -            unmap_domain_mem(pt);
 32.2348 -            break;
 32.2349 -
 32.2350 -        case PGT_l1_page_table:
 32.2351 -            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
 32.2352 -                adjust( page, 1, 1 );
 32.2353 -
 32.2354 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 32.2355 -
 32.2356 -            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
 32.2357 -            {
 32.2358 -                if ( pt[i] & _PAGE_PRESENT )
 32.2359 -                {
 32.2360 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 32.2361 -                    struct pfn_info *l1page = &frame_table[l1pfn];
 32.2362 -
 32.2363 -                    if ( (page_get_owner(l1page) != d) ||
 32.2364 -                         (l1pfn < 0x100) || (l1pfn > max_page) )
 32.2365 -                        continue;
 32.2366 -
 32.2367 -                    adjust(l1page, 1, 0);
 32.2368 -                }
 32.2369 -            }
 32.2370 -
 32.2371 -            unmap_domain_mem(pt);
 32.2372 -            break;
 32.2373 -        }
 32.2374 -
 32.2375 -
 32.2376 -        page->tlbflush_timestamp = 0;
 32.2377 -
 32.2378 -        list_ent = frame_table[pfn].list.next;
 32.2379 -    }
 32.2380 -
 32.2381 -    spin_unlock(&d->page_alloc_lock);
 32.2382 -
 32.2383 -    adjust(&frame_table[pagetable_val(
 32.2384 -        d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1);
 32.2385 -
 32.2386 -    printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
 32.2387 -
 32.2388 -    if ( d != current->domain )
 32.2389 -        domain_unpause(d);
 32.2390 -}
 32.2391 -
 32.2392 -void audit_domains(void)
 32.2393 -{
 32.2394 -    struct domain *d;
 32.2395 -    for_each_domain ( d )
 32.2396 -        audit_domain(d);
 32.2397 -}
 32.2398 -
 32.2399 -void audit_domains_key(unsigned char key)
 32.2400 -{
 32.2401 -    audit_domains();
 32.2402 -}
 32.2403 -
 32.2404 -#endif
    33.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    33.2 +++ b/xen/arch/x86/mm.c	Tue Feb 08 16:44:16 2005 +0000
    33.3 @@ -0,0 +1,2598 @@
    33.4 +/* -*-  Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
    33.5 +/******************************************************************************
    33.6 + * arch/x86/mm.c
    33.7 + * 
    33.8 + * Copyright (c) 2002-2005 K A Fraser
    33.9 + * Copyright (c) 2004 Christian Limpach
   33.10 + * 
   33.11 + * This program is free software; you can redistribute it and/or modify
   33.12 + * it under the terms of the GNU General Public License as published by
   33.13 + * the Free Software Foundation; either version 2 of the License, or
   33.14 + * (at your option) any later version.
   33.15 + * 
   33.16 + * This program is distributed in the hope that it will be useful,
   33.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   33.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   33.19 + * GNU General Public License for more details.
   33.20 + * 
   33.21 + * You should have received a copy of the GNU General Public License
   33.22 + * along with this program; if not, write to the Free Software
   33.23 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   33.24 + */
   33.25 +
   33.26 +/*
   33.27 + * A description of the x86 page table API:
   33.28 + * 
   33.29 + * Domains trap to do_mmu_update with a list of update requests.
   33.30 + * This is a list of (ptr, val) pairs, where the requested operation
   33.31 + * is *ptr = val.
   33.32 + * 
   33.33 + * Reference counting of pages:
   33.34 + * ----------------------------
   33.35 + * Each page has two refcounts: tot_count and type_count.
   33.36 + * 
   33.37 + * TOT_COUNT is the obvious reference count. It counts all uses of a
   33.38 + * physical page frame by a domain, including uses as a page directory,
   33.39 + * a page table, or simple mappings via a PTE. This count prevents a
   33.40 + * domain from releasing a frame back to the free pool when it still holds
   33.41 + * a reference to it.
   33.42 + * 
   33.43 + * TYPE_COUNT is more subtle. A frame can be put to one of three
   33.44 + * mutually-exclusive uses: it might be used as a page directory, or a
   33.45 + * page table, or it may be mapped writable by the domain [of course, a
   33.46 + * frame may not be used in any of these three ways!].
   33.47 + * So, type_count is a count of the number of times a frame is being 
   33.48 + * referred to in its current incarnation. Therefore, a page can only
   33.49 + * change its type when its type count is zero.
   33.50 + * 
   33.51 + * Pinning the page type:
   33.52 + * ----------------------
   33.53 + * The type of a page can be pinned/unpinned with the commands
   33.54 + * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
   33.55 + * pinning is not reference counted, so it can't be nested).
   33.56 + * This is useful to prevent a page's type count falling to zero, at which
   33.57 + * point safety checks would need to be carried out next time the count
   33.58 + * is increased again.
   33.59 + * 
   33.60 + * A further note on writable page mappings:
   33.61 + * -----------------------------------------
   33.62 + * For simplicity, the count of writable mappings for a page may not
   33.63 + * correspond to reality. The 'writable count' is incremented for every
   33.64 + * PTE which maps the page with the _PAGE_RW flag set. However, for
   33.65 + * write access to be possible the page directory entry must also have
   33.66 + * its _PAGE_RW bit set. We do not check this as it complicates the 
   33.67 + * reference counting considerably [consider the case of multiple
   33.68 + * directory entries referencing a single page table, some with the RW
   33.69 + * bit set, others not -- it starts getting a bit messy].
   33.70 + * In normal use, this simplification shouldn't be a problem.
   33.71 + * However, the logic can be added if required.
   33.72 + * 
   33.73 + * One more note on read-only page mappings:
   33.74 + * -----------------------------------------
   33.75 + * We want domains to be able to map pages for read-only access. The
   33.76 + * main reason is that page tables and directories should be readable
   33.77 + * by a domain, but it would not be safe for them to be writable.
   33.78 + * However, domains have free access to rings 1 & 2 of the Intel
   33.79 + * privilege model. In terms of page protection, these are considered
   33.80 + * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
   33.81 + * read-only restrictions are respected in supervisor mode -- if the 
   33.82 + * bit is clear then any mapped page is writable.
   33.83 + * 
   33.84 + * We get round this by always setting the WP bit and disallowing 
   33.85 + * updates to it. This is very unlikely to cause a problem for guest
   33.86 + * OS's, which will generally use the WP bit to simplify copy-on-write
   33.87 + * implementation (in that case, OS wants a fault when it writes to
   33.88 + * an application-supplied buffer).
   33.89 + */
   33.90 +
   33.91 +#include <xen/config.h>
   33.92 +#include <xen/init.h>
   33.93 +#include <xen/kernel.h>
   33.94 +#include <xen/lib.h>
   33.95 +#include <xen/mm.h>
   33.96 +#include <xen/sched.h>
   33.97 +#include <xen/errno.h>
   33.98 +#include <xen/perfc.h>
   33.99 +#include <xen/irq.h>
  33.100 +#include <xen/softirq.h>
  33.101 +#include <asm/shadow.h>
  33.102 +#include <asm/page.h>
  33.103 +#include <asm/flushtlb.h>
  33.104 +#include <asm/io.h>
  33.105 +#include <asm/uaccess.h>
  33.106 +#include <asm/domain_page.h>
  33.107 +#include <asm/ldt.h>
  33.108 +
  33.109 +#ifdef VERBOSE
  33.110 +#define MEM_LOG(_f, _a...)                           \
  33.111 +  printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
  33.112 +         current->domain->id , __LINE__ , ## _a )
  33.113 +#else
  33.114 +#define MEM_LOG(_f, _a...) ((void)0)
  33.115 +#endif
  33.116 +
  33.117 +static int alloc_l2_table(struct pfn_info *page);
  33.118 +static int alloc_l1_table(struct pfn_info *page);
  33.119 +static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
  33.120 +static int get_page_and_type_from_pagenr(unsigned long page_nr, 
  33.121 +                                         u32 type,
  33.122 +                                         struct domain *d);
  33.123 +
  33.124 +static void free_l2_table(struct pfn_info *page);
  33.125 +static void free_l1_table(struct pfn_info *page);
  33.126 +
  33.127 +static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
  33.128 +static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
  33.129 +
  33.130 +/* Used to defer flushing of memory structures. */
  33.131 +static struct {
  33.132 +#define DOP_FLUSH_TLB   (1<<0) /* Flush the TLB.                 */
  33.133 +#define DOP_RELOAD_LDT  (1<<1) /* Reload the LDT shadow mapping. */
  33.134 +    unsigned long  deferred_ops;
  33.135 +    /* If non-NULL, specifies a foreign subject domain for some operations. */
  33.136 +    struct domain *foreign;
  33.137 +} __cacheline_aligned percpu_info[NR_CPUS];
  33.138 +
  33.139 +/*
  33.140 + * Returns the current foreign domain; defaults to the currently-executing
  33.141 + * domain if a foreign override hasn't been specified.
  33.142 + */
  33.143 +#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
  33.144 +
  33.145 +/* Private domain structs for DOMID_XEN and DOMID_IO. */
  33.146 +static struct domain *dom_xen, *dom_io;
  33.147 +
  33.148 +/* Frame table and its size in pages. */
  33.149 +struct pfn_info *frame_table;
  33.150 +unsigned long frame_table_size;
  33.151 +unsigned long max_page;
  33.152 +
  33.153 +void __init init_frametable(void)
  33.154 +{
  33.155 +    unsigned long i, p;
  33.156 +
  33.157 +    frame_table      = (struct pfn_info *)FRAMETABLE_VIRT_START;
  33.158 +    frame_table_size = max_page * sizeof(struct pfn_info);
  33.159 +    frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
  33.160 +
  33.161 +    for ( i = 0; i < frame_table_size; i += (4UL << 20) )
  33.162 +    {
  33.163 +        p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
  33.164 +        if ( p == 0 )
  33.165 +            panic("Not enough memory for frame table\n");
  33.166 +        map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p, 
  33.167 +                  4UL << 20, PAGE_HYPERVISOR);
  33.168 +    }
  33.169 +
  33.170 +    memset(frame_table, 0, frame_table_size);
  33.171 +}
  33.172 +
  33.173 +void arch_init_memory(void)
  33.174 +{
  33.175 +    extern void subarch_init_memory(struct domain *);
  33.176 +
  33.177 +    memset(percpu_info, 0, sizeof(percpu_info));
  33.178 +
  33.179 +    /*
  33.180 +     * Initialise our DOMID_XEN domain.
  33.181 +     * Any Xen-heap pages that we will allow to be mapped will have
  33.182 +     * their domain field set to dom_xen.
  33.183 +     */
  33.184 +    dom_xen = alloc_domain_struct();
  33.185 +    atomic_set(&dom_xen->refcnt, 1);
  33.186 +    dom_xen->id = DOMID_XEN;
  33.187 +
  33.188 +    /*
  33.189 +     * Initialise our DOMID_IO domain.
  33.190 +     * This domain owns no pages but is considered a special case when
  33.191 +     * mapping I/O pages, as the mappings occur at the priv of the caller.
  33.192 +     */
  33.193 +    dom_io = alloc_domain_struct();
  33.194 +    atomic_set(&dom_io->refcnt, 1);
  33.195 +    dom_io->id = DOMID_IO;
  33.196 +
  33.197 +    subarch_init_memory(dom_xen);
  33.198 +}
  33.199 +
  33.200 +void write_ptbase(struct exec_domain *ed)
  33.201 +{
  33.202 +    struct domain *d = ed->domain;
  33.203 +    unsigned long pa;
  33.204 +
  33.205 +#ifdef CONFIG_VMX
  33.206 +    if ( unlikely(shadow_mode(d)) )
  33.207 +        pa = ((shadow_mode(d) == SHM_full_32) ?
  33.208 +              pagetable_val(ed->arch.monitor_table) :
  33.209 +              pagetable_val(ed->arch.shadow_table));
  33.210 +    else
  33.211 +        pa = pagetable_val(ed->arch.pagetable);
  33.212 +#else
  33.213 +    if ( unlikely(shadow_mode(d)) )
  33.214 +        pa = pagetable_val(ed->arch.shadow_table);    
  33.215 +    else
  33.216 +        pa = pagetable_val(ed->arch.pagetable);
  33.217 +#endif
  33.218 +
  33.219 +    write_cr3(pa);
  33.220 +}
  33.221 +
  33.222 +static void __invalidate_shadow_ldt(struct exec_domain *d)
  33.223 +{
  33.224 +    int i;
  33.225 +    unsigned long pfn;
  33.226 +    struct pfn_info *page;
  33.227 +    
  33.228 +    d->arch.shadow_ldt_mapcnt = 0;
  33.229 +
  33.230 +    for ( i = 16; i < 32; i++ )
  33.231 +    {
  33.232 +        pfn = l1_pgentry_to_pfn(d->arch.perdomain_ptes[i]);
  33.233 +        if ( pfn == 0 ) continue;
  33.234 +        d->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
  33.235 +        page = &frame_table[pfn];
  33.236 +        ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
  33.237 +        ASSERT_PAGE_IS_DOMAIN(page, d->domain);
  33.238 +        put_page_and_type(page);
  33.239 +    }
  33.240 +
  33.241 +    /* Dispose of the (now possibly invalid) mappings from the TLB.  */
  33.242 +    percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
  33.243 +}
  33.244 +
  33.245 +
  33.246 +static inline void invalidate_shadow_ldt(struct exec_domain *d)
  33.247 +{
  33.248 +    if ( d->arch.shadow_ldt_mapcnt != 0 )
  33.249 +        __invalidate_shadow_ldt(d);
  33.250 +}
  33.251 +
  33.252 +
  33.253 +static int alloc_segdesc_page(struct pfn_info *page)
  33.254 +{
  33.255 +    struct desc_struct *descs;
  33.256 +    int i;
  33.257 +
  33.258 +    descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
  33.259 +
  33.260 +    for ( i = 0; i < 512; i++ )
  33.261 +        if ( unlikely(!check_descriptor(&descs[i])) )
  33.262 +            goto fail;
  33.263 +
  33.264 +    unmap_domain_mem(descs);
  33.265 +    return 1;
  33.266 +
  33.267 + fail:
  33.268 +    unmap_domain_mem(descs);
  33.269 +    return 0;
  33.270 +}
  33.271 +
  33.272 +
  33.273 +/* Map shadow page at offset @off. */
  33.274 +int map_ldt_shadow_page(unsigned int off)
  33.275 +{
  33.276 +    struct exec_domain *ed = current;
  33.277 +    struct domain *d = ed->domain;
  33.278 +    unsigned long l1e;
  33.279 +
  33.280 +    if ( unlikely(in_irq()) )
  33.281 +        BUG();
  33.282 +
  33.283 +    __get_user(l1e, (unsigned long *)
  33.284 +               &linear_pg_table[l1_linear_offset(ed->arch.ldt_base) + off]);
  33.285 +
  33.286 +    if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
  33.287 +         unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], 
  33.288 +                                     d, PGT_ldt_page)) )
  33.289 +        return 0;
  33.290 +
  33.291 +    ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
  33.292 +    ed->arch.shadow_ldt_mapcnt++;
  33.293 +
  33.294 +    return 1;
  33.295 +}
  33.296 +
  33.297 +
  33.298 +static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
  33.299 +{
  33.300 +    struct pfn_info *page = &frame_table[page_nr];
  33.301 +
  33.302 +    if ( unlikely(!pfn_is_ram(page_nr)) )
  33.303 +    {
  33.304 +        MEM_LOG("Pfn %08lx is not RAM", page_nr);
  33.305 +        return 0;
  33.306 +    }
  33.307 +
  33.308 +    if ( unlikely(!get_page(page, d)) )
  33.309 +    {
  33.310 +        MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
  33.311 +        return 0;
  33.312 +    }
  33.313 +
  33.314 +    return 1;
  33.315 +}
  33.316 +
  33.317 +
  33.318 +static int get_page_and_type_from_pagenr(unsigned long page_nr, 
  33.319 +                                         u32 type,
  33.320 +                                         struct domain *d)
  33.321 +{
  33.322 +    struct pfn_info *page = &frame_table[page_nr];
  33.323 +
  33.324 +    if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
  33.325 +        return 0;
  33.326 +
  33.327 +    if ( unlikely(!get_page_type(page, type)) )
  33.328 +    {
  33.329 +#ifdef VERBOSE
  33.330 +        if ( (type & PGT_type_mask) != PGT_l1_page_table )
  33.331 +            MEM_LOG("Bad page type for pfn %08lx (%08x)", 
  33.332 +                    page_nr, page->u.inuse.type_info);
  33.333 +#endif
  33.334 +        put_page(page);
  33.335 +        return 0;
  33.336 +    }
  33.337 +
  33.338 +    return 1;
  33.339 +}
  33.340 +
  33.341 +
  33.342 +/*
  33.343 + * We allow an L2 tables to map each other (a.k.a. linear page tables). It
  33.344 + * needs some special care with reference counst and access permissions:
  33.345 + *  1. The mapping entry must be read-only, or the guest may get write access
  33.346 + *     to its own PTEs.
  33.347 + *  2. We must only bump the reference counts for an *already validated*
  33.348 + *     L2 table, or we can end up in a deadlock in get_page_type() by waiting
  33.349 + *     on a validation that is required to complete that validation.
  33.350 + *  3. We only need to increment the reference counts for the mapped page
  33.351 + *     frame if it is mapped by a different L2 table. This is sufficient and
  33.352 + *     also necessary to allow validation of an L2 table mapping itself.
  33.353 + */
  33.354 +static int 
  33.355 +get_linear_pagetable(
  33.356 +    l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
  33.357 +{
  33.358 +    u32 x, y;
  33.359 +    struct pfn_info *page;
  33.360 +
  33.361 +    if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
  33.362 +    {
  33.363 +        MEM_LOG("Attempt to create linear p.t. with write perms");
  33.364 +        return 0;
  33.365 +    }
  33.366 +
  33.367 +    if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
  33.368 +    {
  33.369 +        /* Make sure the mapped frame belongs to the correct domain. */
  33.370 +        if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pfn(l2e), d)) )
  33.371 +            return 0;
  33.372 +
  33.373 +        /*
  33.374 +         * Make sure that the mapped frame is an already-validated L2 table. 
  33.375 +         * If so, atomically increment the count (checking for overflow).
  33.376 +         */
  33.377 +        page = &frame_table[l2_pgentry_to_pfn(l2e)];
  33.378 +        y = page->u.inuse.type_info;
  33.379 +        do {
  33.380 +            x = y;
  33.381 +            if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
  33.382 +                 unlikely((x & (PGT_type_mask|PGT_validated)) != 
  33.383 +                          (PGT_l2_page_table|PGT_validated)) )
  33.384 +            {
  33.385 +                put_page(page);
  33.386 +                return 0;
  33.387 +            }
  33.388 +        }
  33.389 +        while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
  33.390 +    }
  33.391 +
  33.392 +    return 1;
  33.393 +}
  33.394 +
  33.395 +
  33.396 +static int
  33.397 +get_page_from_l1e(
  33.398 +    l1_pgentry_t l1e, struct domain *d)
  33.399 +{
  33.400 +    unsigned long l1v = l1_pgentry_val(l1e);
  33.401 +    unsigned long pfn = l1_pgentry_to_pfn(l1e);
  33.402 +    struct pfn_info *page = &frame_table[pfn];
  33.403 +    extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
  33.404 +
  33.405 +    if ( !(l1v & _PAGE_PRESENT) )
  33.406 +        return 1;
  33.407 +
  33.408 +    if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
  33.409 +    {
  33.410 +        MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
  33.411 +        return 0;
  33.412 +    }
  33.413 +
  33.414 +    if ( unlikely(!pfn_is_ram(pfn)) )
  33.415 +    {
  33.416 +        /* Revert to caller privileges if FD == DOMID_IO. */
  33.417 +        if ( d == dom_io )
  33.418 +            d = current->domain;
  33.419 +
  33.420 +        if ( IS_PRIV(d) )
  33.421 +            return 1;
  33.422 +
  33.423 +        if ( IS_CAPABLE_PHYSDEV(d) )
  33.424 +            return domain_iomem_in_pfn(d, pfn);
  33.425 +
  33.426 +        MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
  33.427 +        return 0;
  33.428 +    }
  33.429 +
  33.430 +    return ((l1v & _PAGE_RW) ?
  33.431 +            get_page_and_type(page, d, PGT_writable_page) :
  33.432 +            get_page(page, d));
  33.433 +}
  33.434 +
  33.435 +
  33.436 +/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
  33.437 +static int 
  33.438 +get_page_from_l2e(
  33.439 +    l2_pgentry_t l2e, unsigned long pfn,
  33.440 +    struct domain *d, unsigned long va_idx)
  33.441 +{
  33.442 +    int rc;
  33.443 +
  33.444 +    if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
  33.445 +        return 1;
  33.446 +
  33.447 +    if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
  33.448 +    {
  33.449 +        MEM_LOG("Bad L2 page type settings %04lx",
  33.450 +                l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
  33.451 +        return 0;
  33.452 +    }
  33.453 +
  33.454 +    rc = get_page_and_type_from_pagenr(
  33.455 +        l2_pgentry_to_pfn(l2e), 
  33.456 +        PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
  33.457 +
  33.458 +    if ( unlikely(!rc) )
  33.459 +        return get_linear_pagetable(l2e, pfn, d);
  33.460 +
  33.461 +    return 1;
  33.462 +}
  33.463 +
  33.464 +
  33.465 +static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
  33.466 +{
  33.467 +    unsigned long    l1v  = l1_pgentry_val(l1e);
  33.468 +    unsigned long    pfn  = l1_pgentry_to_pfn(l1e);
  33.469 +    struct pfn_info *page = &frame_table[pfn];
  33.470 +    struct domain   *e;
  33.471 +
  33.472 +    if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
  33.473 +        return;
  33.474 +
  33.475 +    e = page_get_owner(page);
  33.476 +    if ( unlikely(e != d) )
  33.477 +    {
  33.478 +        /*
  33.479 +         * Unmap a foreign page that may have been mapped via a grant table.
  33.480 +         * Note that this can fail for a privileged domain that can map foreign
  33.481 +         * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
  33.482 +         * counted via a grant entry and some counted directly in the page
  33.483 +         * structure's reference count. Note that reference counts won't get
  33.484 +         * dangerously confused as long as we always try to decrement the
  33.485 +         * grant entry first. We may end up with a mismatch between which
  33.486 +         * mappings and which unmappings are counted via the grant entry, but
  33.487 +         * really it doesn't matter as privileged domains have carte blanche.
  33.488 +         */
  33.489 +        if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
  33.490 +            return;
  33.491 +        /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
  33.492 +    }
  33.493 +
  33.494 +    if ( l1v & _PAGE_RW )
  33.495 +    {
  33.496 +        put_page_and_type(page);
  33.497 +    }
  33.498 +    else
  33.499 +    {
  33.500 +        /* We expect this is rare so we blow the entire shadow LDT. */
  33.501 +        if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == 
  33.502 +                       PGT_ldt_page)) &&
  33.503 +             unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
  33.504 +            invalidate_shadow_ldt(e->exec_domain[0]);
  33.505 +        put_page(page);
  33.506 +    }
  33.507 +}
  33.508 +
  33.509 +
  33.510 +/*
  33.511 + * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
  33.512 + * Note also that this automatically deals correctly with linear p.t.'s.
  33.513 + */
  33.514 +static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
  33.515 +{
  33.516 +    if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && 
  33.517 +         ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
  33.518 +        put_page_and_type(&frame_table[l2_pgentry_to_pfn(l2e)]);
  33.519 +}
  33.520 +
  33.521 +
  33.522 +static int alloc_l2_table(struct pfn_info *page)
  33.523 +{
  33.524 +    struct domain *d = page_get_owner(page);
  33.525 +    unsigned long  page_nr = page_to_pfn(page);
  33.526 +    l2_pgentry_t  *pl2e;
  33.527 +    int            i;
  33.528 +   
  33.529 +    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
  33.530 +
  33.531 +    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  33.532 +        if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
  33.533 +            goto fail;
  33.534 +
  33.535 +#if defined(__i386__)
  33.536 +    /* Now we add our private high mappings. */
  33.537 +    memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
  33.538 +           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  33.539 +           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  33.540 +    pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
  33.541 +        mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  33.542 +    pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
  33.543 +        mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) | 
  33.544 +                      __PAGE_HYPERVISOR);
  33.545 +#endif
  33.546 +
  33.547 +    unmap_domain_mem(pl2e);
  33.548 +    return 1;
  33.549 +
  33.550 + fail:
  33.551 +    while ( i-- > 0 )
  33.552 +        put_page_from_l2e(pl2e[i], page_nr);
  33.553 +
  33.554 +    unmap_domain_mem(pl2e);
  33.555 +    return 0;
  33.556 +}
  33.557 +
  33.558 +
  33.559 +static int alloc_l1_table(struct pfn_info *page)
  33.560 +{
  33.561 +    struct domain *d = page_get_owner(page);
  33.562 +    unsigned long  page_nr = page_to_pfn(page);
  33.563 +    l1_pgentry_t  *pl1e;
  33.564 +    int            i;
  33.565 +
  33.566 +    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
  33.567 +
  33.568 +    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
  33.569 +        if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
  33.570 +            goto fail;
  33.571 +
  33.572 +    unmap_domain_mem(pl1e);
  33.573 +    return 1;
  33.574 +
  33.575 + fail:
  33.576 +    while ( i-- > 0 )
  33.577 +        put_page_from_l1e(pl1e[i], d);
  33.578 +
  33.579 +    unmap_domain_mem(pl1e);
  33.580 +    return 0;
  33.581 +}
  33.582 +
  33.583 +
  33.584 +static void free_l2_table(struct pfn_info *page)
  33.585 +{
  33.586 +    unsigned long page_nr = page - frame_table;
  33.587 +    l2_pgentry_t *pl2e;
  33.588 +    int i;
  33.589 +
  33.590 +    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
  33.591 +
  33.592 +    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  33.593 +        put_page_from_l2e(pl2e[i], page_nr);
  33.594 +
  33.595 +    unmap_domain_mem(pl2e);
  33.596 +}
  33.597 +
  33.598 +
  33.599 +static void free_l1_table(struct pfn_info *page)
  33.600 +{
  33.601 +    struct domain *d = page_get_owner(page);
  33.602 +    unsigned long page_nr = page - frame_table;
  33.603 +    l1_pgentry_t *pl1e;
  33.604 +    int i;
  33.605 +
  33.606 +    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
  33.607 +
  33.608 +    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
  33.609 +        put_page_from_l1e(pl1e[i], d);
  33.610 +
  33.611 +    unmap_domain_mem(pl1e);
  33.612 +}
  33.613 +
  33.614 +
  33.615 +static inline int update_l2e(l2_pgentry_t *pl2e, 
  33.616 +                             l2_pgentry_t  ol2e, 
  33.617 +                             l2_pgentry_t  nl2e)
  33.618 +{
  33.619 +    unsigned long o = cmpxchg((unsigned long *)pl2e, 
  33.620 +                              l2_pgentry_val(ol2e), 
  33.621 +                              l2_pgentry_val(nl2e));
  33.622 +    if ( o != l2_pgentry_val(ol2e) )
  33.623 +        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
  33.624 +                l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
  33.625 +    return (o == l2_pgentry_val(ol2e));
  33.626 +}
  33.627 +
  33.628 +
  33.629 +/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
  33.630 +static int mod_l2_entry(l2_pgentry_t *pl2e, 
  33.631 +                        l2_pgentry_t nl2e, 
  33.632 +                        unsigned long pfn)
  33.633 +{
  33.634 +    l2_pgentry_t ol2e;
  33.635 +    unsigned long _ol2e;
  33.636 +
  33.637 +    if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
  33.638 +                  DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
  33.639 +    {
  33.640 +        MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
  33.641 +        return 0;
  33.642 +    }
  33.643 +
  33.644 +    if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
  33.645 +        return 0;
  33.646 +    ol2e = mk_l2_pgentry(_ol2e);
  33.647 +
  33.648 +    if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
  33.649 +    {
  33.650 +        /* Differ in mapping (bits 12-31) or presence (bit 0)? */
  33.651 +        if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
  33.652 +            return update_l2e(pl2e, ol2e, nl2e);
  33.653 +
  33.654 +        if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
  33.655 +                                        ((unsigned long)pl2e & 
  33.656 +                                         ~PAGE_MASK) >> 2)) )
  33.657 +            return 0;
  33.658 +
  33.659 +        if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
  33.660 +        {
  33.661 +            put_page_from_l2e(nl2e, pfn);
  33.662 +            return 0;
  33.663 +        }
  33.664 +        
  33.665 +        put_page_from_l2e(ol2e, pfn);
  33.666 +        return 1;
  33.667 +    }
  33.668 +
  33.669 +    if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
  33.670 +        return 0;
  33.671 +
  33.672 +    put_page_from_l2e(ol2e, pfn);
  33.673 +    return 1;
  33.674 +}
  33.675 +
  33.676 +
  33.677 +static inline int update_l1e(l1_pgentry_t *pl1e, 
  33.678 +                             l1_pgentry_t  ol1e, 
  33.679 +                             l1_pgentry_t  nl1e)
  33.680 +{
  33.681 +    unsigned long o = l1_pgentry_val(ol1e);
  33.682 +    unsigned long n = l1_pgentry_val(nl1e);
  33.683 +
  33.684 +    if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
  33.685 +         unlikely(o != l1_pgentry_val(ol1e)) )
  33.686 +    {
  33.687 +        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
  33.688 +                l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
  33.689 +        return 0;
  33.690 +    }
  33.691 +
  33.692 +    return 1;
  33.693 +}
  33.694 +
  33.695 +
  33.696 +/* Update the L1 entry at pl1e to new value nl1e. */
  33.697 +static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
  33.698 +{
  33.699 +    l1_pgentry_t ol1e;
  33.700 +    unsigned long _ol1e;
  33.701 +    struct domain *d = current->domain;
  33.702 +
  33.703 +    if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
  33.704 +    {
  33.705 +        MEM_LOG("Bad get_user\n");
  33.706 +        return 0;
  33.707 +    }
  33.708 +    
  33.709 +    ol1e = mk_l1_pgentry(_ol1e);
  33.710 +
  33.711 +    if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
  33.712 +    {
  33.713 +        /* Same mapping (bits 12-31), r/w (bit 1), and presence (bit 0)? */
  33.714 +        if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
  33.715 +            return update_l1e(pl1e, ol1e, nl1e);
  33.716 +
  33.717 +        if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
  33.718 +            return 0;
  33.719 +        
  33.720 +        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
  33.721 +        {
  33.722 +            put_page_from_l1e(nl1e, d);
  33.723 +            return 0;
  33.724 +        }
  33.725 +        
  33.726 +        put_page_from_l1e(ol1e, d);
  33.727 +        return 1;
  33.728 +    }
  33.729 +
  33.730 +    if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
  33.731 +        return 0;
  33.732 +    
  33.733 +    put_page_from_l1e(ol1e, d);
  33.734 +    return 1;
  33.735 +}
  33.736 +
  33.737 +
  33.738 +int alloc_page_type(struct pfn_info *page, unsigned int type)
  33.739 +{
  33.740 +    switch ( type )
  33.741 +    {
  33.742 +    case PGT_l1_page_table:
  33.743 +        return alloc_l1_table(page);
  33.744 +    case PGT_l2_page_table:
  33.745 +        return alloc_l2_table(page);
  33.746 +    case PGT_gdt_page:
  33.747 +    case PGT_ldt_page:
  33.748 +        return alloc_segdesc_page(page);
  33.749 +    default:
  33.750 +        printk("Bad type in alloc_page_type %x t=%x c=%x\n", 
  33.751 +               type, page->u.inuse.type_info,
  33.752 +               page->count_info);
  33.753 +        BUG();
  33.754 +    }
  33.755 +
  33.756 +    return 0;
  33.757 +}
  33.758 +
  33.759 +
  33.760 +void free_page_type(struct pfn_info *page, unsigned int type)
  33.761 +{
  33.762 +    struct domain *d = page_get_owner(page);
  33.763 +
  33.764 +    switch ( type )
  33.765 +    {
  33.766 +    case PGT_l1_page_table:
  33.767 +        free_l1_table(page);
  33.768 +        break;
  33.769 +
  33.770 +    case PGT_l2_page_table:
  33.771 +        free_l2_table(page);
  33.772 +        break;
  33.773 +
  33.774 +    default:
  33.775 +        BUG();
  33.776 +    }
  33.777 +
  33.778 +    if ( unlikely(shadow_mode(d)) && 
  33.779 +         (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) )
  33.780 +    {
  33.781 +        unshadow_table(page_to_pfn(page), type);
  33.782 +        put_shadow_status(d);
  33.783 +    }
  33.784 +}
  33.785 +
  33.786 +
  33.787 +void put_page_type(struct pfn_info *page)
  33.788 +{
  33.789 +    u32 nx, x, y = page->u.inuse.type_info;
  33.790 +
  33.791 + again:
  33.792 +    do {
  33.793 +        x  = y;
  33.794 +        nx = x - 1;
  33.795 +
  33.796 +        ASSERT((x & PGT_count_mask) != 0);
  33.797 +
  33.798 +        /*
  33.799 +         * The page should always be validated while a reference is held. The 
  33.800 +         * exception is during domain destruction, when we forcibly invalidate 
  33.801 +         * page-table pages if we detect a referential loop.
  33.802 +         * See domain.c:relinquish_list().
  33.803 +         */
  33.804 +        ASSERT((x & PGT_validated) || 
  33.805 +               test_bit(DF_DYING, &page_get_owner(page)->d_flags));
  33.806 +
  33.807 +        if ( unlikely((nx & PGT_count_mask) == 0) )
  33.808 +        {
  33.809 +            /* Record TLB information for flush later. Races are harmless. */
  33.810 +            page->tlbflush_timestamp = tlbflush_current_time();
  33.811 +            
  33.812 +            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
  33.813 +                 likely(nx & PGT_validated) )
  33.814 +            {
  33.815 +                /*
  33.816 +                 * Page-table pages must be unvalidated when count is zero. The
  33.817 +                 * 'free' is safe because the refcnt is non-zero and validated
  33.818 +                 * bit is clear => other ops will spin or fail.
  33.819 +                 */
  33.820 +                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
  33.821 +                                           x & ~PGT_validated)) != x) )
  33.822 +                    goto again;
  33.823 +                /* We cleared the 'valid bit' so we do the clear up. */
  33.824 +                free_page_type(page, x & PGT_type_mask);
  33.825 +                /* Carry on, but with the 'valid bit' now clear. */
  33.826 +                x  &= ~PGT_validated;
  33.827 +                nx &= ~PGT_validated;
  33.828 +            }
  33.829 +        }
  33.830 +        else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 
  33.831 +                           (PGT_pinned | 1)) )
  33.832 +        {
  33.833 +            /* Page is now only pinned. Make the back pointer mutable again. */
  33.834 +            nx |= PGT_va_mutable;
  33.835 +        }
  33.836 +    }
  33.837 +    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
  33.838 +}
  33.839 +
  33.840 +
  33.841 +int get_page_type(struct pfn_info *page, u32 type)
  33.842 +{
  33.843 +    u32 nx, x, y = page->u.inuse.type_info;
  33.844 +
  33.845 + again:
  33.846 +    do {
  33.847 +        x  = y;
  33.848 +        nx = x + 1;
  33.849 +        if ( unlikely((nx & PGT_count_mask) == 0) )
  33.850 +        {
  33.851 +            MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
  33.852 +            return 0;
  33.853 +        }
  33.854 +        else if ( unlikely((x & PGT_count_mask) == 0) )
  33.855 +        {
  33.856 +            if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
  33.857 +            {
  33.858 +                /*
  33.859 +                 * On type change we check to flush stale TLB entries. This 
  33.860 +                 * may be unnecessary (e.g., page was GDT/LDT) but those
  33.861 +                 * circumstances should be very rare.
  33.862 +                 */
  33.863 +                struct domain *d = page_get_owner(page);
  33.864 +                if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
  33.865 +                                         page->tlbflush_timestamp)) )
  33.866 +                {
  33.867 +                    perfc_incr(need_flush_tlb_flush);
  33.868 +                    flush_tlb_cpu(d->exec_domain[0]->processor);
  33.869 +                }
  33.870 +
  33.871 +                /* We lose existing type, back pointer, and validity. */
  33.872 +                nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
  33.873 +                nx |= type;
  33.874 +
  33.875 +                /* No special validation needed for writable pages. */
  33.876 +                /* Page tables and GDT/LDT need to be scanned for validity. */
  33.877 +                if ( type == PGT_writable_page )
  33.878 +                    nx |= PGT_validated;
  33.879 +            }
  33.880 +        }
  33.881 +        else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
  33.882 +        {
  33.883 +            if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
  33.884 +            {
  33.885 +                if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
  33.886 +                     ((type & PGT_type_mask) != PGT_l1_page_table) )
  33.887 +                    MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
  33.888 +                            x & PGT_type_mask, type, page_to_pfn(page));
  33.889 +                return 0;
  33.890 +            }
  33.891 +            else if ( (x & PGT_va_mask) == PGT_va_mutable )
  33.892 +            {
  33.893 +                /* The va backpointer is mutable, hence we update it. */
  33.894 +                nx &= ~PGT_va_mask;
  33.895 +                nx |= type; /* we know the actual type is correct */
  33.896 +            }
  33.897 +            else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
  33.898 +            {
  33.899 +                /* This table is potentially mapped at multiple locations. */
  33.900 +                nx &= ~PGT_va_mask;
  33.901 +                nx |= PGT_va_unknown;
  33.902 +            }
  33.903 +        }
  33.904 +        else if ( unlikely(!(x & PGT_validated)) )
  33.905 +        {
  33.906 +            /* Someone else is updating validation of this page. Wait... */
  33.907 +            while ( (y = page->u.inuse.type_info) == x )
  33.908 +            {
  33.909 +                rep_nop();
  33.910 +                barrier();
  33.911 +            }
  33.912 +            goto again;
  33.913 +        }
  33.914 +    }
  33.915 +    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
  33.916 +
  33.917 +    if ( unlikely(!(nx & PGT_validated)) )
  33.918 +    {
  33.919 +        /* Try to validate page type; drop the new reference on failure. */
  33.920 +        if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
  33.921 +        {
  33.922 +            MEM_LOG("Error while validating pfn %08lx for type %08x."
  33.923 +                    " caf=%08x taf=%08x\n",
  33.924 +                    page_to_pfn(page), type,
  33.925 +                    page->count_info,
  33.926 +                    page->u.inuse.type_info);
  33.927 +            /* Noone else can get a reference. We hold the only ref. */
  33.928 +            page->u.inuse.type_info = 0;
  33.929 +            return 0;
  33.930 +        }
  33.931 +
  33.932 +        /* Noone else is updating simultaneously. */
  33.933 +        __set_bit(_PGT_validated, &page->u.inuse.type_info);
  33.934 +    }
  33.935 +
  33.936 +    return 1;
  33.937 +}
  33.938 +
  33.939 +
  33.940 +int new_guest_cr3(unsigned long pfn)
  33.941 +{
  33.942 +    struct exec_domain *ed = current;
  33.943 +    struct domain *d = ed->domain;
  33.944 +    int okay, cpu = smp_processor_id();
  33.945 +    unsigned long old_base_pfn;
  33.946 +    
  33.947 +    okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
  33.948 +    if ( likely(okay) )
  33.949 +    {
  33.950 +        invalidate_shadow_ldt(ed);
  33.951 +
  33.952 +        percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
  33.953 +        old_base_pfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT;
  33.954 +        ed->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
  33.955 +
  33.956 +        shadow_mk_pagetable(ed);
  33.957 +
  33.958 +        write_ptbase(ed);
  33.959 +
  33.960 +        put_page_and_type(&frame_table[old_base_pfn]);
  33.961 +    }
  33.962 +    else
  33.963 +    {
  33.964 +        MEM_LOG("Error while installing new baseptr %08lx", pfn);
  33.965 +    }
  33.966 +
  33.967 +    return okay;
  33.968 +}
  33.969 +
  33.970 +static int do_extended_command(unsigned long ptr, unsigned long val)
  33.971 +{
  33.972 +    int okay = 1, cpu = smp_processor_id();
  33.973 +    unsigned int cmd = val & MMUEXT_CMD_MASK;
  33.974 +    unsigned long pfn = ptr >> PAGE_SHIFT;
  33.975 +    struct pfn_info *page = &frame_table[pfn];
  33.976 +    struct exec_domain *ed = current;
  33.977 +    struct domain *d = ed->domain, *nd, *e;
  33.978 +    u32 x, y;
  33.979 +    domid_t domid;
  33.980 +    grant_ref_t gntref;
  33.981 +
  33.982 +    switch ( cmd )
  33.983 +    {
  33.984 +    case MMUEXT_PIN_L1_TABLE:
  33.985 +    case MMUEXT_PIN_L2_TABLE:
  33.986 +        /*
  33.987 +         * We insist that, if you pin an L1 page, it's the first thing that
  33.988 +         * you do to it. This is because we require the backptr to still be
  33.989 +         * mutable. This assumption seems safe.
  33.990 +         */
  33.991 +        okay = get_page_and_type_from_pagenr(
  33.992 +            pfn, 
  33.993 +            ((cmd==MMUEXT_PIN_L2_TABLE) ? 
  33.994 +             PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
  33.995 +            FOREIGNDOM);
  33.996 +
  33.997 +        if ( unlikely(!okay) )
  33.998 +        {
  33.999 +            MEM_LOG("Error while pinning pfn %08lx", pfn);
 33.1000 +            break;
 33.1001 +        }
 33.1002 +
 33.1003 +        if ( unlikely(test_and_set_bit(_PGT_pinned,
 33.1004 +                                       &page->u.inuse.type_info)) )
 33.1005 +        {
 33.1006 +            MEM_LOG("Pfn %08lx already pinned", pfn);
 33.1007 +            put_page_and_type(page);
 33.1008 +            okay = 0;
 33.1009 +            break;
 33.1010 +        }
 33.1011 +
 33.1012 +        break;
 33.1013 +
 33.1014 +    case MMUEXT_UNPIN_TABLE:
 33.1015 +        if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
 33.1016 +        {
 33.1017 +            MEM_LOG("Page %08lx bad domain (dom=%p)",
 33.1018 +                    ptr, page_get_owner(page));
 33.1019 +        }
 33.1020 +        else if ( likely(test_and_clear_bit(_PGT_pinned, 
 33.1021 +                                            &page->u.inuse.type_info)) )
 33.1022 +        {
 33.1023 +            put_page_and_type(page);
 33.1024 +            put_page(page);
 33.1025 +        }
 33.1026 +        else
 33.1027 +        {
 33.1028 +            okay = 0;
 33.1029 +            put_page(page);
 33.1030 +            MEM_LOG("Pfn %08lx not pinned", pfn);
 33.1031 +        }
 33.1032 +        break;
 33.1033 +
 33.1034 +    case MMUEXT_NEW_BASEPTR:
 33.1035 +        okay = new_guest_cr3(pfn);
 33.1036 +        break;
 33.1037 +        
 33.1038 +    case MMUEXT_TLB_FLUSH:
 33.1039 +        percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
 33.1040 +        break;
 33.1041 +    
 33.1042 +    case MMUEXT_INVLPG:
 33.1043 +        __flush_tlb_one(ptr);
 33.1044 +        break;
 33.1045 +
 33.1046 +    case MMUEXT_FLUSH_CACHE:
 33.1047 +        if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
 33.1048 +        {
 33.1049 +            MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
 33.1050 +            okay = 0;
 33.1051 +        }
 33.1052 +        else
 33.1053 +        {
 33.1054 +            wbinvd();
 33.1055 +        }
 33.1056 +        break;
 33.1057 +
 33.1058 +    case MMUEXT_SET_LDT:
 33.1059 +    {
 33.1060 +        unsigned long ents = val >> MMUEXT_CMD_SHIFT;
 33.1061 +        if ( ((ptr & (PAGE_SIZE-1)) != 0) || 
 33.1062 +             (ents > 8192) ||
 33.1063 +             ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
 33.1064 +             ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
 33.1065 +        {
 33.1066 +            okay = 0;
 33.1067 +            MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
 33.1068 +        }
 33.1069 +        else if ( (ed->arch.ldt_ents != ents) || 
 33.1070 +                  (ed->arch.ldt_base != ptr) )
 33.1071 +        {
 33.1072 +            invalidate_shadow_ldt(ed);
 33.1073 +            ed->arch.ldt_base = ptr;
 33.1074 +            ed->arch.ldt_ents = ents;
 33.1075 +            load_LDT(ed);
 33.1076 +            percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
 33.1077 +            if ( ents != 0 )
 33.1078 +                percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
 33.1079 +        }
 33.1080 +        break;
 33.1081 +    }
 33.1082 +
 33.1083 +    case MMUEXT_SET_FOREIGNDOM:
 33.1084 +        domid = (domid_t)(val >> 16);
 33.1085 +
 33.1086 +        if ( (e = percpu_info[cpu].foreign) != NULL )
 33.1087 +            put_domain(e);
 33.1088 +        percpu_info[cpu].foreign = NULL;
 33.1089 +
 33.1090 +        if ( !IS_PRIV(d) )
 33.1091 +        {
 33.1092 +            switch ( domid )
 33.1093 +            {
 33.1094 +            case DOMID_IO:
 33.1095 +                get_knownalive_domain(dom_io);
 33.1096 +                percpu_info[cpu].foreign = dom_io;
 33.1097 +                break;
 33.1098 +            default:
 33.1099 +                MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
 33.1100 +                okay = 0;
 33.1101 +                break;
 33.1102 +            }
 33.1103 +        }
 33.1104 +        else
 33.1105 +        {
 33.1106 +            percpu_info[cpu].foreign = e = find_domain_by_id(domid);
 33.1107 +            if ( e == NULL )
 33.1108 +            {
 33.1109 +                switch ( domid )
 33.1110 +                {
 33.1111 +                case DOMID_XEN:
 33.1112 +                    get_knownalive_domain(dom_xen);
 33.1113 +                    percpu_info[cpu].foreign = dom_xen;
 33.1114 +                    break;
 33.1115 +                case DOMID_IO:
 33.1116 +                    get_knownalive_domain(dom_io);
 33.1117 +                    percpu_info[cpu].foreign = dom_io;
 33.1118 +                    break;
 33.1119 +                default:
 33.1120 +                    MEM_LOG("Unknown domain '%u'", domid);
 33.1121 +                    okay = 0;
 33.1122 +                    break;
 33.1123 +                }
 33.1124 +            }
 33.1125 +        }
 33.1126 +        break;
 33.1127 +
 33.1128 +    case MMUEXT_TRANSFER_PAGE:
 33.1129 +        domid  = (domid_t)(val >> 16);
 33.1130 +        gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
 33.1131 +        
 33.1132 +        if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
 33.1133 +             unlikely(!pfn_is_ram(pfn)) ||
 33.1134 +             unlikely((e = find_domain_by_id(domid)) == NULL) )
 33.1135 +        {
 33.1136 +            MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
 33.1137 +            okay = 0;
 33.1138 +            break;
 33.1139 +        }
 33.1140 +
 33.1141 +        spin_lock(&d->page_alloc_lock);
 33.1142 +
 33.1143 +        /*
 33.1144 +         * The tricky bit: atomically release ownership while there is just one
 33.1145 +         * benign reference to the page (PGC_allocated). If that reference
 33.1146 +         * disappears then the deallocation routine will safely spin.
 33.1147 +         */
 33.1148 +        nd = page_get_owner(page);
 33.1149 +        y  = page->count_info;
 33.1150 +        do {
 33.1151 +            x = y;
 33.1152 +            if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 
 33.1153 +                          (1|PGC_allocated)) ||
 33.1154 +                 unlikely(nd != d) )
 33.1155 +            {
 33.1156 +                MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
 33.1157 +                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
 33.1158 +                        d, d->id, nd, x, page->u.inuse.type_info);
 33.1159 +                spin_unlock(&d->page_alloc_lock);
 33.1160 +                put_domain(e);
 33.1161 +                return 0;
 33.1162 +            }
 33.1163 +            __asm__ __volatile__(
 33.1164 +                LOCK_PREFIX "cmpxchg8b %2"
 33.1165 +                : "=d" (nd), "=a" (y),
 33.1166 +                "=m" (*(volatile u64 *)(&page->count_info))
 33.1167 +                : "0" (d), "1" (x), "c" (NULL), "b" (x) );
 33.1168 +        } 
 33.1169 +        while ( unlikely(nd != d) || unlikely(y != x) );
 33.1170 +
 33.1171 +        /*
 33.1172 +         * Unlink from 'd'. At least one reference remains (now anonymous), so
 33.1173 +         * noone else is spinning to try to delete this page from 'd'.
 33.1174 +         */
 33.1175 +        d->tot_pages--;
 33.1176 +        list_del(&page->list);
 33.1177 +        
 33.1178 +        spin_unlock(&d->page_alloc_lock);
 33.1179 +
 33.1180 +        spin_lock(&e->page_alloc_lock);
 33.1181 +
 33.1182 +        /*
 33.1183 +         * Check that 'e' will accept the page and has reservation headroom.
 33.1184 +         * Also, a domain mustn't have PGC_allocated pages when it is dying.
 33.1185 +         */
 33.1186 +        ASSERT(e->tot_pages <= e->max_pages);
 33.1187 +        if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
 33.1188 +             unlikely(e->tot_pages == e->max_pages) ||
 33.1189 +             unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
 33.1190 +        {
 33.1191 +            MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
 33.1192 +                    "provided a bad grant ref, or is dying (%08lx).\n",
 33.1193 +                    e->tot_pages, e->max_pages, e->d_flags);
 33.1194 +            spin_unlock(&e->page_alloc_lock);
 33.1195 +            put_domain(e);
 33.1196 +            okay = 0;
 33.1197 +            break;
 33.1198 +        }
 33.1199 +
 33.1200 +        /* Okay, add the page to 'e'. */
 33.1201 +        if ( unlikely(e->tot_pages++ == 0) )
 33.1202 +            get_knownalive_domain(e);
 33.1203 +        list_add_tail(&page->list, &e->page_list);
 33.1204 +        page_set_owner(page, e);
 33.1205 +
 33.1206 +        spin_unlock(&e->page_alloc_lock);
 33.1207 +
 33.1208 +        /* Transfer is all done: tell the guest about its new page frame. */
 33.1209 +        gnttab_notify_transfer(e, gntref, pfn);
 33.1210 +        
 33.1211 +        put_domain(e);
 33.1212 +        break;
 33.1213 +
 33.1214 +    case MMUEXT_REASSIGN_PAGE:
 33.1215 +        if ( unlikely(!IS_PRIV(d)) )
 33.1216 +        {
 33.1217 +            MEM_LOG("Dom %u has no reassignment priv", d->id);
 33.1218 +            okay = 0;
 33.1219 +            break;
 33.1220 +        }
 33.1221 +
 33.1222 +        e = percpu_info[cpu].foreign;
 33.1223 +        if ( unlikely(e == NULL) )
 33.1224 +        {
 33.1225 +            MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
 33.1226 +            okay = 0;
 33.1227 +            break;
 33.1228 +        }
 33.1229 +
 33.1230 +        /*
 33.1231 +         * Grab both page_list locks, in order. This prevents the page from
 33.1232 +         * disappearing elsewhere while we modify the owner, and we'll need
 33.1233 +         * both locks if we're successful so that we can change lists.
 33.1234 +         */
 33.1235 +        if ( d < e )
 33.1236 +        {
 33.1237 +            spin_lock(&d->page_alloc_lock);
 33.1238 +            spin_lock(&e->page_alloc_lock);
 33.1239 +        }
 33.1240 +        else
 33.1241 +        {
 33.1242 +            spin_lock(&e->page_alloc_lock);
 33.1243 +            spin_lock(&d->page_alloc_lock);
 33.1244 +        }
 33.1245 +
 33.1246 +        /* A domain shouldn't have PGC_allocated pages when it is dying. */
 33.1247 +        if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
 33.1248 +             unlikely(IS_XEN_HEAP_FRAME(page)) )
 33.1249 +        {
 33.1250 +            MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
 33.1251 +            okay = 0;
 33.1252 +            goto reassign_fail;
 33.1253 +        }
 33.1254 +
 33.1255 +        /*
 33.1256 +         * The tricky bit: atomically change owner while there is just one
 33.1257 +         * benign reference to the page (PGC_allocated). If that reference
 33.1258 +         * disappears then the deallocation routine will safely spin.
 33.1259 +         */
 33.1260 +        nd = page_get_owner(page);
 33.1261 +        y  = page->count_info;
 33.1262 +        do {
 33.1263 +            x = y;
 33.1264 +            if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 
 33.1265 +                          (1|PGC_allocated)) ||
 33.1266 +                 unlikely(nd != d) )
 33.1267 +            {
 33.1268 +                MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
 33.1269 +                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
 33.1270 +                        d, d->id, nd, x, page->u.inuse.type_info);
 33.1271 +                okay = 0;
 33.1272 +                goto reassign_fail;
 33.1273 +            }
 33.1274 +            __asm__ __volatile__(
 33.1275 +                LOCK_PREFIX "cmpxchg8b %3"
 33.1276 +                : "=d" (nd), "=a" (y), "=c" (e),
 33.1277 +                "=m" (*(volatile u64 *)(&page->count_info))
 33.1278 +                : "0" (d), "1" (x), "c" (e), "b" (x) );
 33.1279 +        } 
 33.1280 +        while ( unlikely(nd != d) || unlikely(y != x) );
 33.1281 +        
 33.1282 +        /*
 33.1283 +         * Unlink from 'd'. We transferred at least one reference to 'e', so
 33.1284 +         * noone else is spinning to try to delete this page from 'd'.
 33.1285 +         */
 33.1286 +        d->tot_pages--;
 33.1287 +        list_del(&page->list);
 33.1288 +        
 33.1289 +        /*
 33.1290 +         * Add the page to 'e'. Someone may already have removed the last
 33.1291 +         * reference and want to remove the page from 'e'. However, we have
 33.1292 +         * the lock so they'll spin waiting for us.
 33.1293 +         */
 33.1294 +        if ( unlikely(e->tot_pages++ == 0) )
 33.1295 +            get_knownalive_domain(e);
 33.1296 +        list_add_tail(&page->list, &e->page_list);
 33.1297 +
 33.1298 +    reassign_fail:        
 33.1299 +        spin_unlock(&d->page_alloc_lock);
 33.1300 +        spin_unlock(&e->page_alloc_lock);
 33.1301 +        break;
 33.1302 +
 33.1303 +    case MMUEXT_CLEAR_FOREIGNDOM:
 33.1304 +        if ( (e = percpu_info[cpu].foreign) != NULL )
 33.1305 +            put_domain(e);
 33.1306 +        percpu_info[cpu].foreign = NULL;
 33.1307 +        break;
 33.1308 +
 33.1309 +    default:
 33.1310 +        MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
 33.1311 +        okay = 0;
 33.1312 +        break;
 33.1313 +    }
 33.1314 +
 33.1315 +    return okay;
 33.1316 +}
 33.1317 +
 33.1318 +int do_mmu_update(
 33.1319 +    mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
 33.1320 +{
 33.1321 +/*
 33.1322 + * We steal the m.s.b. of the @count parameter to indicate whether this
 33.1323 + * invocation of do_mmu_update() is resuming a previously preempted call.
 33.1324 + * We steal the next 15 bits to remember the current FOREIGNDOM.
 33.1325 + */
 33.1326 +#define MMU_UPDATE_PREEMPTED          (~(~0U>>1))
 33.1327 +#define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
 33.1328 +#define MMU_UPDATE_PREEMPT_FDOM_MASK  (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
 33.1329 +
 33.1330 +    mmu_update_t req;
 33.1331 +    unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
 33.1332 +    struct pfn_info *page;
 33.1333 +    int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
 33.1334 +    unsigned int cmd, done = 0;
 33.1335 +    unsigned long prev_smfn = 0;
 33.1336 +    l1_pgentry_t *prev_spl1e = 0;
 33.1337 +    struct exec_domain *ed = current;
 33.1338 +    struct domain *d = ed->domain;
 33.1339 +    u32 type_info;
 33.1340 +    domid_t domid;
 33.1341 +
 33.1342 +    LOCK_BIGLOCK(d);
 33.1343 +
 33.1344 +    cleanup_writable_pagetable(d);
 33.1345 +
 33.1346 +    if ( unlikely(shadow_mode(d)) )
 33.1347 +        check_pagetable(d, ed->arch.pagetable, "pre-mmu"); /* debug */
 33.1348 +
 33.1349 +    /*
 33.1350 +     * If we are resuming after preemption, read how much work we have already
 33.1351 +     * done. This allows us to set the @done output parameter correctly.
 33.1352 +     * We also reset FOREIGNDOM here.
 33.1353 +     */
 33.1354 +    if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
 33.1355 +    {
 33.1356 +        if ( !(count & MMU_UPDATE_PREEMPTED) )
 33.1357 +        {
 33.1358 +            /* Count overflow into private FOREIGNDOM field. */
 33.1359 +            MEM_LOG("do_mmu_update count is too large");
 33.1360 +            rc = -EINVAL;
 33.1361 +            goto out;
 33.1362 +        }
 33.1363 +        count &= ~MMU_UPDATE_PREEMPTED;
 33.1364 +        domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
 33.1365 +        count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
 33.1366 +        if ( unlikely(pdone != NULL) )
 33.1367 +            (void)get_user(done, pdone);
 33.1368 +        if ( (domid != current->domain->id) &&
 33.1369 +             !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
 33.1370 +        {
 33.1371 +            rc = -EINVAL;
 33.1372 +            goto out;
 33.1373 +        }
 33.1374 +    }
 33.1375 +
 33.1376 +    perfc_incrc(calls_to_mmu_update); 
 33.1377 +    perfc_addc(num_page_updates, count);
 33.1378 +
 33.1379 +    if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
 33.1380 +    {
 33.1381 +        rc = -EFAULT;
 33.1382 +        goto out;
 33.1383 +    }
 33.1384 +
 33.1385 +    for ( i = 0; i < count; i++ )
 33.1386 +    {
 33.1387 +        if ( hypercall_preempt_check() )
 33.1388 +        {
 33.1389 +            rc = hypercall3_create_continuation(
 33.1390 +                __HYPERVISOR_mmu_update, ureqs, 
 33.1391 +                (count - i) |
 33.1392 +                (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) | 
 33.1393 +                MMU_UPDATE_PREEMPTED, pdone);
 33.1394 +            break;
 33.1395 +        }
 33.1396 +
 33.1397 +        if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
 33.1398 +        {
 33.1399 +            MEM_LOG("Bad __copy_from_user");
 33.1400 +            rc = -EFAULT;
 33.1401 +            break;
 33.1402 +        }
 33.1403 +
 33.1404 +        cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
 33.1405 +        pfn = req.ptr >> PAGE_SHIFT;
 33.1406 +
 33.1407 +        okay = 0;
 33.1408 +
 33.1409 +        switch ( cmd )
 33.1410 +        {
 33.1411 +            /*
 33.1412 +             * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
 33.1413 +             */
 33.1414 +        case MMU_NORMAL_PT_UPDATE:
 33.1415 +            if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
 33.1416 +            {
 33.1417 +                MEM_LOG("Could not get page for normal update");
 33.1418 +                break;
 33.1419 +            }
 33.1420 +
 33.1421 +            if ( likely(prev_pfn == pfn) )
 33.1422 +            {
 33.1423 +                va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
 33.1424 +            }
 33.1425 +            else
 33.1426 +            {
 33.1427 +                if ( prev_pfn != 0 )
 33.1428 +                    unmap_domain_mem((void *)va);
 33.1429 +                va = (unsigned long)map_domain_mem(req.ptr);
 33.1430 +                prev_pfn = pfn;
 33.1431 +            }
 33.1432 +
 33.1433 +            page = &frame_table[pfn];
 33.1434 +            switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
 33.1435 +            {
 33.1436 +            case PGT_l1_page_table: 
 33.1437 +                if ( likely(get_page_type(
 33.1438 +                    page, type_info & (PGT_type_mask|PGT_va_mask))) )
 33.1439 +                {
 33.1440 +                    okay = mod_l1_entry((l1_pgentry_t *)va, 
 33.1441 +                                        mk_l1_pgentry(req.val)); 
 33.1442 +
 33.1443 +                    if ( unlikely(shadow_mode(d)) && okay &&
 33.1444 +                         (get_shadow_status(d, page-frame_table) &
 33.1445 +                          PSH_shadowed) )
 33.1446 +                    {
 33.1447 +                        shadow_l1_normal_pt_update(
 33.1448 +                            req.ptr, req.val, &prev_smfn, &prev_spl1e);
 33.1449 +                        put_shadow_status(d);
 33.1450 +                    }
 33.1451 +
 33.1452 +                    put_page_type(page);
 33.1453 +                }
 33.1454 +                break;
 33.1455 +            case PGT_l2_page_table:
 33.1456 +                if ( likely(get_page_type(page, PGT_l2_page_table)) )
 33.1457 +                {
 33.1458 +                    okay = mod_l2_entry((l2_pgentry_t *)va, 
 33.1459 +                                        mk_l2_pgentry(req.val),
 33.1460 +                                        pfn); 
 33.1461 +
 33.1462 +                    if ( unlikely(shadow_mode(d)) && okay &&
 33.1463 +                         (get_shadow_status(d, page-frame_table) & 
 33.1464 +                          PSH_shadowed) )
 33.1465 +                    {
 33.1466 +                        shadow_l2_normal_pt_update(req.ptr, req.val);
 33.1467 +                        put_shadow_status(d);
 33.1468 +                    }
 33.1469 +
 33.1470 +                    put_page_type(page);
 33.1471 +                }
 33.1472 +                break;
 33.1473 +            default:
 33.1474 +                if ( likely(get_page_type(page, PGT_writable_page)) )
 33.1475 +                {
 33.1476 +                    *(unsigned long *)va = req.val;
 33.1477 +                    okay = 1;
 33.1478 +                    put_page_type(page);
 33.1479 +                }
 33.1480 +                break;
 33.1481 +            }
 33.1482 +
 33.1483 +            put_page(page);
 33.1484 +            break;
 33.1485 +
 33.1486 +        case MMU_MACHPHYS_UPDATE:
 33.1487 +            if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
 33.1488 +            {
 33.1489 +                MEM_LOG("Could not get page for mach->phys update");
 33.1490 +                break;
 33.1491 +            }
 33.1492 +
 33.1493 +            machine_to_phys_mapping[pfn] = req.val;
 33.1494 +            okay = 1;
 33.1495 +
 33.1496 +            /*
 33.1497 +             * If in log-dirty mode, mark the corresponding pseudo-physical
 33.1498 +             * page as dirty.
 33.1499 +             */
 33.1500 +            if ( unlikely(shadow_mode(d) == SHM_logdirty) && 
 33.1501 +                 mark_dirty(d, pfn) )
 33.1502 +                d->arch.shadow_dirty_block_count++;
 33.1503 +
 33.1504 +            put_page(&frame_table[pfn]);
 33.1505 +            break;
 33.1506 +
 33.1507 +            /*
 33.1508 +             * MMU_EXTENDED_COMMAND: Extended command is specified
 33.1509 +             * in the least-siginificant bits of the 'value' field.
 33.1510 +             */
 33.1511 +        case MMU_EXTENDED_COMMAND:
 33.1512 +            req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
 33.1513 +            okay = do_extended_command(req.ptr, req.val);
 33.1514 +            break;
 33.1515 +
 33.1516 +        default:
 33.1517 +            MEM_LOG("Invalid page update command %08lx", req.ptr);
 33.1518 +            break;
 33.1519 +        }
 33.1520 +
 33.1521 +        if ( unlikely(!okay) )
 33.1522 +        {
 33.1523 +            rc = -EINVAL;
 33.1524 +            break;
 33.1525 +        }
 33.1526 +
 33.1527 +        ureqs++;
 33.1528 +    }
 33.1529 +
 33.1530 + out:
 33.1531 +    if ( prev_pfn != 0 )
 33.1532 +        unmap_domain_mem((void *)va);
 33.1533 +
 33.1534 +    if ( unlikely(prev_spl1e != 0) ) 
 33.1535 +        unmap_domain_mem((void *)prev_spl1e);
 33.1536 +
 33.1537 +    deferred_ops = percpu_info[cpu].deferred_ops;
 33.1538 +    percpu_info[cpu].deferred_ops = 0;
 33.1539 +
 33.1540 +    if ( deferred_ops & DOP_FLUSH_TLB )
 33.1541 +        local_flush_tlb();
 33.1542 +        
 33.1543 +    if ( deferred_ops & DOP_RELOAD_LDT )
 33.1544 +        (void)map_ldt_shadow_page(0);
 33.1545 +
 33.1546 +    if ( unlikely(percpu_info[cpu].foreign != NULL) )
 33.1547 +    {
 33.1548 +        put_domain(percpu_info[cpu].foreign);
 33.1549 +        percpu_info[cpu].foreign = NULL;
 33.1550 +    }
 33.1551 +
 33.1552 +    /* Add incremental work we have done to the @done output parameter. */
 33.1553 +    if ( unlikely(pdone != NULL) )
 33.1554 +        __put_user(done + i, pdone);
 33.1555 +
 33.1556 +    if ( unlikely(shadow_mode(d)) )
 33.1557 +        check_pagetable(d, ed->arch.pagetable, "post-mmu"); /* debug */
 33.1558 +
 33.1559 +    UNLOCK_BIGLOCK(d);
 33.1560 +    return rc;
 33.1561 +}
 33.1562 +
 33.1563 +
 33.1564 +int do_update_va_mapping(unsigned long va,
 33.1565 +                         unsigned long val, 
 33.1566 +                         unsigned long flags)
 33.1567 +{
 33.1568 +    struct exec_domain *ed = current;
 33.1569 +    struct domain *d = ed->domain;
 33.1570 +    int err = 0;
 33.1571 +    unsigned int cpu = ed->processor;
 33.1572 +    unsigned long deferred_ops;
 33.1573 +
 33.1574 +    perfc_incrc(calls_to_update_va);
 33.1575 +
 33.1576 +    if ( unlikely(!__addr_ok(va)) )
 33.1577 +        return -EINVAL;
 33.1578 +
 33.1579 +    LOCK_BIGLOCK(d);
 33.1580 +
 33.1581 +    cleanup_writable_pagetable(d);
 33.1582 +
 33.1583 +    /*
 33.1584 +     * XXX When we make this support 4MB superpages we should also deal with 
 33.1585 +     * the case of updating L2 entries.
 33.1586 +     */
 33.1587 +
 33.1588 +    if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
 33.1589 +                                mk_l1_pgentry(val))) )
 33.1590 +        err = -EINVAL;
 33.1591 +
 33.1592 +    if ( unlikely(shadow_mode(d)) )
 33.1593 +    {
 33.1594 +        unsigned long sval = 0;
 33.1595 +
 33.1596 +        l1pte_propagate_from_guest(d, &val, &sval);
 33.1597 +
 33.1598 +        if ( unlikely(__put_user(sval, ((unsigned long *)(
 33.1599 +            &shadow_linear_pg_table[l1_linear_offset(va)])))) )
 33.1600 +        {
 33.1601 +            /*
 33.1602 +             * Since L2's are guranteed RW, failure indicates the page was not 
 33.1603 +             * shadowed, so ignore.
 33.1604 +             */
 33.1605 +            perfc_incrc(shadow_update_va_fail);
 33.1606 +        }
 33.1607 +
 33.1608 +        /*
 33.1609 +         * If we're in log-dirty mode then we need to note that we've updated
 33.1610 +         * the PTE in the PT-holding page. We need the machine frame number
 33.1611 +         * for this.
 33.1612 +         */
 33.1613 +        if ( shadow_mode(d) == SHM_logdirty )
 33.1614 +            mark_dirty(d, va_to_l1mfn(va));
 33.1615 +  
 33.1616 +        check_pagetable(d, ed->arch.pagetable, "va"); /* debug */
 33.1617 +    }
 33.1618 +
 33.1619 +    deferred_ops = percpu_info[cpu].deferred_ops;
 33.1620 +    percpu_info[cpu].deferred_ops = 0;
 33.1621 +
 33.1622 +    if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || 
 33.1623 +         unlikely(flags & UVMF_FLUSH_TLB) )
 33.1624 +        local_flush_tlb();
 33.1625 +    else if ( unlikely(flags & UVMF_INVLPG) )
 33.1626 +        __flush_tlb_one(va);
 33.1627 +
 33.1628 +    if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
 33.1629 +        (void)map_ldt_shadow_page(0);
 33.1630 +    
 33.1631 +    UNLOCK_BIGLOCK(d);
 33.1632 +
 33.1633 +    return err;
 33.1634 +}
 33.1635 +
 33.1636 +int do_update_va_mapping_otherdomain(unsigned long va,
 33.1637 +                                     unsigned long val, 
 33.1638 +                                     unsigned long flags,
 33.1639 +                                     domid_t domid)
 33.1640 +{
 33.1641 +    unsigned int cpu = smp_processor_id();
 33.1642 +    struct domain *d;
 33.1643 +    int rc;
 33.1644 +
 33.1645 +    if ( unlikely(!IS_PRIV(current->domain)) )
 33.1646 +        return -EPERM;
 33.1647 +
 33.1648 +    percpu_info[cpu].foreign = d = find_domain_by_id(domid);
 33.1649 +    if ( unlikely(d == NULL) )
 33.1650 +    {
 33.1651 +        MEM_LOG("Unknown domain '%u'", domid);
 33.1652 +        return -ESRCH;
 33.1653 +    }
 33.1654 +
 33.1655 +    rc = do_update_va_mapping(va, val, flags);
 33.1656 +
 33.1657 +    put_domain(d);
 33.1658 +    percpu_info[cpu].foreign = NULL;
 33.1659 +
 33.1660 +    return rc;
 33.1661 +}
 33.1662 +
 33.1663 +
 33.1664 +
 33.1665 +/*************************
 33.1666 + * Descriptor Tables
 33.1667 + */
 33.1668 +
 33.1669 +void destroy_gdt(struct exec_domain *ed)
 33.1670 +{
 33.1671 +    int i;
 33.1672 +    unsigned long pfn;
 33.1673 +
 33.1674 +    for ( i = 0; i < 16; i++ )
 33.1675 +    {
 33.1676 +        if ( (pfn = l1_pgentry_to_pfn(ed->arch.perdomain_ptes[i])) != 0 )
 33.1677 +            put_page_and_type(&frame_table[pfn]);
 33.1678 +        ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
 33.1679 +    }
 33.1680 +}
 33.1681 +
 33.1682 +
 33.1683 +long set_gdt(struct exec_domain *ed, 
 33.1684 +             unsigned long *frames,
 33.1685 +             unsigned int entries)
 33.1686 +{
 33.1687 +    struct domain *d = ed->domain;
 33.1688 +    /* NB. There are 512 8-byte entries per GDT page. */
 33.1689 +    int i = 0, nr_pages = (entries + 511) / 512;
 33.1690 +    struct desc_struct *vgdt;
 33.1691 +    unsigned long pfn;
 33.1692 +
 33.1693 +    /* Check the first page in the new GDT. */
 33.1694 +    if ( (pfn = frames[0]) >= max_page )
 33.1695 +        goto fail;
 33.1696 +
 33.1697 +    /* The first page is special because Xen owns a range of entries in it. */
 33.1698 +    if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
 33.1699 +    {
 33.1700 +        /* GDT checks failed: try zapping the Xen reserved entries. */
 33.1701 +        if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) )
 33.1702 +            goto fail;
 33.1703 +        vgdt = map_domain_mem(pfn << PAGE_SHIFT);
 33.1704 +        memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0,
 33.1705 +               NR_RESERVED_GDT_ENTRIES*8);
 33.1706 +        unmap_domain_mem(vgdt);
 33.1707 +        put_page_and_type(&frame_table[pfn]);
 33.1708 +
 33.1709 +        /* Okay, we zapped the entries. Now try the GDT checks again. */
 33.1710 +        if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
 33.1711 +            goto fail;
 33.1712 +    }
 33.1713 +
 33.1714 +    /* Check the remaining pages in the new GDT. */
 33.1715 +    for ( i = 1; i < nr_pages; i++ )
 33.1716 +        if ( ((pfn = frames[i]) >= max_page) ||
 33.1717 +             !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
 33.1718 +            goto fail;
 33.1719 +
 33.1720 +    /* Copy reserved GDT entries to the new GDT. */
 33.1721 +    vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
 33.1722 +    memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 
 33.1723 +           gdt_table + FIRST_RESERVED_GDT_ENTRY, 
 33.1724 +           NR_RESERVED_GDT_ENTRIES*8);
 33.1725 +    unmap_domain_mem(vgdt);
 33.1726 +
 33.1727 +    /* Tear down the old GDT. */
 33.1728 +    destroy_gdt(ed);
 33.1729 +
 33.1730 +    /* Install the new GDT. */
 33.1731 +    for ( i = 0; i < nr_pages; i++ )
 33.1732 +        ed->arch.perdomain_ptes[i] =
 33.1733 +            mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
 33.1734 +
 33.1735 +    SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed));
 33.1736 +    SET_GDT_ENTRIES(ed, entries);
 33.1737 +
 33.1738 +    return 0;
 33.1739 +
 33.1740 + fail:
 33.1741 +    while ( i-- > 0 )
 33.1742 +        put_page_and_type(&frame_table[frames[i]]);
 33.1743 +    return -EINVAL;
 33.1744 +}
 33.1745 +
 33.1746 +
 33.1747 +long do_set_gdt(unsigned long *frame_list, unsigned int entries)
 33.1748 +{
 33.1749 +    int nr_pages = (entries + 511) / 512;
 33.1750 +    unsigned long frames[16];
 33.1751 +    long ret;
 33.1752 +
 33.1753 +    if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) ) 
 33.1754 +        return -EINVAL;
 33.1755 +    
 33.1756 +    if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
 33.1757 +        return -EFAULT;
 33.1758 +
 33.1759 +    LOCK_BIGLOCK(current->domain);
 33.1760 +
 33.1761 +    if ( (ret = set_gdt(current, frames, entries)) == 0 )
 33.1762 +    {
 33.1763 +        local_flush_tlb();
 33.1764 +        __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt));
 33.1765 +    }
 33.1766 +
 33.1767 +    UNLOCK_BIGLOCK(current->domain);
 33.1768 +
 33.1769 +    return ret;
 33.1770 +}
 33.1771 +
 33.1772 +
 33.1773 +long do_update_descriptor(
 33.1774 +    unsigned long pa, unsigned long word1, unsigned long word2)
 33.1775 +{
 33.1776 +    unsigned long pfn = pa >> PAGE_SHIFT;
 33.1777 +    struct desc_struct *gdt_pent, d;
 33.1778 +    struct pfn_info *page;
 33.1779 +    struct exec_domain *ed;
 33.1780 +    long ret = -EINVAL;
 33.1781 +
 33.1782 +    d.a = (u32)word1;
 33.1783 +    d.b = (u32)word2;
 33.1784 +
 33.1785 +    LOCK_BIGLOCK(current->domain);
 33.1786 +
 33.1787 +    if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) {
 33.1788 +        UNLOCK_BIGLOCK(current->domain);
 33.1789 +        return -EINVAL;
 33.1790 +    }
 33.1791 +
 33.1792 +    page = &frame_table[pfn];
 33.1793 +    if ( unlikely(!get_page(page, current->domain)) ) {
 33.1794 +        UNLOCK_BIGLOCK(current->domain);
 33.1795 +        return -EINVAL;
 33.1796 +    }
 33.1797 +
 33.1798 +    /* Check if the given frame is in use in an unsafe context. */
 33.1799 +    switch ( page->u.inuse.type_info & PGT_type_mask )
 33.1800 +    {
 33.1801 +    case PGT_gdt_page:
 33.1802 +        /* Disallow updates of Xen-reserved descriptors in the current GDT. */
 33.1803 +        for_each_exec_domain(current->domain, ed) {
 33.1804 +            if ( (l1_pgentry_to_pfn(ed->arch.perdomain_ptes[0]) == pfn) &&
 33.1805 +                 (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
 33.1806 +                 (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
 33.1807 +                goto out;
 33.1808 +        }
 33.1809 +        if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
 33.1810 +            goto out;
 33.1811 +        break;
 33.1812 +    case PGT_ldt_page:
 33.1813 +        if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
 33.1814 +            goto out;
 33.1815 +        break;
 33.1816 +    default:
 33.1817 +        if ( unlikely(!get_page_type(page, PGT_writable_page)) )
 33.1818 +            goto out;
 33.1819 +        break;
 33.1820 +    }
 33.1821 +
 33.1822 +    /* All is good so make the update. */
 33.1823 +    gdt_pent = map_domain_mem(pa);
 33.1824 +    memcpy(gdt_pent, &d, 8);
 33.1825 +    unmap_domain_mem(gdt_pent);
 33.1826 +
 33.1827 +    put_page_type(page);
 33.1828 +
 33.1829 +    ret = 0; /* success */
 33.1830 +
 33.1831 + out:
 33.1832 +    put_page(page);
 33.1833 +
 33.1834 +    UNLOCK_BIGLOCK(current->domain);
 33.1835 +
 33.1836 +    return ret;
 33.1837 +}
 33.1838 +
 33.1839 +
 33.1840 +
 33.1841 +/*************************
 33.1842 + * Writable Pagetables
 33.1843 + */
 33.1844 +
 33.1845 +ptwr_info_t ptwr_info[NR_CPUS];
 33.1846 +
 33.1847 +#ifdef VERBOSE
 33.1848 +int ptwr_debug = 0x0;
 33.1849 +#define PTWR_PRINTK(_f, _a...) \
 33.1850 + do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
 33.1851 +#define PTWR_PRINT_WHICH (which ? 'I' : 'A')
 33.1852 +#else
 33.1853 +#define PTWR_PRINTK(_f, _a...) ((void)0)
 33.1854 +#endif
 33.1855 +
 33.1856 +/* Flush the given writable p.t. page and write-protect it again. */
 33.1857 +void ptwr_flush(const int which)
 33.1858 +{
 33.1859 +    unsigned long  sstat, spte, pte, *ptep, l1va;
 33.1860 +    l1_pgentry_t  *sl1e = NULL, *pl1e, ol1e, nl1e;
 33.1861 +    l2_pgentry_t  *pl2e;
 33.1862 +    int            i, cpu = smp_processor_id();
 33.1863 +    struct exec_domain *ed = current;
 33.1864 +    struct domain *d = ed->domain;
 33.1865 +
 33.1866 +    l1va = ptwr_info[cpu].ptinfo[which].l1va;
 33.1867 +    ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
 33.1868 +
 33.1869 +    /*
 33.1870 +     * STEP 1. Write-protect the p.t. page so no more updates can occur.
 33.1871 +     */
 33.1872 +
 33.1873 +    if ( unlikely(__get_user(pte, ptep)) )
 33.1874 +    {
 33.1875 +        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
 33.1876 +        /*
 33.1877 +         * Really a bug. We could read this PTE during the initial fault,
 33.1878 +         * and pagetables can't have changed meantime. XXX Multi-CPU guests?
 33.1879 +         */
 33.1880 +        BUG();
 33.1881 +    }
 33.1882 +    PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
 33.1883 +                PTWR_PRINT_WHICH, ptep, pte);
 33.1884 +    pte &= ~_PAGE_RW;
 33.1885 +
 33.1886 +    if ( unlikely(shadow_mode(d)) )
 33.1887 +    {
 33.1888 +        /* Write-protect the p.t. page in the shadow page table. */
 33.1889 +        l1pte_propagate_from_guest(d, &pte, &spte);
 33.1890 +        __put_user(
 33.1891 +            spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
 33.1892 +
 33.1893 +        /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
 33.1894 +        sstat = get_shadow_status(d, pte >> PAGE_SHIFT);
 33.1895 +        if ( sstat & PSH_shadowed )
 33.1896 +            sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
 33.1897 +    }
 33.1898 +
 33.1899 +    /* Write-protect the p.t. page in the guest page table. */
 33.1900 +    if ( unlikely(__put_user(pte, ptep)) )
 33.1901 +    {
 33.1902 +        MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
 33.1903 +        /*
 33.1904 +         * Really a bug. We could write this PTE during the initial fault,
 33.1905 +         * and pagetables can't have changed meantime. XXX Multi-CPU guests?
 33.1906 +         */
 33.1907 +        BUG();
 33.1908 +    }
 33.1909 +
 33.1910 +    /* Ensure that there are no stale writable mappings in any TLB. */
 33.1911 +    /* NB. INVLPG is a serialising instruction: flushes pending updates. */
 33.1912 +#if 1
 33.1913 +    __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
 33.1914 +#else
 33.1915 +    flush_tlb_all();
 33.1916 +#endif
 33.1917 +    PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
 33.1918 +                PTWR_PRINT_WHICH, ptep, pte);
 33.1919 +
 33.1920 +    /*
 33.1921 +     * STEP 2. Validate any modified PTEs.
 33.1922 +     */
 33.1923 +
 33.1924 +    pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
 33.1925 +    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
 33.1926 +    {
 33.1927 +        ol1e = ptwr_info[cpu].ptinfo[which].page[i];
 33.1928 +        nl1e = pl1e[i];
 33.1929 +
 33.1930 +        if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
 33.1931 +            continue;
 33.1932 +
 33.1933 +        /*
 33.1934 +         * Fast path for PTEs that have merely been write-protected
 33.1935 +         * (e.g., during a Unix fork()). A strict reduction in privilege.
 33.1936 +         */
 33.1937 +        if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
 33.1938 +        {
 33.1939 +            if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
 33.1940 +            {
 33.1941 +                if ( unlikely(sl1e != NULL) )
 33.1942 +                    l1pte_propagate_from_guest(
 33.1943 +                        d, &l1_pgentry_val(nl1e), 
 33.1944 +                        &l1_pgentry_val(sl1e[i]));
 33.1945 +                put_page_type(&frame_table[l1_pgentry_to_pfn(nl1e)]);
 33.1946 +            }
 33.1947 +            continue;
 33.1948 +        }
 33.1949 +
 33.1950 +        if ( unlikely(!get_page_from_l1e(nl1e, d)) )
 33.1951 +        {
 33.1952 +            MEM_LOG("ptwr: Could not re-validate l1 page\n");
 33.1953 +            /*
 33.1954 +             * Make the remaining p.t's consistent before crashing, so the
 33.1955 +             * reference counts are correct.
 33.1956 +             */
 33.1957 +            memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
 33.1958 +                   (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
 33.1959 +            unmap_domain_mem(pl1e);
 33.1960 +            ptwr_info[cpu].ptinfo[which].l1va = 0;
 33.1961 +            UNLOCK_BIGLOCK(d);
 33.1962 +            domain_crash();
 33.1963 +        }
 33.1964 +        
 33.1965 +        if ( unlikely(sl1e != NULL) )
 33.1966 +            l1pte_propagate_from_guest(
 33.1967 +                d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
 33.1968 +
 33.1969 +        if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
 33.1970 +            put_page_from_l1e(ol1e, d);
 33.1971 +    }
 33.1972 +    unmap_domain_mem(pl1e);
 33.1973 +
 33.1974 +    /*
 33.1975 +     * STEP 3. Reattach the L1 p.t. page into the current address space.
 33.1976 +     */
 33.1977 +
 33.1978 +    if ( (which == PTWR_PT_ACTIVE) && likely(!shadow_mode(d)) )
 33.1979 +    {
 33.1980 +        pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
 33.1981 +        *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); 
 33.1982 +    }
 33.1983 +
 33.1984 +    /*
 33.1985 +     * STEP 4. Final tidy-up.
 33.1986 +     */
 33.1987 +
 33.1988 +    ptwr_info[cpu].ptinfo[which].l1va = 0;
 33.1989 +
 33.1990 +    if ( unlikely(sl1e != NULL) )
 33.1991 +    {
 33.1992 +        unmap_domain_mem(sl1e);
 33.1993 +        put_shadow_status(d);
 33.1994 +    }
 33.1995 +}
 33.1996 +
 33.1997 +/* Write page fault handler: check if guest is trying to modify a PTE. */
 33.1998 +int ptwr_do_page_fault(unsigned long addr)
 33.1999 +{
 33.2000 +    unsigned long    pte, pfn, l2e;
 33.2001 +    struct pfn_info *page;
 33.2002 +    l2_pgentry_t    *pl2e;
 33.2003 +    int              which, cpu = smp_processor_id();
 33.2004 +    u32              l2_idx;
 33.2005 +
 33.2006 +#ifdef __x86_64__
 33.2007 +    return 0; /* Writable pagetables need fixing for x86_64. */
 33.2008 +#endif
 33.2009 +
 33.2010 +    /*
 33.2011 +     * Attempt to read the PTE that maps the VA being accessed. By checking for
 33.2012 +     * PDE validity in the L2 we avoid many expensive fixups in __get_user().
 33.2013 +     */
 33.2014 +    if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
 33.2015 +           _PAGE_PRESENT) ||
 33.2016 +         __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
 33.2017 +    {
 33.2018 +        return 0;
 33.2019 +    }
 33.2020 +
 33.2021 +    pfn  = pte >> PAGE_SHIFT;
 33.2022 +    page = &frame_table[pfn];
 33.2023 +
 33.2024 +    /* We are looking only for read-only mappings of p.t. pages. */
 33.2025 +    if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
 33.2026 +         ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
 33.2027 +    {
 33.2028 +        return 0;
 33.2029 +    }
 33.2030 +    
 33.2031 +    /* Get the L2 index at which this L1 p.t. is always mapped. */
 33.2032 +    l2_idx = page->u.inuse.type_info & PGT_va_mask;
 33.2033 +    if ( unlikely(l2_idx >= PGT_va_unknown) )
 33.2034 +    {
 33.2035 +        domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
 33.2036 +    }
 33.2037 +    l2_idx >>= PGT_va_shift;
 33.2038 +
 33.2039 +    if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
 33.2040 +    {
 33.2041 +        MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
 33.2042 +        domain_crash();
 33.2043 +    }
 33.2044 +
 33.2045 +    /*
 33.2046 +     * Is the L1 p.t. mapped into the current address space? If so we call it
 33.2047 +     * an ACTIVE p.t., otherwise it is INACTIVE.
 33.2048 +     */
 33.2049 +    pl2e = &linear_l2_table[l2_idx];
 33.2050 +    l2e  = l2_pgentry_val(*pl2e);
 33.2051 +    which = PTWR_PT_INACTIVE;
 33.2052 +    if ( (l2e >> PAGE_SHIFT) == pfn )
 33.2053 +    {
 33.2054 +        /* Check the PRESENT bit to set ACTIVE. */
 33.2055 +        if ( likely(l2e & _PAGE_PRESENT) )
 33.2056 +            which = PTWR_PT_ACTIVE;
 33.2057 +        else {
 33.2058 +            /*
 33.2059 +             * If the PRESENT bit is clear, we may be conflicting with
 33.2060 +             * the current ACTIVE p.t. (it may be the same p.t. mapped
 33.2061 +             * at another virt addr).
 33.2062 +             * The ptwr_flush call below will restore the PRESENT bit.
 33.2063 +             */
 33.2064 +            if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
 33.2065 +                 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
 33.2066 +                which = PTWR_PT_ACTIVE;
 33.2067 +        }
 33.2068 +    }
 33.2069 +    
 33.2070 +    PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
 33.2071 +                "pfn %08lx\n", PTWR_PRINT_WHICH,
 33.2072 +                addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
 33.2073 +    
 33.2074 +    /*
 33.2075 +     * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at 
 33.2076 +     * time. If there is already one, we must flush it out.
 33.2077 +     */
 33.2078 +    if ( ptwr_info[cpu].ptinfo[which].l1va )
 33.2079 +        ptwr_flush(which);
 33.2080 +
 33.2081 +    ptwr_info[cpu].ptinfo[which].l1va   = addr | 1;
 33.2082 +    ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
 33.2083 +    
 33.2084 +    /* For safety, disconnect the L1 p.t. page from current space. */
 33.2085 +    if ( (which == PTWR_PT_ACTIVE) && 
 33.2086 +         likely(!shadow_mode(current->domain)) )
 33.2087 +    {
 33.2088 +        *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
 33.2089 +#if 1
 33.2090 +        flush_tlb(); /* XXX Multi-CPU guests? */
 33.2091 +#else
 33.2092 +        flush_tlb_all();
 33.2093 +#endif
 33.2094 +    }
 33.2095 +    
 33.2096 +    /* Temporarily map the L1 page, and make a copy of it. */
 33.2097 +    ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
 33.2098 +    memcpy(ptwr_info[cpu].ptinfo[which].page,
 33.2099 +           ptwr_info[cpu].ptinfo[which].pl1e,
 33.2100 +           ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
 33.2101 +    
 33.2102 +    /* Finally, make the p.t. page writable by the guest OS. */
 33.2103 +    pte |= _PAGE_RW;
 33.2104 +    PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
 33.2105 +                &linear_pg_table[addr>>PAGE_SHIFT], pte);
 33.2106 +    if ( unlikely(__put_user(pte, (unsigned long *)
 33.2107 +                             &linear_pg_table[addr>>PAGE_SHIFT])) )
 33.2108 +    {
 33.2109 +        MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
 33.2110 +                &linear_pg_table[addr>>PAGE_SHIFT]);
 33.2111 +        /* Toss the writable pagetable state and crash. */
 33.2112 +        unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
 33.2113 +        ptwr_info[cpu].ptinfo[which].l1va = 0;
 33.2114 +        domain_crash();
 33.2115 +    }
 33.2116 +    
 33.2117 +    return EXCRET_fault_fixed;
 33.2118 +}
 33.2119 +
 33.2120 +static __init int ptwr_init(void)
 33.2121 +{
 33.2122 +    int i;
 33.2123 +
 33.2124 +    for ( i = 0; i < smp_num_cpus; i++ )
 33.2125 +    {
 33.2126 +        ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
 33.2127 +            (void *)alloc_xenheap_page();
 33.2128 +        ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
 33.2129 +            (void *)alloc_xenheap_page();
 33.2130 +    }
 33.2131 +
 33.2132 +    return 0;
 33.2133 +}
 33.2134 +__initcall(ptwr_init);
 33.2135 +
 33.2136 +
 33.2137 +
 33.2138 +
 33.2139 +/************************************************************************/
 33.2140 +/************************************************************************/
 33.2141 +/************************************************************************/
 33.2142 +
 33.2143 +#ifndef NDEBUG
 33.2144 +
 33.2145 +void ptwr_status(void)
 33.2146 +{
 33.2147 +    unsigned long pte, *ptep, pfn;
 33.2148 +    struct pfn_info *page;
 33.2149 +    int cpu = smp_processor_id();
 33.2150 +
 33.2151 +    ptep = (unsigned long *)&linear_pg_table
 33.2152 +        [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
 33.2153 +
 33.2154 +    if ( __get_user(pte, ptep) ) {
 33.2155 +        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
 33.2156 +        domain_crash();
 33.2157 +    }
 33.2158 +
 33.2159 +    pfn = pte >> PAGE_SHIFT;
 33.2160 +    page = &frame_table[pfn];
 33.2161 +    printk("need to alloc l1 page %p\n", page);
 33.2162 +    /* make pt page writable */
 33.2163 +    printk("need to make read-only l1-page at %p is %08lx\n",
 33.2164 +           ptep, pte);
 33.2165 +
 33.2166 +    if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
 33.2167 +        return;
 33.2168 +
 33.2169 +    if ( __get_user(pte, (unsigned long *)
 33.2170 +                    ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
 33.2171 +        MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
 33.2172 +                ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
 33.2173 +        domain_crash();
 33.2174 +    }
 33.2175 +    pfn = pte >> PAGE_SHIFT;
 33.2176 +    page = &frame_table[pfn];
 33.2177 +}
 33.2178 +
 33.2179 +void audit_domain(struct domain *d)
 33.2180 +{
 33.2181 +    int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
 33.2182 +
 33.2183 +    void adjust (struct pfn_info *page, int dir, int adjtype)
 33.2184 +    {
 33.2185 +        int count = page->count_info & PGC_count_mask;
 33.2186 +
 33.2187 +        if ( adjtype )
 33.2188 +        {
 33.2189 +            int tcount = page->u.inuse.type_info & PGT_count_mask;
 33.2190 +            
 33.2191 +            ttot++;
 33.2192 +
 33.2193 +            tcount += dir;
 33.2194 +
 33.2195 +            if ( tcount < 0 )
 33.2196 +            {
 33.2197 +                /* This will only come out once. */
 33.2198 +                printk("Audit %d: type count whent below zero pfn=%x "
 33.2199 +                       "taf=%x otaf=%x\n",
 33.2200 +                       d->id, page-frame_table,
 33.2201 +                       page->u.inuse.type_info,
 33.2202 +                       page->tlbflush_timestamp);
 33.2203 +            }
 33.2204 +            
 33.2205 +            page->u.inuse.type_info =
 33.2206 +                (page->u.inuse.type_info & ~PGT_count_mask) | 
 33.2207 +                (tcount & PGT_count_mask);
 33.2208 +        }
 33.2209 +
 33.2210 +        ctot++;
 33.2211 +        count += dir;
 33.2212 +        if ( count < 0 )
 33.2213 +        {
 33.2214 +            /* This will only come out once. */
 33.2215 +            printk("Audit %d: general count whent below zero pfn=%x "
 33.2216 +                   "taf=%x otaf=%x\n",
 33.2217 +                   d->id, page-frame_table,
 33.2218 +                   page->u.inuse.type_info,
 33.2219 +                   page->tlbflush_timestamp);
 33.2220 +        }
 33.2221 +            
 33.2222 +        page->count_info =
 33.2223 +            (page->count_info & ~PGC_count_mask) | 
 33.2224 +            (count & PGC_count_mask);            
 33.2225 +
 33.2226 +    }
 33.2227 +
 33.2228 +    void scan_for_pfn(struct domain *d, unsigned long xpfn)
 33.2229 +    {
 33.2230 +        unsigned long pfn, *pt;
 33.2231 +        struct list_head *list_ent;
 33.2232 +        struct pfn_info *page;
 33.2233 +        int i;
 33.2234 +
 33.2235 +        list_ent = d->page_list.next;
 33.2236 +        for ( i = 0; (list_ent != &d->page_list); i++ )
 33.2237 +        {
 33.2238 +            pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
 33.2239 +            page = &frame_table[pfn];
 33.2240 +            
 33.2241 +            switch ( page->u.inuse.type_info & PGT_type_mask )
 33.2242 +            {
 33.2243 +            case PGT_l1_page_table:
 33.2244 +            case PGT_l2_page_table:
 33.2245 +                pt = map_domain_mem(pfn<<PAGE_SHIFT);
 33.2246 +                for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
 33.2247 +                    if ( (pt[i] & _PAGE_PRESENT) &&
 33.2248 +                         ((pt[i] >> PAGE_SHIFT) == xpfn) )
 33.2249 +                        printk("     found dom=%d i=%x pfn=%lx t=%x c=%x\n",
 33.2250 +                               d->id, i, pfn, page->u.inuse.type_info,
 33.2251 +                               page->count_info);
 33.2252 +                unmap_domain_mem(pt);           
 33.2253 +            }
 33.2254 +
 33.2255 +            list_ent = frame_table[pfn].list.next;
 33.2256 +        }
 33.2257 +
 33.2258 +    }
 33.2259 +
 33.2260 +    void scan_for_pfn_remote(unsigned long xpfn)
 33.2261 +    {
 33.2262 +        struct domain *e;
 33.2263 +        for_each_domain ( e )
 33.2264 +            scan_for_pfn( e, xpfn );            
 33.2265 +    }   
 33.2266 +
 33.2267 +    int i;
 33.2268 +    unsigned long pfn;
 33.2269 +    struct list_head *list_ent;
 33.2270 +    struct pfn_info *page;
 33.2271 +
 33.2272 +    if ( d != current->domain )
 33.2273 +        domain_pause(d);
 33.2274 +    synchronise_pagetables(~0UL);
 33.2275 +
 33.2276 +    printk("pt base=%lx sh_info=%x\n",
 33.2277 +           pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT,
 33.2278 +           virt_to_page(d->shared_info)-frame_table);
 33.2279 +           
 33.2280 +    spin_lock(&d->page_alloc_lock);
 33.2281 +
 33.2282 +    /* PHASE 0 */
 33.2283 +
 33.2284 +    list_ent = d->page_list.next;
 33.2285 +    for ( i = 0; (list_ent != &d->page_list); i++ )
 33.2286 +    {
 33.2287 +        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
 33.2288 +        page = &frame_table[pfn];
 33.2289 +
 33.2290 +        if ( page_get_owner(page) != d )
 33.2291 +            BUG();
 33.2292 +
 33.2293 +        if ( (page->u.inuse.type_info & PGT_count_mask) >
 33.2294 +             (page->count_info & PGC_count_mask) )
 33.2295 +            printk("taf > caf %x %x pfn=%lx\n",
 33.2296 +                   page->u.inuse.type_info, page->count_info, pfn );
 33.2297 + 
 33.2298 +#if 0   /* SYSV shared memory pages plus writeable files. */
 33.2299 +        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && 
 33.2300 +             (page->u.inuse.type_info & PGT_count_mask) > 1 )
 33.2301 +        {
 33.2302 +            printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
 33.2303 +                  pfn,
 33.2304 +                  page->u.inuse.type_info,
 33.2305 +                  page->count_info );
 33.2306 +            scan_for_pfn_remote(pfn);
 33.2307 +        }
 33.2308 +#endif
 33.2309 +        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && 
 33.2310 +             (page->u.inuse.type_info & PGT_count_mask) > 1 )
 33.2311 +        {
 33.2312 +            printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
 33.2313 +                  pfn,
 33.2314 +                  page->u.inuse.type_info,
 33.2315 +                  page->count_info );
 33.2316 +        }
 33.2317 +
 33.2318 +        /* Use tlbflush_timestamp to store original type_info. */
 33.2319 +        page->tlbflush_timestamp = page->u.inuse.type_info;
 33.2320 +
 33.2321 +        list_ent = frame_table[pfn].list.next;
 33.2322 +    }
 33.2323 +
 33.2324 +
 33.2325 +    /* PHASE 1 */
 33.2326 +
 33.2327 +    adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], -1, 1);
 33.2328 +
 33.2329 +    list_ent = d->page_list.next;
 33.2330 +    for ( i = 0; (list_ent != &d->page_list); i++ )
 33.2331 +    {
 33.2332 +        unsigned long *pt;
 33.2333 +        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
 33.2334 +        page = &frame_table[pfn];
 33.2335 +
 33.2336 +        if ( page_get_owner(page) != d )
 33.2337 +            BUG();
 33.2338 +
 33.2339 +        switch ( page->u.inuse.type_info & PGT_type_mask )
 33.2340 +        {
 33.2341 +        case PGT_l2_page_table:
 33.2342 +
 33.2343 +            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
 33.2344 +                printk("Audit %d: L2 not validated %x\n",
 33.2345 +                       d->id, page->u.inuse.type_info);
 33.2346 +
 33.2347 +            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
 33.2348 +                printk("Audit %d: L2 not pinned %x\n",
 33.2349 +                       d->id, page->u.inuse.type_info);
 33.2350 +            else
 33.2351 +                adjust( page, -1, 1 );
 33.2352 +           
 33.2353 +            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 33.2354 +
 33.2355 +            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
 33.2356 +            {
 33.2357 +                if ( pt[i] & _PAGE_PRESENT )
 33.2358 +                {
 33.2359 +                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 33.2360 +                    struct pfn_info *l1page = &frame_table[l1pfn];
 33.2361 +
 33.2362 +                    if ( page_get_owner(l1page) != d )
 33.2363 +                    {
 33.2364 +                        printk("L2: Skip bizarre page belonging to other "
 33.2365 +                               "dom %p\n", page_get_owner(l1page));
 33.2366 +                        continue;
 33.2367 +                    }
 33.2368 +                    
 33.2369 +                    if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
 33.2370 +                         PGT_l2_page_table )
 33.2371 +                        printk("Audit %d: [%x] Found %s Linear PT "
 33.2372 +                               "t=%x pfn=%lx\n", d->id, i, 
 33.2373 +                               (l1pfn==pfn) ? "Self" : "Other",
 33.2374 +                               l1page->u.inuse.type_info,
 33.2375 +                               l1pfn);
 33.2376 +                    else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
 33.2377 +                              PGT_l1_page_table )
 33.2378 +                        printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
 33.2379 +                               d->id, i,
 33.2380 +                               l1page->u.inuse.type_info,
 33.2381 +                               l1pfn);
 33.2382 +
 33.2383 +                    adjust(l1page, -1, 1);
 33.2384 +                }
 33.2385 +            }
 33.2386 +
 33.2387 +            unmap_domain_mem(pt);
 33.2388 +
 33.2389 +            break;
 33.2390 +
 33.2391 +
 33.2392 +        case PGT_l1_page_table:
 33.2393 +            
 33.2394 +            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
 33.2395 +                adjust( page, -1, 1 );
 33.2396 +
 33.2397 +            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
 33.2398 +                printk("Audit %d: L1 not validated %x\n",
 33.2399 +                       d->id, page->u.inuse.type_info);
 33.2400 +#if 0
 33.2401 +            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
 33.2402 +                printk("Audit %d: L1 not pinned %x\n",
 33.2403 +                       d->id, page->u.inuse.type_info);
 33.2404 +#endif
 33.2405 +            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 33.2406 +
 33.2407 +            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
 33.2408 +            {
 33.2409 +                if ( pt[i] & _PAGE_PRESENT )
 33.2410 +                {
 33.2411 +                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 33.2412 +                    struct pfn_info *l1page = &frame_table[l1pfn];
 33.2413 +
 33.2414 +                    if ( l1pfn < 0x100 )
 33.2415 +                    {
 33.2416 +                        lowmem_mappings++;
 33.2417 +                        continue;
 33.2418 +                    }
 33.2419 +
 33.2420 +                    if ( l1pfn > max_page )
 33.2421 +                    {
 33.2422 +                        io_mappings++;
 33.2423 +                        continue;
 33.2424 +                    }
 33.2425 +
 33.2426 +                    if ( pt[i] & _PAGE_RW )
 33.2427 +                    {
 33.2428 +
 33.2429 +                        if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
 33.2430 +                             PGT_l1_page_table ||
 33.2431 +                             (l1page->u.inuse.type_info & PGT_type_mask) ==
 33.2432 +                             PGT_l2_page_table )
 33.2433 +                            printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
 33.2434 +                                   d->id, i,
 33.2435 +                                   l1page->u.inuse.type_info,
 33.2436 +                                   l1pfn);
 33.2437 +
 33.2438 +                    }
 33.2439 +
 33.2440 +                    if ( page_get_owner(l1page) != d )
 33.2441 +                    {
 33.2442 +                        printk("Audit %d: [%lx,%x] Skip foreign page dom=%p "
 33.2443 +                               "pfn=%lx c=%08x t=%08x m2p=%lx\n",
 33.2444 +                               d->id, pfn, i,
 33.2445 +                               page_get_owner(l1page),
 33.2446 +                               l1pfn,
 33.2447 +                               l1page->count_info,
 33.2448 +                               l1page->u.inuse.type_info,
 33.2449 +                               machine_to_phys_mapping[l1pfn]);    
 33.2450 +                        continue;
 33.2451 +                    }
 33.2452 +
 33.2453 +                    adjust(l1page, -1, 0);
 33.2454 +                }
 33.2455 +            }
 33.2456 +
 33.2457 +            unmap_domain_mem(pt);
 33.2458 +
 33.2459 +            break;
 33.2460 +        }       
 33.2461 +
 33.2462 +        list_ent = frame_table[pfn].list.next;
 33.2463 +    }
 33.2464 +
 33.2465 +    if ( (io_mappings > 0) || (lowmem_mappings > 0) )
 33.2466 +        printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
 33.2467 +               d->id, lowmem_mappings, io_mappings);
 33.2468 +
 33.2469 +    /* PHASE 2 */
 33.2470 +
 33.2471 +    ctot = ttot = 0;
 33.2472 +    list_ent = d->page_list.next;
 33.2473 +    for ( i = 0; (list_ent != &d->page_list); i++ )
 33.2474 +    {
 33.2475 +        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
 33.2476 +        page = &frame_table[pfn];
 33.2477 +
 33.2478 +        switch ( page->u.inuse.type_info & PGT_type_mask)
 33.2479 +        {
 33.2480 +        case PGT_l1_page_table:
 33.2481 +        case PGT_l2_page_table:
 33.2482 +            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
 33.2483 +            {
 33.2484 +                printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
 33.2485 +                       d->id, page->u.inuse.type_info, 
 33.2486 +                       page->tlbflush_timestamp,
 33.2487 +                       page->count_info, pfn );
 33.2488 +                scan_for_pfn_remote(pfn);
 33.2489 +            }
 33.2490 +        default:
 33.2491 +            if ( (page->count_info & PGC_count_mask) != 1 )
 33.2492 +            {
 33.2493 +                printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
 33.2494 +                       d->id, 
 33.2495 +                       page->count_info,
 33.2496 +                       page->u.inuse.type_info, 
 33.2497 +                       page->tlbflush_timestamp, pfn );
 33.2498 +                scan_for_pfn_remote(pfn);
 33.2499 +            }
 33.2500 +            break;
 33.2501 +        }
 33.2502 +
 33.2503 +        list_ent = frame_table[pfn].list.next;
 33.2504 +    }
 33.2505 +
 33.2506 +    /* PHASE 3 */
 33.2507 +    list_ent = d->page_list.next;
 33.2508 +    for ( i = 0; (list_ent != &d->page_list); i++ )
 33.2509 +    {
 33.2510 +        unsigned long *pt;
 33.2511 +        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
 33.2512 +        page = &frame_table[pfn];
 33.2513 +
 33.2514 +        switch ( page->u.inuse.type_info & PGT_type_mask )
 33.2515 +        {
 33.2516 +        case PGT_l2_page_table:
 33.2517 +            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
 33.2518 +                adjust( page, 1, 1 );          
 33.2519 +
 33.2520 +            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 33.2521 +
 33.2522 +            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
 33.2523 +            {
 33.2524 +                if ( pt[i] & _PAGE_PRESENT )
 33.2525 +                {
 33.2526 +                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 33.2527 +                    struct pfn_info *l1page;
 33.2528 +
 33.2529 +                    if (l1pfn>max_page)
 33.2530 +                        continue;
 33.2531 +
 33.2532 +                    l1page = &frame_table[l1pfn];
 33.2533 +
 33.2534 +                    if ( page_get_owner(l1page) == d )
 33.2535 +                        adjust(l1page, 1, 1);
 33.2536 +                }
 33.2537 +            }
 33.2538 +
 33.2539 +            unmap_domain_mem(pt);
 33.2540 +            break;
 33.2541 +
 33.2542 +        case PGT_l1_page_table:
 33.2543 +            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
 33.2544 +                adjust( page, 1, 1 );
 33.2545 +
 33.2546 +            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 33.2547 +
 33.2548 +            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
 33.2549 +            {
 33.2550 +                if ( pt[i] & _PAGE_PRESENT )
 33.2551 +                {
 33.2552 +                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 33.2553 +                    struct pfn_info *l1page;
 33.2554 +
 33.2555 +                    if (l1pfn>max_page)
 33.2556 +                        continue;
 33.2557 +
 33.2558 +                    l1page = &frame_table[l1pfn];
 33.2559 +
 33.2560 +                    if ( (page_get_owner(l1page) != d) ||
 33.2561 +                         (l1pfn < 0x100) || (l1pfn > max_page) )
 33.2562 +                        continue;
 33.2563 +
 33.2564 +                    adjust(l1page, 1, 0);
 33.2565 +                }
 33.2566 +            }
 33.2567 +
 33.2568 +            unmap_domain_mem(pt);
 33.2569 +            break;
 33.2570 +        }
 33.2571 +
 33.2572 +
 33.2573 +        page->tlbflush_timestamp = 0;
 33.2574 +
 33.2575 +        list_ent = frame_table[pfn].list.next;
 33.2576 +    }
 33.2577 +
 33.2578 +    spin_unlock(&d->page_alloc_lock);
 33.2579 +
 33.2580 +    adjust(&frame_table[pagetable_val(
 33.2581 +        d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1);
 33.2582 +
 33.2583 +    printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
 33.2584 +
 33.2585 +    if ( d != current->domain )
 33.2586 +        domain_unpause(d);
 33.2587 +}
 33.2588 +
 33.2589 +void audit_domains(void)
 33.2590 +{
 33.2591 +    struct domain *d;
 33.2592 +    for_each_domain ( d )
 33.2593 +        audit_domain(d);
 33.2594 +}
 33.2595 +
 33.2596 +void audit_domains_key(unsigned char key)
 33.2597 +{
 33.2598 +    audit_domains();
 33.2599 +}
 33.2600 +
 33.2601 +#endif
    34.1 --- a/xen/arch/x86/setup.c	Mon Feb 07 08:19:24 2005 +0000
    34.2 +++ b/xen/arch/x86/setup.c	Tue Feb 08 16:44:16 2005 +0000
    34.3 @@ -298,19 +298,21 @@ void __init identify_cpu(struct cpuinfo_
    34.4  unsigned long cpu_initialized;
    34.5  void __init cpu_init(void)
    34.6  {
    34.7 -    extern void percpu_traps_init(void);
    34.8      int nr = smp_processor_id();
    34.9      struct tss_struct *t = &init_tss[nr];
   34.10 +    unsigned char idt_load[10];
   34.11  
   34.12      if ( test_and_set_bit(nr, &cpu_initialized) )
   34.13          panic("CPU#%d already initialized!!!\n", nr);
   34.14      printk("Initializing CPU#%d\n", nr);
   34.15  
   34.16 -    /* Set up GDT and IDT. */
   34.17      SET_GDT_ENTRIES(current, DEFAULT_GDT_ENTRIES);
   34.18      SET_GDT_ADDRESS(current, DEFAULT_GDT_ADDRESS);
   34.19      __asm__ __volatile__ ( "lgdt %0" : "=m" (*current->arch.gdt) );
   34.20 -    __asm__ __volatile__ ( "lidt %0" : "=m" (idt_descr) );
   34.21 +
   34.22 +    *(unsigned short *)(&idt_load[0]) = (IDT_ENTRIES*sizeof(idt_entry_t))-1;
   34.23 +    *(unsigned long  *)(&idt_load[2]) = (unsigned long)idt_tables[nr];
   34.24 +    __asm__ __volatile__ ( "lidt %0" : "=m" (idt_load) );
   34.25  
   34.26      /* No nested task. */
   34.27      __asm__ __volatile__ ( "pushf ; andw $0xbfff,(%"__OP"sp) ; popf" );
   34.28 @@ -336,8 +338,6 @@ void __init cpu_init(void)
   34.29      CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
   34.30  #undef CD
   34.31  
   34.32 -    percpu_traps_init();
   34.33 -
   34.34      /* Install correct page table. */
   34.35      write_ptbase(current);
   34.36  
    35.1 --- a/xen/arch/x86/shadow.c	Mon Feb 07 08:19:24 2005 +0000
    35.2 +++ b/xen/arch/x86/shadow.c	Tue Feb 08 16:44:16 2005 +0000
    35.3 @@ -73,11 +73,11 @@ static void free_shadow_state(struct dom
    35.4  
    35.5          /* Free the head page. */
    35.6          free_shadow_page(
    35.7 -            d, &frame_table[x->spfn_and_flags & PSH_pfn_mask]);
    35.8 +            d, &frame_table[x->smfn_and_flags & PSH_pfn_mask]);
    35.9  
   35.10          /* Reinitialise the head node. */
   35.11          x->pfn            = 0;
   35.12 -        x->spfn_and_flags = 0;
   35.13 +        x->smfn_and_flags = 0;
   35.14          n                 = x->next;
   35.15          x->next           = NULL;
   35.16  
   35.17 @@ -88,11 +88,11 @@ static void free_shadow_state(struct dom
   35.18          { 
   35.19              /* Free the shadow page. */
   35.20              free_shadow_page(
   35.21 -                d, &frame_table[x->spfn_and_flags & PSH_pfn_mask]);
   35.22 +                d, &frame_table[x->smfn_and_flags & PSH_pfn_mask]);
   35.23  
   35.24              /* Re-initialise the chain node. */
   35.25              x->pfn            = 0;
   35.26 -            x->spfn_and_flags = 0;
   35.27 +            x->smfn_and_flags = 0;
   35.28  
   35.29              /* Add to the free list. */
   35.30              n                 = x->next;
   35.31 @@ -113,14 +113,14 @@ static inline int clear_shadow_page(
   35.32  {
   35.33      unsigned long   *p;
   35.34      int              restart = 0;
   35.35 -    struct pfn_info *spage = &frame_table[x->spfn_and_flags & PSH_pfn_mask];
   35.36 +    struct pfn_info *spage = &frame_table[x->smfn_and_flags & PSH_pfn_mask];
   35.37  
   35.38      switch ( spage->u.inuse.type_info & PGT_type_mask )
   35.39      {
   35.40          /* We clear L2 pages by zeroing the guest entries. */
   35.41      case PGT_l2_page_table:
   35.42          p = map_domain_mem((spage - frame_table) << PAGE_SHIFT);
   35.43 -        if (d->arch.shadow_mode == SHM_full_32)
   35.44 +        if ( shadow_mode(d) == SHM_full_32 )
   35.45              memset(p, 0, ENTRIES_PER_L2_PAGETABLE * sizeof(*p));
   35.46          else 
   35.47              memset(p, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*p));
   35.48 @@ -419,7 +419,7 @@ static inline struct pfn_info *alloc_sha
   35.49  
   35.50  void unshadow_table(unsigned long gpfn, unsigned int type)
   35.51  {
   35.52 -    unsigned long  spfn;
   35.53 +    unsigned long  smfn;
   35.54      struct domain *d = page_get_owner(&frame_table[gpfn]);
   35.55  
   35.56      SH_VLOG("unshadow_table type=%08x gpfn=%08lx", type, gpfn);
   35.57 @@ -431,15 +431,15 @@ void unshadow_table(unsigned long gpfn, 
   35.58       * guests there won't be a race here as this CPU was the one that 
   35.59       * cmpxchg'ed the page to invalid.
   35.60       */
   35.61 -    spfn = __shadow_status(d, gpfn) & PSH_pfn_mask;
   35.62 +    smfn = __shadow_status(d, gpfn) & PSH_pfn_mask;
   35.63      delete_shadow_status(d, gpfn);
   35.64 -    free_shadow_page(d, &frame_table[spfn]);
   35.65 +    free_shadow_page(d, &frame_table[smfn]);
   35.66  }
   35.67  
   35.68  #ifdef CONFIG_VMX
   35.69  void vmx_shadow_clear_state(struct domain *d)
   35.70  {
   35.71 -    SH_VVLOG("vmx_clear_shadow_state: \n");
   35.72 +    SH_VVLOG("vmx_clear_shadow_state:");
   35.73      clear_shadow_state(d);
   35.74  }
   35.75  #endif
   35.76 @@ -453,7 +453,7 @@ unsigned long shadow_l2_table(
   35.77      l2_pgentry_t    *spl2e = 0;
   35.78      unsigned long guest_gpfn;
   35.79  
   35.80 -    __get_machine_to_phys(d, guest_gpfn, gpfn);
   35.81 +    guest_gpfn = __mfn_to_gpfn(d, gpfn);
   35.82  
   35.83      SH_VVLOG("shadow_l2_table( %08lx )", gpfn);
   35.84  
   35.85 @@ -471,9 +471,13 @@ unsigned long shadow_l2_table(
   35.86   
   35.87  #ifdef __i386__
   35.88      /* Install hypervisor and 2x linear p.t. mapings. */
   35.89 -    if ( d->arch.shadow_mode == SHM_full_32 )
   35.90 +    if ( shadow_mode(d) == SHM_full_32 )
   35.91      {
   35.92 +#ifdef CONFIG_VMX
   35.93          vmx_update_shadow_state(d->exec_domain[0], gpfn, spfn);
   35.94 +#else
   35.95 +        panic("Shadow Full 32 not yet implemented without VMX\n");
   35.96 +#endif
   35.97      }
   35.98      else
   35.99      {
  35.100 @@ -499,7 +503,7 @@ unsigned long shadow_l2_table(
  35.101      }
  35.102  #endif
  35.103  
  35.104 -    if ( d->arch.shadow_mode != SHM_full_32 ) 
  35.105 +    if ( shadow_mode(d) != SHM_full_32 ) 
  35.106          unmap_domain_mem(spl2e);
  35.107  
  35.108      SH_VLOG("shadow_l2_table( %08lx -> %08lx)", gpfn, spfn);
  35.109 @@ -510,13 +514,13 @@ static void shadow_map_l1_into_current_l
  35.110  { 
  35.111      struct exec_domain *ed = current;
  35.112      struct domain *d = ed->domain;
  35.113 -    unsigned long    *gpl1e, *spl1e, gpl2e, spl2e, gl1pfn, sl1pfn=0, sl1ss;
  35.114 +    unsigned long    *gpl1e, *spl1e, gl2e, sl2e, gl1pfn, sl1pfn=0, sl1ss;
  35.115      struct pfn_info  *sl1pfn_info;
  35.116      int               i;
  35.117  
  35.118 -    __guest_get_pl2e(ed, va, &gpl2e);
  35.119 +    __guest_get_l2e(ed, va, &gl2e);
  35.120  
  35.121 -    gl1pfn = gpl2e >> PAGE_SHIFT;
  35.122 +    gl1pfn = gl2e >> PAGE_SHIFT;
  35.123  
  35.124      sl1ss = __shadow_status(d, gl1pfn);
  35.125      if ( !(sl1ss & PSH_shadowed) )
  35.126 @@ -534,10 +538,10 @@ static void shadow_map_l1_into_current_l
  35.127  
  35.128          set_shadow_status(d, gl1pfn, PSH_shadowed | sl1pfn);
  35.129  
  35.130 -        l2pde_general(d, &gpl2e, &spl2e, sl1pfn);
  35.131 +        l2pde_general(d, &gl2e, &sl2e, sl1pfn);
  35.132  
  35.133 -        __guest_set_pl2e(ed, va, gpl2e);
  35.134 -        __shadow_set_pl2e(ed, va, spl2e);
  35.135 +        __guest_set_l2e(ed, va, gl2e);
  35.136 +        __shadow_set_l2e(ed, va, sl2e);
  35.137  
  35.138          gpl1e = (unsigned long *) &(linear_pg_table[
  35.139              (va>>L1_PAGETABLE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1)]);
  35.140 @@ -554,9 +558,9 @@ static void shadow_map_l1_into_current_l
  35.141          SH_VVLOG("4b: was shadowed, l2 missing ( %08lx )", sl1pfn);
  35.142  
  35.143          sl1pfn = sl1ss & PSH_pfn_mask;
  35.144 -        l2pde_general(d, &gpl2e, &spl2e, sl1pfn);
  35.145 -        __guest_set_pl2e(ed, va, gpl2e);
  35.146 -        __shadow_set_pl2e(ed, va, spl2e);
  35.147 +        l2pde_general(d, &gl2e, &sl2e, sl1pfn);
  35.148 +        __guest_set_l2e(ed, va, gl2e);
  35.149 +        __shadow_set_l2e(ed, va, sl2e);
  35.150      }              
  35.151  }
  35.152  
  35.153 @@ -576,7 +580,7 @@ void vmx_shadow_invlpg(struct domain *d,
  35.154          return;
  35.155      }
  35.156  
  35.157 -    host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT];
  35.158 +    host_pfn = phys_to_machine_mapping(gpte >> PAGE_SHIFT);
  35.159      spte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
  35.160  
  35.161      if (__put_user(spte, (unsigned long *)
  35.162 @@ -588,7 +592,7 @@ void vmx_shadow_invlpg(struct domain *d,
  35.163  
  35.164  int shadow_fault(unsigned long va, long error_code)
  35.165  {
  35.166 -    unsigned long gpte, spte;
  35.167 +    unsigned long gpte, spte = 0;
  35.168      struct exec_domain *ed = current;
  35.169      struct domain *d = ed->domain;
  35.170  
  35.171 @@ -628,14 +632,14 @@ int shadow_fault(unsigned long va, long 
  35.172      if ( unlikely(__get_user(gpte, (unsigned long *)
  35.173                               &linear_pg_table[va >> PAGE_SHIFT])) )
  35.174      {
  35.175 -        SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
  35.176 +        SH_VVLOG("shadow_fault - EXIT: read gpte faulted2" );
  35.177          shadow_unlock(d);
  35.178          return 0;
  35.179      }
  35.180  
  35.181      if ( unlikely(!(gpte & _PAGE_PRESENT)) )
  35.182      {
  35.183 -        SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
  35.184 +        SH_VVLOG("shadow_fault - EXIT: gpte not present2 (%lx)",gpte );
  35.185          shadow_unlock(d);
  35.186          return 0;
  35.187      }
  35.188 @@ -691,20 +695,20 @@ int shadow_fault(unsigned long va, long 
  35.189  
  35.190  void shadow_l1_normal_pt_update(
  35.191      unsigned long pa, unsigned long gpte,
  35.192 -    unsigned long *prev_spfn_ptr,
  35.193 +    unsigned long *prev_smfn_ptr,
  35.194      l1_pgentry_t **prev_spl1e_ptr)
  35.195  {
  35.196 -    unsigned long spfn, spte, prev_spfn = *prev_spfn_ptr;    
  35.197 +    unsigned long smfn, spte, prev_smfn = *prev_smfn_ptr;    
  35.198      l1_pgentry_t *spl1e, *prev_spl1e = *prev_spl1e_ptr;
  35.199  
  35.200      /* N.B. To get here, we know the l1 page *must* be shadowed. */
  35.201      SH_VVLOG("shadow_l1_normal_pt_update pa=%08lx, gpte=%08lx, "
  35.202 -             "prev_spfn=%08lx, prev_spl1e=%p\n",
  35.203 -             pa, gpte, prev_spfn, prev_spl1e);
  35.204 +             "prev_smfn=%08lx, prev_spl1e=%p",
  35.205 +             pa, gpte, prev_smfn, prev_spl1e);
  35.206  
  35.207 -    spfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask;
  35.208 +    smfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask;
  35.209  
  35.210 -    if ( spfn == prev_spfn )
  35.211 +    if ( smfn == prev_smfn )
  35.212      {
  35.213          spl1e = prev_spl1e;
  35.214      }
  35.215 @@ -712,8 +716,8 @@ void shadow_l1_normal_pt_update(
  35.216      {
  35.217          if ( prev_spl1e != NULL )
  35.218              unmap_domain_mem( prev_spl1e );
  35.219 -        spl1e = (l1_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT);
  35.220 -        *prev_spfn_ptr  = spfn;
  35.221 +        spl1e = (l1_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
  35.222 +        *prev_smfn_ptr  = smfn;
  35.223          *prev_spl1e_ptr = spl1e;
  35.224      }
  35.225  
  35.226 @@ -721,24 +725,24 @@ void shadow_l1_normal_pt_update(
  35.227      spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = mk_l1_pgentry(spte);
  35.228  }
  35.229  
  35.230 -void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpte)
  35.231 +void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpde)
  35.232  {
  35.233 -    unsigned long spfn, spte;
  35.234 +    unsigned long sl2mfn, spde;
  35.235      l2_pgentry_t *spl2e;
  35.236 -    unsigned long s_sh;
  35.237 +    unsigned long sl1mfn;
  35.238  
  35.239      /* N.B. To get here, we know the l2 page *must* be shadowed. */
  35.240 -    SH_VVLOG("shadow_l2_normal_pt_update pa=%08lx, gpte=%08lx",pa,gpte);
  35.241 +    SH_VVLOG("shadow_l2_normal_pt_update pa=%08lx, gpde=%08lx",pa,gpde);
  35.242  
  35.243 -    spfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask;
  35.244 +    sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT) & PSH_pfn_mask;
  35.245  
  35.246 -    s_sh = (gpte & _PAGE_PRESENT) ?
  35.247 -        __shadow_status(current->domain, gpte >> PAGE_SHIFT) : 0;
  35.248 +    sl1mfn = (gpde & _PAGE_PRESENT) ?
  35.249 +        __shadow_status(current->domain, gpde >> PAGE_SHIFT) : 0;
  35.250  
  35.251      /* XXXX Should mark guest pte as DIRTY and ACCESSED too! */
  35.252 -    l2pde_general(current->domain, &gpte, &spte, s_sh);
  35.253 -    spl2e = (l2_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT);
  35.254 -    spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)] = mk_l2_pgentry(spte);
  35.255 +    l2pde_general(current->domain, &gpde, &spde, sl1mfn);
  35.256 +    spl2e = (l2_pgentry_t *)map_domain_mem(sl2mfn << PAGE_SHIFT);
  35.257 +    spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)] = mk_l2_pgentry(spde);
  35.258      unmap_domain_mem(spl2e);
  35.259  }
  35.260  
  35.261 @@ -751,23 +755,36 @@ void shadow_l2_normal_pt_update(unsigned
  35.262  
  35.263  #if SHADOW_DEBUG
  35.264  
  35.265 +// BUG: these are not SMP safe...
  35.266  static int sh_l2_present;
  35.267  static int sh_l1_present;
  35.268 +static int errors;
  35.269  char * sh_check_name;
  35.270  
  35.271 -#define FAIL(_f, _a...)                                        \
  35.272 -    do {                                                       \
  35.273 -        printk("XXX %s-FAIL (%d,%d)" _f " g=%08lx s=%08lx\n",  \
  35.274 -               sh_check_name, level, i, ## _a , gpte, spte);   \
  35.275 -        BUG();                                                 \
  35.276 +#define virt_to_phys2(adr) ({                                            \
  35.277 +    unsigned long _a = (unsigned long)(adr);                             \
  35.278 +    unsigned long _pte = l1_pgentry_val(                                 \
  35.279 +                            shadow_linear_pg_table[_a >> PAGE_SHIFT]);   \
  35.280 +    unsigned long _pa = _pte & PAGE_MASK;                                \
  35.281 +    _pa | (_a & ~PAGE_MASK);                                             \
  35.282 +})
  35.283 +
  35.284 +#define FAIL(_f, _a...)                                                      \
  35.285 +    do {                                                                     \
  35.286 +        printk("XXX %s-FAIL (%d,%d)" _f " g=%08lx s=%08lx &g=%08lx &s=%08lx" \
  35.287 +               " pa(&g)=%08lx pa(&s)=%08lx\n",                               \
  35.288 +               sh_check_name, level, i, ## _a , gpte, spte, pgpte, pspte,    \
  35.289 +               virt_to_phys2(pgpte), virt_to_phys2(pspte));                  \
  35.290 +        errors++;                                                            \
  35.291      } while ( 0 )
  35.292  
  35.293  static int check_pte(
  35.294 -    struct domain *d, unsigned long gpte, unsigned long spte, 
  35.295 +    struct domain *d, unsigned long *pgpte, unsigned long *pspte, 
  35.296      int level, int i)
  35.297  {
  35.298 -    unsigned long mask, gpfn, spfn;
  35.299 -    unsigned long guest_gpfn;
  35.300 +    unsigned gpte = *pgpte;
  35.301 +    unsigned spte = *pspte;
  35.302 +    unsigned long mask, gpfn, smfn;
  35.303  
  35.304      if ( (spte == 0) || (spte == 0xdeadface) || (spte == 0x00000E00) )
  35.305          return 1;  /* always safe */
  35.306 @@ -781,7 +798,7 @@ static int check_pte(
  35.307      if ( !(gpte & _PAGE_PRESENT) )
  35.308          FAIL("Guest not present yet shadow is");
  35.309  
  35.310 -    mask = ~(_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|0xFFFFF000);
  35.311 +    mask = ~(_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|PAGE_MASK);
  35.312  
  35.313      if ( (spte & mask) != (gpte & mask) )
  35.314          FAIL("Corrupt?");
  35.315 @@ -798,10 +815,10 @@ static int check_pte(
  35.316      if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY)) )
  35.317          FAIL("RW2 coherence");
  35.318   
  35.319 -    spfn = spte >> PAGE_SHIFT;
  35.320 +    smfn = spte >> PAGE_SHIFT;
  35.321      gpfn = gpte >> PAGE_SHIFT;
  35.322  
  35.323 -    if ( gpfn == spfn )
  35.324 +    if ( gpfn == smfn )
  35.325      {
  35.326          if ( level > 1 )
  35.327              FAIL("Linear map ???");    /* XXX this will fail on BSD */
  35.328 @@ -811,20 +828,9 @@ static int check_pte(
  35.329          if ( level < 2 )
  35.330              FAIL("Shadow in L1 entry?");
  35.331  
  35.332 -        if (d->arch.shadow_mode == SHM_full_32) {
  35.333 -
  35.334 -            guest_gpfn = phys_to_machine_mapping[gpfn];
  35.335 -
  35.336 -            if ( __shadow_status(d, guest_gpfn) != (PSH_shadowed | spfn) )
  35.337 -                FAIL("spfn problem g.sf=%08lx", 
  35.338 -                     __shadow_status(d, guest_gpfn) );
  35.339 -            
  35.340 -        } else {
  35.341 -            if ( __shadow_status(d, gpfn) != (PSH_shadowed | spfn) )
  35.342 -                FAIL("spfn problem g.sf=%08lx", 
  35.343 -                     __shadow_status(d, gpfn) );
  35.344 -        }
  35.345 -
  35.346 +        if ( __shadow_status(d, gpfn) != (PSH_shadowed | smfn) )
  35.347 +            FAIL("smfn problem g.sf=%08lx", 
  35.348 +                 __shadow_status(d, gpfn) );
  35.349      }
  35.350  
  35.351      return 1;
  35.352 @@ -832,17 +838,17 @@ static int check_pte(
  35.353  
  35.354  
  35.355  static int check_l1_table(
  35.356 -    struct domain *d, unsigned long va, 
  35.357 -    unsigned long g2, unsigned long s2)
  35.358 +    struct domain *d,
  35.359 +    unsigned long g2mfn, unsigned long s2mfn)
  35.360  {
  35.361      int i;
  35.362      unsigned long *gpl1e, *spl1e;
  35.363  
  35.364 -    gpl1e = map_domain_mem(g2 << PAGE_SHIFT);
  35.365 -    spl1e = map_domain_mem(s2 << PAGE_SHIFT);
  35.366 +    gpl1e = map_domain_mem(g2mfn << PAGE_SHIFT);
  35.367 +    spl1e = map_domain_mem(s2mfn << PAGE_SHIFT);
  35.368  
  35.369      for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
  35.370 -        check_pte(d, gpl1e[i], spl1e[i], 1, i);
  35.371 +        check_pte(d, &gpl1e[i], &spl1e[i], 1, i);
  35.372   
  35.373      unmap_domain_mem(spl1e);
  35.374      unmap_domain_mem(gpl1e);
  35.375 @@ -853,49 +859,46 @@ static int check_l1_table(
  35.376  #define FAILPT(_f, _a...)                                      \
  35.377      do {                                                       \
  35.378          printk("XXX FAIL %s-PT" _f "\n", s, ## _a );           \
  35.379 -        BUG();                                                 \
  35.380 +        errors++;                                              \
  35.381      } while ( 0 )
  35.382  
  35.383 -int check_pagetable(struct domain *d, pagetable_t pt, char *s)
  35.384 +void check_pagetable(struct domain *d, pagetable_t pt, char *s)
  35.385  {
  35.386      unsigned long gptbase = pagetable_val(pt);
  35.387 -    unsigned long gpfn, spfn;
  35.388 +    unsigned long ptbase_pfn, smfn, ss;
  35.389      unsigned long i;
  35.390      l2_pgentry_t *gpl2e, *spl2e;
  35.391 -    unsigned long host_gpfn = 0;
  35.392 +    unsigned long ptbase_mfn = 0;
  35.393 +    int cpu = current->processor;
  35.394  
  35.395 +    errors = 0;
  35.396      sh_check_name = s;
  35.397  
  35.398      SH_VVLOG("%s-PT Audit", s);
  35.399  
  35.400      sh_l2_present = sh_l1_present = 0;
  35.401  
  35.402 -    gpfn = gptbase >> PAGE_SHIFT;
  35.403 +    ptbase_pfn = gptbase >> PAGE_SHIFT;
  35.404 +    ptbase_mfn = __gpfn_to_mfn(d, ptbase_pfn);
  35.405  
  35.406 -    __get_phys_to_machine(d, host_gpfn, gpfn);
  35.407 +    ss = __shadow_status(d, ptbase_pfn);
  35.408    
  35.409 -    if ( ! (__shadow_status(d, gpfn) & PSH_shadowed) )
  35.410 +    if ( ! (ss & PSH_shadowed) )
  35.411      {
  35.412          printk("%s-PT %08lx not shadowed\n", s, gptbase);
  35.413  
  35.414 -        if( __shadow_status(d, gpfn) != 0 ) BUG();
  35.415 -            return 0;
  35.416 +        if ( ss != 0 )
  35.417 +            BUG();
  35.418 +        return;
  35.419      }   
  35.420   
  35.421 -    spfn = __shadow_status(d, gpfn) & PSH_pfn_mask;
  35.422 -
  35.423 -    if ( ! __shadow_status(d, gpfn) == (PSH_shadowed | spfn) )
  35.424 -            FAILPT("ptbase shadow inconsistent1");
  35.425 +    smfn = ss & PSH_pfn_mask;
  35.426  
  35.427 -    if (d->arch.shadow_mode == SHM_full_32) 
  35.428 -    {
  35.429 -        host_gpfn = phys_to_machine_mapping[gpfn];
  35.430 -        gpl2e = (l2_pgentry_t *) map_domain_mem( host_gpfn << PAGE_SHIFT );
  35.431 +    if ( ss != (PSH_shadowed | smfn) )
  35.432 +        FAILPT("ptbase shadow inconsistent1");
  35.433  
  35.434 -    } else
  35.435 -        gpl2e = (l2_pgentry_t *) map_domain_mem( gpfn << PAGE_SHIFT );
  35.436 -
  35.437 -    spl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
  35.438 +    gpl2e = (l2_pgentry_t *) map_domain_mem( ptbase_mfn << PAGE_SHIFT );
  35.439 +    spl2e = (l2_pgentry_t *) map_domain_mem( smfn << PAGE_SHIFT );
  35.440  
  35.441      if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  35.442                  &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
  35.443 @@ -916,40 +919,60 @@ int check_pagetable(struct domain *d, pa
  35.444  
  35.445      if ( (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> 
  35.446                                L2_PAGETABLE_SHIFT]) != 
  35.447 -          ((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR)) )
  35.448 +          ((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR)) )
  35.449          FAILPT("hypervisor shadow linear map inconsistent %08lx %08lx",
  35.450                 l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >>
  35.451                                     L2_PAGETABLE_SHIFT]),
  35.452 -               (spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  35.453 +               (smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  35.454  
  35.455 -    if (d->arch.shadow_mode != SHM_full_32) {
  35.456 +    if ( shadow_mode(d) != SHM_full_32 ) {
  35.457 +        // BUG: this shouldn't be using exec_domain[0] here...
  35.458          if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
  35.459 -              ((__pa(page_get_owner(&frame_table[gpfn])->arch.mm_perdomain_pt) | 
  35.460 +              ((__pa(page_get_owner(&frame_table[ptbase_pfn])->arch.mm_perdomain_pt) | 
  35.461              __PAGE_HYPERVISOR))) )
  35.462              FAILPT("hypervisor per-domain map inconsistent");
  35.463      }
  35.464  
  35.465      /* Check the whole L2. */
  35.466      for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  35.467 -        check_pte(d, l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]), 2, i);
  35.468 +        check_pte(d, &l2_pgentry_val(gpl2e[i]), &l2_pgentry_val(spl2e[i]), 2, i);
  35.469  
  35.470      /* Go back and recurse. */
  35.471      for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  35.472      {
  35.473 +        unsigned long gl1pfn = l2_pgentry_val(gpl2e[i]) >> PAGE_SHIFT;
  35.474 +        unsigned long gl1mfn = __gpfn_to_mfn(d, gl1pfn);
  35.475 +        unsigned long sl1mfn = l2_pgentry_val(spl2e[i]) >> PAGE_SHIFT;
  35.476 +
  35.477          if ( l2_pgentry_val(spl2e[i]) != 0 )
  35.478 -            check_l1_table(
  35.479 -                d, i << L2_PAGETABLE_SHIFT,
  35.480 -                l2_pgentry_val(gpl2e[i]) >> PAGE_SHIFT, 
  35.481 -                l2_pgentry_val(spl2e[i]) >> PAGE_SHIFT);
  35.482 +        {
  35.483 +            // First check to see if this guest page is currently the active
  35.484 +            // PTWR page.  If so, then we compare the (old) cached copy of the
  35.485 +            // guest page to the shadow, and not the currently writable (and
  35.486 +            // thus potentially out-of-sync) guest page.
  35.487 +            //
  35.488 +            if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
  35.489 +                 (i == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx) &&
  35.490 +                 likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
  35.491 +            {
  35.492 +                gl1mfn = (__pa(ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].page) >>
  35.493 +                          PAGE_SHIFT);
  35.494 +            }
  35.495 +
  35.496 +            check_l1_table(d, gl1mfn, sl1mfn);
  35.497 +        }
  35.498      }
  35.499  
  35.500      unmap_domain_mem(spl2e);
  35.501      unmap_domain_mem(gpl2e);
  35.502  
  35.503 -    SH_VVLOG("PT verified : l2_present = %d, l1_present = %d\n",
  35.504 +    SH_VVLOG("PT verified : l2_present = %d, l1_present = %d",
  35.505               sh_l2_present, sh_l1_present);
  35.506   
  35.507 -    return 1;
  35.508 +    if ( errors )
  35.509 +        BUG();
  35.510 +
  35.511 +    return;
  35.512  }
  35.513  
  35.514 -#endif
  35.515 +#endif // SHADOW_DEBUG
    36.1 --- a/xen/arch/x86/smpboot.c	Mon Feb 07 08:19:24 2005 +0000
    36.2 +++ b/xen/arch/x86/smpboot.c	Tue Feb 08 16:44:16 2005 +0000
    36.3 @@ -388,33 +388,27 @@ static int cpucount;
    36.4  void __init start_secondary(void)
    36.5  {
    36.6      unsigned int cpu = cpucount;
    36.7 -    /* 6 bytes suitable for passing to LIDT instruction. */
    36.8 -    unsigned char idt_load[6];
    36.9  
   36.10 +    extern void percpu_traps_init(void);
   36.11      extern void cpu_init(void);
   36.12  
   36.13      set_current(idle_task[cpu]);
   36.14  
   36.15      /*
   36.16 -     * Dont put anything before smp_callin(), SMP
   36.17 -     * booting is too fragile that we want to limit the
   36.18 -     * things done here to the most necessary things.
   36.19 -     */
   36.20 -    cpu_init();
   36.21 -    smp_callin();
   36.22 -
   36.23 -    while (!atomic_read(&smp_commenced))
   36.24 -        rep_nop();
   36.25 -
   36.26 -    /*
   36.27       * At this point, boot CPU has fully initialised the IDT. It is
   36.28       * now safe to make ourselves a private copy.
   36.29       */
   36.30      idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES);
   36.31      memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES*sizeof(idt_entry_t));
   36.32 -    *(unsigned short *)(&idt_load[0]) = (IDT_ENTRIES*sizeof(idt_entry_t))-1;
   36.33 -    *(unsigned long  *)(&idt_load[2]) = (unsigned long)idt_tables[cpu];
   36.34 -    __asm__ __volatile__ ( "lidt %0" : "=m" (idt_load) );
   36.35 +
   36.36 +    percpu_traps_init();
   36.37 +
   36.38 +    cpu_init();
   36.39 +
   36.40 +    smp_callin();
   36.41 +
   36.42 +    while (!atomic_read(&smp_commenced))
   36.43 +        rep_nop();
   36.44  
   36.45      /*
   36.46       * low-memory mappings have been cleared, flush them from the local TLBs 
    37.1 --- a/xen/arch/x86/traps.c	Mon Feb 07 08:19:24 2005 +0000
    37.2 +++ b/xen/arch/x86/traps.c	Tue Feb 08 16:44:16 2005 +0000
    37.3 @@ -149,6 +149,11 @@ static inline int do_trap(int trapnr, ch
    37.4      if ( !GUEST_FAULT(regs) )
    37.5          goto xen_fault;
    37.6  
    37.7 +#ifndef NDEBUG
    37.8 +    if ( (ed->arch.traps[trapnr].address == 0) && (ed->domain->id == 0) )
    37.9 +        goto xen_fault;
   37.10 +#endif
   37.11 +
   37.12      ti = current->arch.traps + trapnr;
   37.13      tb->flags = TBF_EXCEPTION;
   37.14      tb->cs    = ti->cs;
   37.15 @@ -267,6 +272,12 @@ asmlinkage int do_page_fault(struct xen_
   37.16  
   37.17      perfc_incrc(page_faults);
   37.18  
   37.19 +#if 0
   37.20 +    printk("do_page_fault(addr=0x%08lx, error_code=%d)\n",
   37.21 +           addr, regs->error_code);
   37.22 +    show_registers(regs);
   37.23 +#endif
   37.24 +
   37.25      if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
   37.26      {
   37.27          LOCK_BIGLOCK(d);
   37.28 @@ -314,6 +325,11 @@ asmlinkage int do_page_fault(struct xen_
   37.29      if ( !GUEST_FAULT(regs) )
   37.30          goto xen_fault;
   37.31  
   37.32 +#ifndef NDEBUG
   37.33 +    if ( (ed->arch.traps[TRAP_page_fault].address == 0) && (d->id == 0) )
   37.34 +        goto xen_fault;
   37.35 +#endif
   37.36 +
   37.37      propagate_page_fault(addr, regs->error_code);
   37.38      return 0; 
   37.39  
   37.40 @@ -512,7 +528,7 @@ asmlinkage int do_general_protection(str
   37.41  
   37.42      /* Emulate some simple privileged instructions when exec'ed in ring 1. */
   37.43      if ( (regs->error_code == 0) &&
   37.44 -         RING_1(regs) &&
   37.45 +         GUESTOS_FAULT(regs) &&
   37.46           emulate_privileged_op(regs) )
   37.47          return 0;
   37.48  
   37.49 @@ -523,6 +539,12 @@ asmlinkage int do_general_protection(str
   37.50          return 0;
   37.51  #endif
   37.52  
   37.53 +#ifndef NDEBUG
   37.54 +    if ( (ed->arch.traps[TRAP_gp_fault].address == 0) &&
   37.55 +         (ed->domain->id == 0) )
   37.56 +        goto gp_in_kernel;
   37.57 +#endif
   37.58 +
   37.59      /* Pass on GPF as is. */
   37.60      ti = current->arch.traps + 13;
   37.61      tb->flags      = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
   37.62 @@ -553,19 +575,55 @@ asmlinkage int do_general_protection(str
   37.63      return 0;
   37.64  }
   37.65  
   37.66 +unsigned long nmi_softirq_reason;
   37.67 +static void nmi_softirq(void)
   37.68 +{
   37.69 +    if ( dom0 == NULL )
   37.70 +        return;
   37.71 +
   37.72 +    if ( test_and_clear_bit(0, &nmi_softirq_reason) )
   37.73 +        send_guest_virq(dom0->exec_domain[0], VIRQ_PARITY_ERR);
   37.74 +
   37.75 +    if ( test_and_clear_bit(1, &nmi_softirq_reason) )
   37.76 +        send_guest_virq(dom0->exec_domain[0], VIRQ_IO_ERR);
   37.77 +}
   37.78 +
   37.79  asmlinkage void mem_parity_error(struct xen_regs *regs)
   37.80  {
   37.81 -    console_force_unlock();
   37.82 -    printk("\n\nNMI - MEMORY ERROR\n");
   37.83 -    fatal_trap(TRAP_nmi, regs);
   37.84 +    /* Clear and disable the parity-error line. */
   37.85 +    outb((inb(0x61)&15)|4,0x61);
   37.86 +
   37.87 +    switch ( opt_nmi[0] )
   37.88 +    {
   37.89 +    case 'd': /* 'dom0' */
   37.90 +        set_bit(0, &nmi_softirq_reason);
   37.91 +        raise_softirq(NMI_SOFTIRQ);
   37.92 +    case 'i': /* 'ignore' */
   37.93 +        break;
   37.94 +    default:  /* 'fatal' */
   37.95 +        console_force_unlock();
   37.96 +        printk("\n\nNMI - MEMORY ERROR\n");
   37.97 +        fatal_trap(TRAP_nmi, regs);
   37.98 +    }
   37.99  }
  37.100  
  37.101  asmlinkage void io_check_error(struct xen_regs *regs)
  37.102  {
  37.103 -    console_force_unlock();
  37.104 +    /* Clear and disable the I/O-error line. */
  37.105 +    outb((inb(0x61)&15)|8,0x61);
  37.106  
  37.107 -    printk("\n\nNMI - I/O ERROR\n");
  37.108 -    fatal_trap(TRAP_nmi, regs);
  37.109 +    switch ( opt_nmi[0] )
  37.110 +    {
  37.111 +    case 'd': /* 'dom0' */
  37.112 +        set_bit(0, &nmi_softirq_reason);
  37.113 +        raise_softirq(NMI_SOFTIRQ);
  37.114 +    case 'i': /* 'ignore' */
  37.115 +        break;
  37.116 +    default:  /* 'fatal' */
  37.117 +        console_force_unlock();
  37.118 +        printk("\n\nNMI - I/O ERROR\n");
  37.119 +        fatal_trap(TRAP_nmi, regs);
  37.120 +    }
  37.121  }
  37.122  
  37.123  static void unknown_nmi_error(unsigned char reason)
  37.124 @@ -579,25 +637,15 @@ asmlinkage void do_nmi(struct xen_regs *
  37.125  {
  37.126      ++nmi_count(smp_processor_id());
  37.127  
  37.128 -#if CONFIG_X86_LOCAL_APIC
  37.129      if ( nmi_watchdog )
  37.130          nmi_watchdog_tick(regs);
  37.131 -    else
  37.132 -#endif
  37.133 -        unknown_nmi_error((unsigned char)(reason&0xff));
  37.134 -}
  37.135  
  37.136 -unsigned long nmi_softirq_reason;
  37.137 -static void nmi_softirq(void)
  37.138 -{
  37.139 -    if ( dom0 == NULL )
  37.140 -        return;
  37.141 -
  37.142 -    if ( test_and_clear_bit(0, &nmi_softirq_reason) )
  37.143 -        send_guest_virq(dom0->exec_domain[0], VIRQ_PARITY_ERR);
  37.144 -
  37.145 -    if ( test_and_clear_bit(1, &nmi_softirq_reason) )
  37.146 -        send_guest_virq(dom0->exec_domain[0], VIRQ_IO_ERR);
  37.147 +    if ( reason & 0x80 )
  37.148 +        mem_parity_error(regs);
  37.149 +    else if ( reason & 0x40 )
  37.150 +        io_check_error(regs);
  37.151 +    else if ( !nmi_watchdog )
  37.152 +        unknown_nmi_error((unsigned char)(reason&0xff));
  37.153  }
  37.154  
  37.155  asmlinkage int math_state_restore(struct xen_regs *regs)
  37.156 @@ -706,8 +754,8 @@ void set_tss_desc(unsigned int n, void *
  37.157  
  37.158  void __init trap_init(void)
  37.159  {
  37.160 -    extern void doublefault_init(void);
  37.161 -    doublefault_init();
  37.162 +    extern void percpu_traps_init(void);
  37.163 +    extern void cpu_init(void);
  37.164  
  37.165      /*
  37.166       * Note that interrupt gates are always used, rather than trap gates. We 
  37.167 @@ -745,13 +793,9 @@ void __init trap_init(void)
  37.168      /* CPU0 uses the master IDT. */
  37.169      idt_tables[0] = idt_table;
  37.170  
  37.171 -    /*
  37.172 -     * Should be a barrier for any external CPU state.
  37.173 -     */
  37.174 -    {
  37.175 -        extern void cpu_init(void);
  37.176 -        cpu_init();
  37.177 -    }
  37.178 +    percpu_traps_init();
  37.179 +
  37.180 +    cpu_init();
  37.181  
  37.182      open_softirq(NMI_SOFTIRQ, nmi_softirq);
  37.183  }
  37.184 @@ -769,8 +813,8 @@ long do_set_trap_table(trap_info_t *trap
  37.185          if ( hypercall_preempt_check() )
  37.186          {
  37.187              UNLOCK_BIGLOCK(current->domain);
  37.188 -            return hypercall_create_continuation(
  37.189 -                __HYPERVISOR_set_trap_table, 1, traps);
  37.190 +            return hypercall1_create_continuation(
  37.191 +                __HYPERVISOR_set_trap_table, traps);
  37.192          }
  37.193  
  37.194          if ( copy_from_user(&cur, traps, sizeof(cur)) ) return -EFAULT;
  37.195 @@ -816,6 +860,13 @@ long do_fpu_taskswitch(void)
  37.196  }
  37.197  
  37.198  
  37.199 +#if defined(__i386__)
  37.200 +#define DB_VALID_ADDR(_a) \
  37.201 +    ((_a) <= (PAGE_OFFSET - 4))
  37.202 +#elif defined(__x86_64__)
  37.203 +#define DB_VALID_ADDR(_a) \
  37.204 +    ((_a) >= HYPERVISOR_VIRT_END) || ((_a) <= (HYPERVISOR_VIRT_START-8))
  37.205 +#endif
  37.206  long set_debugreg(struct exec_domain *p, int reg, unsigned long value)
  37.207  {
  37.208      int i;
  37.209 @@ -823,22 +874,22 @@ long set_debugreg(struct exec_domain *p,
  37.210      switch ( reg )
  37.211      {
  37.212      case 0: 
  37.213 -        if ( value > (PAGE_OFFSET-4) ) return -EPERM;
  37.214 +        if ( !DB_VALID_ADDR(value) ) return -EPERM;
  37.215          if ( p == current ) 
  37.216              __asm__ ( "mov %0, %%db0" : : "r" (value) );
  37.217          break;
  37.218      case 1: 
  37.219 -        if ( value > (PAGE_OFFSET-4) ) return -EPERM;
  37.220 +        if ( !DB_VALID_ADDR(value) ) return -EPERM;
  37.221          if ( p == current ) 
  37.222              __asm__ ( "mov %0, %%db1" : : "r" (value) );
  37.223          break;
  37.224      case 2: 
  37.225 -        if ( value > (PAGE_OFFSET-4) ) return -EPERM;
  37.226 +        if ( !DB_VALID_ADDR(value) ) return -EPERM;
  37.227          if ( p == current ) 
  37.228              __asm__ ( "mov %0, %%db2" : : "r" (value) );
  37.229          break;
  37.230      case 3:
  37.231 -        if ( value > (PAGE_OFFSET-4) ) return -EPERM;
  37.232 +        if ( !DB_VALID_ADDR(value) ) return -EPERM;
  37.233          if ( p == current ) 
  37.234              __asm__ ( "mov %0, %%db3" : : "r" (value) );
  37.235          break;
    38.1 --- a/xen/arch/x86/vmx.c	Mon Feb 07 08:19:24 2005 +0000
    38.2 +++ b/xen/arch/x86/vmx.c	Tue Feb 08 16:44:16 2005 +0000
    38.3 @@ -36,6 +36,8 @@
    38.4  #include <asm/vmx_vmcs.h>
    38.5  #include <public/io/ioreq.h>
    38.6  
    38.7 +#ifdef CONFIG_VMX
    38.8 +
    38.9  int vmcs_size;
   38.10  unsigned int opt_vmx_debug_level;
   38.11  
   38.12 @@ -123,13 +125,13 @@ static int vmx_do_page_fault(unsigned lo
   38.13      /*
   38.14       * Set up guest page directory cache to make linear_pt_table[] work.
   38.15       */
   38.16 -    __guest_get_pl2e(ed, va, &gpde);
   38.17 +    __guest_get_l2e(ed, va, &gpde);
   38.18      if (!(gpde & _PAGE_PRESENT))
   38.19          return 0;
   38.20  
   38.21      index = (va >> L2_PAGETABLE_SHIFT);
   38.22      if (!l2_pgentry_val(ed->arch.guest_pl2e_cache[index])) {
   38.23 -        pfn = phys_to_machine_mapping[gpde >> PAGE_SHIFT];
   38.24 +        pfn = phys_to_machine_mapping(gpde >> PAGE_SHIFT);
   38.25  
   38.26          VMX_DBG_LOG(DBG_LEVEL_VMMU, "vmx_do_page_fault: pagetable = %lx\n",
   38.27                  pagetable_val(ed->arch.pagetable));
   38.28 @@ -301,10 +303,10 @@ inline unsigned long gva_to_gpa(unsigned
   38.29      unsigned long gpde, gpte, pfn, index;
   38.30      struct exec_domain *ed = current;
   38.31  
   38.32 -    __guest_get_pl2e(ed, gva, &gpde);
   38.33 +    __guest_get_l2e(ed, gva, &gpde);
   38.34      index = (gva >> L2_PAGETABLE_SHIFT);
   38.35  
   38.36 -    pfn = phys_to_machine_mapping[gpde >> PAGE_SHIFT];
   38.37 +    pfn = phys_to_machine_mapping(gpde >> PAGE_SHIFT);
   38.38  
   38.39      ed->arch.guest_pl2e_cache[index] = 
   38.40              mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
   38.41 @@ -451,8 +453,8 @@ static void mov_to_cr(int gp, int cr, st
   38.42              /*
   38.43               * The guest CR3 must be pointing to the guest physical.
   38.44               */
   38.45 -            if (!(pfn = phys_to_machine_mapping[
   38.46 -                      d->arch.arch_vmx.cpu_cr3 >> PAGE_SHIFT])) 
   38.47 +            if (!(pfn = phys_to_machine_mapping(
   38.48 +                      d->arch.arch_vmx.cpu_cr3 >> PAGE_SHIFT))) 
   38.49              {
   38.50                  VMX_DBG_LOG(DBG_LEVEL_VMMU, "Invalid CR3 value = %lx\n", 
   38.51                          d->arch.arch_vmx.cpu_cr3);
   38.52 @@ -504,7 +506,7 @@ static void mov_to_cr(int gp, int cr, st
   38.53               * removed some translation or changed page attributes.
   38.54               * We simply invalidate the shadow.
   38.55               */
   38.56 -            pfn = phys_to_machine_mapping[value >> PAGE_SHIFT];
   38.57 +            pfn = phys_to_machine_mapping(value >> PAGE_SHIFT);
   38.58              if ((pfn << PAGE_SHIFT) != pagetable_val(d->arch.pagetable))
   38.59                  __vmx_bug(regs);
   38.60              vmx_shadow_clear_state(d->domain);
   38.61 @@ -521,7 +523,7 @@ static void mov_to_cr(int gp, int cr, st
   38.62                          "Invalid CR3 value=%lx\n", value);
   38.63                  domain_crash(); /* need to take a clean path */
   38.64              }
   38.65 -            pfn = phys_to_machine_mapping[value >> PAGE_SHIFT];
   38.66 +            pfn = phys_to_machine_mapping(value >> PAGE_SHIFT);
   38.67              vmx_shadow_clear_state(d->domain);
   38.68              d->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
   38.69              shadow_mk_pagetable(d);
   38.70 @@ -927,6 +929,8 @@ asmlinkage void vmx_vmexit_handler(struc
   38.71      default:
   38.72          __vmx_bug(&regs);       /* should not happen */
   38.73      }
   38.74 +
   38.75 +    vmx_intr_assist(d);
   38.76      return;
   38.77  }
   38.78  
   38.79 @@ -937,3 +941,5 @@ asmlinkage void load_cr2(void)
   38.80      local_irq_disable();        
   38.81      asm volatile("movl %0,%%cr2": :"r" (d->arch.arch_vmx.cpu_cr2));
   38.82  }
   38.83 +
   38.84 +#endif /* CONFIG_VMX */
    39.1 --- a/xen/arch/x86/vmx_io.c	Mon Feb 07 08:19:24 2005 +0000
    39.2 +++ b/xen/arch/x86/vmx_io.c	Tue Feb 08 16:44:16 2005 +0000
    39.3 @@ -32,6 +32,8 @@
    39.4  #include <public/io/ioreq.h>
    39.5  #include <asm/vmx_platform.h>
    39.6  
    39.7 +#ifdef CONFIG_VMX
    39.8 +
    39.9  extern long do_block();
   39.10    
   39.11  #if defined (__i386__)
   39.12 @@ -386,3 +388,5 @@ void vmx_do_resume(struct exec_domain *d
   39.13      if (!test_bit(ARCH_VMX_IO_WAIT, &d->arch.arch_vmx.flags))
   39.14          vmx_intr_assist(d);
   39.15  }
   39.16 +
   39.17 +#endif /* CONFIG_VMX */
    40.1 --- a/xen/arch/x86/vmx_platform.c	Mon Feb 07 08:19:24 2005 +0000
    40.2 +++ b/xen/arch/x86/vmx_platform.c	Tue Feb 08 16:44:16 2005 +0000
    40.3 @@ -34,6 +34,8 @@
    40.4  #include <xen/sched.h>
    40.5  #include <asm/current.h>
    40.6  
    40.7 +#ifdef CONFIG_VMX
    40.8 +
    40.9  #define DECODE_success  1
   40.10  #define DECODE_failure  0
   40.11  
   40.12 @@ -369,7 +371,7 @@ static int inst_copy_from_guest(char *bu
   40.13                  printk("inst_copy_from_guest- EXIT: read gpte faulted" );
   40.14                  return 0;
   40.15              }
   40.16 -        mfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT];
   40.17 +        mfn = phys_to_machine_mapping(gpte >> PAGE_SHIFT);
   40.18          ma = (mfn << PAGE_SHIFT) | (guest_eip & (PAGE_SIZE - 1));
   40.19          inst_start = (unsigned char *)map_domain_mem(ma);
   40.20                  
   40.21 @@ -553,3 +555,4 @@ void handle_mmio(unsigned long va, unsig
   40.22      domain_crash();
   40.23  }
   40.24  
   40.25 +#endif /* CONFIG_VMX */
    41.1 --- a/xen/arch/x86/vmx_vmcs.c	Mon Feb 07 08:19:24 2005 +0000
    41.2 +++ b/xen/arch/x86/vmx_vmcs.c	Tue Feb 08 16:44:16 2005 +0000
    41.3 @@ -33,6 +33,8 @@
    41.4  #include <public/io/ioreq.h>
    41.5  #include <asm/domain_page.h>
    41.6  
    41.7 +#ifdef CONFIG_VMX
    41.8 +
    41.9  struct vmcs_struct *alloc_vmcs(void) 
   41.10  {
   41.11      struct vmcs_struct *vmcs;
   41.12 @@ -118,7 +120,7 @@ int vmx_setup_platform(struct exec_domai
   41.13      addr = context->edi;
   41.14      offset = (addr & ~PAGE_MASK);
   41.15      addr = round_pgdown(addr);
   41.16 -    mpfn = phys_to_machine_mapping[addr >> PAGE_SHIFT];
   41.17 +    mpfn = phys_to_machine_mapping(addr >> PAGE_SHIFT);
   41.18      p = map_domain_mem(mpfn << PAGE_SHIFT);
   41.19  
   41.20      e820p = (struct e820entry *) ((unsigned long) p + offset); 
   41.21 @@ -131,52 +133,20 @@ int vmx_setup_platform(struct exec_domai
   41.22      }
   41.23  
   41.24      if (gpfn == 0) {
   41.25 -        VMX_DBG_LOG(DBG_LEVEL_1, "No shared Page ?\n");
   41.26 +        printk("No shared Page ?\n");
   41.27 +        unmap_domain_mem(p);        
   41.28          return -1;
   41.29      }   
   41.30      unmap_domain_mem(p);        
   41.31  
   41.32 -    mpfn = phys_to_machine_mapping[gpfn];
   41.33 +    mpfn = phys_to_machine_mapping(gpfn);
   41.34      p = map_domain_mem(mpfn << PAGE_SHIFT);
   41.35 +    ASSERT(p != NULL);
   41.36      d->arch.arch_vmx.vmx_platform.shared_page_va = (unsigned long) p;
   41.37  
   41.38      return 0;
   41.39  }
   41.40  
   41.41 -
   41.42 -/*
   41.43 - * Add <guest pfn, machine pfn> mapping to per-domain mapping. Full
   41.44 - * virtualization does not need per-domain mapping.
   41.45 - */
   41.46 -static int add_mapping_perdomain(struct exec_domain *d, unsigned long gpfn, 
   41.47 -                                 unsigned long mpfn)
   41.48 -{
   41.49 -    struct pfn_info *page;
   41.50 -    unsigned long pfn = 0;
   41.51 -
   41.52 -    /*
   41.53 -     * We support up to 4GB memory for a guest at this point
   41.54 -     */
   41.55 -    if (gpfn > ENTRIES_PER_L2_PAGETABLE * ENTRIES_PER_L1_PAGETABLE)
   41.56 -        return -1;
   41.57 -
   41.58 -    if (!(l1_pgentry_val(d->domain->arch.mm_perdomain_pt[
   41.59 -            gpfn >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)]) & _PAGE_PRESENT))
   41.60 -    {
   41.61 -        page = (struct pfn_info *) alloc_domheap_page(NULL);
   41.62 -        if (!page) {
   41.63 -            return -1;
   41.64 -        }
   41.65 -
   41.66 -        pfn = (unsigned long) (page - frame_table);
   41.67 -        d->domain->arch.mm_perdomain_pt[gpfn >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)] = 
   41.68 -            mk_l1_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
   41.69 -    }
   41.70 -    phys_to_machine_mapping[gpfn] = mpfn;
   41.71 -
   41.72 -    return 0;
   41.73 -}
   41.74 -
   41.75  void vmx_do_launch(struct exec_domain *ed) 
   41.76  {
   41.77  /* Update CR3, GDT, LDT, TR */
   41.78 @@ -204,7 +174,6 @@ void vmx_do_launch(struct exec_domain *e
   41.79          d->arch.min_pfn = min(d->arch.min_pfn, pfn);
   41.80          d->arch.max_pfn = max(d->arch.max_pfn, pfn);
   41.81          list_ent = frame_table[pfn].list.next;
   41.82 -        add_mapping_perdomain(ed, i, pfn);
   41.83      }
   41.84  
   41.85      spin_unlock(&d->page_alloc_lock);
   41.86 @@ -502,3 +471,4 @@ void vm_resume_fail(unsigned long eflags
   41.87      BUG();
   41.88  }
   41.89  
   41.90 +#endif /* CONFIG_VMX */
    42.1 --- a/xen/arch/x86/x86_32/domain_build.c	Mon Feb 07 08:19:24 2005 +0000
    42.2 +++ b/xen/arch/x86/x86_32/domain_build.c	Tue Feb 08 16:44:16 2005 +0000
    42.3 @@ -20,6 +20,7 @@
    42.4  #include <xen/event.h>
    42.5  #include <xen/elf.h>
    42.6  #include <xen/kernel.h>
    42.7 +#include <asm/shadow.h>
    42.8  
    42.9  /* No ring-3 access in initial page tables. */
   42.10  #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
   42.11 @@ -261,7 +262,7 @@ int construct_dom0(struct domain *d,
   42.12      for ( count = 0; count < nr_pt_pages; count++ ) 
   42.13      {
   42.14          *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
   42.15 -        page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
   42.16 +        page = &frame_table[l1_pgentry_to_pfn(*l1tab)];
   42.17          if ( count == 0 )
   42.18          {
   42.19              page->u.inuse.type_info &= ~PGT_type_mask;
   42.20 @@ -377,10 +378,13 @@ int construct_dom0(struct domain *d,
   42.21  
   42.22      new_thread(ed, dsi.v_kernentry, vstack_end, vstartinfo_start);
   42.23  
   42.24 -#if 0 /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
   42.25 -    shadow_lock(&d->mm);
   42.26 -    shadow_mode_enable(d, SHM_test); 
   42.27 -    shadow_unlock(&d->mm);
   42.28 +#ifndef NDEBUG
   42.29 +    if (0) /* XXXXX DO NOT CHECK IN ENABLED !!! (but useful for testing so leave) */
   42.30 +    {
   42.31 +        shadow_lock(d);
   42.32 +        shadow_mode_enable(d, SHM_test); 
   42.33 +        shadow_unlock(d);
   42.34 +    }
   42.35  #endif
   42.36  
   42.37      return 0;
    43.1 --- a/xen/arch/x86/x86_32/entry.S	Mon Feb 07 08:19:24 2005 +0000
    43.2 +++ b/xen/arch/x86/x86_32/entry.S	Tue Feb 08 16:44:16 2005 +0000
    43.3 @@ -596,7 +596,7 @@ ENTRY(nmi)
    43.4          # Okay, its almost a normal NMI tick. We can only process it if:
    43.5          #  A. We are the outermost Xen activation (in which case we have
    43.6          #     the selectors safely saved on our stack)
    43.7 -        #  B. DS-GS all contain sane Xen values.
    43.8 +        #  B. DS and ES contain sane Xen values.
    43.9          # In all other cases we bail without touching DS-GS, as we have
   43.10          # interrupted an enclosing Xen activation in tricky prologue or
   43.11          # epilogue code.
   43.12 @@ -644,11 +644,11 @@ nmi_parity_err:
   43.13          orb  $0x4,%al
   43.14          outb %al,$0x61
   43.15          cmpb $'i',%ss:SYMBOL_NAME(opt_nmi) # nmi=ignore
   43.16 -        je   restore_all_xen
   43.17 +        je   nmi_out
   43.18          bts  $0,%ss:SYMBOL_NAME(nmi_softirq_reason)
   43.19          bts  $NMI_SOFTIRQ,%ss:SYMBOL_NAME(irq_stat)
   43.20          cmpb $'d',%ss:SYMBOL_NAME(opt_nmi) # nmi=dom0
   43.21 -        je   restore_all_xen
   43.22 +        je   nmi_out
   43.23          movl $(__HYPERVISOR_DS),%edx       # nmi=fatal
   43.24          movl %edx,%ds
   43.25          movl %edx,%es
   43.26 @@ -656,7 +656,15 @@ nmi_parity_err:
   43.27          push %edx
   43.28          call SYMBOL_NAME(mem_parity_error)
   43.29          addl $4,%esp
   43.30 -        jmp  ret_from_intr
   43.31 +nmi_out:movl  %ss:XREGS_eflags(%esp),%eax
   43.32 +        movb  %ss:XREGS_cs(%esp),%al
   43.33 +        testl $(3|X86_EFLAGS_VM),%eax
   43.34 +	jz    restore_all_xen
   43.35 +        movl  $(__HYPERVISOR_DS),%edx
   43.36 +        movl  %edx,%ds
   43.37 +        movl  %edx,%es
   43.38 +        GET_CURRENT(%ebx)
   43.39 +        jmp   test_all_events
   43.40                  
   43.41  nmi_io_err: 
   43.42          # Clear and disable the I/O-error line
   43.43 @@ -664,11 +672,11 @@ nmi_io_err:
   43.44          orb  $0x8,%al
   43.45          outb %al,$0x61
   43.46          cmpb $'i',%ss:SYMBOL_NAME(opt_nmi) # nmi=ignore
   43.47 -        je   restore_all_xen
   43.48 +        je   nmi_out
   43.49          bts  $1,%ss:SYMBOL_NAME(nmi_softirq_reason)
   43.50          bts  $NMI_SOFTIRQ,%ss:SYMBOL_NAME(irq_stat)
   43.51          cmpb $'d',%ss:SYMBOL_NAME(opt_nmi) # nmi=dom0
   43.52 -        je   restore_all_xen
   43.53 +        je   nmi_out
   43.54          movl $(__HYPERVISOR_DS),%edx       # nmi=fatal
   43.55          movl %edx,%ds
   43.56          movl %edx,%es
   43.57 @@ -676,7 +684,7 @@ nmi_io_err:
   43.58          push %edx
   43.59          call SYMBOL_NAME(io_check_error)                        
   43.60          addl $4,%esp
   43.61 -        jmp  ret_from_intr
   43.62 +        jmp  nmi_out
   43.63  
   43.64  
   43.65  ENTRY(setup_vm86_frame)
    44.1 --- a/xen/arch/x86/x86_32/mm.c	Mon Feb 07 08:19:24 2005 +0000
    44.2 +++ b/xen/arch/x86/x86_32/mm.c	Tue Feb 08 16:44:16 2005 +0000
    44.3 @@ -164,7 +164,7 @@ void subarch_init_memory(struct domain *
    44.4      }
    44.5  
    44.6      /* M2P table is mappable read-only by privileged domains. */
    44.7 -    m2p_start_mfn = l2_pgentry_to_pagenr(
    44.8 +    m2p_start_mfn = l2_pgentry_to_pfn(
    44.9          idle_pg_table[l2_table_offset(RDWR_MPT_VIRT_START)]);
   44.10      for ( i = 0; i < 1024; i++ )
   44.11      {
   44.12 @@ -212,9 +212,10 @@ long do_stack_switch(unsigned long ss, u
   44.13  
   44.14  
   44.15  /* Returns TRUE if given descriptor is valid for GDT or LDT. */
   44.16 -int check_descriptor(unsigned long *d)
   44.17 +int check_descriptor(struct desc_struct *d)
   44.18  {
   44.19 -    unsigned long base, limit, a = d[0], b = d[1];
   44.20 +    unsigned long base, limit;
   44.21 +    u32 a = d->a, b = d->b;
   44.22  
   44.23      /* A not-present descriptor will always fault, so is safe. */
   44.24      if ( !(b & _SEGMENT_P) ) 
   44.25 @@ -298,8 +299,8 @@ int check_descriptor(unsigned long *d)
   44.26              if ( !(b & _SEGMENT_G) )
   44.27                  goto bad; /* too dangerous; too hard to work out... */
   44.28              limit = (limit >> 12) - 1;
   44.29 -            d[0] &= ~0x0ffff; d[0] |= limit & 0x0ffff;
   44.30 -            d[1] &= ~0xf0000; d[1] |= limit & 0xf0000;
   44.31 +            d->a &= ~0x0ffff; d->a |= limit & 0x0ffff;
   44.32 +            d->b &= ~0xf0000; d->b |= limit & 0xf0000;
   44.33          }
   44.34      }
   44.35  
   44.36 @@ -310,175 +311,6 @@ int check_descriptor(unsigned long *d)
   44.37  }
   44.38  
   44.39  
   44.40 -void destroy_gdt(struct exec_domain *ed)
   44.41 -{
   44.42 -    int i;
   44.43 -    unsigned long pfn;
   44.44 -
   44.45 -    for ( i = 0; i < 16; i++ )
   44.46 -    {
   44.47 -        if ( (pfn = l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i])) != 0 )
   44.48 -            put_page_and_type(&frame_table[pfn]);
   44.49 -        ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
   44.50 -    }
   44.51 -}
   44.52 -
   44.53 -
   44.54 -long set_gdt(struct exec_domain *ed, 
   44.55 -             unsigned long *frames,
   44.56 -             unsigned int entries)
   44.57 -{
   44.58 -    struct domain *d = ed->domain;
   44.59 -    /* NB. There are 512 8-byte entries per GDT page. */
   44.60 -    int i = 0, nr_pages = (entries + 511) / 512;
   44.61 -    struct desc_struct *vgdt;
   44.62 -    unsigned long pfn;
   44.63 -
   44.64 -    /* Check the first page in the new GDT. */
   44.65 -    if ( (pfn = frames[0]) >= max_page )
   44.66 -        goto fail;
   44.67 -
   44.68 -    /* The first page is special because Xen owns a range of entries in it. */
   44.69 -    if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
   44.70 -    {
   44.71 -        /* GDT checks failed: try zapping the Xen reserved entries. */
   44.72 -        if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) )
   44.73 -            goto fail;
   44.74 -        vgdt = map_domain_mem(pfn << PAGE_SHIFT);
   44.75 -        memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0,
   44.76 -               NR_RESERVED_GDT_ENTRIES*8);
   44.77 -        unmap_domain_mem(vgdt);
   44.78 -        put_page_and_type(&frame_table[pfn]);
   44.79 -
   44.80 -        /* Okay, we zapped the entries. Now try the GDT checks again. */
   44.81 -        if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
   44.82 -            goto fail;
   44.83 -    }
   44.84 -
   44.85 -    /* Check the remaining pages in the new GDT. */
   44.86 -    for ( i = 1; i < nr_pages; i++ )
   44.87 -        if ( ((pfn = frames[i]) >= max_page) ||
   44.88 -             !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
   44.89 -            goto fail;
   44.90 -
   44.91 -    /* Copy reserved GDT entries to the new GDT. */
   44.92 -    vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
   44.93 -    memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 
   44.94 -           gdt_table + FIRST_RESERVED_GDT_ENTRY, 
   44.95 -           NR_RESERVED_GDT_ENTRIES*8);
   44.96 -    unmap_domain_mem(vgdt);
   44.97 -
   44.98 -    /* Tear down the old GDT. */
   44.99 -    destroy_gdt(ed);
  44.100 -
  44.101 -    /* Install the new GDT. */
  44.102 -    for ( i = 0; i < nr_pages; i++ )
  44.103 -        ed->arch.perdomain_ptes[i] =
  44.104 -            mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  44.105 -
  44.106 -    SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed));
  44.107 -    SET_GDT_ENTRIES(ed, entries);
  44.108 -
  44.109 -    return 0;
  44.110 -
  44.111 - fail:
  44.112 -    while ( i-- > 0 )
  44.113 -        put_page_and_type(&frame_table[frames[i]]);
  44.114 -    return -EINVAL;
  44.115 -}
  44.116 -
  44.117 -
  44.118 -long do_set_gdt(unsigned long *frame_list, unsigned int entries)
  44.119 -{
  44.120 -    int nr_pages = (entries + 511) / 512;
  44.121 -    unsigned long frames[16];
  44.122 -    long ret;
  44.123 -
  44.124 -    if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) ) 
  44.125 -        return -EINVAL;
  44.126 -    
  44.127 -    if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
  44.128 -        return -EFAULT;
  44.129 -
  44.130 -    LOCK_BIGLOCK(current->domain);
  44.131 -
  44.132 -    if ( (ret = set_gdt(current, frames, entries)) == 0 )
  44.133 -    {
  44.134 -        local_flush_tlb();
  44.135 -        __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt));
  44.136 -    }
  44.137 -
  44.138 -    UNLOCK_BIGLOCK(current->domain);
  44.139 -
  44.140 -    return ret;
  44.141 -}
  44.142 -
  44.143 -
  44.144 -long do_update_descriptor(
  44.145 -    unsigned long pa, unsigned long word1, unsigned long word2)
  44.146 -{
  44.147 -    unsigned long *gdt_pent, pfn = pa >> PAGE_SHIFT, d[2];
  44.148 -    struct pfn_info *page;
  44.149 -    struct exec_domain *ed;
  44.150 -    long ret = -EINVAL;
  44.151 -
  44.152 -    d[0] = word1;
  44.153 -    d[1] = word2;
  44.154 -
  44.155 -    LOCK_BIGLOCK(current->domain);
  44.156 -
  44.157 -    if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(d) ) {
  44.158 -        UNLOCK_BIGLOCK(current->domain);
  44.159 -        return -EINVAL;
  44.160 -    }
  44.161 -
  44.162 -    page = &frame_table[pfn];
  44.163 -    if ( unlikely(!get_page(page, current->domain)) ) {
  44.164 -        UNLOCK_BIGLOCK(current->domain);
  44.165 -        return -EINVAL;
  44.166 -    }
  44.167 -
  44.168 -    /* Check if the given frame is in use in an unsafe context. */
  44.169 -    switch ( page->u.inuse.type_info & PGT_type_mask )
  44.170 -    {
  44.171 -    case PGT_gdt_page:
  44.172 -        /* Disallow updates of Xen-reserved descriptors in the current GDT. */
  44.173 -        for_each_exec_domain(current->domain, ed) {
  44.174 -            if ( (l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[0]) == pfn) &&
  44.175 -                 (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
  44.176 -                 (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
  44.177 -                goto out;
  44.178 -        }
  44.179 -        if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
  44.180 -            goto out;
  44.181 -        break;
  44.182 -    case PGT_ldt_page:
  44.183 -        if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
  44.184 -            goto out;
  44.185 -        break;
  44.186 -    default:
  44.187 -        if ( unlikely(!get_page_type(page, PGT_writable_page)) )
  44.188 -            goto out;
  44.189 -        break;
  44.190 -    }
  44.191 -
  44.192 -    /* All is good so make the update. */
  44.193 -    gdt_pent = map_domain_mem(pa);
  44.194 -    memcpy(gdt_pent, d, 8);
  44.195 -    unmap_domain_mem(gdt_pent);
  44.196 -
  44.197 -    put_page_type(page);
  44.198 -
  44.199 -    ret = 0; /* success */
  44.200 -
  44.201 - out:
  44.202 -    put_page(page);
  44.203 -
  44.204 -    UNLOCK_BIGLOCK(current->domain);
  44.205 -
  44.206 -    return ret;
  44.207 -}
  44.208 -
  44.209  #ifdef MEMORY_GUARD
  44.210  
  44.211  void *memguard_init(void *heap_start)
    45.1 --- a/xen/arch/x86/x86_32/traps.c	Mon Feb 07 08:19:24 2005 +0000
    45.2 +++ b/xen/arch/x86/x86_32/traps.c	Tue Feb 08 16:44:16 2005 +0000
    45.3 @@ -7,6 +7,7 @@
    45.4  #include <xen/console.h>
    45.5  #include <xen/mm.h>
    45.6  #include <xen/irq.h>
    45.7 +#include <asm/flushtlb.h>
    45.8  
    45.9  static int kstack_depth_to_print = 8*20;
   45.10  
   45.11 @@ -114,6 +115,7 @@ void show_registers(struct xen_regs *reg
   45.12             regs->esi, regs->edi, regs->ebp, esp);
   45.13      printk("ds: %04x   es: %04x   fs: %04x   gs: %04x   ss: %04x\n",
   45.14             ds, es, fs, gs, ss);
   45.15 +    printk("cr3: %08lx\n", read_cr3());
   45.16  
   45.17      show_stack((unsigned long *)&regs->esp);
   45.18  } 
   45.19 @@ -175,34 +177,33 @@ asmlinkage void do_double_fault(void)
   45.20          __asm__ __volatile__ ( "hlt" );
   45.21  }
   45.22  
   45.23 -void __init doublefault_init(void)
   45.24 +void __init percpu_traps_init(void)
   45.25  {
   45.26 -    /*
   45.27 -     * Make a separate task for double faults. This will get us debug output if
   45.28 -     * we blow the kernel stack.
   45.29 -     */
   45.30 -    struct tss_struct *tss = &doublefault_tss;
   45.31 -    memset(tss, 0, sizeof(*tss));
   45.32 -    tss->ds     = __HYPERVISOR_DS;
   45.33 -    tss->es     = __HYPERVISOR_DS;
   45.34 -    tss->ss     = __HYPERVISOR_DS;
   45.35 -    tss->esp    = (unsigned long)
   45.36 -        &doublefault_stack[DOUBLEFAULT_STACK_SIZE];
   45.37 -    tss->__cr3  = __pa(idle_pg_table);
   45.38 -    tss->cs     = __HYPERVISOR_CS;
   45.39 -    tss->eip    = (unsigned long)do_double_fault;
   45.40 -    tss->eflags = 2;
   45.41 -    tss->bitmap = IOBMP_INVALID_OFFSET;
   45.42 -    _set_tssldt_desc(gdt_table+__DOUBLEFAULT_TSS_ENTRY,
   45.43 -                     (unsigned long)tss, 235, 9);
   45.44 +    if ( smp_processor_id() == 0 )
   45.45 +    {
   45.46 +        /*
   45.47 +         * Make a separate task for double faults. This will get us debug
   45.48 +         * output if we blow the kernel stack.
   45.49 +         */
   45.50 +        struct tss_struct *tss = &doublefault_tss;
   45.51 +        memset(tss, 0, sizeof(*tss));
   45.52 +        tss->ds     = __HYPERVISOR_DS;
   45.53 +        tss->es     = __HYPERVISOR_DS;
   45.54 +        tss->ss     = __HYPERVISOR_DS;
   45.55 +        tss->esp    = (unsigned long)
   45.56 +            &doublefault_stack[DOUBLEFAULT_STACK_SIZE];
   45.57 +        tss->__cr3  = __pa(idle_pg_table);
   45.58 +        tss->cs     = __HYPERVISOR_CS;
   45.59 +        tss->eip    = (unsigned long)do_double_fault;
   45.60 +        tss->eflags = 2;
   45.61 +        tss->bitmap = IOBMP_INVALID_OFFSET;
   45.62 +        _set_tssldt_desc(gdt_table+__DOUBLEFAULT_TSS_ENTRY,
   45.63 +                         (unsigned long)tss, 235, 9);
   45.64 +    }
   45.65  
   45.66      set_task_gate(TRAP_double_fault, __DOUBLEFAULT_TSS_ENTRY<<3);
   45.67  }
   45.68  
   45.69 -void __init percpu_traps_init(void)
   45.70 -{
   45.71 -}
   45.72 -
   45.73  long set_fast_trap(struct exec_domain *p, int idx)
   45.74  {
   45.75      trap_info_t *ti;
    46.1 --- a/xen/arch/x86/x86_64/domain_build.c	Mon Feb 07 08:19:24 2005 +0000
    46.2 +++ b/xen/arch/x86/x86_64/domain_build.c	Tue Feb 08 16:44:16 2005 +0000
    46.3 @@ -294,7 +294,7 @@ int construct_dom0(struct domain *d,
    46.4      for ( count = 0; count < nr_pt_pages; count++ ) 
    46.5      {
    46.6          *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
    46.7 -        page = &frame_table[l1_pgentry_to_pagenr(*l1tab)];
    46.8 +        page = &frame_table[l1_pgentry_to_pfn(*l1tab)];
    46.9  
   46.10          /* Read-only mapping + PGC_allocated + page-table page. */
   46.11          page->count_info         = PGC_allocated | 3;
    47.1 --- a/xen/arch/x86/x86_64/entry.S	Mon Feb 07 08:19:24 2005 +0000
    47.2 +++ b/xen/arch/x86/x86_64/entry.S	Tue Feb 08 16:44:16 2005 +0000
    47.3 @@ -11,16 +11,25 @@
    47.4  #include <asm/apicdef.h>
    47.5  #include <public/xen.h>
    47.6  
    47.7 +
    47.8 +/*
    47.9 + * %rax                            = hypercall vector
   47.10 + * %rdi, %rsi, %rdx, %r10, %r8, %9 = hypercall arguments
   47.11 + * %r11, %rcx                      = SYSCALL-saved %rflags and %rip
   47.12 + * NB. We must move %r10 to %rcx for C function-calling ABI.
   47.13 + */
   47.14  ENTRY(hypercall)
   47.15 -        movl  $0x0833,8(%rsp)
   47.16 +        sti
   47.17 +        movl  $__GUEST_SS,8(%rsp)
   47.18          pushq %r11
   47.19 -        pushq $0x082b
   47.20 +        pushq $__GUEST_CS
   47.21          pushq %rcx
   47.22          pushq $0
   47.23          SAVE_ALL
   47.24 -	andq  $(NR_hypercalls-1),%rax
   47.25 -        leaq  SYMBOL_NAME(exception_table)(%rip),%rcx
   47.26 -        callq *(%rcx,%rax,8)
   47.27 +        movq  %r10,%rcx
   47.28 +        andq  $(NR_hypercalls-1),%rax
   47.29 +        leaq  SYMBOL_NAME(hypercall_table)(%rip),%rbx
   47.30 +        callq *(%rbx,%rax,8)
   47.31          RESTORE_ALL
   47.32          addq  $8,%rsp
   47.33          popq  %rcx
   47.34 @@ -38,11 +47,12 @@ restore_all_xen:
   47.35  
   47.36  error_code:
   47.37          SAVE_ALL
   47.38 +        sti
   47.39          movq  %rsp,%rdi
   47.40          movl  XREGS_entry_vector(%rsp),%eax
   47.41          leaq  SYMBOL_NAME(exception_table)(%rip),%rdx
   47.42          callq *(%rdx,%rax,8)
   47.43 -	jmp   restore_all_xen
   47.44 +        jmp   restore_all_xen
   47.45  
   47.46  ENTRY(divide_error)
   47.47          pushq $0
   47.48 @@ -133,7 +143,13 @@ ENTRY(double_fault)
   47.49          jmp   error_code
   47.50  
   47.51  ENTRY(nmi)
   47.52 -        iretq
   47.53 +        pushq $0
   47.54 +        SAVE_ALL
   47.55 +        inb   $0x61,%al
   47.56 +        movl  %eax,%esi # reason
   47.57 +        movq  %rsp,%rdi # regs
   47.58 +        call  SYMBOL_NAME(do_nmi)
   47.59 +	jmp   restore_all_xen
   47.60  
   47.61  .data
   47.62  
    48.1 --- a/xen/arch/x86/x86_64/mm.c	Mon Feb 07 08:19:24 2005 +0000
    48.2 +++ b/xen/arch/x86/x86_64/mm.c	Tue Feb 08 16:44:16 2005 +0000
    48.3 @@ -199,7 +199,7 @@ void subarch_init_memory(struct domain *
    48.4          l2e = l3_pgentry_to_l2(l3e)[l2_table_offset(v)];
    48.5          if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
    48.6              continue;
    48.7 -        m2p_start_mfn = l2_pgentry_to_pagenr(l2e);
    48.8 +        m2p_start_mfn = l2_pgentry_to_pfn(l2e);
    48.9  
   48.10          for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
   48.11          {
   48.12 @@ -240,99 +240,38 @@ long do_stack_switch(unsigned long ss, u
   48.13  
   48.14  
   48.15  /* Returns TRUE if given descriptor is valid for GDT or LDT. */
   48.16 -int check_descriptor(unsigned long *d)
   48.17 +int check_descriptor(struct desc_struct *d)
   48.18  {
   48.19 -    unsigned long base, limit, a = d[0], b = d[1];
   48.20 +    u32 a = d->a, b = d->b;
   48.21  
   48.22      /* A not-present descriptor will always fault, so is safe. */
   48.23      if ( !(b & _SEGMENT_P) ) 
   48.24          goto good;
   48.25  
   48.26 -    /*
   48.27 -     * We don't allow a DPL of zero. There is no legitimate reason for 
   48.28 -     * specifying DPL==0, and it gets rather dangerous if we also accept call 
   48.29 -     * gates (consider a call gate pointing at another guestos descriptor with 
   48.30 -     * DPL 0 -- this would get the OS ring-0 privileges).
   48.31 -     */
   48.32 -    if ( (b & _SEGMENT_DPL) == 0 )
   48.33 +    /* The guest can only safely be executed in ring 3. */
   48.34 +    if ( (b & _SEGMENT_DPL) != 3 )
   48.35          goto bad;
   48.36  
   48.37 -    if ( !(b & _SEGMENT_S) )
   48.38 -    {
   48.39 -        /*
   48.40 -         * System segment:
   48.41 -         *  1. Don't allow interrupt or trap gates as they belong in the IDT.
   48.42 -         *  2. Don't allow TSS descriptors or task gates as we don't
   48.43 -         *     virtualise x86 tasks.
   48.44 -         *  3. Don't allow LDT descriptors because they're unnecessary and
   48.45 -         *     I'm uneasy about allowing an LDT page to contain LDT
   48.46 -         *     descriptors. In any case, Xen automatically creates the
   48.47 -         *     required descriptor when reloading the LDT register.
   48.48 -         *  4. We allow call gates but they must not jump to a private segment.
   48.49 -         */
   48.50 +    /* Any code or data segment is okay. No base/limit checking. */
   48.51 +    if ( (b & _SEGMENT_S) )
   48.52 +        goto good;
   48.53  
   48.54 -        /* Disallow everything but call gates. */
   48.55 -        if ( (b & _SEGMENT_TYPE) != 0xc00 )
   48.56 -            goto bad;
   48.57 -
   48.58 -#if 0
   48.59 -        /* Can't allow far jump to a Xen-private segment. */
   48.60 -        if ( !VALID_CODESEL(a>>16) )
   48.61 -            goto bad;
   48.62 -#endif
   48.63 +    /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
   48.64 +    if ( (b & _SEGMENT_TYPE) == 0x000 )
   48.65 +        goto good;
   48.66  
   48.67 -        /* Reserved bits must be zero. */
   48.68 -        if ( (b & 0xe0) != 0 )
   48.69 -            goto bad;
   48.70 -        
   48.71 -        /* No base/limit check is needed for a call gate. */
   48.72 -        goto good;
   48.73 -    }
   48.74 -    
   48.75 -    /* Check that base is at least a page away from Xen-private area. */
   48.76 -    base  = (b&(0xff<<24)) | ((b&0xff)<<16) | (a>>16);
   48.77 -    if ( base >= (PAGE_OFFSET - PAGE_SIZE) )
   48.78 +    /* Everything but a call gate is discarded here. */
   48.79 +    if ( (b & _SEGMENT_TYPE) != 0xc00 )
   48.80          goto bad;
   48.81  
   48.82 -    /* Check and truncate the limit if necessary. */
   48.83 -    limit = (b&0xf0000) | (a&0xffff);
   48.84 -    limit++; /* We add one because limit is inclusive. */
   48.85 -    if ( (b & _SEGMENT_G) )
   48.86 -        limit <<= 12;
   48.87 +    /* Can't allow far jump to a Xen-private segment. */
   48.88 +    if ( !VALID_CODESEL(a>>16) )
   48.89 +        goto bad;
   48.90  
   48.91 -    if ( (b & (_SEGMENT_CODE | _SEGMENT_EC)) == _SEGMENT_EC )
   48.92 -    {
   48.93 -        /*
   48.94 -         * Grows-down limit check. 
   48.95 -         * NB. limit == 0xFFFFF provides no access      (if G=1).
   48.96 -         *     limit == 0x00000 provides 4GB-4kB access (if G=1).
   48.97 -         */
   48.98 -        if ( (base + limit) > base )
   48.99 -        {
  48.100 -            limit = -(base & PAGE_MASK);
  48.101 -            goto truncate;
  48.102 -        }
  48.103 -    }
  48.104 -    else
  48.105 -    {
  48.106 -        /*
  48.107 -         * Grows-up limit check.
  48.108 -         * NB. limit == 0xFFFFF provides 4GB access (if G=1).
  48.109 -         *     limit == 0x00000 provides 4kB access (if G=1).
  48.110 -         */
  48.111 -        if ( ((base + limit) <= base) || 
  48.112 -             ((base + limit) > PAGE_OFFSET) )
  48.113 -        {
  48.114 -            limit = PAGE_OFFSET - base;
  48.115 -        truncate:
  48.116 -            if ( !(b & _SEGMENT_G) )
  48.117 -                goto bad; /* too dangerous; too hard to work out... */
  48.118 -            limit = (limit >> 12) - 1;
  48.119 -            d[0] &= ~0x0ffff; d[0] |= limit & 0x0ffff;
  48.120 -            d[1] &= ~0xf0000; d[1] |= limit & 0xf0000;
  48.121 -        }
  48.122 -    }
  48.123 -
  48.124 +    /* Reserved bits must be zero. */
  48.125 +    if ( (b & 0xe0) != 0 )
  48.126 +        goto bad;
  48.127 +        
  48.128   good:
  48.129      return 1;
  48.130   bad:
  48.131 @@ -340,159 +279,6 @@ int check_descriptor(unsigned long *d)
  48.132  }
  48.133  
  48.134  
  48.135 -void destroy_gdt(struct exec_domain *ed)
  48.136 -{
  48.137 -    int i;
  48.138 -    unsigned long pfn;
  48.139 -
  48.140 -    for ( i = 0; i < 16; i++ )
  48.141 -    {
  48.142 -        if ( (pfn = l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i])) != 0 )
  48.143 -            put_page_and_type(&frame_table[pfn]);
  48.144 -        ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
  48.145 -    }
  48.146 -}
  48.147 -
  48.148 -
  48.149 -long set_gdt(struct exec_domain *ed, 
  48.150 -             unsigned long *frames,
  48.151 -             unsigned int entries)
  48.152 -{
  48.153 -    struct domain *d = ed->domain;
  48.154 -    /* NB. There are 512 8-byte entries per GDT page. */
  48.155 -    int i = 0, nr_pages = (entries + 511) / 512;
  48.156 -    struct desc_struct *vgdt;
  48.157 -    unsigned long pfn;
  48.158 -
  48.159 -    /* Check the first page in the new GDT. */
  48.160 -    if ( (pfn = frames[0]) >= max_page )
  48.161 -        goto fail;
  48.162 -
  48.163 -    /* The first page is special because Xen owns a range of entries in it. */
  48.164 -    if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
  48.165 -    {
  48.166 -        /* GDT checks failed: try zapping the Xen reserved entries. */
  48.167 -        if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) )
  48.168 -            goto fail;
  48.169 -        vgdt = map_domain_mem(pfn << PAGE_SHIFT);
  48.170 -        memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0,
  48.171 -               NR_RESERVED_GDT_ENTRIES*8);
  48.172 -        unmap_domain_mem(vgdt);
  48.173 -        put_page_and_type(&frame_table[pfn]);
  48.174 -
  48.175 -        /* Okay, we zapped the entries. Now try the GDT checks again. */
  48.176 -        if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
  48.177 -            goto fail;
  48.178 -    }
  48.179 -
  48.180 -    /* Check the remaining pages in the new GDT. */
  48.181 -    for ( i = 1; i < nr_pages; i++ )
  48.182 -        if ( ((pfn = frames[i]) >= max_page) ||
  48.183 -             !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
  48.184 -            goto fail;
  48.185 -
  48.186 -    /* Copy reserved GDT entries to the new GDT. */
  48.187 -    vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
  48.188 -    memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 
  48.189 -           gdt_table + FIRST_RESERVED_GDT_ENTRY, 
  48.190 -           NR_RESERVED_GDT_ENTRIES*8);
  48.191 -    unmap_domain_mem(vgdt);
  48.192 -
  48.193 -    /* Tear down the old GDT. */
  48.194 -    destroy_gdt(ed);
  48.195 -
  48.196 -    /* Install the new GDT. */
  48.197 -    for ( i = 0; i < nr_pages; i++ )
  48.198 -        ed->arch.perdomain_ptes[i] =
  48.199 -            mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  48.200 -
  48.201 -    SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed));
  48.202 -    SET_GDT_ENTRIES(ed, entries);
  48.203 -
  48.204 -    return 0;
  48.205 -
  48.206 - fail:
  48.207 -    while ( i-- > 0 )
  48.208 -        put_page_and_type(&frame_table[frames[i]]);
  48.209 -    return -EINVAL;
  48.210 -}
  48.211 -
  48.212 -
  48.213 -long do_set_gdt(unsigned long *frame_list, unsigned int entries)
  48.214 -{
  48.215 -    int nr_pages = (entries + 511) / 512;
  48.216 -    unsigned long frames[16];
  48.217 -    long ret;
  48.218 -
  48.219 -    if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) ) 
  48.220 -        return -EINVAL;
  48.221 -    
  48.222 -    if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
  48.223 -        return -EFAULT;
  48.224 -
  48.225 -    if ( (ret = set_gdt(current, frames, entries)) == 0 )
  48.226 -    {
  48.227 -        local_flush_tlb();
  48.228 -        __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt));
  48.229 -    }
  48.230 -
  48.231 -    return ret;
  48.232 -}
  48.233 -
  48.234 -
  48.235 -long do_update_descriptor(
  48.236 -    unsigned long pa, unsigned long word1, unsigned long word2)
  48.237 -{
  48.238 -    unsigned long *gdt_pent, pfn = pa >> PAGE_SHIFT, d[2];
  48.239 -    struct pfn_info *page;
  48.240 -    long ret = -EINVAL;
  48.241 -
  48.242 -    d[0] = word1;
  48.243 -    d[1] = word2;
  48.244 -
  48.245 -    if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(d) )
  48.246 -        return -EINVAL;
  48.247 -
  48.248 -    page = &frame_table[pfn];
  48.249 -    if ( unlikely(!get_page(page, current->domain)) )
  48.250 -        return -EINVAL;
  48.251 -
  48.252 -    /* Check if the given frame is in use in an unsafe context. */
  48.253 -    switch ( page->u.inuse.type_info & PGT_type_mask )
  48.254 -    {
  48.255 -    case PGT_gdt_page:
  48.256 -        /* Disallow updates of Xen-reserved descriptors in the current GDT. */
  48.257 -        if ( (l1_pgentry_to_pagenr(current->arch.perdomain_ptes[0]) == pfn) &&
  48.258 -             (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
  48.259 -             (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
  48.260 -            goto out;
  48.261 -        if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
  48.262 -            goto out;
  48.263 -        break;
  48.264 -    case PGT_ldt_page:
  48.265 -        if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
  48.266 -            goto out;
  48.267 -        break;
  48.268 -    default:
  48.269 -        if ( unlikely(!get_page_type(page, PGT_writable_page)) )
  48.270 -            goto out;
  48.271 -        break;
  48.272 -    }
  48.273 -
  48.274 -    /* All is good so make the update. */
  48.275 -    gdt_pent = map_domain_mem(pa);
  48.276 -    memcpy(gdt_pent, d, 8);
  48.277 -    unmap_domain_mem(gdt_pent);
  48.278 -
  48.279 -    put_page_type(page);
  48.280 -
  48.281 -    ret = 0; /* success */
  48.282 -
  48.283 - out:
  48.284 -    put_page(page);
  48.285 -    return ret;
  48.286 -}
  48.287 -
  48.288  #ifdef MEMORY_GUARD
  48.289  
  48.290  #define ALLOC_PT(_level) \
    49.1 --- a/xen/arch/x86/x86_64/traps.c	Mon Feb 07 08:19:24 2005 +0000
    49.2 +++ b/xen/arch/x86/x86_64/traps.c	Tue Feb 08 16:44:16 2005 +0000
    49.3 @@ -129,10 +129,7 @@ void show_page_walk(unsigned long addr)
    49.4      printk("    L1 = %p\n", page);
    49.5  }
    49.6  
    49.7 -#define DOUBLEFAULT_STACK_SIZE 1024
    49.8 -static unsigned char doublefault_stack[DOUBLEFAULT_STACK_SIZE];
    49.9  asmlinkage void double_fault(void);
   49.10 -
   49.11  asmlinkage void do_double_fault(struct xen_regs *regs)
   49.12  {
   49.13      /* Disable the NMI watchdog. It's useless now. */
   49.14 @@ -142,19 +139,9 @@ asmlinkage void do_double_fault(struct x
   49.15  
   49.16      /* Find information saved during fault and dump it to the console. */
   49.17      printk("************************************\n");
   49.18 -    printk("EIP:    %04lx:[<%p>]      \nEFLAGS: %p\n",
   49.19 -           0xffff & regs->cs, regs->rip, regs->eflags);
   49.20 -    printk("rax: %p   rbx: %p   rcx: %p   rdx: %p\n",
   49.21 -           regs->rax, regs->rbx, regs->rcx, regs->rdx);
   49.22 -    printk("rsi: %p   rdi: %p   rbp: %p   rsp: %p\n",
   49.23 -           regs->rsi, regs->rdi, regs->rbp, regs->rsp);
   49.24 -    printk("r8:  %p   r9:  %p   r10: %p   r11: %p\n",
   49.25 -           regs->r8,  regs->r9,  regs->r10, regs->r11);
   49.26 -    printk("r12: %p   r13: %p   r14: %p   r15: %p\n",
   49.27 -           regs->r12, regs->r13, regs->r14, regs->r15);
   49.28 +    show_registers(regs);
   49.29      printk("************************************\n");
   49.30 -    printk("CPU%d DOUBLE FAULT -- system shutdown\n",
   49.31 -           logical_smp_processor_id());
   49.32 +    printk("CPU%d DOUBLE FAULT -- system shutdown\n", smp_processor_id());
   49.33      printk("System needs manual reset.\n");
   49.34      printk("************************************\n");
   49.35  
   49.36 @@ -166,25 +153,29 @@ asmlinkage void do_double_fault(struct x
   49.37          __asm__ __volatile__ ( "hlt" );
   49.38  }
   49.39  
   49.40 -void __init doublefault_init(void)
   49.41 -{
   49.42 -    int i;
   49.43 -
   49.44 -    /* Initialise IST1 for each CPU. Note the handler is non-reentrant. */
   49.45 -    for ( i = 0; i < NR_CPUS; i++ )
   49.46 -        init_tss[i].ist[0] = (unsigned long)
   49.47 -            &doublefault_stack[DOUBLEFAULT_STACK_SIZE];
   49.48 -
   49.49 -    /* Set interrupt gate for double faults, specifying IST1. */
   49.50 -    set_intr_gate(TRAP_double_fault, &double_fault);
   49.51 -    idt_table[TRAP_double_fault].a |= 1UL << 32; /* IST1 */
   49.52 -}
   49.53 -
   49.54  asmlinkage void hypercall(void);
   49.55  void __init percpu_traps_init(void)
   49.56  {
   49.57      char *stack_top = (char *)get_stack_top();
   49.58      char *stack     = (char *)((unsigned long)stack_top & ~(STACK_SIZE - 1));
   49.59 +    int   cpu       = smp_processor_id();
   49.60 +
   49.61 +    /* Double-fault handler has its own per-CPU 1kB stack. */
   49.62 +    init_tss[cpu].ist[0] = (unsigned long)&stack[1024];
   49.63 +    set_intr_gate(TRAP_double_fault, &double_fault);
   49.64 +    idt_tables[cpu][TRAP_double_fault].a |= 1UL << 32; /* IST1 */
   49.65 +
   49.66 +    /* NMI handler has its own per-CPU 1kB stack. */
   49.67 +    init_tss[cpu].ist[1] = (unsigned long)&stack[2048];
   49.68 +    idt_tables[cpu][TRAP_nmi].a          |= 2UL << 32; /* IST2 */
   49.69 +
   49.70 +    /*
   49.71 +     * Trampoline for SYSCALL entry from long mode.
   49.72 +     */
   49.73 +
   49.74 +    /* Skip the NMI and DF stacks. */
   49.75 +    stack = &stack[2048];
   49.76 +    wrmsr(MSR_LSTAR, (unsigned long)stack, ((unsigned long)stack>>32)); 
   49.77  
   49.78      /* movq %rsp, saversp(%rip) */
   49.79      stack[0] = 0x48;
   49.80 @@ -202,9 +193,36 @@ void __init percpu_traps_init(void)
   49.81      stack[14] = 0xe9;
   49.82      *(u32 *)&stack[15] = (char *)hypercall - &stack[19];
   49.83  
   49.84 -    wrmsr(MSR_STAR,  0, (FLAT_RING3_CS64<<16) | __HYPERVISOR_CS); 
   49.85 -    wrmsr(MSR_LSTAR, (unsigned long)stack, ((unsigned long)stack>>32)); 
   49.86 -    wrmsr(MSR_SYSCALL_MASK, 0xFFFFFFFFU, 0U);
   49.87 +    /*
   49.88 +     * Trampoline for SYSCALL entry from compatibility mode.
   49.89 +     */
   49.90 +
   49.91 +    /* Skip the long-mode entry trampoline. */
   49.92 +    stack = &stack[19];
   49.93 +    wrmsr(MSR_CSTAR, (unsigned long)stack, ((unsigned long)stack>>32)); 
   49.94 +
   49.95 +    /* movq %rsp, saversp(%rip) */
   49.96 +    stack[0] = 0x48;
   49.97 +    stack[1] = 0x89;
   49.98 +    stack[2] = 0x25;
   49.99 +    *(u32 *)&stack[3] = (stack_top - &stack[7]) - 16;
  49.100 +
  49.101 +    /* leaq saversp(%rip), %rsp */
  49.102 +    stack[7] = 0x48;
  49.103 +    stack[8] = 0x8d;
  49.104 +    stack[9] = 0x25;
  49.105 +    *(u32 *)&stack[10] = (stack_top - &stack[14]) - 16;
  49.106 +
  49.107 +    /* jmp hypercall */
  49.108 +    stack[14] = 0xe9;
  49.109 +    *(u32 *)&stack[15] = (char *)hypercall - &stack[19];
  49.110 +
  49.111 +    /*
  49.112 +     * Common SYSCALL parameters.
  49.113 +     */
  49.114 +
  49.115 +    wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS);
  49.116 +    wrmsr(MSR_SYSCALL_MASK, ~EF_IE, 0U); /* disable interrupts */
  49.117  }
  49.118  
  49.119  void *decode_reg(struct xen_regs *regs, u8 b)
    50.1 --- a/xen/common/dom_mem_ops.c	Mon Feb 07 08:19:24 2005 +0000
    50.2 +++ b/xen/common/dom_mem_ops.c	Tue Feb 08 16:44:16 2005 +0000
    50.3 @@ -25,8 +25,8 @@
    50.4  
    50.5  #define PREEMPT_CHECK(_op)                          \
    50.6      if ( hypercall_preempt_check() )                \
    50.7 -        return hypercall_create_continuation(       \
    50.8 -            __HYPERVISOR_dom_mem_op, 5,             \
    50.9 +        return hypercall5_create_continuation(      \
   50.10 +            __HYPERVISOR_dom_mem_op,                \
   50.11              (_op) | (i << START_EXTENT_SHIFT),      \
   50.12              extent_list, nr_extents, extent_order,  \
   50.13              (d == current->domain) ? DOMID_SELF : d->id);
   50.14 @@ -122,7 +122,7 @@ free_dom_mem(struct domain *d,
   50.15  long
   50.16  do_dom_mem_op(unsigned long  op, 
   50.17                unsigned long *extent_list, 
   50.18 -              unsigned long  nr_extents,
   50.19 +              unsigned int   nr_extents,
   50.20                unsigned int   extent_order,
   50.21                domid_t        domid)
   50.22  {
   50.23 @@ -133,8 +133,7 @@ do_dom_mem_op(unsigned long  op,
   50.24      start_extent  = op >> START_EXTENT_SHIFT;
   50.25      op           &= (1 << START_EXTENT_SHIFT) - 1;
   50.26  
   50.27 -    if ( unlikely(start_extent > nr_extents) || 
   50.28 -         unlikely(nr_extents > ~0U) ) /* can pack into a uint? */
   50.29 +    if ( unlikely(start_extent > nr_extents) )
   50.30          return -EINVAL;
   50.31  
   50.32      if ( likely(domid == DOMID_SELF) )
   50.33 @@ -150,13 +149,11 @@ do_dom_mem_op(unsigned long  op,
   50.34      {
   50.35      case MEMOP_increase_reservation:
   50.36          rc = alloc_dom_mem(
   50.37 -            d, extent_list, start_extent, 
   50.38 -            (unsigned int)nr_extents, extent_order);
   50.39 +            d, extent_list, start_extent, nr_extents, extent_order);
   50.40          break;
   50.41      case MEMOP_decrease_reservation:
   50.42          rc = free_dom_mem(
   50.43 -            d, extent_list, start_extent, 
   50.44 -            (unsigned int)nr_extents, extent_order);
   50.45 +            d, extent_list, start_extent, nr_extents, extent_order);
   50.46          break;
   50.47      default:
   50.48          rc = -ENOSYS;
    51.1 --- a/xen/common/domain.c	Mon Feb 07 08:19:24 2005 +0000
    51.2 +++ b/xen/common/domain.c	Tue Feb 08 16:44:16 2005 +0000
    51.3 @@ -45,8 +45,6 @@ struct domain *do_createdomain(domid_t d
    51.4      ed->processor  = cpu;
    51.5      d->create_time = NOW();
    51.6   
    51.7 -    memcpy(&ed->arch, &idle0_exec_domain.arch, sizeof(ed->arch));
    51.8 -
    51.9      spin_lock_init(&d->time_lock);
   51.10  
   51.11      spin_lock_init(&d->big_lock);
    52.1 --- a/xen/common/elf.c	Mon Feb 07 08:19:24 2005 +0000
    52.2 +++ b/xen/common/elf.c	Tue Feb 08 16:44:16 2005 +0000
    52.3 @@ -13,10 +13,8 @@
    52.4  
    52.5  #ifdef CONFIG_X86
    52.6  #define FORCE_XENELF_IMAGE 1
    52.7 -#define ELF_ADDR           p_vaddr
    52.8  #elif defined(__ia64__)
    52.9  #define FORCE_XENELF_IMAGE 0
   52.10 -#define ELF_ADDR           p_paddr
   52.11  #endif
   52.12  
   52.13  static inline int is_loadable_phdr(Elf_Phdr *phdr)
   52.14 @@ -100,10 +98,10 @@ int parseelfimage(char *elfbase,
   52.15          phdr = (Elf_Phdr *)(elfbase + ehdr->e_phoff + (h*ehdr->e_phentsize));
   52.16          if ( !is_loadable_phdr(phdr) )
   52.17              continue;
   52.18 -        if ( phdr->ELF_ADDR < kernstart )
   52.19 -            kernstart = phdr->ELF_ADDR;
   52.20 -        if ( (phdr->ELF_ADDR + phdr->p_memsz) > kernend )
   52.21 -            kernend = phdr->ELF_ADDR + phdr->p_memsz;
   52.22 +        if ( phdr->p_paddr < kernstart )
   52.23 +            kernstart = phdr->p_paddr;
   52.24 +        if ( (phdr->p_paddr + phdr->p_memsz) > kernend )
   52.25 +            kernend = phdr->p_paddr + phdr->p_memsz;
   52.26      }
   52.27  
   52.28      if ( (kernstart > kernend) || 
   52.29 @@ -144,10 +142,10 @@ int loadelfimage(char *elfbase)
   52.30          if ( !is_loadable_phdr(phdr) )
   52.31              continue;
   52.32          if ( phdr->p_filesz != 0 )
   52.33 -            memcpy((char *)phdr->ELF_ADDR, elfbase + phdr->p_offset, 
   52.34 +            memcpy((char *)phdr->p_paddr, elfbase + phdr->p_offset, 
   52.35                     phdr->p_filesz);
   52.36          if ( phdr->p_memsz > phdr->p_filesz )
   52.37 -            memset((char *)phdr->ELF_ADDR + phdr->p_filesz, 0, 
   52.38 +            memset((char *)phdr->p_paddr + phdr->p_filesz, 0, 
   52.39                     phdr->p_memsz - phdr->p_filesz);
   52.40      }
   52.41  
    53.1 --- a/xen/common/keyhandler.c	Mon Feb 07 08:19:24 2005 +0000
    53.2 +++ b/xen/common/keyhandler.c	Tue Feb 08 16:44:16 2005 +0000
    53.3 @@ -27,7 +27,7 @@ static struct {
    53.4  
    53.5  static unsigned char keypress_key;
    53.6  
    53.7 -void keypress_softirq(void)
    53.8 +static void keypress_softirq(void)
    53.9  {
   53.10      keyhandler_t *h;
   53.11      unsigned char key = keypress_key;
   53.12 @@ -94,7 +94,7 @@ static void halt_machine(unsigned char k
   53.13      machine_restart(NULL); 
   53.14  }
   53.15  
   53.16 -void do_task_queues(unsigned char key)
   53.17 +static void do_task_queues(unsigned char key)
   53.18  {
   53.19      struct domain *d;
   53.20      struct exec_domain *ed;
    54.1 --- a/xen/common/multicall.c	Mon Feb 07 08:19:24 2005 +0000
    54.2 +++ b/xen/common/multicall.c	Tue Feb 08 16:44:16 2005 +0000
    54.3 @@ -67,8 +67,8 @@ long do_multicall(multicall_entry_t *cal
    54.4              if ( i < nr_calls )
    54.5              {
    54.6                  mcs->flags = 0;
    54.7 -                return hypercall_create_continuation(
    54.8 -                    __HYPERVISOR_multicall, 2, &call_list[i], nr_calls-i);
    54.9 +                return hypercall2_create_continuation(
   54.10 +                    __HYPERVISOR_multicall, &call_list[i], nr_calls-i);
   54.11              }
   54.12          }
   54.13      }
    55.1 --- a/xen/common/physdev.c	Mon Feb 07 08:19:24 2005 +0000
    55.2 +++ b/xen/common/physdev.c	Tue Feb 08 16:44:16 2005 +0000
    55.3 @@ -720,7 +720,7 @@ string_param("physdev_dom0_hide", opt_ph
    55.4  
    55.5  /* Test if boot params specify this device should NOT be visible to DOM0
    55.6   * (e.g. so that another domain can control it instead) */
    55.7 -int pcidev_dom0_hidden(struct pci_dev *dev)
    55.8 +static int pcidev_dom0_hidden(struct pci_dev *dev)
    55.9  {
   55.10      char cmp[10] = "(.......)";
   55.11      
    56.1 --- a/xen/common/resource.c	Mon Feb 07 08:19:24 2005 +0000
    56.2 +++ b/xen/common/resource.c	Tue Feb 08 16:44:16 2005 +0000
    56.3 @@ -254,19 +254,6 @@ struct resource * __request_region(struc
    56.4  	return res;
    56.5  }
    56.6  
    56.7 -int __check_region(struct resource *parent, unsigned long start, unsigned long n)
    56.8 -{
    56.9 -	struct resource * res;
   56.10 -
   56.11 -	res = __request_region(parent, start, n, "check-region");
   56.12 -	if (!res)
   56.13 -		return -EBUSY;
   56.14 -
   56.15 -	release_resource(res);
   56.16 -	xfree(res);
   56.17 -	return 0;
   56.18 -}
   56.19 -
   56.20  void __release_region(struct resource *parent, unsigned long start, unsigned long n)
   56.21  {
   56.22  	struct resource **p;
    57.1 --- a/xen/common/sched_bvt.c	Mon Feb 07 08:19:24 2005 +0000
    57.2 +++ b/xen/common/sched_bvt.c	Tue Feb 08 16:44:16 2005 +0000
    57.3 @@ -167,7 +167,7 @@ static inline u32 calc_evt(struct exec_d
    57.4   *
    57.5   * Returns non-zero on failure.
    57.6   */
    57.7 -int bvt_alloc_task(struct exec_domain *ed)
    57.8 +static int bvt_alloc_task(struct exec_domain *ed)
    57.9  {
   57.10      struct domain *d = ed->domain;
   57.11      if ( (d->sched_priv == NULL) ) {
   57.12 @@ -184,7 +184,7 @@ int bvt_alloc_task(struct exec_domain *e
   57.13  /*
   57.14   * Add and remove a domain
   57.15   */
   57.16 -void bvt_add_task(struct exec_domain *d) 
   57.17 +static void bvt_add_task(struct exec_domain *d) 
   57.18  {
   57.19      struct bvt_dom_info *inf = BVT_INFO(d->domain);
   57.20      struct bvt_edom_info *einf = EBVT_INFO(d);
   57.21 @@ -225,7 +225,7 @@ void bvt_add_task(struct exec_domain *d)
   57.22      }
   57.23  }
   57.24  
   57.25 -int bvt_init_idle_task(struct exec_domain *p)
   57.26 +static int bvt_init_idle_task(struct exec_domain *p)
   57.27  {
   57.28      if ( bvt_alloc_task(p) < 0 )
   57.29          return -1;
   57.30 @@ -239,7 +239,7 @@ int bvt_init_idle_task(struct exec_domai
   57.31      return 0;
   57.32  }
   57.33  
   57.34 -void bvt_wake(struct exec_domain *d)
   57.35 +static void bvt_wake(struct exec_domain *d)
   57.36  {
   57.37      struct bvt_edom_info *einf = EBVT_INFO(d);
   57.38      struct exec_domain  *curr;
   57.39 @@ -290,14 +290,14 @@ static void bvt_sleep(struct exec_domain
   57.40   * bvt_free_task - free BVT private structures for a task
   57.41   * @d:             task
   57.42   */
   57.43 -void bvt_free_task(struct domain *d)
   57.44 +static void bvt_free_task(struct domain *d)
   57.45  {
   57.46      ASSERT(d->sched_priv != NULL);
   57.47      xfree(d->sched_priv);
   57.48  }
   57.49  
   57.50  /* Control the scheduler. */
   57.51 -int bvt_ctl(struct sched_ctl_cmd *cmd)
   57.52 +static int bvt_ctl(struct sched_ctl_cmd *cmd)
   57.53  {
   57.54      struct bvt_ctl *params = &cmd->u.bvt;
   57.55  
   57.56 @@ -310,7 +310,7 @@ int bvt_ctl(struct sched_ctl_cmd *cmd)
   57.57  }
   57.58  
   57.59  /* Adjust scheduling parameter for a given domain. */
   57.60 -int bvt_adjdom(
   57.61 +static int bvt_adjdom(
   57.62      struct domain *d, struct sched_adjdom_cmd *cmd)
   57.63  {
   57.64      struct bvt_adjdom *params = &cmd->u.bvt;
   57.65 @@ -549,7 +549,7 @@ static void bvt_dump_cpu_state(int i)
   57.66  }
   57.67  
   57.68  /* Initialise the data structures. */
   57.69 -int bvt_init_scheduler()
   57.70 +static int bvt_init_scheduler(void)
   57.71  {
   57.72      int i;
   57.73  
    58.1 --- a/xen/drivers/pci/Makefile	Mon Feb 07 08:19:24 2005 +0000
    58.2 +++ b/xen/drivers/pci/Makefile	Tue Feb 08 16:44:16 2005 +0000
    58.3 @@ -4,7 +4,7 @@
    58.4  
    58.5  include $(BASEDIR)/Rules.mk
    58.6  
    58.7 -OBJS := pci.o quirks.o compat.o names.o setup-res.o
    58.8 +OBJS := pci.o quirks.o names.o setup-res.o
    58.9  
   58.10  #obj-$(CONFIG_PCI) += pci.o quirks.o compat.o names.o
   58.11  #obj-$(CONFIG_PROC_FS) += proc.o
    59.1 --- a/xen/drivers/pci/compat.c	Mon Feb 07 08:19:24 2005 +0000
    59.2 +++ b/xen/drivers/pci/compat.c	Tue Feb 08 16:44:16 2005 +0000
    59.3 @@ -1,65 +0,0 @@
    59.4 -/*
    59.5 - *	$Id: compat.c,v 1.1 1998/02/16 10:35:50 mj Exp $
    59.6 - *
    59.7 - *	PCI Bus Services -- Function For Backward Compatibility
    59.8 - *
    59.9 - *	Copyright 1998--2000 Martin Mares <mj@ucw.cz>
   59.10 - */
   59.11 -
   59.12 -#include <xen/types.h>
   59.13 -//#include <xen/kernel.h>
   59.14 -#include <xen/pci.h>
   59.15 -
   59.16 -int
   59.17 -pcibios_present(void)
   59.18 -{
   59.19 -	return !list_empty(&pci_devices);
   59.20 -}
   59.21 -
   59.22 -int
   59.23 -pcibios_find_class(unsigned int class, unsigned short index, unsigned char *bus, unsigned char *devfn)
   59.24 -{
   59.25 -	const struct pci_dev *dev = NULL;
   59.26 -	int cnt = 0;
   59.27 -
   59.28 -	while ((dev = pci_find_class(class, dev)))
   59.29 -		if (index == cnt++) {
   59.30 -			*bus = dev->bus->number;
   59.31 -			*devfn = dev->devfn;
   59.32 -			return PCIBIOS_SUCCESSFUL;
   59.33 -		}
   59.34 -	return PCIBIOS_DEVICE_NOT_FOUND;
   59.35 -}
   59.36 -
   59.37 -
   59.38 -int
   59.39 -pcibios_find_device(unsigned short vendor, unsigned short device, unsigned short index,
   59.40 -		    unsigned char *bus, unsigned char *devfn)
   59.41 -{
   59.42 -	const struct pci_dev *dev = NULL;
   59.43 -	int cnt = 0;
   59.44 -
   59.45 -	while ((dev = pci_find_device(vendor, device, dev)))
   59.46 -		if (index == cnt++) {
   59.47 -			*bus = dev->bus->number;
   59.48 -			*devfn = dev->devfn;
   59.49 -			return PCIBIOS_SUCCESSFUL;
   59.50 -		}
   59.51 -	return PCIBIOS_DEVICE_NOT_FOUND;
   59.52 -}
   59.53 -
   59.54 -#define PCI_OP(rw,size,type)							\
   59.55 -int pcibios_##rw##_config_##size (unsigned char bus, unsigned char dev_fn,	\
   59.56 -				  unsigned char where, unsigned type val)	\
   59.57 -{										\
   59.58 -	struct pci_dev *dev = pci_find_slot(bus, dev_fn);			\
   59.59 -	if (!dev) return PCIBIOS_DEVICE_NOT_FOUND;				\
   59.60 -	return pci_##rw##_config_##size(dev, where, val);			\
   59.61 -}
   59.62 -
   59.63 -PCI_OP(read, byte, char *)
   59.64 -PCI_OP(read, word, short *)
   59.65 -PCI_OP(read, dword, int *)
   59.66 -PCI_OP(write, byte, char)
   59.67 -PCI_OP(write, word, short)
   59.68 -PCI_OP(write, dword, int)
    60.1 --- a/xen/include/asm-x86/config.h	Mon Feb 07 08:19:24 2005 +0000
    60.2 +++ b/xen/include/asm-x86/config.h	Tue Feb 08 16:44:16 2005 +0000
    60.3 @@ -191,6 +191,10 @@ extern void __out_of_line_bug(int line) 
    60.4  #define __HYPERVISOR_DS32 0x0818
    60.5  #define __HYPERVISOR_DS   __HYPERVISOR_DS64
    60.6  
    60.7 +#define __GUEST_CS        0x0833
    60.8 +#define __GUEST_DS        0x0000
    60.9 +#define __GUEST_SS        0x082b
   60.10 +
   60.11  /* For generic assembly code: use macros to define operation/operand sizes. */
   60.12  #define __OS "q"  /* Operation Suffix */
   60.13  #define __OP "r"  /* Operand Prefix */
    61.1 --- a/xen/include/asm-x86/domain.h	Mon Feb 07 08:19:24 2005 +0000
    61.2 +++ b/xen/include/asm-x86/domain.h	Tue Feb 08 16:44:16 2005 +0000
    61.3 @@ -96,6 +96,7 @@ struct arch_exec_domain
    61.4      pagetable_t  pagetable;
    61.5  
    61.6      pagetable_t  monitor_table;
    61.7 +    pagetable_t  phys_table;            /* 1:1 pagetable */
    61.8      pagetable_t  shadow_table;
    61.9      l2_pgentry_t *vpagetable;	        /* virtual address of pagetable */
   61.10      l2_pgentry_t *shadow_vtable;	/* virtual address of shadow_table */
    62.1 --- a/xen/include/asm-x86/mm.h	Mon Feb 07 08:19:24 2005 +0000
    62.2 +++ b/xen/include/asm-x86/mm.h	Tue Feb 08 16:44:16 2005 +0000
    62.3 @@ -13,6 +13,7 @@
    62.4  #include <asm/desc.h>
    62.5  #include <asm/flushtlb.h>
    62.6  #include <asm/io.h>
    62.7 +#include <asm/uaccess.h>
    62.8  
    62.9  #include <public/xen.h>
   62.10  
   62.11 @@ -218,7 +219,7 @@ static inline int get_page_and_type(stru
   62.12      ASSERT(((_p)->count_info & PGC_count_mask) != 0);          \
   62.13      ASSERT(page_get_owner(_p) == (_d))
   62.14  
   62.15 -int check_descriptor(unsigned long *d);
   62.16 +int check_descriptor(struct desc_struct *d);
   62.17  
   62.18  /*
   62.19   * Use currently-executing domain's pagetables on the specified CPUs.
   62.20 @@ -241,8 +242,20 @@ void synchronise_pagetables(unsigned lon
   62.21  #undef  phys_to_machine_mapping
   62.22  
   62.23  #define machine_to_phys_mapping ((unsigned long *)RDWR_MPT_VIRT_START)
   62.24 -#define phys_to_machine_mapping ((unsigned long *)PERDOMAIN_VIRT_START)
   62.25 +#define __phys_to_machine_mapping ((unsigned long *)PERDOMAIN_VIRT_START)
   62.26 +/* Returns the machine physical */
   62.27 +static inline unsigned long phys_to_machine_mapping(unsigned long pfn) 
   62.28 +{
   62.29 +    unsigned long mfn;
   62.30 +    l1_pgentry_t pte;
   62.31  
   62.32 +   if (__get_user(l1_pgentry_val(pte), (__phys_to_machine_mapping + pfn))) {
   62.33 +       return 0;
   62.34 +   }
   62.35 +               
   62.36 +   mfn = l1_pgentry_to_phys(pte) >> PAGE_SHIFT;
   62.37 +   return mfn; 
   62.38 +}
   62.39  #define set_machinetophys(_mfn, _pfn) machine_to_phys_mapping[(_mfn)] = (_pfn)
   62.40  
   62.41  #define DEFAULT_GDT_ENTRIES     (LAST_RESERVED_GDT_ENTRY+1)
    63.1 --- a/xen/include/asm-x86/multicall.h	Mon Feb 07 08:19:24 2005 +0000
    63.2 +++ b/xen/include/asm-x86/multicall.h	Tue Feb 08 16:44:16 2005 +0000
    63.3 @@ -9,7 +9,23 @@
    63.4  
    63.5  #ifdef __x86_64__
    63.6  
    63.7 -#define do_multicall_call(_call) BUG()
    63.8 +#define do_multicall_call(_call)                         \
    63.9 +    do {                                                 \
   63.10 +        __asm__ __volatile__ (                           \
   63.11 +            "movq  "STR(MULTICALL_op)"(%0),%%rax; "      \
   63.12 +            "andq  $("STR(NR_hypercalls)"-1),%%rax; "    \
   63.13 +            "leaq  "STR(hypercall_table)"(%%rip),%%rdi; "\
   63.14 +            "leaq  (%%rdi,%%rax,8),%%rax; "              \
   63.15 +            "movq  "STR(MULTICALL_arg0)"(%0),%%rdi; "    \
   63.16 +            "movq  "STR(MULTICALL_arg1)"(%0),%%rsi; "    \
   63.17 +            "movq  "STR(MULTICALL_arg2)"(%0),%%rdx; "    \
   63.18 +            "movq  "STR(MULTICALL_arg3)"(%0),%%rcx; "    \
   63.19 +            "movq  "STR(MULTICALL_arg4)"(%0),%%r8; "     \
   63.20 +            "callq *(%%rax); "                           \
   63.21 +            "movq  %%rax,"STR(MULTICALL_result)"(%0); "  \
   63.22 +            : : "b" (_call)                              \
   63.23 +            : "rax", "rdi", "rsi", "rdx", "rcx", "r8" ); \
   63.24 +    } while ( 0 )
   63.25  
   63.26  #else
   63.27  
    64.1 --- a/xen/include/asm-x86/page.h	Mon Feb 07 08:19:24 2005 +0000
    64.2 +++ b/xen/include/asm-x86/page.h	Tue Feb 08 16:44:16 2005 +0000
    64.3 @@ -1,39 +1,14 @@
    64.4 -/******************************************************************************
    64.5 - * asm-x86/page.h
    64.6 - * 
    64.7 - * Definitions relating to page tables.
    64.8 - */
    64.9 +/* -*-  Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
   64.10  
   64.11  #ifndef __X86_PAGE_H__
   64.12  #define __X86_PAGE_H__
   64.13  
   64.14 -#if defined(__x86_64__)
   64.15 -
   64.16 -#define L1_PAGETABLE_SHIFT       12
   64.17 -#define L2_PAGETABLE_SHIFT       21
   64.18 -#define L3_PAGETABLE_SHIFT       30
   64.19 -#define L4_PAGETABLE_SHIFT       39
   64.20 -
   64.21 -#define ENTRIES_PER_L1_PAGETABLE 512
   64.22 -#define ENTRIES_PER_L2_PAGETABLE 512
   64.23 -#define ENTRIES_PER_L3_PAGETABLE 512
   64.24 -#define ENTRIES_PER_L4_PAGETABLE 512
   64.25 -
   64.26 -#define __PAGE_OFFSET		(0xFFFF830000000000)
   64.27 -
   64.28 -#elif defined(__i386__)
   64.29 -
   64.30 -#define L1_PAGETABLE_SHIFT       12
   64.31 -#define L2_PAGETABLE_SHIFT       22
   64.32 -
   64.33 -#define ENTRIES_PER_L1_PAGETABLE 1024
   64.34 -#define ENTRIES_PER_L2_PAGETABLE 1024
   64.35 -
   64.36 -#define __PAGE_OFFSET		(0xFC400000)
   64.37 -
   64.38 +#if defined(__i386__)
   64.39 +#include <asm/x86_32/page.h>
   64.40 +#elif defined(__x86_64__)
   64.41 +#include <asm/x86_64/page.h>
   64.42  #endif
   64.43  
   64.44 -#define PAGE_SHIFT               L1_PAGETABLE_SHIFT
   64.45  #ifndef __ASSEMBLY__
   64.46  #define PAGE_SIZE	         (1UL << PAGE_SHIFT)
   64.47  #else
   64.48 @@ -44,77 +19,9 @@
   64.49  #define clear_page(_p)           memset((void *)(_p), 0, PAGE_SIZE)
   64.50  #define copy_page(_t,_f)         memcpy((void *)(_t), (void *)(_f), PAGE_SIZE)
   64.51  
   64.52 -#ifndef __ASSEMBLY__
   64.53 -#include <xen/config.h>
   64.54 -typedef struct { unsigned long l1_lo; } l1_pgentry_t;
   64.55 -typedef struct { unsigned long l2_lo; } l2_pgentry_t;
   64.56 -typedef struct { unsigned long l3_lo; } l3_pgentry_t;
   64.57 -typedef struct { unsigned long l4_lo; } l4_pgentry_t;
   64.58 -#endif /* !__ASSEMBLY__ */
   64.59 -
   64.60 -/* Strip type from a table entry. */
   64.61 -#define l1_pgentry_val(_x) ((_x).l1_lo)
   64.62 -#define l2_pgentry_val(_x) ((_x).l2_lo)
   64.63 -#define l3_pgentry_val(_x) ((_x).l3_lo)
   64.64 -#define l4_pgentry_val(_x) ((_x).l4_lo)
   64.65 -
   64.66 -/* Add type to a table entry. */
   64.67 -#define mk_l1_pgentry(_x)  ( (l1_pgentry_t) { (_x) } )
   64.68 -#define mk_l2_pgentry(_x)  ( (l2_pgentry_t) { (_x) } )
   64.69 -#define mk_l3_pgentry(_x)  ( (l3_pgentry_t) { (_x) } )
   64.70 -#define mk_l4_pgentry(_x)  ( (l4_pgentry_t) { (_x) } )
   64.71 -
   64.72 -/* Turn a typed table entry into a page index. */
   64.73 -#define l1_pgentry_to_pagenr(_x) (l1_pgentry_val(_x) >> PAGE_SHIFT) 
   64.74 -#define l2_pgentry_to_pagenr(_x) (l2_pgentry_val(_x) >> PAGE_SHIFT)
   64.75 -#define l3_pgentry_to_pagenr(_x) (l3_pgentry_val(_x) >> PAGE_SHIFT)
   64.76 -#define l4_pgentry_to_pagenr(_x) (l4_pgentry_val(_x) >> PAGE_SHIFT)
   64.77 -
   64.78 -/* Turn a typed table entry into a physical address. */
   64.79 -#define l1_pgentry_to_phys(_x) (l1_pgentry_val(_x) & PAGE_MASK)
   64.80 -#define l2_pgentry_to_phys(_x) (l2_pgentry_val(_x) & PAGE_MASK)
   64.81 -#define l3_pgentry_to_phys(_x) (l3_pgentry_val(_x) & PAGE_MASK)
   64.82 -#define l4_pgentry_to_phys(_x) (l4_pgentry_val(_x) & PAGE_MASK)
   64.83 -
   64.84 -/* Pagetable walking. */
   64.85 -#define l2_pgentry_to_l1(_x) \
   64.86 -  ((l1_pgentry_t *)__va(l2_pgentry_val(_x) & PAGE_MASK))
   64.87 -#define l3_pgentry_to_l2(_x) \
   64.88 -  ((l2_pgentry_t *)__va(l3_pgentry_val(_x) & PAGE_MASK))
   64.89 -#define l4_pgentry_to_l3(_x) \
   64.90 -  ((l3_pgentry_t *)__va(l4_pgentry_val(_x) & PAGE_MASK))
   64.91 -
   64.92 -/* Given a virtual address, get an entry offset into a page table. */
   64.93 -#define l1_table_offset(_a) \
   64.94 -  (((_a) >> L1_PAGETABLE_SHIFT) & (ENTRIES_PER_L1_PAGETABLE - 1))
   64.95 -#if defined(__i386__)
   64.96 -#define l2_table_offset(_a) \
   64.97 -  ((_a) >> L2_PAGETABLE_SHIFT)
   64.98 -#elif defined(__x86_64__)
   64.99 -#define l2_table_offset(_a) \
  64.100 -  (((_a) >> L2_PAGETABLE_SHIFT) & (ENTRIES_PER_L2_PAGETABLE - 1))
  64.101 -#define l3_table_offset(_a) \
  64.102 -  (((_a) >> L3_PAGETABLE_SHIFT) & (ENTRIES_PER_L3_PAGETABLE - 1))
  64.103 -#define l4_table_offset(_a) \
  64.104 -  (((_a) >> L4_PAGETABLE_SHIFT) & (ENTRIES_PER_L4_PAGETABLE - 1))
  64.105 -#endif
  64.106 -
  64.107 -#if defined(__i386__)
  64.108 -#define pagetable_t l2_pgentry_t
  64.109 -#define pagetable_val(_x)  ((_x).l2_lo)
  64.110 -#define mk_pagetable(_x)   ( (l2_pgentry_t) { (_x) } )
  64.111 -#define ENTRIES_PER_PAGETABLE ENTRIES_PER_L2_PAGETABLE
  64.112 -#elif defined(__x86_64__)
  64.113 -#define pagetable_t l4_pgentry_t
  64.114 -#define pagetable_val(_x)  ((_x).l4_lo)
  64.115 -#define mk_pagetable(_x)   ( (l4_pgentry_t) { (_x) } )
  64.116 -#define ENTRIES_PER_PAGETABLE ENTRIES_PER_L4_PAGETABLE
  64.117 -#endif
  64.118 -
  64.119  #define PAGE_OFFSET		((unsigned long)__PAGE_OFFSET)
  64.120  #define __pa(x)			((unsigned long)(x)-PAGE_OFFSET)
  64.121  #define __va(x)			((void *)((unsigned long)(x)+PAGE_OFFSET))
  64.122 -#define page_address(_p)        (__va(((_p) - frame_table) << PAGE_SHIFT))
  64.123  #define pfn_to_page(_pfn)       (frame_table + (_pfn))
  64.124  #define phys_to_page(kaddr)     (frame_table + ((kaddr) >> PAGE_SHIFT))
  64.125  #define virt_to_page(kaddr)	(frame_table + (__pa(kaddr) >> PAGE_SHIFT))
    65.1 --- a/xen/include/asm-x86/shadow.h	Mon Feb 07 08:19:24 2005 +0000
    65.2 +++ b/xen/include/asm-x86/shadow.h	Tue Feb 08 16:44:16 2005 +0000
    65.3 @@ -8,6 +8,10 @@
    65.4  #include <xen/perfc.h>
    65.5  #include <asm/processor.h>
    65.6  
    65.7 +#ifdef CONFIG_VMX
    65.8 +#include <asm/domain_page.h>
    65.9 +#endif
   65.10 +
   65.11  /* Shadow PT flag bits in pfn_info */
   65.12  #define PSH_shadowed    (1<<31) /* page has a shadow. PFN points to shadow */
   65.13  #define PSH_pfn_mask    ((1<<21)-1)
   65.14 @@ -34,7 +38,7 @@ extern int shadow_fault(unsigned long va
   65.15  extern void shadow_l1_normal_pt_update(
   65.16      unsigned long pa, unsigned long gpte, 
   65.17      unsigned long *prev_spfn_ptr, l1_pgentry_t **prev_spl1e_ptr);
   65.18 -extern void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpte);
   65.19 +extern void shadow_l2_normal_pt_update(unsigned long pa, unsigned long gpde);
   65.20  extern void unshadow_table(unsigned long gpfn, unsigned int type);
   65.21  extern int shadow_mode_enable(struct domain *p, unsigned int mode);
   65.22  
   65.23 @@ -43,17 +47,15 @@ extern void vmx_shadow_clear_state(struc
   65.24  extern void vmx_shadow_invlpg(struct domain *, unsigned long);
   65.25  #endif
   65.26  
   65.27 -#define  __get_machine_to_phys(_d, guest_gpfn, gpfn)    \
   65.28 -    if ((_d)->arch.shadow_mode == SHM_full_32)          \
   65.29 -        (guest_gpfn) = machine_to_phys_mapping[(gpfn)]; \
   65.30 -    else                                                \
   65.31 -        (guest_gpfn) = (gpfn);
   65.32 +#define __mfn_to_gpfn(_d, mfn)                         \
   65.33 +    ( (shadow_mode(_d) == SHM_full_32)                 \
   65.34 +      ? machine_to_phys_mapping[(mfn)]                 \
   65.35 +      : (mfn) )
   65.36  
   65.37 -#define  __get_phys_to_machine(_d, host_gpfn, gpfn)    \
   65.38 -    if ((_d)->arch.shadow_mode == SHM_full_32)         \
   65.39 -        (host_gpfn) = phys_to_machine_mapping[(gpfn)]; \
   65.40 -    else                                               \
   65.41 -        (host_gpfn) = (gpfn);
   65.42 +#define __gpfn_to_mfn(_d, gpfn)                        \
   65.43 +    ( (shadow_mode(_d) == SHM_full_32)                 \
   65.44 +      ? phys_to_machine_mapping(gpfn)                  \
   65.45 +      : (gpfn) )
   65.46  
   65.47  extern void __shadow_mode_disable(struct domain *d);
   65.48  static inline void shadow_mode_disable(struct domain *d)
   65.49 @@ -66,17 +68,18 @@ extern unsigned long shadow_l2_table(
   65.50      struct domain *d, unsigned long gpfn);
   65.51    
   65.52  static inline void shadow_invalidate(struct exec_domain *ed) {
   65.53 -    if ( ed->domain->arch.shadow_mode != SHM_full_32 )
   65.54 +    if ( shadow_mode(ed->domain) != SHM_full_32 )
   65.55          BUG();
   65.56      memset(ed->arch.shadow_vtable, 0, PAGE_SIZE);
   65.57  }
   65.58  
   65.59  #define SHADOW_DEBUG 1
   65.60 +#define SHADOW_VERBOSE_DEBUG 0
   65.61  #define SHADOW_HASH_DEBUG 1
   65.62  
   65.63  struct shadow_status {
   65.64      unsigned long pfn;            /* Guest pfn.             */
   65.65 -    unsigned long spfn_and_flags; /* Shadow pfn plus flags. */
   65.66 +    unsigned long smfn_and_flags; /* Shadow mfn plus flags. */
   65.67      struct shadow_status *next;   /* Pull-to-front list.    */
   65.68  };
   65.69  
   65.70 @@ -84,62 +87,72 @@ struct shadow_status {
   65.71  #define shadow_ht_buckets    256
   65.72  
   65.73  #ifdef VERBOSE
   65.74 -#define SH_LOG(_f, _a...)                             \
   65.75 -printk("DOM%u: (file=shadow.c, line=%d) " _f "\n",    \
   65.76 -       current->domain->id , __LINE__ , ## _a )
   65.77 +#define SH_LOG(_f, _a...)                                               \
   65.78 +printk("DOM%uP%u: (file=shadow.c, line=%d) " _f "\n",                   \
   65.79 +       current->domain->id , current->processor, __LINE__ , ## _a )
   65.80  #else
   65.81  #define SH_LOG(_f, _a...) 
   65.82  #endif
   65.83  
   65.84  #if SHADOW_DEBUG
   65.85 -#define SH_VLOG(_f, _a...)                             \
   65.86 -    printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \
   65.87 -           current->domain->id , __LINE__ , ## _a )
   65.88 +#define SH_VLOG(_f, _a...)                                              \
   65.89 +    printk("DOM%uP%u: (file=shadow.c, line=%d) " _f "\n",               \
   65.90 +           current->domain->id, current->processor, __LINE__ , ## _a )
   65.91  #else
   65.92  #define SH_VLOG(_f, _a...) 
   65.93  #endif
   65.94  
   65.95 -#if 0
   65.96 -#define SH_VVLOG(_f, _a...)                             \
   65.97 -    printk("DOM%u: (file=shadow.c, line=%d) " _f "\n",  \
   65.98 -           current->domain->id , __LINE__ , ## _a )
   65.99 +#if SHADOW_VERBOSE_DEBUG
  65.100 +#define SH_VVLOG(_f, _a...)                                             \
  65.101 +    printk("DOM%uP%u: (file=shadow.c, line=%d) " _f "\n",               \
  65.102 +           current->domain->id, current->processor, __LINE__ , ## _a )
  65.103  #else
  65.104  #define SH_VVLOG(_f, _a...)
  65.105  #endif
  65.106  
  65.107 -static inline void __shadow_get_pl2e(
  65.108 +// BUG: mafetter: this assumes ed == current, so why pass ed?
  65.109 +static inline void __shadow_get_l2e(
  65.110      struct exec_domain *ed, unsigned long va, unsigned long *sl2e)
  65.111  {
  65.112 -    *sl2e = (ed->domain->arch.shadow_mode == SHM_full_32) ?
  65.113 -        l2_pgentry_val(ed->arch.shadow_vtable[l2_table_offset(va)]) :
  65.114 -        l2_pgentry_val(linear_l2_table[l2_table_offset(va)]);
  65.115 +    if ( shadow_mode(ed->domain) == SHM_full_32 ) {
  65.116 +        *sl2e = l2_pgentry_val(ed->arch.shadow_vtable[l2_table_offset(va)]);
  65.117 +    }
  65.118 +    else if ( shadow_mode(ed->domain) ) {
  65.119 +        *sl2e = l2_pgentry_val(shadow_linear_l2_table[l2_table_offset(va)]);
  65.120 +    }
  65.121 +    else
  65.122 +        *sl2e = l2_pgentry_val(linear_l2_table[l2_table_offset(va)]);
  65.123  }
  65.124  
  65.125 -static inline void __shadow_set_pl2e(
  65.126 +static inline void __shadow_set_l2e(
  65.127      struct exec_domain *ed, unsigned long va, unsigned long value)
  65.128  {
  65.129 -    if ( ed->domain->arch.shadow_mode == SHM_full_32 )
  65.130 +    if ( shadow_mode(ed->domain) == SHM_full_32 ) {
  65.131          ed->arch.shadow_vtable[l2_table_offset(va)] = mk_l2_pgentry(value);
  65.132 +    }
  65.133 +    else if ( shadow_mode(ed->domain) ) {
  65.134 +        shadow_linear_l2_table[l2_table_offset(va)] = mk_l2_pgentry(value);
  65.135 +    }
  65.136      else
  65.137          linear_l2_table[l2_table_offset(va)] = mk_l2_pgentry(value);
  65.138  }
  65.139  
  65.140 -static inline void __guest_get_pl2e(
  65.141 +static inline void __guest_get_l2e(
  65.142      struct exec_domain *ed, unsigned long va, unsigned long *l2e)
  65.143  {
  65.144 -    *l2e = (ed->domain->arch.shadow_mode == SHM_full_32) ?
  65.145 +    *l2e = ( shadow_mode(ed->domain) == SHM_full_32) ?
  65.146          l2_pgentry_val(ed->arch.vpagetable[l2_table_offset(va)]) :
  65.147          l2_pgentry_val(linear_l2_table[l2_table_offset(va)]);
  65.148  }
  65.149  
  65.150 -static inline void __guest_set_pl2e(
  65.151 +static inline void __guest_set_l2e(
  65.152      struct exec_domain *ed, unsigned long va, unsigned long value)
  65.153  {
  65.154 -    if ( ed->domain->arch.shadow_mode == SHM_full_32 )
  65.155 +    if ( shadow_mode(ed->domain) == SHM_full_32 )
  65.156      {
  65.157          unsigned long pfn;
  65.158  
  65.159 -        pfn = phys_to_machine_mapping[value >> PAGE_SHIFT];
  65.160 +        pfn = phys_to_machine_mapping(value >> PAGE_SHIFT);
  65.161          ed->arch.guest_pl2e_cache[l2_table_offset(va)] =
  65.162              mk_l2_pgentry((pfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  65.163  
  65.164 @@ -213,32 +226,18 @@ static inline void l1pte_write_fault(
  65.165  { 
  65.166      unsigned long gpte = *gpte_p;
  65.167      unsigned long spte = *spte_p;
  65.168 +    unsigned long pfn = gpte >> PAGE_SHIFT;
  65.169 +    unsigned long mfn = __gpfn_to_mfn(d, pfn);
  65.170  
  65.171      ASSERT(gpte & _PAGE_RW);
  65.172      gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
  65.173  
  65.174 -    switch ( d->arch.shadow_mode )
  65.175 -    {
  65.176 -    case SHM_test:
  65.177 -        spte = gpte | _PAGE_RW;
  65.178 -        break;
  65.179 -
  65.180 -    case SHM_logdirty:
  65.181 -        spte = gpte | _PAGE_RW;
  65.182 -        __mark_dirty(d, gpte >> PAGE_SHIFT);
  65.183 +    if ( shadow_mode(d) == SHM_logdirty )
  65.184 +        __mark_dirty(d, pfn);
  65.185  
  65.186 -    case SHM_full_32:
  65.187 -    {
  65.188 -        unsigned long host_pfn, host_gpte;
  65.189 -        
  65.190 -        host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT];
  65.191 -        host_gpte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
  65.192 -        spte = host_gpte | _PAGE_RW;
  65.193 -    }
  65.194 -        break;
  65.195 -    }
  65.196 +    spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
  65.197  
  65.198 -    SH_VVLOG("updating spte=%lx gpte=%lx", spte, gpte);
  65.199 +    SH_VVLOG("l1pte_write_fault: updating spte=0x%08lx gpte=0x%08lx", spte, gpte);
  65.200      *gpte_p = gpte;
  65.201      *spte_p = spte;
  65.202  }
  65.203 @@ -248,31 +247,16 @@ static inline void l1pte_read_fault(
  65.204  { 
  65.205      unsigned long gpte = *gpte_p;
  65.206      unsigned long spte = *spte_p;
  65.207 +    unsigned long pfn = gpte >> PAGE_SHIFT;
  65.208 +    unsigned long mfn = __gpfn_to_mfn(d, pfn);
  65.209  
  65.210      gpte |= _PAGE_ACCESSED;
  65.211 -
  65.212 -    switch ( d->arch.shadow_mode )
  65.213 -    {
  65.214 -    case SHM_test:
  65.215 -        spte = (gpte & _PAGE_DIRTY) ? gpte : (gpte & ~_PAGE_RW);
  65.216 -        break;
  65.217 -
  65.218 -    case SHM_logdirty:
  65.219 -        spte = gpte & ~_PAGE_RW;
  65.220 -        break;
  65.221 +    spte = (mfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
  65.222  
  65.223 -    case SHM_full_32:
  65.224 -    {
  65.225 -        unsigned long host_pfn, host_gpte;
  65.226 -        
  65.227 -        host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT];
  65.228 -        host_gpte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
  65.229 -        spte = (host_gpte & _PAGE_DIRTY) ? host_gpte : (host_gpte & ~_PAGE_RW);
  65.230 -    }
  65.231 -        break;
  65.232 +    if ( (shadow_mode(d) == SHM_logdirty) || ! (gpte & _PAGE_DIRTY) )
  65.233 +        spte &= ~_PAGE_RW;
  65.234  
  65.235 -    }
  65.236 -
  65.237 +    SH_VVLOG("l1pte_read_fault: updating spte=0x%08lx gpte=0x%08lx", spte, gpte);
  65.238      *gpte_p = gpte;
  65.239      *spte_p = spte;
  65.240  }
  65.241 @@ -283,8 +267,11 @@ static inline void l1pte_propagate_from_
  65.242      unsigned long gpte = *gpte_p;
  65.243      unsigned long spte = *spte_p;
  65.244      unsigned long host_pfn, host_gpte;
  65.245 +#if SHADOW_VERBOSE_DEBUG
  65.246 +    unsigned long old_spte = spte;
  65.247 +#endif
  65.248  
  65.249 -    switch ( d->arch.shadow_mode )
  65.250 +    switch ( shadow_mode(d) )
  65.251      {
  65.252      case SHM_test:
  65.253          spte = 0;
  65.254 @@ -309,7 +296,7 @@ static inline void l1pte_propagate_from_
  65.255              return;
  65.256          }
  65.257          
  65.258 -        host_pfn = phys_to_machine_mapping[gpte >> PAGE_SHIFT];
  65.259 +        host_pfn = phys_to_machine_mapping(gpte >> PAGE_SHIFT);
  65.260          host_gpte = (host_pfn << PAGE_SHIFT) | (gpte & ~PAGE_MASK);
  65.261  
  65.262          if ( (host_gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
  65.263 @@ -320,6 +307,11 @@ static inline void l1pte_propagate_from_
  65.264          break;
  65.265      }
  65.266  
  65.267 +#if SHADOW_VERBOSE_DEBUG
  65.268 +    if ( old_spte || spte || gpte )
  65.269 +        SH_VVLOG("l1pte_propagate_from_guest: gpte=0x%08lx, old spte=0x%08lx, new spte=0x%08lx ", gpte, old_spte, spte);
  65.270 +#endif
  65.271 +
  65.272      *gpte_p = gpte;
  65.273      *spte_p = spte;
  65.274  }
  65.275 @@ -328,24 +320,24 @@ static inline void l2pde_general(
  65.276      struct domain *d,
  65.277      unsigned long *gpde_p,
  65.278      unsigned long *spde_p,
  65.279 -    unsigned long sl1pfn)
  65.280 +    unsigned long sl1mfn)
  65.281  {
  65.282      unsigned long gpde = *gpde_p;
  65.283      unsigned long spde = *spde_p;
  65.284  
  65.285      spde = 0;
  65.286  
  65.287 -    if ( sl1pfn != 0 )
  65.288 +    if ( sl1mfn != 0 )
  65.289      {
  65.290 -        spde = (gpde & ~PAGE_MASK) | (sl1pfn << PAGE_SHIFT) | 
  65.291 +        spde = (gpde & ~PAGE_MASK) | (sl1mfn << PAGE_SHIFT) | 
  65.292              _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY;
  65.293          gpde |= _PAGE_ACCESSED | _PAGE_DIRTY;
  65.294  
  65.295          /* Detect linear p.t. mappings and write-protect them. */
  65.296 -        if ( (frame_table[sl1pfn].u.inuse.type_info & PGT_type_mask) ==
  65.297 +        if ( (frame_table[sl1mfn].u.inuse.type_info & PGT_type_mask) ==
  65.298               PGT_l2_page_table ) 
  65.299          {
  65.300 -            if ( d->arch.shadow_mode != SHM_full_32 )
  65.301 +            if ( shadow_mode(d) != SHM_full_32 )
  65.302                  spde = gpde & ~_PAGE_RW;
  65.303  
  65.304          }
  65.305 @@ -366,20 +358,20 @@ static void shadow_audit(struct domain *
  65.306      for ( j = 0; j < shadow_ht_buckets; j++ )
  65.307      {
  65.308          a = &d->arch.shadow_ht[j];        
  65.309 -        if ( a->pfn ) { live++; ASSERT(a->spfn_and_flags & PSH_pfn_mask); }
  65.310 +        if ( a->pfn ) { live++; ASSERT(a->smfn_and_flags & PSH_pfn_mask); }
  65.311          ASSERT(a->pfn < 0x00100000UL);
  65.312          a = a->next;
  65.313          while ( a && (live < 9999) )
  65.314          { 
  65.315              live++; 
  65.316 -            if ( (a->pfn == 0) || (a->spfn_and_flags == 0) )
  65.317 +            if ( (a->pfn == 0) || (a->smfn_and_flags == 0) )
  65.318              {
  65.319                  printk("XXX live=%d pfn=%08lx sp=%08lx next=%p\n",
  65.320 -                       live, a->pfn, a->spfn_and_flags, a->next);
  65.321 +                       live, a->pfn, a->smfn_and_flags, a->next);
  65.322                  BUG();
  65.323              }
  65.324              ASSERT(a->pfn < 0x00100000UL);
  65.325 -            ASSERT(a->spfn_and_flags & PSH_pfn_mask);
  65.326 +            ASSERT(a->smfn_and_flags & PSH_pfn_mask);
  65.327              a = a->next; 
  65.328          }
  65.329          ASSERT(live < 9999);
  65.330 @@ -411,6 +403,12 @@ static inline struct shadow_status *hash
  65.331  }
  65.332  
  65.333  
  65.334 +/*
  65.335 + * N.B. This takes a guest pfn (i.e. a pfn in the guest's namespace,
  65.336 + *      which, depending on full shadow mode, may or may not equal
  65.337 + *      its mfn).
  65.338 + *      The shadow status it returns is a mfn.
  65.339 + */
  65.340  static inline unsigned long __shadow_status(
  65.341      struct domain *d, unsigned int gpfn)
  65.342  {
  65.343 @@ -419,7 +417,7 @@ static inline unsigned long __shadow_sta
  65.344      x = head = hash_bucket(d, gpfn);
  65.345      p = NULL;
  65.346  
  65.347 -    SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, x);
  65.348 +    //SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, x);
  65.349      shadow_audit(d, 0);
  65.350  
  65.351      do
  65.352 @@ -438,10 +436,12 @@ static inline unsigned long __shadow_sta
  65.353  
  65.354                  /* Swap 'x' contents with head contents. */
  65.355                  SWAP(head->pfn, x->pfn);
  65.356 -                SWAP(head->spfn_and_flags, x->spfn_and_flags);
  65.357 +                SWAP(head->smfn_and_flags, x->smfn_and_flags);
  65.358              }
  65.359  
  65.360 -            return head->spfn_and_flags;
  65.361 +            SH_VVLOG("lookup gpfn=%08lx => status=%08lx",
  65.362 +                     gpfn, head->smfn_and_flags);
  65.363 +            return head->smfn_and_flags;
  65.364          }
  65.365  
  65.366          p = x;
  65.367 @@ -449,6 +449,7 @@ static inline unsigned long __shadow_sta
  65.368      }
  65.369      while ( x != NULL );
  65.370  
  65.371 +    SH_VVLOG("lookup gpfn=%08lx => status=0", gpfn);
  65.372      return 0;
  65.373  }
  65.374  
  65.375 @@ -462,7 +463,7 @@ static inline unsigned long get_shadow_s
  65.376  {
  65.377      unsigned long res;
  65.378  
  65.379 -    ASSERT(d->arch.shadow_mode);
  65.380 +    ASSERT(shadow_mode(d));
  65.381  
  65.382      /*
  65.383       * If we get here we know that some sort of update has happened to the
  65.384 @@ -474,7 +475,7 @@ static inline unsigned long get_shadow_s
  65.385  
  65.386      shadow_lock(d);
  65.387  
  65.388 -    if ( d->arch.shadow_mode == SHM_logdirty )
  65.389 +    if ( shadow_mode(d) == SHM_logdirty )
  65.390          __mark_dirty(d, gpfn);
  65.391  
  65.392      if ( !(res = __shadow_status(d, gpfn)) )
  65.393 @@ -511,14 +512,14 @@ static inline void delete_shadow_status(
  65.394          {
  65.395              /* Overwrite head with contents of following node. */
  65.396              head->pfn            = n->pfn;
  65.397 -            head->spfn_and_flags = n->spfn_and_flags;
  65.398 +            head->smfn_and_flags = n->smfn_and_flags;
  65.399  
  65.400              /* Delete following node. */
  65.401              head->next           = n->next;
  65.402  
  65.403              /* Add deleted node to the free list. */
  65.404              n->pfn            = 0;
  65.405 -            n->spfn_and_flags = 0;
  65.406 +            n->smfn_and_flags = 0;
  65.407              n->next           = d->arch.shadow_ht_free;
  65.408              d->arch.shadow_ht_free = n;
  65.409          }
  65.410 @@ -526,7 +527,7 @@ static inline void delete_shadow_status(
  65.411          {
  65.412              /* This bucket is now empty. Initialise the head node. */
  65.413              head->pfn            = 0;
  65.414 -            head->spfn_and_flags = 0;
  65.415 +            head->smfn_and_flags = 0;
  65.416          }
  65.417  
  65.418          goto found;
  65.419 @@ -544,7 +545,7 @@ static inline void delete_shadow_status(
  65.420  
  65.421              /* Add deleted node to the free list. */
  65.422              x->pfn            = 0;
  65.423 -            x->spfn_and_flags = 0;
  65.424 +            x->smfn_and_flags = 0;
  65.425              x->next           = d->arch.shadow_ht_free;
  65.426              d->arch.shadow_ht_free = x;
  65.427  
  65.428 @@ -587,7 +588,7 @@ static inline void set_shadow_status(
  65.429      {
  65.430          if ( x->pfn == gpfn )
  65.431          {
  65.432 -            x->spfn_and_flags = s;
  65.433 +            x->smfn_and_flags = s;
  65.434              goto done;
  65.435          }
  65.436  
  65.437 @@ -603,7 +604,7 @@ static inline void set_shadow_status(
  65.438      if ( head->pfn == 0 )
  65.439      {
  65.440          head->pfn            = gpfn;
  65.441 -        head->spfn_and_flags = s;
  65.442 +        head->smfn_and_flags = s;
  65.443          ASSERT(head->next == NULL);
  65.444          goto done;
  65.445      }
  65.446 @@ -643,7 +644,7 @@ static inline void set_shadow_status(
  65.447  
  65.448      /* Initialise the new node and insert directly after the head item. */
  65.449      x->pfn            = gpfn;
  65.450 -    x->spfn_and_flags = s;
  65.451 +    x->smfn_and_flags = s;
  65.452      x->next           = head->next;
  65.453      head->next        = x;
  65.454  
  65.455 @@ -652,10 +653,9 @@ static inline void set_shadow_status(
  65.456  }
  65.457    
  65.458  #ifdef CONFIG_VMX
  65.459 -#include <asm/domain_page.h>
  65.460  
  65.461  static inline void vmx_update_shadow_state(
  65.462 -    struct exec_domain *ed, unsigned long gpfn, unsigned long spfn)
  65.463 +    struct exec_domain *ed, unsigned long gpfn, unsigned long smfn)
  65.464  {
  65.465  
  65.466      l2_pgentry_t *mpl2e = 0;
  65.467 @@ -672,70 +672,46 @@ static inline void vmx_update_shadow_sta
  65.468          map_domain_mem(pagetable_val(ed->arch.monitor_table));
  65.469  
  65.470      mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] =
  65.471 -        mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  65.472 +        mk_l2_pgentry((smfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  65.473      __flush_tlb_one(SH_LINEAR_PT_VIRT_START);
  65.474  
  65.475 -    spl2e = (l2_pgentry_t *)map_domain_mem(spfn << PAGE_SHIFT);
  65.476 +    spl2e = (l2_pgentry_t *)map_domain_mem(smfn << PAGE_SHIFT);
  65.477      gpl2e = (l2_pgentry_t *)map_domain_mem(gpfn << PAGE_SHIFT);
  65.478      memset(spl2e, 0, ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  65.479  
  65.480 -    ed->arch.shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
  65.481      ed->arch.shadow_vtable = spl2e;
  65.482      ed->arch.vpagetable = gpl2e; /* expect the guest did clean this up */
  65.483      unmap_domain_mem(mpl2e);
  65.484  }
  65.485  
  65.486 +#endif /* CONFIG_VMX */
  65.487 +
  65.488  static inline void __shadow_mk_pagetable(struct exec_domain *ed)
  65.489  {
  65.490      struct domain *d = ed->domain;
  65.491      unsigned long gpfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT;
  65.492 -    unsigned long spfn;
  65.493 -    SH_VLOG("0: __shadow_mk_pagetable(gpfn=%08lx\n", gpfn);
  65.494 -
  65.495 -    if (d->arch.shadow_mode == SHM_full_32) 
  65.496 -    {
  65.497 -        unsigned long guest_gpfn;
  65.498 -        guest_gpfn = machine_to_phys_mapping[gpfn];
  65.499 -
  65.500 -        SH_VVLOG("__shadow_mk_pagetable(guest_gpfn=%08lx, gpfn=%08lx\n", 
  65.501 -                 guest_gpfn, gpfn);
  65.502 +    unsigned long smfn = __shadow_status(d, gpfn) & PSH_pfn_mask;
  65.503  
  65.504 -        spfn = __shadow_status(d, guest_gpfn) & PSH_pfn_mask;
  65.505 -        if ( unlikely(spfn == 0) ) {
  65.506 -            spfn = shadow_l2_table(d, gpfn);
  65.507 -            ed->arch.shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
  65.508 -        } else {
  65.509 -            vmx_update_shadow_state(ed, gpfn, spfn);
  65.510 -        }
  65.511 -    } else {
  65.512 -        spfn = __shadow_status(d, gpfn) & PSH_pfn_mask;
  65.513 +    SH_VVLOG("0: __shadow_mk_pagetable(gpfn=%08lx, smfn=%08lx)", gpfn, smfn);
  65.514  
  65.515 -        if ( unlikely(spfn == 0) ) {
  65.516 -            spfn = shadow_l2_table(d, gpfn);
  65.517 -        }
  65.518 -        ed->arch.shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
  65.519 -    }
  65.520 +    if ( unlikely(smfn == 0) )
  65.521 +        smfn = shadow_l2_table(d, gpfn);
  65.522 +#ifdef CONFIG_VMX
  65.523 +    else
  65.524 +        if (d->arch.shadow_mode == SHM_full_32)
  65.525 +            vmx_update_shadow_state(ed, gpfn, smfn);
  65.526 +#endif
  65.527 +
  65.528 +    ed->arch.shadow_table = mk_pagetable(smfn<<PAGE_SHIFT);
  65.529  }
  65.530 -#else
  65.531 -static inline void __shadow_mk_pagetable(struct exec_domain *ed)
  65.532 -{
  65.533 -    unsigned long gpfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT;
  65.534 -    unsigned long spfn = __shadow_status(ed->domain, gpfn);
  65.535 -
  65.536 -    if ( unlikely(spfn == 0) )
  65.537 -        spfn = shadow_l2_table(ed->domain, gpfn);
  65.538 -
  65.539 -    ed->arch.shadow_table = mk_pagetable(spfn << PAGE_SHIFT);
  65.540 -}
  65.541 -#endif /* CONFIG_VMX */
  65.542  
  65.543  static inline void shadow_mk_pagetable(struct exec_domain *ed)
  65.544  {
  65.545 -     if ( unlikely(ed->domain->arch.shadow_mode) )
  65.546 +     if ( unlikely(shadow_mode(ed->domain)) )
  65.547       {
  65.548           SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )",
  65.549               pagetable_val(ed->arch.pagetable),
  65.550 -                  ed->domain->arch.shadow_mode); 
  65.551 +                  shadow_mode(ed->domain)); 
  65.552  
  65.553           shadow_lock(ed->domain);
  65.554           __shadow_mk_pagetable(ed);
  65.555 @@ -744,13 +720,13 @@ static inline void shadow_mk_pagetable(s
  65.556       SH_VVLOG("leaving shadow_mk_pagetable:\n"
  65.557                "( gptbase=%08lx, mode=%d ) sh=%08lx",
  65.558                pagetable_val(ed->arch.pagetable),
  65.559 -              ed->domain->arch.shadow_mode, 
  65.560 +              shadow_mode(ed->domain), 
  65.561                pagetable_val(ed->arch.shadow_table) );
  65.562       }
  65.563  }
  65.564  
  65.565  #if SHADOW_DEBUG
  65.566 -extern int check_pagetable(struct domain *d, pagetable_t pt, char *s);
  65.567 +extern void check_pagetable(struct domain *d, pagetable_t pt, char *s);
  65.568  #else
  65.569  #define check_pagetable(d, pt, s) ((void)0)
  65.570  #endif
    66.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    66.2 +++ b/xen/include/asm-x86/x86_32/page.h	Tue Feb 08 16:44:16 2005 +0000
    66.3 @@ -0,0 +1,56 @@
    66.4 +/* -*-  Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
    66.5 +
    66.6 +#ifndef __X86_32_PAGE_H__
    66.7 +#define __X86_32_PAGE_H__
    66.8 +
    66.9 +#define L1_PAGETABLE_SHIFT       12
   66.10 +#define L2_PAGETABLE_SHIFT       22
   66.11 +#define PAGE_SHIFT               L1_PAGETABLE_SHIFT
   66.12 +
   66.13 +#define ENTRIES_PER_L1_PAGETABLE 1024
   66.14 +#define ENTRIES_PER_L2_PAGETABLE 1024
   66.15 +
   66.16 +#define __PAGE_OFFSET		(0xFC400000)
   66.17 +
   66.18 +#ifndef __ASSEMBLY__
   66.19 +#include <xen/config.h>
   66.20 +typedef struct { unsigned long l1_lo; } l1_pgentry_t;
   66.21 +typedef struct { unsigned long l2_lo; } l2_pgentry_t;
   66.22 +#endif /* !__ASSEMBLY__ */
   66.23 +
   66.24 +/* Strip type from a table entry. */
   66.25 +#define l1_pgentry_val(_x) ((_x).l1_lo)
   66.26 +#define l2_pgentry_val(_x) ((_x).l2_lo)
   66.27 +
   66.28 +/* Add type to a table entry. */
   66.29 +#define mk_l1_pgentry(_x)  ( (l1_pgentry_t) { (_x) } )
   66.30 +#define mk_l2_pgentry(_x)  ( (l2_pgentry_t) { (_x) } )
   66.31 +
   66.32 +/* Turn a typed table entry into a physical address. */
   66.33 +#define l1_pgentry_to_phys(_x) (l1_pgentry_val(_x) & PAGE_MASK)
   66.34 +#define l2_pgentry_to_phys(_x) (l2_pgentry_val(_x) & PAGE_MASK)
   66.35 +
   66.36 +/* Turn a typed table entry into a page index. */
   66.37 +#define l1_pgentry_to_pfn(_x) (l1_pgentry_val(_x) >> PAGE_SHIFT) 
   66.38 +#define l2_pgentry_to_pfn(_x) (l2_pgentry_val(_x) >> PAGE_SHIFT)
   66.39 +
   66.40 +/* Pagetable walking. */
   66.41 +#define l2_pgentry_to_l1(_x) \
   66.42 +  ((l1_pgentry_t *)__va(l2_pgentry_to_phys(_x)))
   66.43 +
   66.44 +/* Given a virtual address, get an entry offset into a page table. */
   66.45 +#define l1_table_offset(_a) \
   66.46 +  (((_a) >> L1_PAGETABLE_SHIFT) & (ENTRIES_PER_L1_PAGETABLE - 1))
   66.47 +#define l2_table_offset(_a) \
   66.48 +  ((_a) >> L2_PAGETABLE_SHIFT)
   66.49 +
   66.50 +/* Given a virtual address, get an entry offset into a linear page table. */
   66.51 +#define l1_linear_offset(_a) ((_a) >> PAGE_SHIFT)
   66.52 +
   66.53 +/* Root page-table definitions. */
   66.54 +#define pagetable_t l2_pgentry_t
   66.55 +#define pagetable_val(_x)  ((_x).l2_lo)
   66.56 +#define mk_pagetable(_x)   ( (l2_pgentry_t) { (_x) } )
   66.57 +#define ENTRIES_PER_PAGETABLE ENTRIES_PER_L2_PAGETABLE
   66.58 +
   66.59 +#endif /* __X86_32_PAGE_H__ */
    67.1 --- a/xen/include/asm-x86/x86_32/regs.h	Mon Feb 07 08:19:24 2005 +0000
    67.2 +++ b/xen/include/asm-x86/x86_32/regs.h	Tue Feb 08 16:44:16 2005 +0000
    67.3 @@ -39,4 +39,6 @@ struct xen_regs
    67.4  #define RING_2(_r)    (((_r)->cs & 3) == 2)
    67.5  #define RING_3(_r)    (((_r)->cs & 3) == 3)
    67.6  
    67.7 +#define GUESTOS_FAULT(_r) (!VM86_MODE(_r) && RING_1(_r))
    67.8 +
    67.9  #endif
    68.1 --- a/xen/include/asm-x86/x86_32/uaccess.h	Mon Feb 07 08:19:24 2005 +0000
    68.2 +++ b/xen/include/asm-x86/x86_32/uaccess.h	Tue Feb 08 16:44:16 2005 +0000
    68.3 @@ -8,7 +8,6 @@
    68.4  #include <xen/errno.h>
    68.5  #include <xen/prefetch.h>
    68.6  #include <xen/string.h>
    68.7 -#include <xen/sched.h>
    68.8  
    68.9  #define __user
   68.10  
    69.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    69.2 +++ b/xen/include/asm-x86/x86_64/page.h	Tue Feb 08 16:44:16 2005 +0000
    69.3 @@ -0,0 +1,84 @@
    69.4 +/* -*-  Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
    69.5 +
    69.6 +#ifndef __X86_64_PAGE_H__
    69.7 +#define __X86_64_PAGE_H__
    69.8 +
    69.9 +#define L1_PAGETABLE_SHIFT       12
   69.10 +#define L2_PAGETABLE_SHIFT       21
   69.11 +#define L3_PAGETABLE_SHIFT       30
   69.12 +#define L4_PAGETABLE_SHIFT       39
   69.13 +#define PAGE_SHIFT               L1_PAGETABLE_SHIFT
   69.14 +
   69.15 +#define ENTRIES_PER_L1_PAGETABLE 512
   69.16 +#define ENTRIES_PER_L2_PAGETABLE 512
   69.17 +#define ENTRIES_PER_L3_PAGETABLE 512
   69.18 +#define ENTRIES_PER_L4_PAGETABLE 512
   69.19 +
   69.20 +#define __PAGE_OFFSET		(0xFFFF830000000000)
   69.21 +
   69.22 +/* These may increase in future (phys. bits in particular). */
   69.23 +#define PADDR_BITS              40
   69.24 +#define VADDR_BITS              48
   69.25 +#define PADDR_MASK              ((1UL << PADDR_BITS)-1)
   69.26 +#define VADDR_MASK              ((1UL << VADDR_BITS)-1)
   69.27 +
   69.28 +#ifndef __ASSEMBLY__
   69.29 +#include <xen/config.h>
   69.30 +typedef struct { unsigned long l1_lo; } l1_pgentry_t;
   69.31 +typedef struct { unsigned long l2_lo; } l2_pgentry_t;
   69.32 +typedef struct { unsigned long l3_lo; } l3_pgentry_t;
   69.33 +typedef struct { unsigned long l4_lo; } l4_pgentry_t;
   69.34 +#endif /* !__ASSEMBLY__ */
   69.35 +
   69.36 +/* Strip type from a table entry. */
   69.37 +#define l1_pgentry_val(_x) ((_x).l1_lo)
   69.38 +#define l2_pgentry_val(_x) ((_x).l2_lo)
   69.39 +#define l3_pgentry_val(_x) ((_x).l3_lo)
   69.40 +#define l4_pgentry_val(_x) ((_x).l4_lo)
   69.41 +
   69.42 +/* Add type to a table entry. */
   69.43 +#define mk_l1_pgentry(_x)  ( (l1_pgentry_t) { (_x) } )
   69.44 +#define mk_l2_pgentry(_x)  ( (l2_pgentry_t) { (_x) } )
   69.45 +#define mk_l3_pgentry(_x)  ( (l3_pgentry_t) { (_x) } )
   69.46 +#define mk_l4_pgentry(_x)  ( (l4_pgentry_t) { (_x) } )
   69.47 +
   69.48 +/* Turn a typed table entry into a physical address. */
   69.49 +#define l1_pgentry_to_phys(_x) (l1_pgentry_val(_x) & (PADDR_MASK & PAGE_MASK))
   69.50 +#define l2_pgentry_to_phys(_x) (l2_pgentry_val(_x) & (PADDR_MASK & PAGE_MASK))
   69.51 +#define l3_pgentry_to_phys(_x) (l3_pgentry_val(_x) & (PADDR_MASK & PAGE_MASK))
   69.52 +#define l4_pgentry_to_phys(_x) (l4_pgentry_val(_x) & (PADDR_MASK & PAGE_MASK))
   69.53 +
   69.54 +/* Turn a typed table entry into a page index. */
   69.55 +#define l1_pgentry_to_pfn(_x) (l1_pgentry_val(_x) >> PAGE_SHIFT) 
   69.56 +#define l2_pgentry_to_pfn(_x) (l2_pgentry_val(_x) >> PAGE_SHIFT)
   69.57 +#define l3_pgentry_to_pfn(_x) (l3_pgentry_val(_x) >> PAGE_SHIFT)
   69.58 +#define l4_pgentry_to_pfn(_x) (l4_pgentry_val(_x) >> PAGE_SHIFT)
   69.59 +
   69.60 +/* Pagetable walking. */
   69.61 +#define l2_pgentry_to_l1(_x) \
   69.62 +  ((l1_pgentry_t *)__va(l2_pgentry_to_phys(_x)))
   69.63 +#define l3_pgentry_to_l2(_x) \
   69.64 +  ((l2_pgentry_t *)__va(l3_pgentry_to_phys(_x)))
   69.65 +#define l4_pgentry_to_l3(_x) \
   69.66 +  ((l3_pgentry_t *)__va(l4_pgentry_to_phys(_x)))
   69.67 +
   69.68 +/* Given a virtual address, get an entry offset into a page table. */
   69.69 +#define l1_table_offset(_a) \
   69.70 +  (((_a) >> L1_PAGETABLE_SHIFT) & (ENTRIES_PER_L1_PAGETABLE - 1))
   69.71 +#define l2_table_offset(_a) \
   69.72 +  (((_a) >> L2_PAGETABLE_SHIFT) & (ENTRIES_PER_L2_PAGETABLE - 1))
   69.73 +#define l3_table_offset(_a) \
   69.74 +  (((_a) >> L3_PAGETABLE_SHIFT) & (ENTRIES_PER_L3_PAGETABLE - 1))
   69.75 +#define l4_table_offset(_a) \
   69.76 +  (((_a) >> L4_PAGETABLE_SHIFT) & (ENTRIES_PER_L4_PAGETABLE - 1))
   69.77 +
   69.78 +/* Given a virtual address, get an entry offset into a linear page table. */
   69.79 +#define l1_linear_offset(_a) (((_a) & VADDR_MASK) >> PAGE_SHIFT)
   69.80 +
   69.81 +/* Root page-table definitions. */
   69.82 +#define pagetable_t l4_pgentry_t
   69.83 +#define pagetable_val(_x)  ((_x).l4_lo)
   69.84 +#define mk_pagetable(_x)   ( (l4_pgentry_t) { (_x) } )
   69.85 +#define ENTRIES_PER_PAGETABLE ENTRIES_PER_L4_PAGETABLE
   69.86 +
   69.87 +#endif /* __X86_64_PAGE_H__ */
    70.1 --- a/xen/include/asm-x86/x86_64/regs.h	Mon Feb 07 08:19:24 2005 +0000
    70.2 +++ b/xen/include/asm-x86/x86_64/regs.h	Tue Feb 08 16:44:16 2005 +0000
    70.3 @@ -36,4 +36,6 @@ struct xen_regs
    70.4  #define RING_2(_r)    (((_r)->cs & 3) == 2)
    70.5  #define RING_3(_r)    (((_r)->cs & 3) == 3)
    70.6  
    70.7 +#define GUESTOS_FAULT(_r) (!VM86_MODE(_r) && RING_3(_r))
    70.8 +
    70.9  #endif
    71.1 --- a/xen/include/asm-x86/x86_64/uaccess.h	Mon Feb 07 08:19:24 2005 +0000
    71.2 +++ b/xen/include/asm-x86/x86_64/uaccess.h	Tue Feb 08 16:44:16 2005 +0000
    71.3 @@ -7,7 +7,6 @@
    71.4  #include <xen/config.h>
    71.5  #include <xen/compiler.h>
    71.6  #include <xen/errno.h>
    71.7 -#include <xen/sched.h>
    71.8  #include <xen/prefetch.h>
    71.9  #include <asm/page.h>
   71.10  
   71.11 @@ -16,34 +15,19 @@
   71.12  #define VERIFY_READ 0
   71.13  #define VERIFY_WRITE 1
   71.14  
   71.15 -#define __addr_ok(addr) ((unsigned long)(addr) < HYPERVISOR_VIRT_START)
   71.16 -
   71.17  /*
   71.18 - * Test whether a block of memory is a valid user space address.
   71.19 - * Returns 0 if the range is valid, nonzero otherwise.
   71.20 - *
   71.21 - * This is equivalent to the following test:
   71.22 - * ((u65)addr >= (u65)HYPERVISOR_VIRT_END) ?
   71.23 - * (((u65)addr + (u65)size) >= ((u65)1 << 64)) :
   71.24 - * (((u65)addr + (u65)size) >= ((u65)HYPERVISOR_VIRT_START))
   71.25 + * Valid if in +ve half of 48-bit address space, or above Xen-reserved area.
   71.26 + * This is also valid for range checks (addr, addr+size). As long as the
   71.27 + * start address is outside the Xen-reserved area then we will access a
   71.28 + * non-canonical address (and thus fault) before ever reaching VIRT_START.
   71.29   */
   71.30 -#define __range_not_ok(addr,size) ({ \
   71.31 -    unsigned long flag,sum; \
   71.32 -    if ((unsigned long)addr >= HYPERVISOR_VIRT_END) \
   71.33 -        asm("addq %3,%1 ; sbbq %0,%0" \
   71.34 -            :"=&r" (flag), "=r" (sum) \
   71.35 -            :"1" (addr),"g" ((long)(size))); \
   71.36 -    else \
   71.37 -        asm("addq %3,%1 ; sbbq %0,%0 ; cmpq %1,%4 ; sbbq $0,%0"  \
   71.38 -            :"=&r" (flag), "=r" (sum) \
   71.39 -            :"1" (addr),"g" ((long)(size)),"r" (HYPERVISOR_VIRT_START)); \
   71.40 -    flag; })
   71.41 +#define __addr_ok(addr) \
   71.42 +    (((unsigned long)(addr) < (1UL<<48)) || \
   71.43 +     ((unsigned long)(addr) >= HYPERVISOR_VIRT_END))
   71.44  
   71.45 -#define access_ok(type, addr, size) (__range_not_ok(addr,size) == 0)
   71.46 +#define access_ok(type, addr, size) (__addr_ok(addr))
   71.47  
   71.48 -#define array_access_ok(type,addr,count,size)                    \
   71.49 -    (likely(sizeof(count) <= 4) /* disallow 64-bit counts */ &&  \
   71.50 -     access_ok(type,addr,(unsigned long)count*(unsigned long)size))
   71.51 +#define array_access_ok(type,addr,count,size) (__addr_ok(addr))
   71.52  
   71.53  extern long __get_user_bad(void);
   71.54  extern void __put_user_bad(void);
    72.1 --- a/xen/include/public/arch-x86_64.h	Mon Feb 07 08:19:24 2005 +0000
    72.2 +++ b/xen/include/public/arch-x86_64.h	Tue Feb 08 16:44:16 2005 +0000
    72.3 @@ -43,11 +43,11 @@
    72.4   */
    72.5  
    72.6  #define FLAT_RING3_CS32 0x0823  /* GDT index 260 */
    72.7 -#define FLAT_RING3_CS64 0x082b  /* GDT index 261 */
    72.8 -#define FLAT_RING3_DS32 0x0833  /* GDT index 262 */
    72.9 +#define FLAT_RING3_CS64 0x0833  /* GDT index 261 */
   72.10 +#define FLAT_RING3_DS32 0x082b  /* GDT index 262 */
   72.11  #define FLAT_RING3_DS64 0x0000  /* NULL selector */
   72.12 -#define FLAT_RING3_SS32 0x0833  /* GDT index 262 */
   72.13 -#define FLAT_RING3_SS64 0x0833  /* GDT index 262 */
   72.14 +#define FLAT_RING3_SS32 0x082b  /* GDT index 262 */
   72.15 +#define FLAT_RING3_SS64 0x082b  /* GDT index 262 */
   72.16  
   72.17  #define FLAT_GUESTOS_DS64 FLAT_RING3_DS64
   72.18  #define FLAT_GUESTOS_DS32 FLAT_RING3_DS32
    73.1 --- a/xen/include/public/xen.h	Mon Feb 07 08:19:24 2005 +0000
    73.2 +++ b/xen/include/public/xen.h	Tue Feb 08 16:44:16 2005 +0000
    73.3 @@ -23,7 +23,14 @@
    73.4   * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
    73.5   */
    73.6  
    73.7 -/* EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5. */
    73.8 +/*
    73.9 + * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5.
   73.10 + *         EAX = return value
   73.11 + *         (argument registers may be clobbered on return)
   73.12 + * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6. 
   73.13 + *         RAX = return value
   73.14 + *         (argument registers not clobbered on return; RCX, R11 are)
   73.15 + */
   73.16  #define __HYPERVISOR_set_trap_table        0
   73.17  #define __HYPERVISOR_mmu_update            1
   73.18  #define __HYPERVISOR_set_gdt               2
    74.1 --- a/xen/include/xen/ioport.h	Mon Feb 07 08:19:24 2005 +0000
    74.2 +++ b/xen/include/xen/ioport.h	Tue Feb 08 16:44:16 2005 +0000
    74.3 @@ -100,17 +100,13 @@ extern int allocate_resource(struct reso
    74.4  #define request_region(start,n,name)	__request_region(&ioport_resource, (start), (n), (name))
    74.5  #define request_mem_region(start,n,name) __request_region(&iomem_resource, (start), (n), (name))
    74.6  
    74.7 -extern struct resource * __request_region(struct resource *, unsigned long start, unsigned long n, const char *name);
    74.8 -
    74.9 -/* Compatibility cruft */
   74.10 -#define check_region(start,n)	__check_region(&ioport_resource, (start), (n))
   74.11  #define release_region(start,n)	__release_region(&ioport_resource, (start), (n))
   74.12 -#define check_mem_region(start,n)	__check_region(&iomem_resource, (start), (n))
   74.13  #define release_mem_region(start,n)	__release_region(&iomem_resource, (start), (n))
   74.14  
   74.15 -extern int __check_region(struct resource *, unsigned long, unsigned long);
   74.16  extern void __release_region(struct resource *, unsigned long, unsigned long);
   74.17  
   74.18 +extern struct resource * __request_region(struct resource *, unsigned long start, unsigned long n, const char *name);
   74.19 +
   74.20  #define get_ioport_list(buf)	get_resource_list(&ioport_resource, buf, PAGE_SIZE)
   74.21  #define get_mem_list(buf)	get_resource_list(&iomem_resource, buf, PAGE_SIZE)
   74.22  
    75.1 --- a/xen/include/xen/sched.h	Mon Feb 07 08:19:24 2005 +0000
    75.2 +++ b/xen/include/xen/sched.h	Tue Feb 08 16:44:16 2005 +0000
    75.3 @@ -262,8 +262,32 @@ int idle_cpu(int cpu); /* Is CPU 'cpu' i
    75.4  
    75.5  void startup_cpu_idle_loop(void);
    75.6  
    75.7 -unsigned long hypercall_create_continuation(
    75.8 +unsigned long __hypercall_create_continuation(
    75.9      unsigned int op, unsigned int nr_args, ...);
   75.10 +#define hypercall0_create_continuation(_op)                               \
   75.11 +    __hypercall_create_continuation((_op), 0)
   75.12 +#define hypercall1_create_continuation(_op, _a1)                          \
   75.13 +    __hypercall_create_continuation((_op), 1,                             \
   75.14 +        (unsigned long)(_a1))
   75.15 +#define hypercall2_create_continuation(_op, _a1, _a2)                     \
   75.16 +    __hypercall_create_continuation((_op), 2,                             \
   75.17 +        (unsigned long)(_a1), (unsigned long)(_a2))
   75.18 +#define hypercall3_create_continuation(_op, _a1, _a2, _a3)                \
   75.19 +    __hypercall_create_continuation((_op), 3,                             \
   75.20 +        (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3))
   75.21 +#define hypercall4_create_continuation(_op, _a1, _a2, _a3, _a4)           \
   75.22 +    __hypercall_create_continuation((_op), 4,                             \
   75.23 +        (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \
   75.24 +        (unsigned long)(_a4))
   75.25 +#define hypercall5_create_continuation(_op, _a1, _a2, _a3, _a4, _a5)      \
   75.26 +    __hypercall_create_continuation((_op), 5,                             \
   75.27 +        (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \
   75.28 +        (unsigned long)(_a4), (unsigned long)(_a5))
   75.29 +#define hypercall6_create_continuation(_op, _a1, _a2, _a3, _a4, _a5, _a6) \
   75.30 +    __hypercall_create_continuation((_op), 6,                             \
   75.31 +        (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \
   75.32 +        (unsigned long)(_a4), (unsigned long)(_a5), (unsigned long)(_a6))
   75.33 +
   75.34  #define hypercall_preempt_check() \
   75.35      (unlikely(softirq_pending(smp_processor_id())))
   75.36