ia64/xen-unstable

changeset 3720:9f7935ea4606

bitkeeper revision 1.1159.212.128 (4208d72fZEHIE9NOZZbr91V7R-3gUg)

Merge scramble.cl.cam.ac.uk:/auto/groups/xeno/BK/xeno.bk
into scramble.cl.cam.ac.uk:/local/scratch/kaf24/xen-unstable.bk
author kaf24@scramble.cl.cam.ac.uk
date Tue Feb 08 15:13:51 2005 +0000 (2005-02-08)
parents ea98f0bb6510 f504382b179f
children 4f427b731288 5612c06cde33
files .rootkeys linux-2.4.29-xen-sparse/mm/memory.c linux-2.6.10-xen-sparse/drivers/xen/blkback/blkback.c linux-2.6.10-xen-sparse/drivers/xen/netback/netback.c linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c linux-2.6.10-xen-sparse/drivers/xen/usbback/usbback.c linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable.h linux-2.6.10-xen-sparse/include/asm-xen/hypervisor.h netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c xen/arch/x86/memory.c xen/arch/x86/mm.c xen/arch/x86/traps.c xen/common/dom_mem_ops.c xen/include/asm-x86/page.h xen/include/asm-x86/x86_32/regs.h xen/include/asm-x86/x86_64/regs.h xen/include/asm-x86/x86_64/uaccess.h
line diff
     1.1 --- a/.rootkeys	Tue Feb 08 12:27:23 2005 +0000
     1.2 +++ b/.rootkeys	Tue Feb 08 15:13:51 2005 +0000
     1.3 @@ -867,8 +867,8 @@ 3ddb79bcCAq6IpdkHueChoVTfXqEQQ xen/arch/
     1.4  3ddb79bcBit4xJXbwtX0kb1hh2uO1Q xen/arch/x86/idle0_task.c
     1.5  3ddb79bcKIkRR0kqWaJhe5VUDkMdxg xen/arch/x86/io_apic.c
     1.6  3ddb79bdqfIcjkz_h9Hvtp8Tk_19Zw xen/arch/x86/irq.c
     1.7 -40ec29ffuOa1ZvmJHzFKyZn4k_RcXg xen/arch/x86/memory.c
     1.8  41d54a76qfpO0VnbL2tYs0Jgt3W3XA xen/arch/x86/microcode.c
     1.9 +40ec29ffuOa1ZvmJHzFKyZn4k_RcXg xen/arch/x86/mm.c
    1.10  3ddb79bdS4UeWWXDH-FaBKqcpMFcnw xen/arch/x86/mpparse.c
    1.11  41aaf566Z4sTDgJ77eEg0TzzQ1ka6Q xen/arch/x86/mtrr/amd.c
    1.12  41aaf566TOpOBXT00wwQGUh20f1rlA xen/arch/x86/mtrr/centaur.c
     2.1 --- a/linux-2.4.29-xen-sparse/mm/memory.c	Tue Feb 08 12:27:23 2005 +0000
     2.2 +++ b/linux-2.4.29-xen-sparse/mm/memory.c	Tue Feb 08 15:13:51 2005 +0000
     2.3 @@ -915,7 +915,7 @@ static inline void establish_pte(struct 
     2.4  #ifdef CONFIG_XEN
     2.5  	if ( likely(vma->vm_mm == current->mm) ) {
     2.6  		XEN_flush_page_update_queue();
     2.7 -		HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, entry, UVMF_INVLPG);
     2.8 +		HYPERVISOR_update_va_mapping(address, entry, UVMF_INVLPG);
     2.9  	} else {
    2.10  		set_pte(page_table, entry);
    2.11  		flush_tlb_page(vma, address);
    2.12 @@ -1191,7 +1191,7 @@ static int do_swap_page(struct mm_struct
    2.13  #ifdef CONFIG_XEN
    2.14  	if ( likely(vma->vm_mm == current->mm) ) {
    2.15  		XEN_flush_page_update_queue();
    2.16 -		HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, pte, 0);
    2.17 +		HYPERVISOR_update_va_mapping(address, pte, 0);
    2.18  	} else {
    2.19  		set_pte(page_table, pte);
    2.20  		XEN_flush_page_update_queue();
    2.21 @@ -1247,7 +1247,7 @@ static int do_anonymous_page(struct mm_s
    2.22  #ifdef CONFIG_XEN
    2.23  	if ( likely(vma->vm_mm == current->mm) ) {
    2.24  		XEN_flush_page_update_queue();
    2.25 -		HYPERVISOR_update_va_mapping(addr>>PAGE_SHIFT, entry, 0);
    2.26 +		HYPERVISOR_update_va_mapping(addr, entry, 0);
    2.27  	} else {
    2.28  		set_pte(page_table, entry);
    2.29  		XEN_flush_page_update_queue();
    2.30 @@ -1333,7 +1333,7 @@ static int do_no_page(struct mm_struct *
    2.31  #ifdef CONFIG_XEN
    2.32  		if ( likely(vma->vm_mm == current->mm) ) {
    2.33  			XEN_flush_page_update_queue();
    2.34 -			HYPERVISOR_update_va_mapping(address>>PAGE_SHIFT, entry, 0);
    2.35 +			HYPERVISOR_update_va_mapping(address, entry, 0);
    2.36  		} else {
    2.37  			set_pte(page_table, entry);
    2.38  			XEN_flush_page_update_queue();
     3.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/blkback/blkback.c	Tue Feb 08 12:27:23 2005 +0000
     3.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/blkback/blkback.c	Tue Feb 08 15:13:51 2005 +0000
     3.3 @@ -95,7 +95,7 @@ static void fast_flush_area(int idx, int
     3.4      for ( i = 0; i < nr_pages; i++ )
     3.5      {
     3.6          mcl[i].op = __HYPERVISOR_update_va_mapping;
     3.7 -        mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT;
     3.8 +        mcl[i].args[0] = MMAP_VADDR(idx, i);
     3.9          mcl[i].args[1] = 0;
    3.10          mcl[i].args[2] = 0;
    3.11      }
    3.12 @@ -343,14 +343,14 @@ static void dispatch_probe(blkif_t *blki
    3.13  
    3.14  #ifdef CONFIG_XEN_BLKDEV_TAP_BE
    3.15      if ( HYPERVISOR_update_va_mapping_otherdomain(
    3.16 -        MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT,
    3.17 +        MMAP_VADDR(pending_idx, 0),
    3.18          (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
    3.19          0, (blkif->is_blktap ? ID_TO_DOM(req->id) : blkif->domid) ) )
    3.20          
    3.21          goto out;
    3.22  #else
    3.23      if ( HYPERVISOR_update_va_mapping_otherdomain(
    3.24 -        MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT,
    3.25 +        MMAP_VADDR(pending_idx, 0),
    3.26          (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
    3.27          0, blkif->domid) ) 
    3.28          
    3.29 @@ -436,7 +436,7 @@ static void dispatch_rw_block_io(blkif_t
    3.30      for ( i = 0; i < nr_psegs; i++ )
    3.31      {
    3.32          mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
    3.33 -        mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
    3.34 +        mcl[i].args[0] = MMAP_VADDR(pending_idx, i);
    3.35          mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot;
    3.36          mcl[i].args[2] = 0;
    3.37  #ifdef CONFIG_XEN_BLKDEV_TAP_BE
     4.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/netback/netback.c	Tue Feb 08 12:27:23 2005 +0000
     4.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/netback/netback.c	Tue Feb 08 15:13:51 2005 +0000
     4.3 @@ -234,7 +234,7 @@ static void net_rx_action(unsigned long 
     4.4          mmu[2].val  = MMUEXT_REASSIGN_PAGE;
     4.5  
     4.6          mcl[0].op = __HYPERVISOR_update_va_mapping;
     4.7 -        mcl[0].args[0] = vdata >> PAGE_SHIFT;
     4.8 +        mcl[0].args[0] = vdata;
     4.9          mcl[0].args[1] = (new_mfn << PAGE_SHIFT) | __PAGE_KERNEL;
    4.10          mcl[0].args[2] = 0;
    4.11          mcl[1].op = __HYPERVISOR_mmu_update;
    4.12 @@ -409,7 +409,7 @@ static void net_tx_action(unsigned long 
    4.13      {
    4.14          pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
    4.15          mcl[0].op = __HYPERVISOR_update_va_mapping;
    4.16 -        mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
    4.17 +        mcl[0].args[0] = MMAP_VADDR(pending_idx);
    4.18          mcl[0].args[1] = 0;
    4.19          mcl[0].args[2] = 0;
    4.20          mcl++;     
    4.21 @@ -546,7 +546,7 @@ static void net_tx_action(unsigned long 
    4.22          skb_reserve(skb, 16);
    4.23  
    4.24          mcl[0].op = __HYPERVISOR_update_va_mapping_otherdomain;
    4.25 -        mcl[0].args[0] = MMAP_VADDR(pending_idx) >> PAGE_SHIFT;
    4.26 +        mcl[0].args[0] = MMAP_VADDR(pending_idx);
    4.27          mcl[0].args[1] = (txreq.addr & PAGE_MASK) | __PAGE_KERNEL;
    4.28          mcl[0].args[2] = 0;
    4.29          mcl[0].args[3] = netif->domid;
     5.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c	Tue Feb 08 12:27:23 2005 +0000
     5.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/netfront/netfront.c	Tue Feb 08 15:13:51 2005 +0000
     5.3 @@ -392,7 +392,7 @@ static void network_alloc_rx_buffers(str
     5.4  	    = INVALID_P2M_ENTRY;
     5.5  
     5.6          rx_mcl[i].op = __HYPERVISOR_update_va_mapping;
     5.7 -        rx_mcl[i].args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
     5.8 +        rx_mcl[i].args[0] = (unsigned long)skb->head;
     5.9          rx_mcl[i].args[1] = 0;
    5.10          rx_mcl[i].args[2] = 0;
    5.11      }
    5.12 @@ -593,7 +593,7 @@ static int netif_poll(struct net_device 
    5.13          mmu->val  = __pa(skb->head) >> PAGE_SHIFT;
    5.14          mmu++;
    5.15          mcl->op = __HYPERVISOR_update_va_mapping;
    5.16 -        mcl->args[0] = (unsigned long)skb->head >> PAGE_SHIFT;
    5.17 +        mcl->args[0] = (unsigned long)skb->head;
    5.18          mcl->args[1] = (rx->addr & PAGE_MASK) | __PAGE_KERNEL;
    5.19          mcl->args[2] = 0;
    5.20          mcl++;
     6.1 --- a/linux-2.6.10-xen-sparse/drivers/xen/usbback/usbback.c	Tue Feb 08 12:27:23 2005 +0000
     6.2 +++ b/linux-2.6.10-xen-sparse/drivers/xen/usbback/usbback.c	Tue Feb 08 15:13:51 2005 +0000
     6.3 @@ -191,7 +191,7 @@ static void fast_flush_area(int idx, int
     6.4      for ( i = 0; i < nr_pages; i++ )
     6.5      {
     6.6          mcl[i].op = __HYPERVISOR_update_va_mapping;
     6.7 -        mcl[i].args[0] = MMAP_VADDR(idx, i) >> PAGE_SHIFT;
     6.8 +        mcl[i].args[0] = MMAP_VADDR(idx, i);
     6.9          mcl[i].args[1] = 0;
    6.10          mcl[i].args[2] = 0;
    6.11      }
    6.12 @@ -630,7 +630,7 @@ static void dispatch_usb_io(usbif_priv_t
    6.13            i++, offset += PAGE_SIZE )
    6.14      {
    6.15  	mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
    6.16 -	mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
    6.17 +	mcl[i].args[0] = MMAP_VADDR(pending_idx, i);
    6.18          mcl[i].args[1] = ((buffer_mach & PAGE_MASK) + offset) | remap_prot;
    6.19          mcl[i].args[2] = 0;
    6.20          mcl[i].args[3] = up->domid;
    6.21 @@ -646,7 +646,7 @@ static void dispatch_usb_io(usbif_priv_t
    6.22      {
    6.23          /* Map in ISO schedule, if necessary. */
    6.24          mcl[i].op = __HYPERVISOR_update_va_mapping_otherdomain;
    6.25 -        mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
    6.26 +        mcl[i].args[0] = MMAP_VADDR(pending_idx, i);
    6.27          mcl[i].args[1] = (req->iso_schedule & PAGE_MASK) | remap_prot;
    6.28          mcl[i].args[2] = 0;
    6.29          mcl[i].args[3] = up->domid;
     7.1 --- a/linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Tue Feb 08 12:27:23 2005 +0000
     7.2 +++ b/linux-2.6.10-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Tue Feb 08 15:13:51 2005 +0000
     7.3 @@ -426,7 +426,7 @@ extern pte_t *lookup_address(unsigned lo
     7.4  		if (__dirty) {						  \
     7.5  		        if ( likely((__vma)->vm_mm == current->mm) ) {    \
     7.6  			    xen_flush_page_update_queue();                \
     7.7 -			    HYPERVISOR_update_va_mapping((__address)>>PAGE_SHIFT, (__entry), UVMF_INVLPG); \
     7.8 +			    HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG); \
     7.9  			} else {                                          \
    7.10                              xen_l1_entry_update((__ptep), (__entry).pte_low); \
    7.11  			    flush_tlb_page((__vma), (__address));         \
    7.12 @@ -445,7 +445,7 @@ do {				  					\
    7.13  do {				  					\
    7.14  	if (likely((__vma)->vm_mm == current->mm)) {			\
    7.15  		xen_flush_page_update_queue();				\
    7.16 -		HYPERVISOR_update_va_mapping((__address)>>PAGE_SHIFT,	\
    7.17 +		HYPERVISOR_update_va_mapping((__address),		\
    7.18  					     __entry, 0);		\
    7.19  	} else {							\
    7.20  		xen_l1_entry_update((__ptep), (__entry).pte_low);	\
     8.1 --- a/linux-2.6.10-xen-sparse/include/asm-xen/hypervisor.h	Tue Feb 08 12:27:23 2005 +0000
     8.2 +++ b/linux-2.6.10-xen-sparse/include/asm-xen/hypervisor.h	Tue Feb 08 15:13:51 2005 +0000
     8.3 @@ -438,7 +438,7 @@ HYPERVISOR_multicall(
     8.4  
     8.5  static inline int
     8.6  HYPERVISOR_update_va_mapping(
     8.7 -    unsigned long page_nr, pte_t new_val, unsigned long flags)
     8.8 +    unsigned long nr, pte_t new_val, unsigned long flags)
     8.9  {
    8.10      int ret;
    8.11      unsigned long ign1, ign2, ign3;
    8.12 @@ -447,13 +447,13 @@ HYPERVISOR_update_va_mapping(
    8.13          TRAP_INSTR
    8.14          : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3)
    8.15  	: "0" (__HYPERVISOR_update_va_mapping), 
    8.16 -          "1" (page_nr), "2" ((new_val).pte_low), "3" (flags)
    8.17 +          "1" (va), "2" ((new_val).pte_low), "3" (flags)
    8.18  	: "memory" );
    8.19  
    8.20      if ( unlikely(ret < 0) )
    8.21      {
    8.22          printk(KERN_ALERT "Failed update VA mapping: %08lx, %08lx, %08lx\n",
    8.23 -               page_nr, (new_val).pte_low, flags);
    8.24 +               va, (new_val).pte_low, flags);
    8.25          BUG();
    8.26      }
    8.27  
    8.28 @@ -540,7 +540,7 @@ HYPERVISOR_grant_table_op(
    8.29  
    8.30  static inline int
    8.31  HYPERVISOR_update_va_mapping_otherdomain(
    8.32 -    unsigned long page_nr, pte_t new_val, unsigned long flags, domid_t domid)
    8.33 +    unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
    8.34  {
    8.35      int ret;
    8.36      unsigned long ign1, ign2, ign3, ign4;
    8.37 @@ -549,7 +549,7 @@ HYPERVISOR_update_va_mapping_otherdomain
    8.38          TRAP_INSTR
    8.39          : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4)
    8.40  	: "0" (__HYPERVISOR_update_va_mapping_otherdomain),
    8.41 -          "1" (page_nr), "2" ((new_val).pte_low), "3" (flags), "4" (domid) :
    8.42 +          "1" (va), "2" ((new_val).pte_low), "3" (flags), "4" (domid) :
    8.43          "memory" );
    8.44      
    8.45      return ret;
     9.1 --- a/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h	Tue Feb 08 12:27:23 2005 +0000
     9.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/include/hypervisor.h	Tue Feb 08 15:13:51 2005 +0000
     9.3 @@ -398,7 +398,7 @@ HYPERVISOR_multicall(void *call_list, in
     9.4  }
     9.5  
     9.6  static inline int
     9.7 -HYPERVISOR_update_va_mapping(unsigned long page_nr, unsigned long new_val,
     9.8 +HYPERVISOR_update_va_mapping(unsigned long va, unsigned long new_val,
     9.9      unsigned long flags)
    9.10  {
    9.11      int ret;
    9.12 @@ -408,12 +408,12 @@ HYPERVISOR_update_va_mapping(unsigned lo
    9.13          TRAP_INSTR
    9.14          : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3)
    9.15  	: "0" (__HYPERVISOR_update_va_mapping), 
    9.16 -          "1" (page_nr), "2" (new_val), "3" (flags)
    9.17 +          "1" (va), "2" (new_val), "3" (flags)
    9.18  	: "memory" );
    9.19  
    9.20      if (__predict_false(ret < 0))
    9.21          panic("Failed update VA mapping: %08lx, %08lx, %08lx",
    9.22 -              page_nr, new_val, flags);
    9.23 +              va, new_val, flags);
    9.24  
    9.25      return ret;
    9.26  }
    9.27 @@ -494,7 +494,7 @@ HYPERVISOR_grant_table_op(unsigned int c
    9.28  }
    9.29  
    9.30  static inline int
    9.31 -HYPERVISOR_update_va_mapping_otherdomain(unsigned long page_nr,
    9.32 +HYPERVISOR_update_va_mapping_otherdomain(unsigned long va,
    9.33      unsigned long new_val, unsigned long flags, domid_t domid)
    9.34  {
    9.35      int ret;
    9.36 @@ -504,7 +504,7 @@ HYPERVISOR_update_va_mapping_otherdomain
    9.37          TRAP_INSTR
    9.38          : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4)
    9.39  	: "0" (__HYPERVISOR_update_va_mapping_otherdomain),
    9.40 -          "1" (page_nr), "2" (new_val), "3" (flags), "4" (domid) :
    9.41 +          "1" (va), "2" (new_val), "3" (flags), "4" (domid) :
    9.42          "memory" );
    9.43      
    9.44      return ret;
    10.1 --- a/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c	Tue Feb 08 12:27:23 2005 +0000
    10.2 +++ b/netbsd-2.0-xen-sparse/sys/arch/xen/xen/if_xennet.c	Tue Feb 08 15:13:51 2005 +0000
    10.3 @@ -580,7 +580,7 @@ xennet_rx_push_buffer(struct xennet_soft
    10.4  		INVALID_P2M_ENTRY;
    10.5  
    10.6  	rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
    10.7 -	rx_mcl[nr_pfns].args[0] = sc->sc_rx_bufa[id].xb_rx.xbrx_va >> PAGE_SHIFT;
    10.8 +	rx_mcl[nr_pfns].args[0] = sc->sc_rx_bufa[id].xb_rx.xbrx_va;
    10.9  	rx_mcl[nr_pfns].args[1] = 0;
   10.10  	rx_mcl[nr_pfns].args[2] = 0;
   10.11  
   10.12 @@ -679,7 +679,7 @@ xen_network_handler(void *arg)
   10.13  		mmu->val  = (pa - XPMAP_OFFSET) >> PAGE_SHIFT;
   10.14  		mmu++;
   10.15  		mcl->op = __HYPERVISOR_update_va_mapping;
   10.16 -		mcl->args[0] = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va >> PAGE_SHIFT;
   10.17 +		mcl->args[0] = sc->sc_rx_bufa[rx->id].xb_rx.xbrx_va;
   10.18  		mcl->args[1] = (rx->addr & PG_FRAME) | PG_V|PG_KW;
   10.19  		mcl->args[2] = UVMF_FLUSH_TLB; // 0;
   10.20  		mcl++;
   10.21 @@ -872,7 +872,7 @@ network_alloc_rx_buffers(struct xennet_s
   10.22  			INVALID_P2M_ENTRY;
   10.23  
   10.24  		rx_mcl[nr_pfns].op = __HYPERVISOR_update_va_mapping;
   10.25 -		rx_mcl[nr_pfns].args[0] = va >> PAGE_SHIFT;
   10.26 +		rx_mcl[nr_pfns].args[0] = va;
   10.27  		rx_mcl[nr_pfns].args[1] = 0;
   10.28  		rx_mcl[nr_pfns].args[2] = 0;
   10.29  
    11.1 --- a/xen/arch/x86/memory.c	Tue Feb 08 12:27:23 2005 +0000
    11.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.3 @@ -1,2594 +0,0 @@
    11.4 -/* -*-  Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
    11.5 -/******************************************************************************
    11.6 - * arch/x86/memory.c
    11.7 - * 
    11.8 - * Copyright (c) 2002-2004 K A Fraser
    11.9 - * Copyright (c) 2004 Christian Limpach
   11.10 - * 
   11.11 - * This program is free software; you can redistribute it and/or modify
   11.12 - * it under the terms of the GNU General Public License as published by
   11.13 - * the Free Software Foundation; either version 2 of the License, or
   11.14 - * (at your option) any later version.
   11.15 - * 
   11.16 - * This program is distributed in the hope that it will be useful,
   11.17 - * but WITHOUT ANY WARRANTY; without even the implied warranty of
   11.18 - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   11.19 - * GNU General Public License for more details.
   11.20 - * 
   11.21 - * You should have received a copy of the GNU General Public License
   11.22 - * along with this program; if not, write to the Free Software
   11.23 - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   11.24 - */
   11.25 -
   11.26 -/*
   11.27 - * A description of the x86 page table API:
   11.28 - * 
   11.29 - * Domains trap to do_mmu_update with a list of update requests.
   11.30 - * This is a list of (ptr, val) pairs, where the requested operation
   11.31 - * is *ptr = val.
   11.32 - * 
   11.33 - * Reference counting of pages:
   11.34 - * ----------------------------
   11.35 - * Each page has two refcounts: tot_count and type_count.
   11.36 - * 
   11.37 - * TOT_COUNT is the obvious reference count. It counts all uses of a
   11.38 - * physical page frame by a domain, including uses as a page directory,
   11.39 - * a page table, or simple mappings via a PTE. This count prevents a
   11.40 - * domain from releasing a frame back to the free pool when it still holds
   11.41 - * a reference to it.
   11.42 - * 
   11.43 - * TYPE_COUNT is more subtle. A frame can be put to one of three
   11.44 - * mutually-exclusive uses: it might be used as a page directory, or a
   11.45 - * page table, or it may be mapped writable by the domain [of course, a
   11.46 - * frame may not be used in any of these three ways!].
   11.47 - * So, type_count is a count of the number of times a frame is being 
   11.48 - * referred to in its current incarnation. Therefore, a page can only
   11.49 - * change its type when its type count is zero.
   11.50 - * 
   11.51 - * Pinning the page type:
   11.52 - * ----------------------
   11.53 - * The type of a page can be pinned/unpinned with the commands
   11.54 - * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
   11.55 - * pinning is not reference counted, so it can't be nested).
   11.56 - * This is useful to prevent a page's type count falling to zero, at which
   11.57 - * point safety checks would need to be carried out next time the count
   11.58 - * is increased again.
   11.59 - * 
   11.60 - * A further note on writable page mappings:
   11.61 - * -----------------------------------------
   11.62 - * For simplicity, the count of writable mappings for a page may not
   11.63 - * correspond to reality. The 'writable count' is incremented for every
   11.64 - * PTE which maps the page with the _PAGE_RW flag set. However, for
   11.65 - * write access to be possible the page directory entry must also have
   11.66 - * its _PAGE_RW bit set. We do not check this as it complicates the 
   11.67 - * reference counting considerably [consider the case of multiple
   11.68 - * directory entries referencing a single page table, some with the RW
   11.69 - * bit set, others not -- it starts getting a bit messy].
   11.70 - * In normal use, this simplification shouldn't be a problem.
   11.71 - * However, the logic can be added if required.
   11.72 - * 
   11.73 - * One more note on read-only page mappings:
   11.74 - * -----------------------------------------
   11.75 - * We want domains to be able to map pages for read-only access. The
   11.76 - * main reason is that page tables and directories should be readable
   11.77 - * by a domain, but it would not be safe for them to be writable.
   11.78 - * However, domains have free access to rings 1 & 2 of the Intel
   11.79 - * privilege model. In terms of page protection, these are considered
   11.80 - * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
   11.81 - * read-only restrictions are respected in supervisor mode -- if the 
   11.82 - * bit is clear then any mapped page is writable.
   11.83 - * 
   11.84 - * We get round this by always setting the WP bit and disallowing 
   11.85 - * updates to it. This is very unlikely to cause a problem for guest
   11.86 - * OS's, which will generally use the WP bit to simplify copy-on-write
   11.87 - * implementation (in that case, OS wants a fault when it writes to
   11.88 - * an application-supplied buffer).
   11.89 - */
   11.90 -
   11.91 -#include <xen/config.h>
   11.92 -#include <xen/init.h>
   11.93 -#include <xen/kernel.h>
   11.94 -#include <xen/lib.h>
   11.95 -#include <xen/mm.h>
   11.96 -#include <xen/sched.h>
   11.97 -#include <xen/errno.h>
   11.98 -#include <xen/perfc.h>
   11.99 -#include <xen/irq.h>
  11.100 -#include <xen/softirq.h>
  11.101 -#include <asm/shadow.h>
  11.102 -#include <asm/page.h>
  11.103 -#include <asm/flushtlb.h>
  11.104 -#include <asm/io.h>
  11.105 -#include <asm/uaccess.h>
  11.106 -#include <asm/domain_page.h>
  11.107 -#include <asm/ldt.h>
  11.108 -
  11.109 -#ifdef VERBOSE
  11.110 -#define MEM_LOG(_f, _a...)                           \
  11.111 -  printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
  11.112 -         current->domain->id , __LINE__ , ## _a )
  11.113 -#else
  11.114 -#define MEM_LOG(_f, _a...) ((void)0)
  11.115 -#endif
  11.116 -
  11.117 -static int alloc_l2_table(struct pfn_info *page);
  11.118 -static int alloc_l1_table(struct pfn_info *page);
  11.119 -static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
  11.120 -static int get_page_and_type_from_pagenr(unsigned long page_nr, 
  11.121 -                                         u32 type,
  11.122 -                                         struct domain *d);
  11.123 -
  11.124 -static void free_l2_table(struct pfn_info *page);
  11.125 -static void free_l1_table(struct pfn_info *page);
  11.126 -
  11.127 -static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
  11.128 -static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
  11.129 -
  11.130 -/* Used to defer flushing of memory structures. */
  11.131 -static struct {
  11.132 -#define DOP_FLUSH_TLB   (1<<0) /* Flush the TLB.                 */
  11.133 -#define DOP_RELOAD_LDT  (1<<1) /* Reload the LDT shadow mapping. */
  11.134 -    unsigned long  deferred_ops;
  11.135 -    /* If non-NULL, specifies a foreign subject domain for some operations. */
  11.136 -    struct domain *foreign;
  11.137 -} __cacheline_aligned percpu_info[NR_CPUS];
  11.138 -
  11.139 -/*
  11.140 - * Returns the current foreign domain; defaults to the currently-executing
  11.141 - * domain if a foreign override hasn't been specified.
  11.142 - */
  11.143 -#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
  11.144 -
  11.145 -/* Private domain structs for DOMID_XEN and DOMID_IO. */
  11.146 -static struct domain *dom_xen, *dom_io;
  11.147 -
  11.148 -/* Frame table and its size in pages. */
  11.149 -struct pfn_info *frame_table;
  11.150 -unsigned long frame_table_size;
  11.151 -unsigned long max_page;
  11.152 -
  11.153 -void __init init_frametable(void)
  11.154 -{
  11.155 -    unsigned long i, p;
  11.156 -
  11.157 -    frame_table      = (struct pfn_info *)FRAMETABLE_VIRT_START;
  11.158 -    frame_table_size = max_page * sizeof(struct pfn_info);
  11.159 -    frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
  11.160 -
  11.161 -    for ( i = 0; i < frame_table_size; i += (4UL << 20) )
  11.162 -    {
  11.163 -        p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
  11.164 -        if ( p == 0 )
  11.165 -            panic("Not enough memory for frame table\n");
  11.166 -        map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p, 
  11.167 -                  4UL << 20, PAGE_HYPERVISOR);
  11.168 -    }
  11.169 -
  11.170 -    memset(frame_table, 0, frame_table_size);
  11.171 -}
  11.172 -
  11.173 -void arch_init_memory(void)
  11.174 -{
  11.175 -    extern void subarch_init_memory(struct domain *);
  11.176 -
  11.177 -    memset(percpu_info, 0, sizeof(percpu_info));
  11.178 -
  11.179 -    /*
  11.180 -     * Initialise our DOMID_XEN domain.
  11.181 -     * Any Xen-heap pages that we will allow to be mapped will have
  11.182 -     * their domain field set to dom_xen.
  11.183 -     */
  11.184 -    dom_xen = alloc_domain_struct();
  11.185 -    atomic_set(&dom_xen->refcnt, 1);
  11.186 -    dom_xen->id = DOMID_XEN;
  11.187 -
  11.188 -    /*
  11.189 -     * Initialise our DOMID_IO domain.
  11.190 -     * This domain owns no pages but is considered a special case when
  11.191 -     * mapping I/O pages, as the mappings occur at the priv of the caller.
  11.192 -     */
  11.193 -    dom_io = alloc_domain_struct();
  11.194 -    atomic_set(&dom_io->refcnt, 1);
  11.195 -    dom_io->id = DOMID_IO;
  11.196 -
  11.197 -    subarch_init_memory(dom_xen);
  11.198 -}
  11.199 -
  11.200 -void write_ptbase(struct exec_domain *ed)
  11.201 -{
  11.202 -    struct domain *d = ed->domain;
  11.203 -    unsigned long pa;
  11.204 -
  11.205 -#ifdef CONFIG_VMX
  11.206 -    if ( unlikely(shadow_mode(d)) )
  11.207 -        pa = ((shadow_mode(d) == SHM_full_32) ?
  11.208 -              pagetable_val(ed->arch.monitor_table) :
  11.209 -              pagetable_val(ed->arch.shadow_table));
  11.210 -    else
  11.211 -        pa = pagetable_val(ed->arch.pagetable);
  11.212 -#else
  11.213 -    if ( unlikely(shadow_mode(d)) )
  11.214 -        pa = pagetable_val(ed->arch.shadow_table);    
  11.215 -    else
  11.216 -        pa = pagetable_val(ed->arch.pagetable);
  11.217 -#endif
  11.218 -
  11.219 -    write_cr3(pa);
  11.220 -}
  11.221 -
  11.222 -static void __invalidate_shadow_ldt(struct exec_domain *d)
  11.223 -{
  11.224 -    int i;
  11.225 -    unsigned long pfn;
  11.226 -    struct pfn_info *page;
  11.227 -    
  11.228 -    d->arch.shadow_ldt_mapcnt = 0;
  11.229 -
  11.230 -    for ( i = 16; i < 32; i++ )
  11.231 -    {
  11.232 -        pfn = l1_pgentry_to_pagenr(d->arch.perdomain_ptes[i]);
  11.233 -        if ( pfn == 0 ) continue;
  11.234 -        d->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
  11.235 -        page = &frame_table[pfn];
  11.236 -        ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
  11.237 -        ASSERT_PAGE_IS_DOMAIN(page, d->domain);
  11.238 -        put_page_and_type(page);
  11.239 -    }
  11.240 -
  11.241 -    /* Dispose of the (now possibly invalid) mappings from the TLB.  */
  11.242 -    percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
  11.243 -}
  11.244 -
  11.245 -
  11.246 -static inline void invalidate_shadow_ldt(struct exec_domain *d)
  11.247 -{
  11.248 -    if ( d->arch.shadow_ldt_mapcnt != 0 )
  11.249 -        __invalidate_shadow_ldt(d);
  11.250 -}
  11.251 -
  11.252 -
  11.253 -static int alloc_segdesc_page(struct pfn_info *page)
  11.254 -{
  11.255 -    struct desc_struct *descs;
  11.256 -    int i;
  11.257 -
  11.258 -    descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
  11.259 -
  11.260 -    for ( i = 0; i < 512; i++ )
  11.261 -        if ( unlikely(!check_descriptor(&descs[i])) )
  11.262 -            goto fail;
  11.263 -
  11.264 -    unmap_domain_mem(descs);
  11.265 -    return 1;
  11.266 -
  11.267 - fail:
  11.268 -    unmap_domain_mem(descs);
  11.269 -    return 0;
  11.270 -}
  11.271 -
  11.272 -
  11.273 -/* Map shadow page at offset @off. */
  11.274 -int map_ldt_shadow_page(unsigned int off)
  11.275 -{
  11.276 -    struct exec_domain *ed = current;
  11.277 -    struct domain *d = ed->domain;
  11.278 -    unsigned long l1e;
  11.279 -
  11.280 -    if ( unlikely(in_irq()) )
  11.281 -        BUG();
  11.282 -
  11.283 -    __get_user(l1e, (unsigned long *)&linear_pg_table[(ed->arch.ldt_base >> 
  11.284 -                                                       PAGE_SHIFT) + off]);
  11.285 -
  11.286 -    if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
  11.287 -         unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], 
  11.288 -                                     d, PGT_ldt_page)) )
  11.289 -        return 0;
  11.290 -
  11.291 -    ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
  11.292 -    ed->arch.shadow_ldt_mapcnt++;
  11.293 -
  11.294 -    return 1;
  11.295 -}
  11.296 -
  11.297 -
  11.298 -static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
  11.299 -{
  11.300 -    struct pfn_info *page = &frame_table[page_nr];
  11.301 -
  11.302 -    if ( unlikely(!pfn_is_ram(page_nr)) )
  11.303 -    {
  11.304 -        MEM_LOG("Pfn %08lx is not RAM", page_nr);
  11.305 -        return 0;
  11.306 -    }
  11.307 -
  11.308 -    if ( unlikely(!get_page(page, d)) )
  11.309 -    {
  11.310 -        MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
  11.311 -        return 0;
  11.312 -    }
  11.313 -
  11.314 -    return 1;
  11.315 -}
  11.316 -
  11.317 -
  11.318 -static int get_page_and_type_from_pagenr(unsigned long page_nr, 
  11.319 -                                         u32 type,
  11.320 -                                         struct domain *d)
  11.321 -{
  11.322 -    struct pfn_info *page = &frame_table[page_nr];
  11.323 -
  11.324 -    if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
  11.325 -        return 0;
  11.326 -
  11.327 -    if ( unlikely(!get_page_type(page, type)) )
  11.328 -    {
  11.329 -#ifdef VERBOSE
  11.330 -        if ( (type & PGT_type_mask) != PGT_l1_page_table )
  11.331 -            MEM_LOG("Bad page type for pfn %08lx (%08x)", 
  11.332 -                    page_nr, page->u.inuse.type_info);
  11.333 -#endif
  11.334 -        put_page(page);
  11.335 -        return 0;
  11.336 -    }
  11.337 -
  11.338 -    return 1;
  11.339 -}
  11.340 -
  11.341 -
  11.342 -/*
  11.343 - * We allow an L2 tables to map each other (a.k.a. linear page tables). It
  11.344 - * needs some special care with reference counst and access permissions:
  11.345 - *  1. The mapping entry must be read-only, or the guest may get write access
  11.346 - *     to its own PTEs.
  11.347 - *  2. We must only bump the reference counts for an *already validated*
  11.348 - *     L2 table, or we can end up in a deadlock in get_page_type() by waiting
  11.349 - *     on a validation that is required to complete that validation.
  11.350 - *  3. We only need to increment the reference counts for the mapped page
  11.351 - *     frame if it is mapped by a different L2 table. This is sufficient and
  11.352 - *     also necessary to allow validation of an L2 table mapping itself.
  11.353 - */
  11.354 -static int 
  11.355 -get_linear_pagetable(
  11.356 -    l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
  11.357 -{
  11.358 -    u32 x, y;
  11.359 -    struct pfn_info *page;
  11.360 -
  11.361 -    if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
  11.362 -    {
  11.363 -        MEM_LOG("Attempt to create linear p.t. with write perms");
  11.364 -        return 0;
  11.365 -    }
  11.366 -
  11.367 -    if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
  11.368 -    {
  11.369 -        /* Make sure the mapped frame belongs to the correct domain. */
  11.370 -        if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
  11.371 -            return 0;
  11.372 -
  11.373 -        /*
  11.374 -         * Make sure that the mapped frame is an already-validated L2 table. 
  11.375 -         * If so, atomically increment the count (checking for overflow).
  11.376 -         */
  11.377 -        page = &frame_table[l2_pgentry_to_pagenr(l2e)];
  11.378 -        y = page->u.inuse.type_info;
  11.379 -        do {
  11.380 -            x = y;
  11.381 -            if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
  11.382 -                 unlikely((x & (PGT_type_mask|PGT_validated)) != 
  11.383 -                          (PGT_l2_page_table|PGT_validated)) )
  11.384 -            {
  11.385 -                put_page(page);
  11.386 -                return 0;
  11.387 -            }
  11.388 -        }
  11.389 -        while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
  11.390 -    }
  11.391 -
  11.392 -    return 1;
  11.393 -}
  11.394 -
  11.395 -
  11.396 -static int
  11.397 -get_page_from_l1e(
  11.398 -    l1_pgentry_t l1e, struct domain *d)
  11.399 -{
  11.400 -    unsigned long l1v = l1_pgentry_val(l1e);
  11.401 -    unsigned long pfn = l1_pgentry_to_pagenr(l1e);
  11.402 -    struct pfn_info *page = &frame_table[pfn];
  11.403 -    extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
  11.404 -
  11.405 -    if ( !(l1v & _PAGE_PRESENT) )
  11.406 -        return 1;
  11.407 -
  11.408 -    if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
  11.409 -    {
  11.410 -        MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
  11.411 -        return 0;
  11.412 -    }
  11.413 -
  11.414 -    if ( unlikely(!pfn_is_ram(pfn)) )
  11.415 -    {
  11.416 -        /* Revert to caller privileges if FD == DOMID_IO. */
  11.417 -        if ( d == dom_io )
  11.418 -            d = current->domain;
  11.419 -
  11.420 -        if ( IS_PRIV(d) )
  11.421 -            return 1;
  11.422 -
  11.423 -        if ( IS_CAPABLE_PHYSDEV(d) )
  11.424 -            return domain_iomem_in_pfn(d, pfn);
  11.425 -
  11.426 -        MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
  11.427 -        return 0;
  11.428 -    }
  11.429 -
  11.430 -    return ((l1v & _PAGE_RW) ?
  11.431 -            get_page_and_type(page, d, PGT_writable_page) :
  11.432 -            get_page(page, d));
  11.433 -}
  11.434 -
  11.435 -
  11.436 -/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
  11.437 -static int 
  11.438 -get_page_from_l2e(
  11.439 -    l2_pgentry_t l2e, unsigned long pfn,
  11.440 -    struct domain *d, unsigned long va_idx)
  11.441 -{
  11.442 -    int rc;
  11.443 -
  11.444 -    if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
  11.445 -        return 1;
  11.446 -
  11.447 -    if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
  11.448 -    {
  11.449 -        MEM_LOG("Bad L2 page type settings %04lx",
  11.450 -                l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
  11.451 -        return 0;
  11.452 -    }
  11.453 -
  11.454 -    rc = get_page_and_type_from_pagenr(
  11.455 -        l2_pgentry_to_pagenr(l2e), 
  11.456 -        PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
  11.457 -
  11.458 -    if ( unlikely(!rc) )
  11.459 -        return get_linear_pagetable(l2e, pfn, d);
  11.460 -
  11.461 -    return 1;
  11.462 -}
  11.463 -
  11.464 -
  11.465 -static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
  11.466 -{
  11.467 -    unsigned long    l1v  = l1_pgentry_val(l1e);
  11.468 -    unsigned long    pfn  = l1_pgentry_to_pagenr(l1e);
  11.469 -    struct pfn_info *page = &frame_table[pfn];
  11.470 -    struct domain   *e;
  11.471 -
  11.472 -    if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
  11.473 -        return;
  11.474 -
  11.475 -    e = page_get_owner(page);
  11.476 -    if ( unlikely(e != d) )
  11.477 -    {
  11.478 -        /*
  11.479 -         * Unmap a foreign page that may have been mapped via a grant table.
  11.480 -         * Note that this can fail for a privileged domain that can map foreign
  11.481 -         * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
  11.482 -         * counted via a grant entry and some counted directly in the page
  11.483 -         * structure's reference count. Note that reference counts won't get
  11.484 -         * dangerously confused as long as we always try to decrement the
  11.485 -         * grant entry first. We may end up with a mismatch between which
  11.486 -         * mappings and which unmappings are counted via the grant entry, but
  11.487 -         * really it doesn't matter as privileged domains have carte blanche.
  11.488 -         */
  11.489 -        if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
  11.490 -            return;
  11.491 -        /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
  11.492 -    }
  11.493 -
  11.494 -    if ( l1v & _PAGE_RW )
  11.495 -    {
  11.496 -        put_page_and_type(page);
  11.497 -    }
  11.498 -    else
  11.499 -    {
  11.500 -        /* We expect this is rare so we blow the entire shadow LDT. */
  11.501 -        if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == 
  11.502 -                       PGT_ldt_page)) &&
  11.503 -             unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
  11.504 -            invalidate_shadow_ldt(e->exec_domain[0]);
  11.505 -        put_page(page);
  11.506 -    }
  11.507 -}
  11.508 -
  11.509 -
  11.510 -/*
  11.511 - * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
  11.512 - * Note also that this automatically deals correctly with linear p.t.'s.
  11.513 - */
  11.514 -static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
  11.515 -{
  11.516 -    if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && 
  11.517 -         ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
  11.518 -        put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
  11.519 -}
  11.520 -
  11.521 -
  11.522 -static int alloc_l2_table(struct pfn_info *page)
  11.523 -{
  11.524 -    struct domain *d = page_get_owner(page);
  11.525 -    unsigned long  page_nr = page_to_pfn(page);
  11.526 -    l2_pgentry_t  *pl2e;
  11.527 -    int            i;
  11.528 -   
  11.529 -    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
  11.530 -
  11.531 -    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  11.532 -        if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
  11.533 -            goto fail;
  11.534 -
  11.535 -#if defined(__i386__)
  11.536 -    /* Now we add our private high mappings. */
  11.537 -    memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
  11.538 -           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  11.539 -           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  11.540 -    pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
  11.541 -        mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  11.542 -    pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
  11.543 -        mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) | 
  11.544 -                      __PAGE_HYPERVISOR);
  11.545 -#endif
  11.546 -
  11.547 -    unmap_domain_mem(pl2e);
  11.548 -    return 1;
  11.549 -
  11.550 - fail:
  11.551 -    while ( i-- > 0 )
  11.552 -        put_page_from_l2e(pl2e[i], page_nr);
  11.553 -
  11.554 -    unmap_domain_mem(pl2e);
  11.555 -    return 0;
  11.556 -}
  11.557 -
  11.558 -
  11.559 -static int alloc_l1_table(struct pfn_info *page)
  11.560 -{
  11.561 -    struct domain *d = page_get_owner(page);
  11.562 -    unsigned long  page_nr = page_to_pfn(page);
  11.563 -    l1_pgentry_t  *pl1e;
  11.564 -    int            i;
  11.565 -
  11.566 -    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
  11.567 -
  11.568 -    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
  11.569 -        if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
  11.570 -            goto fail;
  11.571 -
  11.572 -    unmap_domain_mem(pl1e);
  11.573 -    return 1;
  11.574 -
  11.575 - fail:
  11.576 -    while ( i-- > 0 )
  11.577 -        put_page_from_l1e(pl1e[i], d);
  11.578 -
  11.579 -    unmap_domain_mem(pl1e);
  11.580 -    return 0;
  11.581 -}
  11.582 -
  11.583 -
  11.584 -static void free_l2_table(struct pfn_info *page)
  11.585 -{
  11.586 -    unsigned long page_nr = page - frame_table;
  11.587 -    l2_pgentry_t *pl2e;
  11.588 -    int i;
  11.589 -
  11.590 -    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
  11.591 -
  11.592 -    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  11.593 -        put_page_from_l2e(pl2e[i], page_nr);
  11.594 -
  11.595 -    unmap_domain_mem(pl2e);
  11.596 -}
  11.597 -
  11.598 -
  11.599 -static void free_l1_table(struct pfn_info *page)
  11.600 -{
  11.601 -    struct domain *d = page_get_owner(page);
  11.602 -    unsigned long page_nr = page - frame_table;
  11.603 -    l1_pgentry_t *pl1e;
  11.604 -    int i;
  11.605 -
  11.606 -    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
  11.607 -
  11.608 -    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
  11.609 -        put_page_from_l1e(pl1e[i], d);
  11.610 -
  11.611 -    unmap_domain_mem(pl1e);
  11.612 -}
  11.613 -
  11.614 -
  11.615 -static inline int update_l2e(l2_pgentry_t *pl2e, 
  11.616 -                             l2_pgentry_t  ol2e, 
  11.617 -                             l2_pgentry_t  nl2e)
  11.618 -{
  11.619 -    unsigned long o = cmpxchg((unsigned long *)pl2e, 
  11.620 -                              l2_pgentry_val(ol2e), 
  11.621 -                              l2_pgentry_val(nl2e));
  11.622 -    if ( o != l2_pgentry_val(ol2e) )
  11.623 -        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
  11.624 -                l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
  11.625 -    return (o == l2_pgentry_val(ol2e));
  11.626 -}
  11.627 -
  11.628 -
  11.629 -/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
  11.630 -static int mod_l2_entry(l2_pgentry_t *pl2e, 
  11.631 -                        l2_pgentry_t nl2e, 
  11.632 -                        unsigned long pfn)
  11.633 -{
  11.634 -    l2_pgentry_t ol2e;
  11.635 -    unsigned long _ol2e;
  11.636 -
  11.637 -    if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
  11.638 -                  DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
  11.639 -    {
  11.640 -        MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
  11.641 -        return 0;
  11.642 -    }
  11.643 -
  11.644 -    if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
  11.645 -        return 0;
  11.646 -    ol2e = mk_l2_pgentry(_ol2e);
  11.647 -
  11.648 -    if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
  11.649 -    {
  11.650 -        /* Differ in mapping (bits 12-31) or presence (bit 0)? */
  11.651 -        if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
  11.652 -            return update_l2e(pl2e, ol2e, nl2e);
  11.653 -
  11.654 -        if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
  11.655 -                                        ((unsigned long)pl2e & 
  11.656 -                                         ~PAGE_MASK) >> 2)) )
  11.657 -            return 0;
  11.658 -
  11.659 -        if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
  11.660 -        {
  11.661 -            put_page_from_l2e(nl2e, pfn);
  11.662 -            return 0;
  11.663 -        }
  11.664 -        
  11.665 -        put_page_from_l2e(ol2e, pfn);
  11.666 -        return 1;
  11.667 -    }
  11.668 -
  11.669 -    if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
  11.670 -        return 0;
  11.671 -
  11.672 -    put_page_from_l2e(ol2e, pfn);
  11.673 -    return 1;
  11.674 -}
  11.675 -
  11.676 -
  11.677 -static inline int update_l1e(l1_pgentry_t *pl1e, 
  11.678 -                             l1_pgentry_t  ol1e, 
  11.679 -                             l1_pgentry_t  nl1e)
  11.680 -{
  11.681 -    unsigned long o = l1_pgentry_val(ol1e);
  11.682 -    unsigned long n = l1_pgentry_val(nl1e);
  11.683 -
  11.684 -    if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
  11.685 -         unlikely(o != l1_pgentry_val(ol1e)) )
  11.686 -    {
  11.687 -        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
  11.688 -                l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
  11.689 -        return 0;
  11.690 -    }
  11.691 -
  11.692 -    return 1;
  11.693 -}
  11.694 -
  11.695 -
  11.696 -/* Update the L1 entry at pl1e to new value nl1e. */
  11.697 -static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
  11.698 -{
  11.699 -    l1_pgentry_t ol1e;
  11.700 -    unsigned long _ol1e;
  11.701 -    struct domain *d = current->domain;
  11.702 -
  11.703 -    if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
  11.704 -    {
  11.705 -        MEM_LOG("Bad get_user\n");
  11.706 -        return 0;
  11.707 -    }
  11.708 -    
  11.709 -    ol1e = mk_l1_pgentry(_ol1e);
  11.710 -
  11.711 -    if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
  11.712 -    {
  11.713 -        /* Same mapping (bits 12-31), r/w (bit 1), and presence (bit 0)? */
  11.714 -        if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
  11.715 -            return update_l1e(pl1e, ol1e, nl1e);
  11.716 -
  11.717 -        if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
  11.718 -            return 0;
  11.719 -        
  11.720 -        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
  11.721 -        {
  11.722 -            put_page_from_l1e(nl1e, d);
  11.723 -            return 0;
  11.724 -        }
  11.725 -        
  11.726 -        put_page_from_l1e(ol1e, d);
  11.727 -        return 1;
  11.728 -    }
  11.729 -
  11.730 -    if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
  11.731 -        return 0;
  11.732 -    
  11.733 -    put_page_from_l1e(ol1e, d);
  11.734 -    return 1;
  11.735 -}
  11.736 -
  11.737 -
  11.738 -int alloc_page_type(struct pfn_info *page, unsigned int type)
  11.739 -{
  11.740 -    switch ( type )
  11.741 -    {
  11.742 -    case PGT_l1_page_table:
  11.743 -        return alloc_l1_table(page);
  11.744 -    case PGT_l2_page_table:
  11.745 -        return alloc_l2_table(page);
  11.746 -    case PGT_gdt_page:
  11.747 -    case PGT_ldt_page:
  11.748 -        return alloc_segdesc_page(page);
  11.749 -    default:
  11.750 -        printk("Bad type in alloc_page_type %x t=%x c=%x\n", 
  11.751 -               type, page->u.inuse.type_info,
  11.752 -               page->count_info);
  11.753 -        BUG();
  11.754 -    }
  11.755 -
  11.756 -    return 0;
  11.757 -}
  11.758 -
  11.759 -
  11.760 -void free_page_type(struct pfn_info *page, unsigned int type)
  11.761 -{
  11.762 -    struct domain *d = page_get_owner(page);
  11.763 -
  11.764 -    switch ( type )
  11.765 -    {
  11.766 -    case PGT_l1_page_table:
  11.767 -        free_l1_table(page);
  11.768 -        break;
  11.769 -
  11.770 -    case PGT_l2_page_table:
  11.771 -        free_l2_table(page);
  11.772 -        break;
  11.773 -
  11.774 -    default:
  11.775 -        BUG();
  11.776 -    }
  11.777 -
  11.778 -    if ( unlikely(shadow_mode(d)) && 
  11.779 -         (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) )
  11.780 -    {
  11.781 -        unshadow_table(page_to_pfn(page), type);
  11.782 -        put_shadow_status(d);
  11.783 -    }
  11.784 -}
  11.785 -
  11.786 -
  11.787 -void put_page_type(struct pfn_info *page)
  11.788 -{
  11.789 -    u32 nx, x, y = page->u.inuse.type_info;
  11.790 -
  11.791 - again:
  11.792 -    do {
  11.793 -        x  = y;
  11.794 -        nx = x - 1;
  11.795 -
  11.796 -        ASSERT((x & PGT_count_mask) != 0);
  11.797 -
  11.798 -        /*
  11.799 -         * The page should always be validated while a reference is held. The 
  11.800 -         * exception is during domain destruction, when we forcibly invalidate 
  11.801 -         * page-table pages if we detect a referential loop.
  11.802 -         * See domain.c:relinquish_list().
  11.803 -         */
  11.804 -        ASSERT((x & PGT_validated) || 
  11.805 -               test_bit(DF_DYING, &page_get_owner(page)->d_flags));
  11.806 -
  11.807 -        if ( unlikely((nx & PGT_count_mask) == 0) )
  11.808 -        {
  11.809 -            /* Record TLB information for flush later. Races are harmless. */
  11.810 -            page->tlbflush_timestamp = tlbflush_current_time();
  11.811 -            
  11.812 -            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
  11.813 -                 likely(nx & PGT_validated) )
  11.814 -            {
  11.815 -                /*
  11.816 -                 * Page-table pages must be unvalidated when count is zero. The
  11.817 -                 * 'free' is safe because the refcnt is non-zero and validated
  11.818 -                 * bit is clear => other ops will spin or fail.
  11.819 -                 */
  11.820 -                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
  11.821 -                                           x & ~PGT_validated)) != x) )
  11.822 -                    goto again;
  11.823 -                /* We cleared the 'valid bit' so we do the clear up. */
  11.824 -                free_page_type(page, x & PGT_type_mask);
  11.825 -                /* Carry on, but with the 'valid bit' now clear. */
  11.826 -                x  &= ~PGT_validated;
  11.827 -                nx &= ~PGT_validated;
  11.828 -            }
  11.829 -        }
  11.830 -        else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 
  11.831 -                           (PGT_pinned | 1)) )
  11.832 -        {
  11.833 -            /* Page is now only pinned. Make the back pointer mutable again. */
  11.834 -            nx |= PGT_va_mutable;
  11.835 -        }
  11.836 -    }
  11.837 -    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
  11.838 -}
  11.839 -
  11.840 -
  11.841 -int get_page_type(struct pfn_info *page, u32 type)
  11.842 -{
  11.843 -    u32 nx, x, y = page->u.inuse.type_info;
  11.844 -
  11.845 - again:
  11.846 -    do {
  11.847 -        x  = y;
  11.848 -        nx = x + 1;
  11.849 -        if ( unlikely((nx & PGT_count_mask) == 0) )
  11.850 -        {
  11.851 -            MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
  11.852 -            return 0;
  11.853 -        }
  11.854 -        else if ( unlikely((x & PGT_count_mask) == 0) )
  11.855 -        {
  11.856 -            if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
  11.857 -            {
  11.858 -                /*
  11.859 -                 * On type change we check to flush stale TLB entries. This 
  11.860 -                 * may be unnecessary (e.g., page was GDT/LDT) but those
  11.861 -                 * circumstances should be very rare.
  11.862 -                 */
  11.863 -                struct domain *d = page_get_owner(page);
  11.864 -                if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
  11.865 -                                         page->tlbflush_timestamp)) )
  11.866 -                {
  11.867 -                    perfc_incr(need_flush_tlb_flush);
  11.868 -                    flush_tlb_cpu(d->exec_domain[0]->processor);
  11.869 -                }
  11.870 -
  11.871 -                /* We lose existing type, back pointer, and validity. */
  11.872 -                nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
  11.873 -                nx |= type;
  11.874 -
  11.875 -                /* No special validation needed for writable pages. */
  11.876 -                /* Page tables and GDT/LDT need to be scanned for validity. */
  11.877 -                if ( type == PGT_writable_page )
  11.878 -                    nx |= PGT_validated;
  11.879 -            }
  11.880 -        }
  11.881 -        else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
  11.882 -        {
  11.883 -            if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
  11.884 -            {
  11.885 -                if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
  11.886 -                     ((type & PGT_type_mask) != PGT_l1_page_table) )
  11.887 -                    MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
  11.888 -                            x & PGT_type_mask, type, page_to_pfn(page));
  11.889 -                return 0;
  11.890 -            }
  11.891 -            else if ( (x & PGT_va_mask) == PGT_va_mutable )
  11.892 -            {
  11.893 -                /* The va backpointer is mutable, hence we update it. */
  11.894 -                nx &= ~PGT_va_mask;
  11.895 -                nx |= type; /* we know the actual type is correct */
  11.896 -            }
  11.897 -            else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
  11.898 -            {
  11.899 -                /* This table is potentially mapped at multiple locations. */
  11.900 -                nx &= ~PGT_va_mask;
  11.901 -                nx |= PGT_va_unknown;
  11.902 -            }
  11.903 -        }
  11.904 -        else if ( unlikely(!(x & PGT_validated)) )
  11.905 -        {
  11.906 -            /* Someone else is updating validation of this page. Wait... */
  11.907 -            while ( (y = page->u.inuse.type_info) == x )
  11.908 -            {
  11.909 -                rep_nop();
  11.910 -                barrier();
  11.911 -            }
  11.912 -            goto again;
  11.913 -        }
  11.914 -    }
  11.915 -    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
  11.916 -
  11.917 -    if ( unlikely(!(nx & PGT_validated)) )
  11.918 -    {
  11.919 -        /* Try to validate page type; drop the new reference on failure. */
  11.920 -        if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
  11.921 -        {
  11.922 -            MEM_LOG("Error while validating pfn %08lx for type %08x."
  11.923 -                    " caf=%08x taf=%08x\n",
  11.924 -                    page_to_pfn(page), type,
  11.925 -                    page->count_info,
  11.926 -                    page->u.inuse.type_info);
  11.927 -            /* Noone else can get a reference. We hold the only ref. */
  11.928 -            page->u.inuse.type_info = 0;
  11.929 -            return 0;
  11.930 -        }
  11.931 -
  11.932 -        /* Noone else is updating simultaneously. */
  11.933 -        __set_bit(_PGT_validated, &page->u.inuse.type_info);
  11.934 -    }
  11.935 -
  11.936 -    return 1;
  11.937 -}
  11.938 -
  11.939 -
  11.940 -int new_guest_cr3(unsigned long pfn)
  11.941 -{
  11.942 -    struct exec_domain *ed = current;
  11.943 -    struct domain *d = ed->domain;
  11.944 -    int okay, cpu = smp_processor_id();
  11.945 -    unsigned long old_base_pfn;
  11.946 -    
  11.947 -    okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
  11.948 -    if ( likely(okay) )
  11.949 -    {
  11.950 -        invalidate_shadow_ldt(ed);
  11.951 -
  11.952 -        percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
  11.953 -        old_base_pfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT;
  11.954 -        ed->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
  11.955 -
  11.956 -        shadow_mk_pagetable(ed);
  11.957 -
  11.958 -        write_ptbase(ed);
  11.959 -
  11.960 -        put_page_and_type(&frame_table[old_base_pfn]);
  11.961 -    }
  11.962 -    else
  11.963 -    {
  11.964 -        MEM_LOG("Error while installing new baseptr %08lx", pfn);
  11.965 -    }
  11.966 -
  11.967 -    return okay;
  11.968 -}
  11.969 -
  11.970 -static int do_extended_command(unsigned long ptr, unsigned long val)
  11.971 -{
  11.972 -    int okay = 1, cpu = smp_processor_id();
  11.973 -    unsigned int cmd = val & MMUEXT_CMD_MASK;
  11.974 -    unsigned long pfn = ptr >> PAGE_SHIFT;
  11.975 -    struct pfn_info *page = &frame_table[pfn];
  11.976 -    struct exec_domain *ed = current;
  11.977 -    struct domain *d = ed->domain, *nd, *e;
  11.978 -    u32 x, y;
  11.979 -    domid_t domid;
  11.980 -    grant_ref_t gntref;
  11.981 -
  11.982 -    switch ( cmd )
  11.983 -    {
  11.984 -    case MMUEXT_PIN_L1_TABLE:
  11.985 -    case MMUEXT_PIN_L2_TABLE:
  11.986 -        /*
  11.987 -         * We insist that, if you pin an L1 page, it's the first thing that
  11.988 -         * you do to it. This is because we require the backptr to still be
  11.989 -         * mutable. This assumption seems safe.
  11.990 -         */
  11.991 -        okay = get_page_and_type_from_pagenr(
  11.992 -            pfn, 
  11.993 -            ((cmd==MMUEXT_PIN_L2_TABLE) ? 
  11.994 -             PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
  11.995 -            FOREIGNDOM);
  11.996 -
  11.997 -        if ( unlikely(!okay) )
  11.998 -        {
  11.999 -            MEM_LOG("Error while pinning pfn %08lx", pfn);
 11.1000 -            break;
 11.1001 -        }
 11.1002 -
 11.1003 -        if ( unlikely(test_and_set_bit(_PGT_pinned,
 11.1004 -                                       &page->u.inuse.type_info)) )
 11.1005 -        {
 11.1006 -            MEM_LOG("Pfn %08lx already pinned", pfn);
 11.1007 -            put_page_and_type(page);
 11.1008 -            okay = 0;
 11.1009 -            break;
 11.1010 -        }
 11.1011 -
 11.1012 -        break;
 11.1013 -
 11.1014 -    case MMUEXT_UNPIN_TABLE:
 11.1015 -        if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
 11.1016 -        {
 11.1017 -            MEM_LOG("Page %08lx bad domain (dom=%p)",
 11.1018 -                    ptr, page_get_owner(page));
 11.1019 -        }
 11.1020 -        else if ( likely(test_and_clear_bit(_PGT_pinned, 
 11.1021 -                                            &page->u.inuse.type_info)) )
 11.1022 -        {
 11.1023 -            put_page_and_type(page);
 11.1024 -            put_page(page);
 11.1025 -        }
 11.1026 -        else
 11.1027 -        {
 11.1028 -            okay = 0;
 11.1029 -            put_page(page);
 11.1030 -            MEM_LOG("Pfn %08lx not pinned", pfn);
 11.1031 -        }
 11.1032 -        break;
 11.1033 -
 11.1034 -    case MMUEXT_NEW_BASEPTR:
 11.1035 -        okay = new_guest_cr3(pfn);
 11.1036 -        break;
 11.1037 -        
 11.1038 -    case MMUEXT_TLB_FLUSH:
 11.1039 -        percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
 11.1040 -        break;
 11.1041 -    
 11.1042 -    case MMUEXT_INVLPG:
 11.1043 -        __flush_tlb_one(ptr);
 11.1044 -        break;
 11.1045 -
 11.1046 -    case MMUEXT_FLUSH_CACHE:
 11.1047 -        if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
 11.1048 -        {
 11.1049 -            MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
 11.1050 -            okay = 0;
 11.1051 -        }
 11.1052 -        else
 11.1053 -        {
 11.1054 -            wbinvd();
 11.1055 -        }
 11.1056 -        break;
 11.1057 -
 11.1058 -    case MMUEXT_SET_LDT:
 11.1059 -    {
 11.1060 -        unsigned long ents = val >> MMUEXT_CMD_SHIFT;
 11.1061 -        if ( ((ptr & (PAGE_SIZE-1)) != 0) || 
 11.1062 -             (ents > 8192) ||
 11.1063 -             ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
 11.1064 -             ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
 11.1065 -        {
 11.1066 -            okay = 0;
 11.1067 -            MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
 11.1068 -        }
 11.1069 -        else if ( (ed->arch.ldt_ents != ents) || 
 11.1070 -                  (ed->arch.ldt_base != ptr) )
 11.1071 -        {
 11.1072 -            invalidate_shadow_ldt(ed);
 11.1073 -            ed->arch.ldt_base = ptr;
 11.1074 -            ed->arch.ldt_ents = ents;
 11.1075 -            load_LDT(ed);
 11.1076 -            percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
 11.1077 -            if ( ents != 0 )
 11.1078 -                percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
 11.1079 -        }
 11.1080 -        break;
 11.1081 -    }
 11.1082 -
 11.1083 -    case MMUEXT_SET_FOREIGNDOM:
 11.1084 -        domid = (domid_t)(val >> 16);
 11.1085 -
 11.1086 -        if ( (e = percpu_info[cpu].foreign) != NULL )
 11.1087 -            put_domain(e);
 11.1088 -        percpu_info[cpu].foreign = NULL;
 11.1089 -
 11.1090 -        if ( !IS_PRIV(d) )
 11.1091 -        {
 11.1092 -            switch ( domid )
 11.1093 -            {
 11.1094 -            case DOMID_IO:
 11.1095 -                get_knownalive_domain(dom_io);
 11.1096 -                percpu_info[cpu].foreign = dom_io;
 11.1097 -                break;
 11.1098 -            default:
 11.1099 -                MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
 11.1100 -                okay = 0;
 11.1101 -                break;
 11.1102 -            }
 11.1103 -        }
 11.1104 -        else
 11.1105 -        {
 11.1106 -            percpu_info[cpu].foreign = e = find_domain_by_id(domid);
 11.1107 -            if ( e == NULL )
 11.1108 -            {
 11.1109 -                switch ( domid )
 11.1110 -                {
 11.1111 -                case DOMID_XEN:
 11.1112 -                    get_knownalive_domain(dom_xen);
 11.1113 -                    percpu_info[cpu].foreign = dom_xen;
 11.1114 -                    break;
 11.1115 -                case DOMID_IO:
 11.1116 -                    get_knownalive_domain(dom_io);
 11.1117 -                    percpu_info[cpu].foreign = dom_io;
 11.1118 -                    break;
 11.1119 -                default:
 11.1120 -                    MEM_LOG("Unknown domain '%u'", domid);
 11.1121 -                    okay = 0;
 11.1122 -                    break;
 11.1123 -                }
 11.1124 -            }
 11.1125 -        }
 11.1126 -        break;
 11.1127 -
 11.1128 -    case MMUEXT_TRANSFER_PAGE:
 11.1129 -        domid  = (domid_t)(val >> 16);
 11.1130 -        gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
 11.1131 -        
 11.1132 -        if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
 11.1133 -             unlikely(!pfn_is_ram(pfn)) ||
 11.1134 -             unlikely((e = find_domain_by_id(domid)) == NULL) )
 11.1135 -        {
 11.1136 -            MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
 11.1137 -            okay = 0;
 11.1138 -            break;
 11.1139 -        }
 11.1140 -
 11.1141 -        spin_lock(&d->page_alloc_lock);
 11.1142 -
 11.1143 -        /*
 11.1144 -         * The tricky bit: atomically release ownership while there is just one
 11.1145 -         * benign reference to the page (PGC_allocated). If that reference
 11.1146 -         * disappears then the deallocation routine will safely spin.
 11.1147 -         */
 11.1148 -        nd = page_get_owner(page);
 11.1149 -        y  = page->count_info;
 11.1150 -        do {
 11.1151 -            x = y;
 11.1152 -            if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 
 11.1153 -                          (1|PGC_allocated)) ||
 11.1154 -                 unlikely(nd != d) )
 11.1155 -            {
 11.1156 -                MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
 11.1157 -                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
 11.1158 -                        d, d->id, nd, x, page->u.inuse.type_info);
 11.1159 -                spin_unlock(&d->page_alloc_lock);
 11.1160 -                put_domain(e);
 11.1161 -                return 0;
 11.1162 -            }
 11.1163 -            __asm__ __volatile__(
 11.1164 -                LOCK_PREFIX "cmpxchg8b %2"
 11.1165 -                : "=d" (nd), "=a" (y),
 11.1166 -                "=m" (*(volatile u64 *)(&page->count_info))
 11.1167 -                : "0" (d), "1" (x), "c" (NULL), "b" (x) );
 11.1168 -        } 
 11.1169 -        while ( unlikely(nd != d) || unlikely(y != x) );
 11.1170 -
 11.1171 -        /*
 11.1172 -         * Unlink from 'd'. At least one reference remains (now anonymous), so
 11.1173 -         * noone else is spinning to try to delete this page from 'd'.
 11.1174 -         */
 11.1175 -        d->tot_pages--;
 11.1176 -        list_del(&page->list);
 11.1177 -        
 11.1178 -        spin_unlock(&d->page_alloc_lock);
 11.1179 -
 11.1180 -        spin_lock(&e->page_alloc_lock);
 11.1181 -
 11.1182 -        /*
 11.1183 -         * Check that 'e' will accept the page and has reservation headroom.
 11.1184 -         * Also, a domain mustn't have PGC_allocated pages when it is dying.
 11.1185 -         */
 11.1186 -        ASSERT(e->tot_pages <= e->max_pages);
 11.1187 -        if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
 11.1188 -             unlikely(e->tot_pages == e->max_pages) ||
 11.1189 -             unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
 11.1190 -        {
 11.1191 -            MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
 11.1192 -                    "provided a bad grant ref, or is dying (%08lx).\n",
 11.1193 -                    e->tot_pages, e->max_pages, e->d_flags);
 11.1194 -            spin_unlock(&e->page_alloc_lock);
 11.1195 -            put_domain(e);
 11.1196 -            okay = 0;
 11.1197 -            break;
 11.1198 -        }
 11.1199 -
 11.1200 -        /* Okay, add the page to 'e'. */
 11.1201 -        if ( unlikely(e->tot_pages++ == 0) )
 11.1202 -            get_knownalive_domain(e);
 11.1203 -        list_add_tail(&page->list, &e->page_list);
 11.1204 -        page_set_owner(page, e);
 11.1205 -
 11.1206 -        spin_unlock(&e->page_alloc_lock);
 11.1207 -
 11.1208 -        /* Transfer is all done: tell the guest about its new page frame. */
 11.1209 -        gnttab_notify_transfer(e, gntref, pfn);
 11.1210 -        
 11.1211 -        put_domain(e);
 11.1212 -        break;
 11.1213 -
 11.1214 -    case MMUEXT_REASSIGN_PAGE:
 11.1215 -        if ( unlikely(!IS_PRIV(d)) )
 11.1216 -        {
 11.1217 -            MEM_LOG("Dom %u has no reassignment priv", d->id);
 11.1218 -            okay = 0;
 11.1219 -            break;
 11.1220 -        }
 11.1221 -
 11.1222 -        e = percpu_info[cpu].foreign;
 11.1223 -        if ( unlikely(e == NULL) )
 11.1224 -        {
 11.1225 -            MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
 11.1226 -            okay = 0;
 11.1227 -            break;
 11.1228 -        }
 11.1229 -
 11.1230 -        /*
 11.1231 -         * Grab both page_list locks, in order. This prevents the page from
 11.1232 -         * disappearing elsewhere while we modify the owner, and we'll need
 11.1233 -         * both locks if we're successful so that we can change lists.
 11.1234 -         */
 11.1235 -        if ( d < e )
 11.1236 -        {
 11.1237 -            spin_lock(&d->page_alloc_lock);
 11.1238 -            spin_lock(&e->page_alloc_lock);
 11.1239 -        }
 11.1240 -        else
 11.1241 -        {
 11.1242 -            spin_lock(&e->page_alloc_lock);
 11.1243 -            spin_lock(&d->page_alloc_lock);
 11.1244 -        }
 11.1245 -
 11.1246 -        /* A domain shouldn't have PGC_allocated pages when it is dying. */
 11.1247 -        if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
 11.1248 -             unlikely(IS_XEN_HEAP_FRAME(page)) )
 11.1249 -        {
 11.1250 -            MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
 11.1251 -            okay = 0;
 11.1252 -            goto reassign_fail;
 11.1253 -        }
 11.1254 -
 11.1255 -        /*
 11.1256 -         * The tricky bit: atomically change owner while there is just one
 11.1257 -         * benign reference to the page (PGC_allocated). If that reference
 11.1258 -         * disappears then the deallocation routine will safely spin.
 11.1259 -         */
 11.1260 -        nd = page_get_owner(page);
 11.1261 -        y  = page->count_info;
 11.1262 -        do {
 11.1263 -            x = y;
 11.1264 -            if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 
 11.1265 -                          (1|PGC_allocated)) ||
 11.1266 -                 unlikely(nd != d) )
 11.1267 -            {
 11.1268 -                MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
 11.1269 -                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
 11.1270 -                        d, d->id, nd, x, page->u.inuse.type_info);
 11.1271 -                okay = 0;
 11.1272 -                goto reassign_fail;
 11.1273 -            }
 11.1274 -            __asm__ __volatile__(
 11.1275 -                LOCK_PREFIX "cmpxchg8b %3"
 11.1276 -                : "=d" (nd), "=a" (y), "=c" (e),
 11.1277 -                "=m" (*(volatile u64 *)(&page->count_info))
 11.1278 -                : "0" (d), "1" (x), "c" (e), "b" (x) );
 11.1279 -        } 
 11.1280 -        while ( unlikely(nd != d) || unlikely(y != x) );
 11.1281 -        
 11.1282 -        /*
 11.1283 -         * Unlink from 'd'. We transferred at least one reference to 'e', so
 11.1284 -         * noone else is spinning to try to delete this page from 'd'.
 11.1285 -         */
 11.1286 -        d->tot_pages--;
 11.1287 -        list_del(&page->list);
 11.1288 -        
 11.1289 -        /*
 11.1290 -         * Add the page to 'e'. Someone may already have removed the last
 11.1291 -         * reference and want to remove the page from 'e'. However, we have
 11.1292 -         * the lock so they'll spin waiting for us.
 11.1293 -         */
 11.1294 -        if ( unlikely(e->tot_pages++ == 0) )
 11.1295 -            get_knownalive_domain(e);
 11.1296 -        list_add_tail(&page->list, &e->page_list);
 11.1297 -
 11.1298 -    reassign_fail:        
 11.1299 -        spin_unlock(&d->page_alloc_lock);
 11.1300 -        spin_unlock(&e->page_alloc_lock);
 11.1301 -        break;
 11.1302 -
 11.1303 -    case MMUEXT_CLEAR_FOREIGNDOM:
 11.1304 -        if ( (e = percpu_info[cpu].foreign) != NULL )
 11.1305 -            put_domain(e);
 11.1306 -        percpu_info[cpu].foreign = NULL;
 11.1307 -        break;
 11.1308 -
 11.1309 -    default:
 11.1310 -        MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
 11.1311 -        okay = 0;
 11.1312 -        break;
 11.1313 -    }
 11.1314 -
 11.1315 -    return okay;
 11.1316 -}
 11.1317 -
 11.1318 -int do_mmu_update(
 11.1319 -    mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
 11.1320 -{
 11.1321 -/*
 11.1322 - * We steal the m.s.b. of the @count parameter to indicate whether this
 11.1323 - * invocation of do_mmu_update() is resuming a previously preempted call.
 11.1324 - * We steal the next 15 bits to remember the current FOREIGNDOM.
 11.1325 - */
 11.1326 -#define MMU_UPDATE_PREEMPTED          (~(~0U>>1))
 11.1327 -#define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
 11.1328 -#define MMU_UPDATE_PREEMPT_FDOM_MASK  (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
 11.1329 -
 11.1330 -    mmu_update_t req;
 11.1331 -    unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
 11.1332 -    struct pfn_info *page;
 11.1333 -    int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
 11.1334 -    unsigned int cmd, done = 0;
 11.1335 -    unsigned long prev_smfn = 0;
 11.1336 -    l1_pgentry_t *prev_spl1e = 0;
 11.1337 -    struct exec_domain *ed = current;
 11.1338 -    struct domain *d = ed->domain;
 11.1339 -    u32 type_info;
 11.1340 -    domid_t domid;
 11.1341 -
 11.1342 -    LOCK_BIGLOCK(d);
 11.1343 -
 11.1344 -    cleanup_writable_pagetable(d);
 11.1345 -
 11.1346 -    if ( unlikely(shadow_mode(d)) )
 11.1347 -        check_pagetable(d, ed->arch.pagetable, "pre-mmu"); /* debug */
 11.1348 -
 11.1349 -    /*
 11.1350 -     * If we are resuming after preemption, read how much work we have already
 11.1351 -     * done. This allows us to set the @done output parameter correctly.
 11.1352 -     * We also reset FOREIGNDOM here.
 11.1353 -     */
 11.1354 -    if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
 11.1355 -    {
 11.1356 -        if ( !(count & MMU_UPDATE_PREEMPTED) )
 11.1357 -        {
 11.1358 -            /* Count overflow into private FOREIGNDOM field. */
 11.1359 -            MEM_LOG("do_mmu_update count is too large");
 11.1360 -            rc = -EINVAL;
 11.1361 -            goto out;
 11.1362 -        }
 11.1363 -        count &= ~MMU_UPDATE_PREEMPTED;
 11.1364 -        domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
 11.1365 -        count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
 11.1366 -        if ( unlikely(pdone != NULL) )
 11.1367 -            (void)get_user(done, pdone);
 11.1368 -        if ( (domid != current->domain->id) &&
 11.1369 -             !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
 11.1370 -        {
 11.1371 -            rc = -EINVAL;
 11.1372 -            goto out;
 11.1373 -        }
 11.1374 -    }
 11.1375 -
 11.1376 -    perfc_incrc(calls_to_mmu_update); 
 11.1377 -    perfc_addc(num_page_updates, count);
 11.1378 -
 11.1379 -    if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
 11.1380 -    {
 11.1381 -        rc = -EFAULT;
 11.1382 -        goto out;
 11.1383 -    }
 11.1384 -
 11.1385 -    for ( i = 0; i < count; i++ )
 11.1386 -    {
 11.1387 -        if ( hypercall_preempt_check() )
 11.1388 -        {
 11.1389 -            rc = hypercall3_create_continuation(
 11.1390 -                __HYPERVISOR_mmu_update, ureqs, 
 11.1391 -                (count - i) |
 11.1392 -                (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) | 
 11.1393 -                MMU_UPDATE_PREEMPTED, pdone);
 11.1394 -            break;
 11.1395 -        }
 11.1396 -
 11.1397 -        if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
 11.1398 -        {
 11.1399 -            MEM_LOG("Bad __copy_from_user");
 11.1400 -            rc = -EFAULT;
 11.1401 -            break;
 11.1402 -        }
 11.1403 -
 11.1404 -        cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
 11.1405 -        pfn = req.ptr >> PAGE_SHIFT;
 11.1406 -
 11.1407 -        okay = 0;
 11.1408 -
 11.1409 -        switch ( cmd )
 11.1410 -        {
 11.1411 -            /*
 11.1412 -             * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
 11.1413 -             */
 11.1414 -        case MMU_NORMAL_PT_UPDATE:
 11.1415 -            if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
 11.1416 -            {
 11.1417 -                MEM_LOG("Could not get page for normal update");
 11.1418 -                break;
 11.1419 -            }
 11.1420 -
 11.1421 -            if ( likely(prev_pfn == pfn) )
 11.1422 -            {
 11.1423 -                va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
 11.1424 -            }
 11.1425 -            else
 11.1426 -            {
 11.1427 -                if ( prev_pfn != 0 )
 11.1428 -                    unmap_domain_mem((void *)va);
 11.1429 -                va = (unsigned long)map_domain_mem(req.ptr);
 11.1430 -                prev_pfn = pfn;
 11.1431 -            }
 11.1432 -
 11.1433 -            page = &frame_table[pfn];
 11.1434 -            switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
 11.1435 -            {
 11.1436 -            case PGT_l1_page_table: 
 11.1437 -                if ( likely(get_page_type(
 11.1438 -                    page, type_info & (PGT_type_mask|PGT_va_mask))) )
 11.1439 -                {
 11.1440 -                    okay = mod_l1_entry((l1_pgentry_t *)va, 
 11.1441 -                                        mk_l1_pgentry(req.val)); 
 11.1442 -
 11.1443 -                    if ( unlikely(shadow_mode(d)) && okay &&
 11.1444 -                         (get_shadow_status(d, page-frame_table) &
 11.1445 -                          PSH_shadowed) )
 11.1446 -                    {
 11.1447 -                        shadow_l1_normal_pt_update(
 11.1448 -                            req.ptr, req.val, &prev_smfn, &prev_spl1e);
 11.1449 -                        put_shadow_status(d);
 11.1450 -                    }
 11.1451 -
 11.1452 -                    put_page_type(page);
 11.1453 -                }
 11.1454 -                break;
 11.1455 -            case PGT_l2_page_table:
 11.1456 -                if ( likely(get_page_type(page, PGT_l2_page_table)) )
 11.1457 -                {
 11.1458 -                    okay = mod_l2_entry((l2_pgentry_t *)va, 
 11.1459 -                                        mk_l2_pgentry(req.val),
 11.1460 -                                        pfn); 
 11.1461 -
 11.1462 -                    if ( unlikely(shadow_mode(d)) && okay &&
 11.1463 -                         (get_shadow_status(d, page-frame_table) & 
 11.1464 -                          PSH_shadowed) )
 11.1465 -                    {
 11.1466 -                        shadow_l2_normal_pt_update(req.ptr, req.val);
 11.1467 -                        put_shadow_status(d);
 11.1468 -                    }
 11.1469 -
 11.1470 -                    put_page_type(page);
 11.1471 -                }
 11.1472 -                break;
 11.1473 -            default:
 11.1474 -                if ( likely(get_page_type(page, PGT_writable_page)) )
 11.1475 -                {
 11.1476 -                    *(unsigned long *)va = req.val;
 11.1477 -                    okay = 1;
 11.1478 -                    put_page_type(page);
 11.1479 -                }
 11.1480 -                break;
 11.1481 -            }
 11.1482 -
 11.1483 -            put_page(page);
 11.1484 -            break;
 11.1485 -
 11.1486 -        case MMU_MACHPHYS_UPDATE:
 11.1487 -            if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
 11.1488 -            {
 11.1489 -                MEM_LOG("Could not get page for mach->phys update");
 11.1490 -                break;
 11.1491 -            }
 11.1492 -
 11.1493 -            machine_to_phys_mapping[pfn] = req.val;
 11.1494 -            okay = 1;
 11.1495 -
 11.1496 -            /*
 11.1497 -             * If in log-dirty mode, mark the corresponding pseudo-physical
 11.1498 -             * page as dirty.
 11.1499 -             */
 11.1500 -            if ( unlikely(shadow_mode(d) == SHM_logdirty) && 
 11.1501 -                 mark_dirty(d, pfn) )
 11.1502 -                d->arch.shadow_dirty_block_count++;
 11.1503 -
 11.1504 -            put_page(&frame_table[pfn]);
 11.1505 -            break;
 11.1506 -
 11.1507 -            /*
 11.1508 -             * MMU_EXTENDED_COMMAND: Extended command is specified
 11.1509 -             * in the least-siginificant bits of the 'value' field.
 11.1510 -             */
 11.1511 -        case MMU_EXTENDED_COMMAND:
 11.1512 -            req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
 11.1513 -            okay = do_extended_command(req.ptr, req.val);
 11.1514 -            break;
 11.1515 -
 11.1516 -        default:
 11.1517 -            MEM_LOG("Invalid page update command %08lx", req.ptr);
 11.1518 -            break;
 11.1519 -        }
 11.1520 -
 11.1521 -        if ( unlikely(!okay) )
 11.1522 -        {
 11.1523 -            rc = -EINVAL;
 11.1524 -            break;
 11.1525 -        }
 11.1526 -
 11.1527 -        ureqs++;
 11.1528 -    }
 11.1529 -
 11.1530 - out:
 11.1531 -    if ( prev_pfn != 0 )
 11.1532 -        unmap_domain_mem((void *)va);
 11.1533 -
 11.1534 -    if ( unlikely(prev_spl1e != 0) ) 
 11.1535 -        unmap_domain_mem((void *)prev_spl1e);
 11.1536 -
 11.1537 -    deferred_ops = percpu_info[cpu].deferred_ops;
 11.1538 -    percpu_info[cpu].deferred_ops = 0;
 11.1539 -
 11.1540 -    if ( deferred_ops & DOP_FLUSH_TLB )
 11.1541 -        local_flush_tlb();
 11.1542 -        
 11.1543 -    if ( deferred_ops & DOP_RELOAD_LDT )
 11.1544 -        (void)map_ldt_shadow_page(0);
 11.1545 -
 11.1546 -    if ( unlikely(percpu_info[cpu].foreign != NULL) )
 11.1547 -    {
 11.1548 -        put_domain(percpu_info[cpu].foreign);
 11.1549 -        percpu_info[cpu].foreign = NULL;
 11.1550 -    }
 11.1551 -
 11.1552 -    /* Add incremental work we have done to the @done output parameter. */
 11.1553 -    if ( unlikely(pdone != NULL) )
 11.1554 -        __put_user(done + i, pdone);
 11.1555 -
 11.1556 -    if ( unlikely(shadow_mode(d)) )
 11.1557 -        check_pagetable(d, ed->arch.pagetable, "post-mmu"); /* debug */
 11.1558 -
 11.1559 -    UNLOCK_BIGLOCK(d);
 11.1560 -    return rc;
 11.1561 -}
 11.1562 -
 11.1563 -
 11.1564 -int do_update_va_mapping(unsigned long page_nr, 
 11.1565 -                         unsigned long val, 
 11.1566 -                         unsigned long flags)
 11.1567 -{
 11.1568 -    struct exec_domain *ed = current;
 11.1569 -    struct domain *d = ed->domain;
 11.1570 -    int err = 0;
 11.1571 -    unsigned int cpu = ed->processor;
 11.1572 -    unsigned long deferred_ops;
 11.1573 -
 11.1574 -    perfc_incrc(calls_to_update_va);
 11.1575 -
 11.1576 -    if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
 11.1577 -        return -EINVAL;
 11.1578 -
 11.1579 -    LOCK_BIGLOCK(d);
 11.1580 -
 11.1581 -    cleanup_writable_pagetable(d);
 11.1582 -
 11.1583 -    /*
 11.1584 -     * XXX When we make this support 4MB superpages we should also deal with 
 11.1585 -     * the case of updating L2 entries.
 11.1586 -     */
 11.1587 -
 11.1588 -    if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr], 
 11.1589 -                                mk_l1_pgentry(val))) )
 11.1590 -        err = -EINVAL;
 11.1591 -
 11.1592 -    if ( unlikely(shadow_mode(d)) )
 11.1593 -    {
 11.1594 -        unsigned long sval = 0;
 11.1595 -
 11.1596 -        l1pte_propagate_from_guest(d, &val, &sval);
 11.1597 -
 11.1598 -        if ( unlikely(__put_user(sval, ((unsigned long *)(
 11.1599 -            &shadow_linear_pg_table[page_nr])))) )
 11.1600 -        {
 11.1601 -            /*
 11.1602 -             * Since L2's are guranteed RW, failure indicates the page was not 
 11.1603 -             * shadowed, so ignore.
 11.1604 -             */
 11.1605 -            perfc_incrc(shadow_update_va_fail);
 11.1606 -        }
 11.1607 -
 11.1608 -        /*
 11.1609 -         * If we're in log-dirty mode then we need to note that we've updated
 11.1610 -         * the PTE in the PT-holding page. We need the machine frame number
 11.1611 -         * for this.
 11.1612 -         */
 11.1613 -        if ( shadow_mode(d) == SHM_logdirty )
 11.1614 -            mark_dirty(d, va_to_l1mfn(page_nr << PAGE_SHIFT));  
 11.1615 -  
 11.1616 -        check_pagetable(d, ed->arch.pagetable, "va"); /* debug */
 11.1617 -    }
 11.1618 -
 11.1619 -    deferred_ops = percpu_info[cpu].deferred_ops;
 11.1620 -    percpu_info[cpu].deferred_ops = 0;
 11.1621 -
 11.1622 -    if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || 
 11.1623 -         unlikely(flags & UVMF_FLUSH_TLB) )
 11.1624 -        local_flush_tlb();
 11.1625 -    else if ( unlikely(flags & UVMF_INVLPG) )
 11.1626 -        __flush_tlb_one(page_nr << PAGE_SHIFT);
 11.1627 -
 11.1628 -    if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
 11.1629 -        (void)map_ldt_shadow_page(0);
 11.1630 -    
 11.1631 -    UNLOCK_BIGLOCK(d);
 11.1632 -
 11.1633 -    return err;
 11.1634 -}
 11.1635 -
 11.1636 -int do_update_va_mapping_otherdomain(unsigned long page_nr, 
 11.1637 -                                     unsigned long val, 
 11.1638 -                                     unsigned long flags,
 11.1639 -                                     domid_t domid)
 11.1640 -{
 11.1641 -    unsigned int cpu = smp_processor_id();
 11.1642 -    struct domain *d;
 11.1643 -    int rc;
 11.1644 -
 11.1645 -    if ( unlikely(!IS_PRIV(current->domain)) )
 11.1646 -        return -EPERM;
 11.1647 -
 11.1648 -    percpu_info[cpu].foreign = d = find_domain_by_id(domid);
 11.1649 -    if ( unlikely(d == NULL) )
 11.1650 -    {
 11.1651 -        MEM_LOG("Unknown domain '%u'", domid);
 11.1652 -        return -ESRCH;
 11.1653 -    }
 11.1654 -
 11.1655 -    rc = do_update_va_mapping(page_nr, val, flags);
 11.1656 -
 11.1657 -    put_domain(d);
 11.1658 -    percpu_info[cpu].foreign = NULL;
 11.1659 -
 11.1660 -    return rc;
 11.1661 -}
 11.1662 -
 11.1663 -
 11.1664 -
 11.1665 -/*************************
 11.1666 - * Descriptor Tables
 11.1667 - */
 11.1668 -
 11.1669 -void destroy_gdt(struct exec_domain *ed)
 11.1670 -{
 11.1671 -    int i;
 11.1672 -    unsigned long pfn;
 11.1673 -
 11.1674 -    for ( i = 0; i < 16; i++ )
 11.1675 -    {
 11.1676 -        if ( (pfn = l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i])) != 0 )
 11.1677 -            put_page_and_type(&frame_table[pfn]);
 11.1678 -        ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
 11.1679 -    }
 11.1680 -}
 11.1681 -
 11.1682 -
 11.1683 -long set_gdt(struct exec_domain *ed, 
 11.1684 -             unsigned long *frames,
 11.1685 -             unsigned int entries)
 11.1686 -{
 11.1687 -    struct domain *d = ed->domain;
 11.1688 -    /* NB. There are 512 8-byte entries per GDT page. */
 11.1689 -    int i = 0, nr_pages = (entries + 511) / 512;
 11.1690 -    struct desc_struct *vgdt;
 11.1691 -    unsigned long pfn;
 11.1692 -
 11.1693 -    /* Check the first page in the new GDT. */
 11.1694 -    if ( (pfn = frames[0]) >= max_page )
 11.1695 -        goto fail;
 11.1696 -
 11.1697 -    /* The first page is special because Xen owns a range of entries in it. */
 11.1698 -    if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
 11.1699 -    {
 11.1700 -        /* GDT checks failed: try zapping the Xen reserved entries. */
 11.1701 -        if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) )
 11.1702 -            goto fail;
 11.1703 -        vgdt = map_domain_mem(pfn << PAGE_SHIFT);
 11.1704 -        memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0,
 11.1705 -               NR_RESERVED_GDT_ENTRIES*8);
 11.1706 -        unmap_domain_mem(vgdt);
 11.1707 -        put_page_and_type(&frame_table[pfn]);
 11.1708 -
 11.1709 -        /* Okay, we zapped the entries. Now try the GDT checks again. */
 11.1710 -        if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
 11.1711 -            goto fail;
 11.1712 -    }
 11.1713 -
 11.1714 -    /* Check the remaining pages in the new GDT. */
 11.1715 -    for ( i = 1; i < nr_pages; i++ )
 11.1716 -        if ( ((pfn = frames[i]) >= max_page) ||
 11.1717 -             !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
 11.1718 -            goto fail;
 11.1719 -
 11.1720 -    /* Copy reserved GDT entries to the new GDT. */
 11.1721 -    vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
 11.1722 -    memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 
 11.1723 -           gdt_table + FIRST_RESERVED_GDT_ENTRY, 
 11.1724 -           NR_RESERVED_GDT_ENTRIES*8);
 11.1725 -    unmap_domain_mem(vgdt);
 11.1726 -
 11.1727 -    /* Tear down the old GDT. */
 11.1728 -    destroy_gdt(ed);
 11.1729 -
 11.1730 -    /* Install the new GDT. */
 11.1731 -    for ( i = 0; i < nr_pages; i++ )
 11.1732 -        ed->arch.perdomain_ptes[i] =
 11.1733 -            mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
 11.1734 -
 11.1735 -    SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed));
 11.1736 -    SET_GDT_ENTRIES(ed, entries);
 11.1737 -
 11.1738 -    return 0;
 11.1739 -
 11.1740 - fail:
 11.1741 -    while ( i-- > 0 )
 11.1742 -        put_page_and_type(&frame_table[frames[i]]);
 11.1743 -    return -EINVAL;
 11.1744 -}
 11.1745 -
 11.1746 -
 11.1747 -long do_set_gdt(unsigned long *frame_list, unsigned int entries)
 11.1748 -{
 11.1749 -    int nr_pages = (entries + 511) / 512;
 11.1750 -    unsigned long frames[16];
 11.1751 -    long ret;
 11.1752 -
 11.1753 -    if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) ) 
 11.1754 -        return -EINVAL;
 11.1755 -    
 11.1756 -    if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
 11.1757 -        return -EFAULT;
 11.1758 -
 11.1759 -    LOCK_BIGLOCK(current->domain);
 11.1760 -
 11.1761 -    if ( (ret = set_gdt(current, frames, entries)) == 0 )
 11.1762 -    {
 11.1763 -        local_flush_tlb();
 11.1764 -        __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt));
 11.1765 -    }
 11.1766 -
 11.1767 -    UNLOCK_BIGLOCK(current->domain);
 11.1768 -
 11.1769 -    return ret;
 11.1770 -}
 11.1771 -
 11.1772 -
 11.1773 -long do_update_descriptor(
 11.1774 -    unsigned long pa, unsigned long word1, unsigned long word2)
 11.1775 -{
 11.1776 -    unsigned long pfn = pa >> PAGE_SHIFT;
 11.1777 -    struct desc_struct *gdt_pent, d;
 11.1778 -    struct pfn_info *page;
 11.1779 -    struct exec_domain *ed;
 11.1780 -    long ret = -EINVAL;
 11.1781 -
 11.1782 -    d.a = (u32)word1;
 11.1783 -    d.b = (u32)word2;
 11.1784 -
 11.1785 -    LOCK_BIGLOCK(current->domain);
 11.1786 -
 11.1787 -    if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) {
 11.1788 -        UNLOCK_BIGLOCK(current->domain);
 11.1789 -        return -EINVAL;
 11.1790 -    }
 11.1791 -
 11.1792 -    page = &frame_table[pfn];
 11.1793 -    if ( unlikely(!get_page(page, current->domain)) ) {
 11.1794 -        UNLOCK_BIGLOCK(current->domain);
 11.1795 -        return -EINVAL;
 11.1796 -    }
 11.1797 -
 11.1798 -    /* Check if the given frame is in use in an unsafe context. */
 11.1799 -    switch ( page->u.inuse.type_info & PGT_type_mask )
 11.1800 -    {
 11.1801 -    case PGT_gdt_page:
 11.1802 -        /* Disallow updates of Xen-reserved descriptors in the current GDT. */
 11.1803 -        for_each_exec_domain(current->domain, ed) {
 11.1804 -            if ( (l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[0]) == pfn) &&
 11.1805 -                 (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
 11.1806 -                 (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
 11.1807 -                goto out;
 11.1808 -        }
 11.1809 -        if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
 11.1810 -            goto out;
 11.1811 -        break;
 11.1812 -    case PGT_ldt_page:
 11.1813 -        if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
 11.1814 -            goto out;
 11.1815 -        break;
 11.1816 -    default:
 11.1817 -        if ( unlikely(!get_page_type(page, PGT_writable_page)) )
 11.1818 -            goto out;
 11.1819 -        break;
 11.1820 -    }
 11.1821 -
 11.1822 -    /* All is good so make the update. */
 11.1823 -    gdt_pent = map_domain_mem(pa);
 11.1824 -    memcpy(gdt_pent, &d, 8);
 11.1825 -    unmap_domain_mem(gdt_pent);
 11.1826 -
 11.1827 -    put_page_type(page);
 11.1828 -
 11.1829 -    ret = 0; /* success */
 11.1830 -
 11.1831 - out:
 11.1832 -    put_page(page);
 11.1833 -
 11.1834 -    UNLOCK_BIGLOCK(current->domain);
 11.1835 -
 11.1836 -    return ret;
 11.1837 -}
 11.1838 -
 11.1839 -
 11.1840 -
 11.1841 -/*************************
 11.1842 - * Writable Pagetables
 11.1843 - */
 11.1844 -
 11.1845 -ptwr_info_t ptwr_info[NR_CPUS];
 11.1846 -
 11.1847 -#ifdef VERBOSE
 11.1848 -int ptwr_debug = 0x0;
 11.1849 -#define PTWR_PRINTK(_f, _a...) \
 11.1850 - do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
 11.1851 -#define PTWR_PRINT_WHICH (which ? 'I' : 'A')
 11.1852 -#else
 11.1853 -#define PTWR_PRINTK(_f, _a...) ((void)0)
 11.1854 -#endif
 11.1855 -
 11.1856 -/* Flush the given writable p.t. page and write-protect it again. */
 11.1857 -void ptwr_flush(const int which)
 11.1858 -{
 11.1859 -    unsigned long  sstat, spte, pte, *ptep, l1va;
 11.1860 -    l1_pgentry_t  *sl1e = NULL, *pl1e, ol1e, nl1e;
 11.1861 -    l2_pgentry_t  *pl2e;
 11.1862 -    int            i, cpu = smp_processor_id();
 11.1863 -    struct exec_domain *ed = current;
 11.1864 -    struct domain *d = ed->domain;
 11.1865 -
 11.1866 -    l1va = ptwr_info[cpu].ptinfo[which].l1va;
 11.1867 -    ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
 11.1868 -
 11.1869 -    /*
 11.1870 -     * STEP 1. Write-protect the p.t. page so no more updates can occur.
 11.1871 -     */
 11.1872 -
 11.1873 -    if ( unlikely(__get_user(pte, ptep)) )
 11.1874 -    {
 11.1875 -        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
 11.1876 -        /*
 11.1877 -         * Really a bug. We could read this PTE during the initial fault,
 11.1878 -         * and pagetables can't have changed meantime. XXX Multi-CPU guests?
 11.1879 -         */
 11.1880 -        BUG();
 11.1881 -    }
 11.1882 -    PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
 11.1883 -                PTWR_PRINT_WHICH, ptep, pte);
 11.1884 -    pte &= ~_PAGE_RW;
 11.1885 -
 11.1886 -    if ( unlikely(shadow_mode(d)) )
 11.1887 -    {
 11.1888 -        /* Write-protect the p.t. page in the shadow page table. */
 11.1889 -        l1pte_propagate_from_guest(d, &pte, &spte);
 11.1890 -        __put_user(
 11.1891 -            spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
 11.1892 -
 11.1893 -        /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
 11.1894 -        sstat = get_shadow_status(d, pte >> PAGE_SHIFT);
 11.1895 -        if ( sstat & PSH_shadowed )
 11.1896 -            sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
 11.1897 -    }
 11.1898 -
 11.1899 -    /* Write-protect the p.t. page in the guest page table. */
 11.1900 -    if ( unlikely(__put_user(pte, ptep)) )
 11.1901 -    {
 11.1902 -        MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
 11.1903 -        /*
 11.1904 -         * Really a bug. We could write this PTE during the initial fault,
 11.1905 -         * and pagetables can't have changed meantime. XXX Multi-CPU guests?
 11.1906 -         */
 11.1907 -        BUG();
 11.1908 -    }
 11.1909 -
 11.1910 -    /* Ensure that there are no stale writable mappings in any TLB. */
 11.1911 -    /* NB. INVLPG is a serialising instruction: flushes pending updates. */
 11.1912 -#if 1
 11.1913 -    __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
 11.1914 -#else
 11.1915 -    flush_tlb_all();
 11.1916 -#endif
 11.1917 -    PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
 11.1918 -                PTWR_PRINT_WHICH, ptep, pte);
 11.1919 -
 11.1920 -    /*
 11.1921 -     * STEP 2. Validate any modified PTEs.
 11.1922 -     */
 11.1923 -
 11.1924 -    pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
 11.1925 -    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
 11.1926 -    {
 11.1927 -        ol1e = ptwr_info[cpu].ptinfo[which].page[i];
 11.1928 -        nl1e = pl1e[i];
 11.1929 -
 11.1930 -        if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
 11.1931 -            continue;
 11.1932 -
 11.1933 -        /*
 11.1934 -         * Fast path for PTEs that have merely been write-protected
 11.1935 -         * (e.g., during a Unix fork()). A strict reduction in privilege.
 11.1936 -         */
 11.1937 -        if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
 11.1938 -        {
 11.1939 -            if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
 11.1940 -            {
 11.1941 -                if ( unlikely(sl1e != NULL) )
 11.1942 -                    l1pte_propagate_from_guest(
 11.1943 -                        d, &l1_pgentry_val(nl1e), 
 11.1944 -                        &l1_pgentry_val(sl1e[i]));
 11.1945 -                put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
 11.1946 -            }
 11.1947 -            continue;
 11.1948 -        }
 11.1949 -
 11.1950 -        if ( unlikely(!get_page_from_l1e(nl1e, d)) )
 11.1951 -        {
 11.1952 -            MEM_LOG("ptwr: Could not re-validate l1 page\n");
 11.1953 -            /*
 11.1954 -             * Make the remaining p.t's consistent before crashing, so the
 11.1955 -             * reference counts are correct.
 11.1956 -             */
 11.1957 -            memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
 11.1958 -                   (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
 11.1959 -            unmap_domain_mem(pl1e);
 11.1960 -            ptwr_info[cpu].ptinfo[which].l1va = 0;
 11.1961 -            UNLOCK_BIGLOCK(d);
 11.1962 -            domain_crash();
 11.1963 -        }
 11.1964 -        
 11.1965 -        if ( unlikely(sl1e != NULL) )
 11.1966 -            l1pte_propagate_from_guest(
 11.1967 -                d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
 11.1968 -
 11.1969 -        if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
 11.1970 -            put_page_from_l1e(ol1e, d);
 11.1971 -    }
 11.1972 -    unmap_domain_mem(pl1e);
 11.1973 -
 11.1974 -    /*
 11.1975 -     * STEP 3. Reattach the L1 p.t. page into the current address space.
 11.1976 -     */
 11.1977 -
 11.1978 -    if ( (which == PTWR_PT_ACTIVE) && likely(!shadow_mode(d)) )
 11.1979 -    {
 11.1980 -        pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
 11.1981 -        *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); 
 11.1982 -    }
 11.1983 -
 11.1984 -    /*
 11.1985 -     * STEP 4. Final tidy-up.
 11.1986 -     */
 11.1987 -
 11.1988 -    ptwr_info[cpu].ptinfo[which].l1va = 0;
 11.1989 -
 11.1990 -    if ( unlikely(sl1e != NULL) )
 11.1991 -    {
 11.1992 -        unmap_domain_mem(sl1e);
 11.1993 -        put_shadow_status(d);
 11.1994 -    }
 11.1995 -}
 11.1996 -
 11.1997 -/* Write page fault handler: check if guest is trying to modify a PTE. */
 11.1998 -int ptwr_do_page_fault(unsigned long addr)
 11.1999 -{
 11.2000 -    unsigned long    pte, pfn, l2e;
 11.2001 -    struct pfn_info *page;
 11.2002 -    l2_pgentry_t    *pl2e;
 11.2003 -    int              which, cpu = smp_processor_id();
 11.2004 -    u32              l2_idx;
 11.2005 -
 11.2006 -    /*
 11.2007 -     * Attempt to read the PTE that maps the VA being accessed. By checking for
 11.2008 -     * PDE validity in the L2 we avoid many expensive fixups in __get_user().
 11.2009 -     */
 11.2010 -    if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
 11.2011 -           _PAGE_PRESENT) ||
 11.2012 -         __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
 11.2013 -    {
 11.2014 -        return 0;
 11.2015 -    }
 11.2016 -
 11.2017 -    pfn  = pte >> PAGE_SHIFT;
 11.2018 -    page = &frame_table[pfn];
 11.2019 -
 11.2020 -    /* We are looking only for read-only mappings of p.t. pages. */
 11.2021 -    if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
 11.2022 -         ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
 11.2023 -    {
 11.2024 -        return 0;
 11.2025 -    }
 11.2026 -    
 11.2027 -    /* Get the L2 index at which this L1 p.t. is always mapped. */
 11.2028 -    l2_idx = page->u.inuse.type_info & PGT_va_mask;
 11.2029 -    if ( unlikely(l2_idx >= PGT_va_unknown) )
 11.2030 -    {
 11.2031 -        domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
 11.2032 -    }
 11.2033 -    l2_idx >>= PGT_va_shift;
 11.2034 -
 11.2035 -    if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
 11.2036 -    {
 11.2037 -        MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
 11.2038 -        domain_crash();
 11.2039 -    }
 11.2040 -
 11.2041 -    /*
 11.2042 -     * Is the L1 p.t. mapped into the current address space? If so we call it
 11.2043 -     * an ACTIVE p.t., otherwise it is INACTIVE.
 11.2044 -     */
 11.2045 -    pl2e = &linear_l2_table[l2_idx];
 11.2046 -    l2e  = l2_pgentry_val(*pl2e);
 11.2047 -    which = PTWR_PT_INACTIVE;
 11.2048 -    if ( (l2e >> PAGE_SHIFT) == pfn )
 11.2049 -    {
 11.2050 -        /* Check the PRESENT bit to set ACTIVE. */
 11.2051 -        if ( likely(l2e & _PAGE_PRESENT) )
 11.2052 -            which = PTWR_PT_ACTIVE;
 11.2053 -        else {
 11.2054 -            /*
 11.2055 -             * If the PRESENT bit is clear, we may be conflicting with
 11.2056 -             * the current ACTIVE p.t. (it may be the same p.t. mapped
 11.2057 -             * at another virt addr).
 11.2058 -             * The ptwr_flush call below will restore the PRESENT bit.
 11.2059 -             */
 11.2060 -            if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
 11.2061 -                 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
 11.2062 -                which = PTWR_PT_ACTIVE;
 11.2063 -        }
 11.2064 -    }
 11.2065 -    
 11.2066 -    PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
 11.2067 -                "pfn %08lx\n", PTWR_PRINT_WHICH,
 11.2068 -                addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
 11.2069 -    
 11.2070 -    /*
 11.2071 -     * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at 
 11.2072 -     * time. If there is already one, we must flush it out.
 11.2073 -     */
 11.2074 -    if ( ptwr_info[cpu].ptinfo[which].l1va )
 11.2075 -        ptwr_flush(which);
 11.2076 -
 11.2077 -    ptwr_info[cpu].ptinfo[which].l1va   = addr | 1;
 11.2078 -    ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
 11.2079 -    
 11.2080 -    /* For safety, disconnect the L1 p.t. page from current space. */
 11.2081 -    if ( (which == PTWR_PT_ACTIVE) && 
 11.2082 -         likely(!shadow_mode(current->domain)) )
 11.2083 -    {
 11.2084 -        *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
 11.2085 -#if 1
 11.2086 -        flush_tlb(); /* XXX Multi-CPU guests? */
 11.2087 -#else
 11.2088 -        flush_tlb_all();
 11.2089 -#endif
 11.2090 -    }
 11.2091 -    
 11.2092 -    /* Temporarily map the L1 page, and make a copy of it. */
 11.2093 -    ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
 11.2094 -    memcpy(ptwr_info[cpu].ptinfo[which].page,
 11.2095 -           ptwr_info[cpu].ptinfo[which].pl1e,
 11.2096 -           ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
 11.2097 -    
 11.2098 -    /* Finally, make the p.t. page writable by the guest OS. */
 11.2099 -    pte |= _PAGE_RW;
 11.2100 -    PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
 11.2101 -                &linear_pg_table[addr>>PAGE_SHIFT], pte);
 11.2102 -    if ( unlikely(__put_user(pte, (unsigned long *)
 11.2103 -                             &linear_pg_table[addr>>PAGE_SHIFT])) )
 11.2104 -    {
 11.2105 -        MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
 11.2106 -                &linear_pg_table[addr>>PAGE_SHIFT]);
 11.2107 -        /* Toss the writable pagetable state and crash. */
 11.2108 -        unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
 11.2109 -        ptwr_info[cpu].ptinfo[which].l1va = 0;
 11.2110 -        domain_crash();
 11.2111 -    }
 11.2112 -    
 11.2113 -    return EXCRET_fault_fixed;
 11.2114 -}
 11.2115 -
 11.2116 -static __init int ptwr_init(void)
 11.2117 -{
 11.2118 -    int i;
 11.2119 -
 11.2120 -    for ( i = 0; i < smp_num_cpus; i++ )
 11.2121 -    {
 11.2122 -        ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
 11.2123 -            (void *)alloc_xenheap_page();
 11.2124 -        ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
 11.2125 -            (void *)alloc_xenheap_page();
 11.2126 -    }
 11.2127 -
 11.2128 -    return 0;
 11.2129 -}
 11.2130 -__initcall(ptwr_init);
 11.2131 -
 11.2132 -
 11.2133 -
 11.2134 -
 11.2135 -/************************************************************************/
 11.2136 -/************************************************************************/
 11.2137 -/************************************************************************/
 11.2138 -
 11.2139 -#ifndef NDEBUG
 11.2140 -
 11.2141 -void ptwr_status(void)
 11.2142 -{
 11.2143 -    unsigned long pte, *ptep, pfn;
 11.2144 -    struct pfn_info *page;
 11.2145 -    int cpu = smp_processor_id();
 11.2146 -
 11.2147 -    ptep = (unsigned long *)&linear_pg_table
 11.2148 -        [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
 11.2149 -
 11.2150 -    if ( __get_user(pte, ptep) ) {
 11.2151 -        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
 11.2152 -        domain_crash();
 11.2153 -    }
 11.2154 -
 11.2155 -    pfn = pte >> PAGE_SHIFT;
 11.2156 -    page = &frame_table[pfn];
 11.2157 -    printk("need to alloc l1 page %p\n", page);
 11.2158 -    /* make pt page writable */
 11.2159 -    printk("need to make read-only l1-page at %p is %08lx\n",
 11.2160 -           ptep, pte);
 11.2161 -
 11.2162 -    if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
 11.2163 -        return;
 11.2164 -
 11.2165 -    if ( __get_user(pte, (unsigned long *)
 11.2166 -                    ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
 11.2167 -        MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
 11.2168 -                ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
 11.2169 -        domain_crash();
 11.2170 -    }
 11.2171 -    pfn = pte >> PAGE_SHIFT;
 11.2172 -    page = &frame_table[pfn];
 11.2173 -}
 11.2174 -
 11.2175 -void audit_domain(struct domain *d)
 11.2176 -{
 11.2177 -    int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
 11.2178 -
 11.2179 -    void adjust (struct pfn_info *page, int dir, int adjtype)
 11.2180 -    {
 11.2181 -        int count = page->count_info & PGC_count_mask;
 11.2182 -
 11.2183 -        if ( adjtype )
 11.2184 -        {
 11.2185 -            int tcount = page->u.inuse.type_info & PGT_count_mask;
 11.2186 -            
 11.2187 -            ttot++;
 11.2188 -
 11.2189 -            tcount += dir;
 11.2190 -
 11.2191 -            if ( tcount < 0 )
 11.2192 -            {
 11.2193 -                /* This will only come out once. */
 11.2194 -                printk("Audit %d: type count whent below zero pfn=%x "
 11.2195 -                       "taf=%x otaf=%x\n",
 11.2196 -                       d->id, page-frame_table,
 11.2197 -                       page->u.inuse.type_info,
 11.2198 -                       page->tlbflush_timestamp);
 11.2199 -            }
 11.2200 -            
 11.2201 -            page->u.inuse.type_info =
 11.2202 -                (page->u.inuse.type_info & ~PGT_count_mask) | 
 11.2203 -                (tcount & PGT_count_mask);
 11.2204 -        }
 11.2205 -
 11.2206 -        ctot++;
 11.2207 -        count += dir;
 11.2208 -        if ( count < 0 )
 11.2209 -        {
 11.2210 -            /* This will only come out once. */
 11.2211 -            printk("Audit %d: general count whent below zero pfn=%x "
 11.2212 -                   "taf=%x otaf=%x\n",
 11.2213 -                   d->id, page-frame_table,
 11.2214 -                   page->u.inuse.type_info,
 11.2215 -                   page->tlbflush_timestamp);
 11.2216 -        }
 11.2217 -            
 11.2218 -        page->count_info =
 11.2219 -            (page->count_info & ~PGC_count_mask) | 
 11.2220 -            (count & PGC_count_mask);            
 11.2221 -
 11.2222 -    }
 11.2223 -
 11.2224 -    void scan_for_pfn(struct domain *d, unsigned long xpfn)
 11.2225 -    {
 11.2226 -        unsigned long pfn, *pt;
 11.2227 -        struct list_head *list_ent;
 11.2228 -        struct pfn_info *page;
 11.2229 -        int i;
 11.2230 -
 11.2231 -        list_ent = d->page_list.next;
 11.2232 -        for ( i = 0; (list_ent != &d->page_list); i++ )
 11.2233 -        {
 11.2234 -            pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
 11.2235 -            page = &frame_table[pfn];
 11.2236 -            
 11.2237 -            switch ( page->u.inuse.type_info & PGT_type_mask )
 11.2238 -            {
 11.2239 -            case PGT_l1_page_table:
 11.2240 -            case PGT_l2_page_table:
 11.2241 -                pt = map_domain_mem(pfn<<PAGE_SHIFT);
 11.2242 -                for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
 11.2243 -                    if ( (pt[i] & _PAGE_PRESENT) &&
 11.2244 -                         ((pt[i] >> PAGE_SHIFT) == xpfn) )
 11.2245 -                        printk("     found dom=%d i=%x pfn=%lx t=%x c=%x\n",
 11.2246 -                               d->id, i, pfn, page->u.inuse.type_info,
 11.2247 -                               page->count_info);
 11.2248 -                unmap_domain_mem(pt);           
 11.2249 -            }
 11.2250 -
 11.2251 -            list_ent = frame_table[pfn].list.next;
 11.2252 -        }
 11.2253 -
 11.2254 -    }
 11.2255 -
 11.2256 -    void scan_for_pfn_remote(unsigned long xpfn)
 11.2257 -    {
 11.2258 -        struct domain *e;
 11.2259 -        for_each_domain ( e )
 11.2260 -            scan_for_pfn( e, xpfn );            
 11.2261 -    }   
 11.2262 -
 11.2263 -    int i;
 11.2264 -    unsigned long pfn;
 11.2265 -    struct list_head *list_ent;
 11.2266 -    struct pfn_info *page;
 11.2267 -
 11.2268 -    if ( d != current->domain )
 11.2269 -        domain_pause(d);
 11.2270 -    synchronise_pagetables(~0UL);
 11.2271 -
 11.2272 -    printk("pt base=%lx sh_info=%x\n",
 11.2273 -           pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT,
 11.2274 -           virt_to_page(d->shared_info)-frame_table);
 11.2275 -           
 11.2276 -    spin_lock(&d->page_alloc_lock);
 11.2277 -
 11.2278 -    /* PHASE 0 */
 11.2279 -
 11.2280 -    list_ent = d->page_list.next;
 11.2281 -    for ( i = 0; (list_ent != &d->page_list); i++ )
 11.2282 -    {
 11.2283 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
 11.2284 -        page = &frame_table[pfn];
 11.2285 -
 11.2286 -        if ( page_get_owner(page) != d )
 11.2287 -            BUG();
 11.2288 -
 11.2289 -        if ( (page->u.inuse.type_info & PGT_count_mask) >
 11.2290 -             (page->count_info & PGC_count_mask) )
 11.2291 -            printk("taf > caf %x %x pfn=%lx\n",
 11.2292 -                   page->u.inuse.type_info, page->count_info, pfn );
 11.2293 - 
 11.2294 -#if 0   /* SYSV shared memory pages plus writeable files. */
 11.2295 -        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && 
 11.2296 -             (page->u.inuse.type_info & PGT_count_mask) > 1 )
 11.2297 -        {
 11.2298 -            printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
 11.2299 -                  pfn,
 11.2300 -                  page->u.inuse.type_info,
 11.2301 -                  page->count_info );
 11.2302 -            scan_for_pfn_remote(pfn);
 11.2303 -        }
 11.2304 -#endif
 11.2305 -        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && 
 11.2306 -             (page->u.inuse.type_info & PGT_count_mask) > 1 )
 11.2307 -        {
 11.2308 -            printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
 11.2309 -                  pfn,
 11.2310 -                  page->u.inuse.type_info,
 11.2311 -                  page->count_info );
 11.2312 -        }
 11.2313 -
 11.2314 -        /* Use tlbflush_timestamp to store original type_info. */
 11.2315 -        page->tlbflush_timestamp = page->u.inuse.type_info;
 11.2316 -
 11.2317 -        list_ent = frame_table[pfn].list.next;
 11.2318 -    }
 11.2319 -
 11.2320 -
 11.2321 -    /* PHASE 1 */
 11.2322 -
 11.2323 -    adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], -1, 1);
 11.2324 -
 11.2325 -    list_ent = d->page_list.next;
 11.2326 -    for ( i = 0; (list_ent != &d->page_list); i++ )
 11.2327 -    {
 11.2328 -        unsigned long *pt;
 11.2329 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
 11.2330 -        page = &frame_table[pfn];
 11.2331 -
 11.2332 -        if ( page_get_owner(page) != d )
 11.2333 -            BUG();
 11.2334 -
 11.2335 -        switch ( page->u.inuse.type_info & PGT_type_mask )
 11.2336 -        {
 11.2337 -        case PGT_l2_page_table:
 11.2338 -
 11.2339 -            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
 11.2340 -                printk("Audit %d: L2 not validated %x\n",
 11.2341 -                       d->id, page->u.inuse.type_info);
 11.2342 -
 11.2343 -            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
 11.2344 -                printk("Audit %d: L2 not pinned %x\n",
 11.2345 -                       d->id, page->u.inuse.type_info);
 11.2346 -            else
 11.2347 -                adjust( page, -1, 1 );
 11.2348 -           
 11.2349 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 11.2350 -
 11.2351 -            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
 11.2352 -            {
 11.2353 -                if ( pt[i] & _PAGE_PRESENT )
 11.2354 -                {
 11.2355 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 11.2356 -                    struct pfn_info *l1page = &frame_table[l1pfn];
 11.2357 -
 11.2358 -                    if ( page_get_owner(l1page) != d )
 11.2359 -                    {
 11.2360 -                        printk("L2: Skip bizarre page belonging to other "
 11.2361 -                               "dom %p\n", page_get_owner(l1page));
 11.2362 -                        continue;
 11.2363 -                    }
 11.2364 -                    
 11.2365 -                    if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
 11.2366 -                         PGT_l2_page_table )
 11.2367 -                        printk("Audit %d: [%x] Found %s Linear PT "
 11.2368 -                               "t=%x pfn=%lx\n", d->id, i, 
 11.2369 -                               (l1pfn==pfn) ? "Self" : "Other",
 11.2370 -                               l1page->u.inuse.type_info,
 11.2371 -                               l1pfn);
 11.2372 -                    else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
 11.2373 -                              PGT_l1_page_table )
 11.2374 -                        printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
 11.2375 -                               d->id, i,
 11.2376 -                               l1page->u.inuse.type_info,
 11.2377 -                               l1pfn);
 11.2378 -
 11.2379 -                    adjust(l1page, -1, 1);
 11.2380 -                }
 11.2381 -            }
 11.2382 -
 11.2383 -            unmap_domain_mem(pt);
 11.2384 -
 11.2385 -            break;
 11.2386 -
 11.2387 -
 11.2388 -        case PGT_l1_page_table:
 11.2389 -            
 11.2390 -            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
 11.2391 -                adjust( page, -1, 1 );
 11.2392 -
 11.2393 -            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
 11.2394 -                printk("Audit %d: L1 not validated %x\n",
 11.2395 -                       d->id, page->u.inuse.type_info);
 11.2396 -#if 0
 11.2397 -            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
 11.2398 -                printk("Audit %d: L1 not pinned %x\n",
 11.2399 -                       d->id, page->u.inuse.type_info);
 11.2400 -#endif
 11.2401 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 11.2402 -
 11.2403 -            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
 11.2404 -            {
 11.2405 -                if ( pt[i] & _PAGE_PRESENT )
 11.2406 -                {
 11.2407 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 11.2408 -                    struct pfn_info *l1page = &frame_table[l1pfn];
 11.2409 -
 11.2410 -                    if ( l1pfn < 0x100 )
 11.2411 -                    {
 11.2412 -                        lowmem_mappings++;
 11.2413 -                        continue;
 11.2414 -                    }
 11.2415 -
 11.2416 -                    if ( l1pfn > max_page )
 11.2417 -                    {
 11.2418 -                        io_mappings++;
 11.2419 -                        continue;
 11.2420 -                    }
 11.2421 -
 11.2422 -                    if ( pt[i] & _PAGE_RW )
 11.2423 -                    {
 11.2424 -
 11.2425 -                        if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
 11.2426 -                             PGT_l1_page_table ||
 11.2427 -                             (l1page->u.inuse.type_info & PGT_type_mask) ==
 11.2428 -                             PGT_l2_page_table )
 11.2429 -                            printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
 11.2430 -                                   d->id, i,
 11.2431 -                                   l1page->u.inuse.type_info,
 11.2432 -                                   l1pfn);
 11.2433 -
 11.2434 -                    }
 11.2435 -
 11.2436 -                    if ( page_get_owner(l1page) != d )
 11.2437 -                    {
 11.2438 -                        printk("Audit %d: [%lx,%x] Skip foreign page dom=%p "
 11.2439 -                               "pfn=%lx c=%08x t=%08x m2p=%lx\n",
 11.2440 -                               d->id, pfn, i,
 11.2441 -                               page_get_owner(l1page),
 11.2442 -                               l1pfn,
 11.2443 -                               l1page->count_info,
 11.2444 -                               l1page->u.inuse.type_info,
 11.2445 -                               machine_to_phys_mapping[l1pfn]);    
 11.2446 -                        continue;
 11.2447 -                    }
 11.2448 -
 11.2449 -                    adjust(l1page, -1, 0);
 11.2450 -                }
 11.2451 -            }
 11.2452 -
 11.2453 -            unmap_domain_mem(pt);
 11.2454 -
 11.2455 -            break;
 11.2456 -        }       
 11.2457 -
 11.2458 -        list_ent = frame_table[pfn].list.next;
 11.2459 -    }
 11.2460 -
 11.2461 -    if ( (io_mappings > 0) || (lowmem_mappings > 0) )
 11.2462 -        printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
 11.2463 -               d->id, lowmem_mappings, io_mappings);
 11.2464 -
 11.2465 -    /* PHASE 2 */
 11.2466 -
 11.2467 -    ctot = ttot = 0;
 11.2468 -    list_ent = d->page_list.next;
 11.2469 -    for ( i = 0; (list_ent != &d->page_list); i++ )
 11.2470 -    {
 11.2471 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
 11.2472 -        page = &frame_table[pfn];
 11.2473 -
 11.2474 -        switch ( page->u.inuse.type_info & PGT_type_mask)
 11.2475 -        {
 11.2476 -        case PGT_l1_page_table:
 11.2477 -        case PGT_l2_page_table:
 11.2478 -            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
 11.2479 -            {
 11.2480 -                printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
 11.2481 -                       d->id, page->u.inuse.type_info, 
 11.2482 -                       page->tlbflush_timestamp,
 11.2483 -                       page->count_info, pfn );
 11.2484 -                scan_for_pfn_remote(pfn);
 11.2485 -            }
 11.2486 -        default:
 11.2487 -            if ( (page->count_info & PGC_count_mask) != 1 )
 11.2488 -            {
 11.2489 -                printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
 11.2490 -                       d->id, 
 11.2491 -                       page->count_info,
 11.2492 -                       page->u.inuse.type_info, 
 11.2493 -                       page->tlbflush_timestamp, pfn );
 11.2494 -                scan_for_pfn_remote(pfn);
 11.2495 -            }
 11.2496 -            break;
 11.2497 -        }
 11.2498 -
 11.2499 -        list_ent = frame_table[pfn].list.next;
 11.2500 -    }
 11.2501 -
 11.2502 -    /* PHASE 3 */
 11.2503 -    list_ent = d->page_list.next;
 11.2504 -    for ( i = 0; (list_ent != &d->page_list); i++ )
 11.2505 -    {
 11.2506 -        unsigned long *pt;
 11.2507 -        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
 11.2508 -        page = &frame_table[pfn];
 11.2509 -
 11.2510 -        switch ( page->u.inuse.type_info & PGT_type_mask )
 11.2511 -        {
 11.2512 -        case PGT_l2_page_table:
 11.2513 -            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
 11.2514 -                adjust( page, 1, 1 );          
 11.2515 -
 11.2516 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 11.2517 -
 11.2518 -            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
 11.2519 -            {
 11.2520 -                if ( pt[i] & _PAGE_PRESENT )
 11.2521 -                {
 11.2522 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 11.2523 -                    struct pfn_info *l1page;
 11.2524 -
 11.2525 -                    if (l1pfn>max_page)
 11.2526 -                        continue;
 11.2527 -
 11.2528 -                    l1page = &frame_table[l1pfn];
 11.2529 -
 11.2530 -                    if ( page_get_owner(l1page) == d )
 11.2531 -                        adjust(l1page, 1, 1);
 11.2532 -                }
 11.2533 -            }
 11.2534 -
 11.2535 -            unmap_domain_mem(pt);
 11.2536 -            break;
 11.2537 -
 11.2538 -        case PGT_l1_page_table:
 11.2539 -            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
 11.2540 -                adjust( page, 1, 1 );
 11.2541 -
 11.2542 -            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 11.2543 -
 11.2544 -            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
 11.2545 -            {
 11.2546 -                if ( pt[i] & _PAGE_PRESENT )
 11.2547 -                {
 11.2548 -                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 11.2549 -                    struct pfn_info *l1page;
 11.2550 -
 11.2551 -                    if (l1pfn>max_page)
 11.2552 -                        continue;
 11.2553 -
 11.2554 -                    l1page = &frame_table[l1pfn];
 11.2555 -
 11.2556 -                    if ( (page_get_owner(l1page) != d) ||
 11.2557 -                         (l1pfn < 0x100) || (l1pfn > max_page) )
 11.2558 -                        continue;
 11.2559 -
 11.2560 -                    adjust(l1page, 1, 0);
 11.2561 -                }
 11.2562 -            }
 11.2563 -
 11.2564 -            unmap_domain_mem(pt);
 11.2565 -            break;
 11.2566 -        }
 11.2567 -
 11.2568 -
 11.2569 -        page->tlbflush_timestamp = 0;
 11.2570 -
 11.2571 -        list_ent = frame_table[pfn].list.next;
 11.2572 -    }
 11.2573 -
 11.2574 -    spin_unlock(&d->page_alloc_lock);
 11.2575 -
 11.2576 -    adjust(&frame_table[pagetable_val(
 11.2577 -        d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1);
 11.2578 -
 11.2579 -    printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
 11.2580 -
 11.2581 -    if ( d != current->domain )
 11.2582 -        domain_unpause(d);
 11.2583 -}
 11.2584 -
 11.2585 -void audit_domains(void)
 11.2586 -{
 11.2587 -    struct domain *d;
 11.2588 -    for_each_domain ( d )
 11.2589 -        audit_domain(d);
 11.2590 -}
 11.2591 -
 11.2592 -void audit_domains_key(unsigned char key)
 11.2593 -{
 11.2594 -    audit_domains();
 11.2595 -}
 11.2596 -
 11.2597 -#endif
    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/xen/arch/x86/mm.c	Tue Feb 08 15:13:51 2005 +0000
    12.3 @@ -0,0 +1,2598 @@
    12.4 +/* -*-  Mode:C; c-basic-offset:4; tab-width:4; indent-tabs-mode:nil -*- */
    12.5 +/******************************************************************************
    12.6 + * arch/x86/mm.c
    12.7 + * 
    12.8 + * Copyright (c) 2002-2005 K A Fraser
    12.9 + * Copyright (c) 2004 Christian Limpach
   12.10 + * 
   12.11 + * This program is free software; you can redistribute it and/or modify
   12.12 + * it under the terms of the GNU General Public License as published by
   12.13 + * the Free Software Foundation; either version 2 of the License, or
   12.14 + * (at your option) any later version.
   12.15 + * 
   12.16 + * This program is distributed in the hope that it will be useful,
   12.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   12.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   12.19 + * GNU General Public License for more details.
   12.20 + * 
   12.21 + * You should have received a copy of the GNU General Public License
   12.22 + * along with this program; if not, write to the Free Software
   12.23 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   12.24 + */
   12.25 +
   12.26 +/*
   12.27 + * A description of the x86 page table API:
   12.28 + * 
   12.29 + * Domains trap to do_mmu_update with a list of update requests.
   12.30 + * This is a list of (ptr, val) pairs, where the requested operation
   12.31 + * is *ptr = val.
   12.32 + * 
   12.33 + * Reference counting of pages:
   12.34 + * ----------------------------
   12.35 + * Each page has two refcounts: tot_count and type_count.
   12.36 + * 
   12.37 + * TOT_COUNT is the obvious reference count. It counts all uses of a
   12.38 + * physical page frame by a domain, including uses as a page directory,
   12.39 + * a page table, or simple mappings via a PTE. This count prevents a
   12.40 + * domain from releasing a frame back to the free pool when it still holds
   12.41 + * a reference to it.
   12.42 + * 
   12.43 + * TYPE_COUNT is more subtle. A frame can be put to one of three
   12.44 + * mutually-exclusive uses: it might be used as a page directory, or a
   12.45 + * page table, or it may be mapped writable by the domain [of course, a
   12.46 + * frame may not be used in any of these three ways!].
   12.47 + * So, type_count is a count of the number of times a frame is being 
   12.48 + * referred to in its current incarnation. Therefore, a page can only
   12.49 + * change its type when its type count is zero.
   12.50 + * 
   12.51 + * Pinning the page type:
   12.52 + * ----------------------
   12.53 + * The type of a page can be pinned/unpinned with the commands
   12.54 + * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
   12.55 + * pinning is not reference counted, so it can't be nested).
   12.56 + * This is useful to prevent a page's type count falling to zero, at which
   12.57 + * point safety checks would need to be carried out next time the count
   12.58 + * is increased again.
   12.59 + * 
   12.60 + * A further note on writable page mappings:
   12.61 + * -----------------------------------------
   12.62 + * For simplicity, the count of writable mappings for a page may not
   12.63 + * correspond to reality. The 'writable count' is incremented for every
   12.64 + * PTE which maps the page with the _PAGE_RW flag set. However, for
   12.65 + * write access to be possible the page directory entry must also have
   12.66 + * its _PAGE_RW bit set. We do not check this as it complicates the 
   12.67 + * reference counting considerably [consider the case of multiple
   12.68 + * directory entries referencing a single page table, some with the RW
   12.69 + * bit set, others not -- it starts getting a bit messy].
   12.70 + * In normal use, this simplification shouldn't be a problem.
   12.71 + * However, the logic can be added if required.
   12.72 + * 
   12.73 + * One more note on read-only page mappings:
   12.74 + * -----------------------------------------
   12.75 + * We want domains to be able to map pages for read-only access. The
   12.76 + * main reason is that page tables and directories should be readable
   12.77 + * by a domain, but it would not be safe for them to be writable.
   12.78 + * However, domains have free access to rings 1 & 2 of the Intel
   12.79 + * privilege model. In terms of page protection, these are considered
   12.80 + * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
   12.81 + * read-only restrictions are respected in supervisor mode -- if the 
   12.82 + * bit is clear then any mapped page is writable.
   12.83 + * 
   12.84 + * We get round this by always setting the WP bit and disallowing 
   12.85 + * updates to it. This is very unlikely to cause a problem for guest
   12.86 + * OS's, which will generally use the WP bit to simplify copy-on-write
   12.87 + * implementation (in that case, OS wants a fault when it writes to
   12.88 + * an application-supplied buffer).
   12.89 + */
   12.90 +
   12.91 +#include <xen/config.h>
   12.92 +#include <xen/init.h>
   12.93 +#include <xen/kernel.h>
   12.94 +#include <xen/lib.h>
   12.95 +#include <xen/mm.h>
   12.96 +#include <xen/sched.h>
   12.97 +#include <xen/errno.h>
   12.98 +#include <xen/perfc.h>
   12.99 +#include <xen/irq.h>
  12.100 +#include <xen/softirq.h>
  12.101 +#include <asm/shadow.h>
  12.102 +#include <asm/page.h>
  12.103 +#include <asm/flushtlb.h>
  12.104 +#include <asm/io.h>
  12.105 +#include <asm/uaccess.h>
  12.106 +#include <asm/domain_page.h>
  12.107 +#include <asm/ldt.h>
  12.108 +
  12.109 +#ifdef VERBOSE
  12.110 +#define MEM_LOG(_f, _a...)                           \
  12.111 +  printk("DOM%u: (file=memory.c, line=%d) " _f "\n", \
  12.112 +         current->domain->id , __LINE__ , ## _a )
  12.113 +#else
  12.114 +#define MEM_LOG(_f, _a...) ((void)0)
  12.115 +#endif
  12.116 +
  12.117 +static int alloc_l2_table(struct pfn_info *page);
  12.118 +static int alloc_l1_table(struct pfn_info *page);
  12.119 +static int get_page_from_pagenr(unsigned long page_nr, struct domain *d);
  12.120 +static int get_page_and_type_from_pagenr(unsigned long page_nr, 
  12.121 +                                         u32 type,
  12.122 +                                         struct domain *d);
  12.123 +
  12.124 +static void free_l2_table(struct pfn_info *page);
  12.125 +static void free_l1_table(struct pfn_info *page);
  12.126 +
  12.127 +static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
  12.128 +static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
  12.129 +
  12.130 +/* Used to defer flushing of memory structures. */
  12.131 +static struct {
  12.132 +#define DOP_FLUSH_TLB   (1<<0) /* Flush the TLB.                 */
  12.133 +#define DOP_RELOAD_LDT  (1<<1) /* Reload the LDT shadow mapping. */
  12.134 +    unsigned long  deferred_ops;
  12.135 +    /* If non-NULL, specifies a foreign subject domain for some operations. */
  12.136 +    struct domain *foreign;
  12.137 +} __cacheline_aligned percpu_info[NR_CPUS];
  12.138 +
  12.139 +/*
  12.140 + * Returns the current foreign domain; defaults to the currently-executing
  12.141 + * domain if a foreign override hasn't been specified.
  12.142 + */
  12.143 +#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain)
  12.144 +
  12.145 +/* Private domain structs for DOMID_XEN and DOMID_IO. */
  12.146 +static struct domain *dom_xen, *dom_io;
  12.147 +
  12.148 +/* Frame table and its size in pages. */
  12.149 +struct pfn_info *frame_table;
  12.150 +unsigned long frame_table_size;
  12.151 +unsigned long max_page;
  12.152 +
  12.153 +void __init init_frametable(void)
  12.154 +{
  12.155 +    unsigned long i, p;
  12.156 +
  12.157 +    frame_table      = (struct pfn_info *)FRAMETABLE_VIRT_START;
  12.158 +    frame_table_size = max_page * sizeof(struct pfn_info);
  12.159 +    frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
  12.160 +
  12.161 +    for ( i = 0; i < frame_table_size; i += (4UL << 20) )
  12.162 +    {
  12.163 +        p = alloc_boot_pages(min(frame_table_size - i, 4UL << 20), 4UL << 20);
  12.164 +        if ( p == 0 )
  12.165 +            panic("Not enough memory for frame table\n");
  12.166 +        map_pages(idle_pg_table, FRAMETABLE_VIRT_START + i, p, 
  12.167 +                  4UL << 20, PAGE_HYPERVISOR);
  12.168 +    }
  12.169 +
  12.170 +    memset(frame_table, 0, frame_table_size);
  12.171 +}
  12.172 +
  12.173 +void arch_init_memory(void)
  12.174 +{
  12.175 +    extern void subarch_init_memory(struct domain *);
  12.176 +
  12.177 +    memset(percpu_info, 0, sizeof(percpu_info));
  12.178 +
  12.179 +    /*
  12.180 +     * Initialise our DOMID_XEN domain.
  12.181 +     * Any Xen-heap pages that we will allow to be mapped will have
  12.182 +     * their domain field set to dom_xen.
  12.183 +     */
  12.184 +    dom_xen = alloc_domain_struct();
  12.185 +    atomic_set(&dom_xen->refcnt, 1);
  12.186 +    dom_xen->id = DOMID_XEN;
  12.187 +
  12.188 +    /*
  12.189 +     * Initialise our DOMID_IO domain.
  12.190 +     * This domain owns no pages but is considered a special case when
  12.191 +     * mapping I/O pages, as the mappings occur at the priv of the caller.
  12.192 +     */
  12.193 +    dom_io = alloc_domain_struct();
  12.194 +    atomic_set(&dom_io->refcnt, 1);
  12.195 +    dom_io->id = DOMID_IO;
  12.196 +
  12.197 +    subarch_init_memory(dom_xen);
  12.198 +}
  12.199 +
  12.200 +void write_ptbase(struct exec_domain *ed)
  12.201 +{
  12.202 +    struct domain *d = ed->domain;
  12.203 +    unsigned long pa;
  12.204 +
  12.205 +#ifdef CONFIG_VMX
  12.206 +    if ( unlikely(shadow_mode(d)) )
  12.207 +        pa = ((shadow_mode(d) == SHM_full_32) ?
  12.208 +              pagetable_val(ed->arch.monitor_table) :
  12.209 +              pagetable_val(ed->arch.shadow_table));
  12.210 +    else
  12.211 +        pa = pagetable_val(ed->arch.pagetable);
  12.212 +#else
  12.213 +    if ( unlikely(shadow_mode(d)) )
  12.214 +        pa = pagetable_val(ed->arch.shadow_table);    
  12.215 +    else
  12.216 +        pa = pagetable_val(ed->arch.pagetable);
  12.217 +#endif
  12.218 +
  12.219 +    write_cr3(pa);
  12.220 +}
  12.221 +
  12.222 +static void __invalidate_shadow_ldt(struct exec_domain *d)
  12.223 +{
  12.224 +    int i;
  12.225 +    unsigned long pfn;
  12.226 +    struct pfn_info *page;
  12.227 +    
  12.228 +    d->arch.shadow_ldt_mapcnt = 0;
  12.229 +
  12.230 +    for ( i = 16; i < 32; i++ )
  12.231 +    {
  12.232 +        pfn = l1_pgentry_to_pagenr(d->arch.perdomain_ptes[i]);
  12.233 +        if ( pfn == 0 ) continue;
  12.234 +        d->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
  12.235 +        page = &frame_table[pfn];
  12.236 +        ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
  12.237 +        ASSERT_PAGE_IS_DOMAIN(page, d->domain);
  12.238 +        put_page_and_type(page);
  12.239 +    }
  12.240 +
  12.241 +    /* Dispose of the (now possibly invalid) mappings from the TLB.  */
  12.242 +    percpu_info[d->processor].deferred_ops |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
  12.243 +}
  12.244 +
  12.245 +
  12.246 +static inline void invalidate_shadow_ldt(struct exec_domain *d)
  12.247 +{
  12.248 +    if ( d->arch.shadow_ldt_mapcnt != 0 )
  12.249 +        __invalidate_shadow_ldt(d);
  12.250 +}
  12.251 +
  12.252 +
  12.253 +static int alloc_segdesc_page(struct pfn_info *page)
  12.254 +{
  12.255 +    struct desc_struct *descs;
  12.256 +    int i;
  12.257 +
  12.258 +    descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
  12.259 +
  12.260 +    for ( i = 0; i < 512; i++ )
  12.261 +        if ( unlikely(!check_descriptor(&descs[i])) )
  12.262 +            goto fail;
  12.263 +
  12.264 +    unmap_domain_mem(descs);
  12.265 +    return 1;
  12.266 +
  12.267 + fail:
  12.268 +    unmap_domain_mem(descs);
  12.269 +    return 0;
  12.270 +}
  12.271 +
  12.272 +
  12.273 +/* Map shadow page at offset @off. */
  12.274 +int map_ldt_shadow_page(unsigned int off)
  12.275 +{
  12.276 +    struct exec_domain *ed = current;
  12.277 +    struct domain *d = ed->domain;
  12.278 +    unsigned long l1e;
  12.279 +
  12.280 +    if ( unlikely(in_irq()) )
  12.281 +        BUG();
  12.282 +
  12.283 +    __get_user(l1e, (unsigned long *)
  12.284 +               &linear_pg_table[l1_linear_offset(ed->arch.ldt_base) + off]);
  12.285 +
  12.286 +    if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
  12.287 +         unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], 
  12.288 +                                     d, PGT_ldt_page)) )
  12.289 +        return 0;
  12.290 +
  12.291 +    ed->arch.perdomain_ptes[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
  12.292 +    ed->arch.shadow_ldt_mapcnt++;
  12.293 +
  12.294 +    return 1;
  12.295 +}
  12.296 +
  12.297 +
  12.298 +static int get_page_from_pagenr(unsigned long page_nr, struct domain *d)
  12.299 +{
  12.300 +    struct pfn_info *page = &frame_table[page_nr];
  12.301 +
  12.302 +    if ( unlikely(!pfn_is_ram(page_nr)) )
  12.303 +    {
  12.304 +        MEM_LOG("Pfn %08lx is not RAM", page_nr);
  12.305 +        return 0;
  12.306 +    }
  12.307 +
  12.308 +    if ( unlikely(!get_page(page, d)) )
  12.309 +    {
  12.310 +        MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
  12.311 +        return 0;
  12.312 +    }
  12.313 +
  12.314 +    return 1;
  12.315 +}
  12.316 +
  12.317 +
  12.318 +static int get_page_and_type_from_pagenr(unsigned long page_nr, 
  12.319 +                                         u32 type,
  12.320 +                                         struct domain *d)
  12.321 +{
  12.322 +    struct pfn_info *page = &frame_table[page_nr];
  12.323 +
  12.324 +    if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
  12.325 +        return 0;
  12.326 +
  12.327 +    if ( unlikely(!get_page_type(page, type)) )
  12.328 +    {
  12.329 +#ifdef VERBOSE
  12.330 +        if ( (type & PGT_type_mask) != PGT_l1_page_table )
  12.331 +            MEM_LOG("Bad page type for pfn %08lx (%08x)", 
  12.332 +                    page_nr, page->u.inuse.type_info);
  12.333 +#endif
  12.334 +        put_page(page);
  12.335 +        return 0;
  12.336 +    }
  12.337 +
  12.338 +    return 1;
  12.339 +}
  12.340 +
  12.341 +
  12.342 +/*
  12.343 + * We allow an L2 tables to map each other (a.k.a. linear page tables). It
  12.344 + * needs some special care with reference counst and access permissions:
  12.345 + *  1. The mapping entry must be read-only, or the guest may get write access
  12.346 + *     to its own PTEs.
  12.347 + *  2. We must only bump the reference counts for an *already validated*
  12.348 + *     L2 table, or we can end up in a deadlock in get_page_type() by waiting
  12.349 + *     on a validation that is required to complete that validation.
  12.350 + *  3. We only need to increment the reference counts for the mapped page
  12.351 + *     frame if it is mapped by a different L2 table. This is sufficient and
  12.352 + *     also necessary to allow validation of an L2 table mapping itself.
  12.353 + */
  12.354 +static int 
  12.355 +get_linear_pagetable(
  12.356 +    l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
  12.357 +{
  12.358 +    u32 x, y;
  12.359 +    struct pfn_info *page;
  12.360 +
  12.361 +    if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
  12.362 +    {
  12.363 +        MEM_LOG("Attempt to create linear p.t. with write perms");
  12.364 +        return 0;
  12.365 +    }
  12.366 +
  12.367 +    if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
  12.368 +    {
  12.369 +        /* Make sure the mapped frame belongs to the correct domain. */
  12.370 +        if ( unlikely(!get_page_from_pagenr(l2_pgentry_to_pagenr(l2e), d)) )
  12.371 +            return 0;
  12.372 +
  12.373 +        /*
  12.374 +         * Make sure that the mapped frame is an already-validated L2 table. 
  12.375 +         * If so, atomically increment the count (checking for overflow).
  12.376 +         */
  12.377 +        page = &frame_table[l2_pgentry_to_pagenr(l2e)];
  12.378 +        y = page->u.inuse.type_info;
  12.379 +        do {
  12.380 +            x = y;
  12.381 +            if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||
  12.382 +                 unlikely((x & (PGT_type_mask|PGT_validated)) != 
  12.383 +                          (PGT_l2_page_table|PGT_validated)) )
  12.384 +            {
  12.385 +                put_page(page);
  12.386 +                return 0;
  12.387 +            }
  12.388 +        }
  12.389 +        while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );
  12.390 +    }
  12.391 +
  12.392 +    return 1;
  12.393 +}
  12.394 +
  12.395 +
  12.396 +static int
  12.397 +get_page_from_l1e(
  12.398 +    l1_pgentry_t l1e, struct domain *d)
  12.399 +{
  12.400 +    unsigned long l1v = l1_pgentry_val(l1e);
  12.401 +    unsigned long pfn = l1_pgentry_to_pagenr(l1e);
  12.402 +    struct pfn_info *page = &frame_table[pfn];
  12.403 +    extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn);
  12.404 +
  12.405 +    if ( !(l1v & _PAGE_PRESENT) )
  12.406 +        return 1;
  12.407 +
  12.408 +    if ( unlikely(l1v & (_PAGE_GLOBAL|_PAGE_PAT)) )
  12.409 +    {
  12.410 +        MEM_LOG("Bad L1 type settings %04lx", l1v & (_PAGE_GLOBAL|_PAGE_PAT));
  12.411 +        return 0;
  12.412 +    }
  12.413 +
  12.414 +    if ( unlikely(!pfn_is_ram(pfn)) )
  12.415 +    {
  12.416 +        /* Revert to caller privileges if FD == DOMID_IO. */
  12.417 +        if ( d == dom_io )
  12.418 +            d = current->domain;
  12.419 +
  12.420 +        if ( IS_PRIV(d) )
  12.421 +            return 1;
  12.422 +
  12.423 +        if ( IS_CAPABLE_PHYSDEV(d) )
  12.424 +            return domain_iomem_in_pfn(d, pfn);
  12.425 +
  12.426 +        MEM_LOG("Non-privileged attempt to map I/O space %08lx", pfn);
  12.427 +        return 0;
  12.428 +    }
  12.429 +
  12.430 +    return ((l1v & _PAGE_RW) ?
  12.431 +            get_page_and_type(page, d, PGT_writable_page) :
  12.432 +            get_page(page, d));
  12.433 +}
  12.434 +
  12.435 +
  12.436 +/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
  12.437 +static int 
  12.438 +get_page_from_l2e(
  12.439 +    l2_pgentry_t l2e, unsigned long pfn,
  12.440 +    struct domain *d, unsigned long va_idx)
  12.441 +{
  12.442 +    int rc;
  12.443 +
  12.444 +    if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) )
  12.445 +        return 1;
  12.446 +
  12.447 +    if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
  12.448 +    {
  12.449 +        MEM_LOG("Bad L2 page type settings %04lx",
  12.450 +                l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
  12.451 +        return 0;
  12.452 +    }
  12.453 +
  12.454 +    rc = get_page_and_type_from_pagenr(
  12.455 +        l2_pgentry_to_pagenr(l2e), 
  12.456 +        PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
  12.457 +
  12.458 +    if ( unlikely(!rc) )
  12.459 +        return get_linear_pagetable(l2e, pfn, d);
  12.460 +
  12.461 +    return 1;
  12.462 +}
  12.463 +
  12.464 +
  12.465 +static void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
  12.466 +{
  12.467 +    unsigned long    l1v  = l1_pgentry_val(l1e);
  12.468 +    unsigned long    pfn  = l1_pgentry_to_pagenr(l1e);
  12.469 +    struct pfn_info *page = &frame_table[pfn];
  12.470 +    struct domain   *e;
  12.471 +
  12.472 +    if ( !(l1v & _PAGE_PRESENT) || !pfn_is_ram(pfn) )
  12.473 +        return;
  12.474 +
  12.475 +    e = page_get_owner(page);
  12.476 +    if ( unlikely(e != d) )
  12.477 +    {
  12.478 +        /*
  12.479 +         * Unmap a foreign page that may have been mapped via a grant table.
  12.480 +         * Note that this can fail for a privileged domain that can map foreign
  12.481 +         * pages via MMUEXT_SET_FOREIGNDOM. Such domains can have some mappings
  12.482 +         * counted via a grant entry and some counted directly in the page
  12.483 +         * structure's reference count. Note that reference counts won't get
  12.484 +         * dangerously confused as long as we always try to decrement the
  12.485 +         * grant entry first. We may end up with a mismatch between which
  12.486 +         * mappings and which unmappings are counted via the grant entry, but
  12.487 +         * really it doesn't matter as privileged domains have carte blanche.
  12.488 +         */
  12.489 +        if ( likely(gnttab_check_unmap(e, d, pfn, !(l1v & _PAGE_RW))) )
  12.490 +            return;
  12.491 +        /* Assume this mapping was made via MMUEXT_SET_FOREIGNDOM... */
  12.492 +    }
  12.493 +
  12.494 +    if ( l1v & _PAGE_RW )
  12.495 +    {
  12.496 +        put_page_and_type(page);
  12.497 +    }
  12.498 +    else
  12.499 +    {
  12.500 +        /* We expect this is rare so we blow the entire shadow LDT. */
  12.501 +        if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) == 
  12.502 +                       PGT_ldt_page)) &&
  12.503 +             unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) )
  12.504 +            invalidate_shadow_ldt(e->exec_domain[0]);
  12.505 +        put_page(page);
  12.506 +    }
  12.507 +}
  12.508 +
  12.509 +
  12.510 +/*
  12.511 + * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
  12.512 + * Note also that this automatically deals correctly with linear p.t.'s.
  12.513 + */
  12.514 +static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
  12.515 +{
  12.516 +    if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && 
  12.517 +         ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
  12.518 +        put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
  12.519 +}
  12.520 +
  12.521 +
  12.522 +static int alloc_l2_table(struct pfn_info *page)
  12.523 +{
  12.524 +    struct domain *d = page_get_owner(page);
  12.525 +    unsigned long  page_nr = page_to_pfn(page);
  12.526 +    l2_pgentry_t  *pl2e;
  12.527 +    int            i;
  12.528 +   
  12.529 +    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
  12.530 +
  12.531 +    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  12.532 +        if ( unlikely(!get_page_from_l2e(pl2e[i], page_nr, d, i)) )
  12.533 +            goto fail;
  12.534 +
  12.535 +#if defined(__i386__)
  12.536 +    /* Now we add our private high mappings. */
  12.537 +    memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
  12.538 +           &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
  12.539 +           HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
  12.540 +    pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
  12.541 +        mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
  12.542 +    pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
  12.543 +        mk_l2_pgentry(__pa(page_get_owner(page)->arch.mm_perdomain_pt) | 
  12.544 +                      __PAGE_HYPERVISOR);
  12.545 +#endif
  12.546 +
  12.547 +    unmap_domain_mem(pl2e);
  12.548 +    return 1;
  12.549 +
  12.550 + fail:
  12.551 +    while ( i-- > 0 )
  12.552 +        put_page_from_l2e(pl2e[i], page_nr);
  12.553 +
  12.554 +    unmap_domain_mem(pl2e);
  12.555 +    return 0;
  12.556 +}
  12.557 +
  12.558 +
  12.559 +static int alloc_l1_table(struct pfn_info *page)
  12.560 +{
  12.561 +    struct domain *d = page_get_owner(page);
  12.562 +    unsigned long  page_nr = page_to_pfn(page);
  12.563 +    l1_pgentry_t  *pl1e;
  12.564 +    int            i;
  12.565 +
  12.566 +    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
  12.567 +
  12.568 +    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
  12.569 +        if ( unlikely(!get_page_from_l1e(pl1e[i], d)) )
  12.570 +            goto fail;
  12.571 +
  12.572 +    unmap_domain_mem(pl1e);
  12.573 +    return 1;
  12.574 +
  12.575 + fail:
  12.576 +    while ( i-- > 0 )
  12.577 +        put_page_from_l1e(pl1e[i], d);
  12.578 +
  12.579 +    unmap_domain_mem(pl1e);
  12.580 +    return 0;
  12.581 +}
  12.582 +
  12.583 +
  12.584 +static void free_l2_table(struct pfn_info *page)
  12.585 +{
  12.586 +    unsigned long page_nr = page - frame_table;
  12.587 +    l2_pgentry_t *pl2e;
  12.588 +    int i;
  12.589 +
  12.590 +    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
  12.591 +
  12.592 +    for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
  12.593 +        put_page_from_l2e(pl2e[i], page_nr);
  12.594 +
  12.595 +    unmap_domain_mem(pl2e);
  12.596 +}
  12.597 +
  12.598 +
  12.599 +static void free_l1_table(struct pfn_info *page)
  12.600 +{
  12.601 +    struct domain *d = page_get_owner(page);
  12.602 +    unsigned long page_nr = page - frame_table;
  12.603 +    l1_pgentry_t *pl1e;
  12.604 +    int i;
  12.605 +
  12.606 +    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
  12.607 +
  12.608 +    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
  12.609 +        put_page_from_l1e(pl1e[i], d);
  12.610 +
  12.611 +    unmap_domain_mem(pl1e);
  12.612 +}
  12.613 +
  12.614 +
  12.615 +static inline int update_l2e(l2_pgentry_t *pl2e, 
  12.616 +                             l2_pgentry_t  ol2e, 
  12.617 +                             l2_pgentry_t  nl2e)
  12.618 +{
  12.619 +    unsigned long o = cmpxchg((unsigned long *)pl2e, 
  12.620 +                              l2_pgentry_val(ol2e), 
  12.621 +                              l2_pgentry_val(nl2e));
  12.622 +    if ( o != l2_pgentry_val(ol2e) )
  12.623 +        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
  12.624 +                l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
  12.625 +    return (o == l2_pgentry_val(ol2e));
  12.626 +}
  12.627 +
  12.628 +
  12.629 +/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
  12.630 +static int mod_l2_entry(l2_pgentry_t *pl2e, 
  12.631 +                        l2_pgentry_t nl2e, 
  12.632 +                        unsigned long pfn)
  12.633 +{
  12.634 +    l2_pgentry_t ol2e;
  12.635 +    unsigned long _ol2e;
  12.636 +
  12.637 +    if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
  12.638 +                  DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
  12.639 +    {
  12.640 +        MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e);
  12.641 +        return 0;
  12.642 +    }
  12.643 +
  12.644 +    if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
  12.645 +        return 0;
  12.646 +    ol2e = mk_l2_pgentry(_ol2e);
  12.647 +
  12.648 +    if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
  12.649 +    {
  12.650 +        /* Differ in mapping (bits 12-31) or presence (bit 0)? */
  12.651 +        if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) == 0 )
  12.652 +            return update_l2e(pl2e, ol2e, nl2e);
  12.653 +
  12.654 +        if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain,
  12.655 +                                        ((unsigned long)pl2e & 
  12.656 +                                         ~PAGE_MASK) >> 2)) )
  12.657 +            return 0;
  12.658 +
  12.659 +        if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
  12.660 +        {
  12.661 +            put_page_from_l2e(nl2e, pfn);
  12.662 +            return 0;
  12.663 +        }
  12.664 +        
  12.665 +        put_page_from_l2e(ol2e, pfn);
  12.666 +        return 1;
  12.667 +    }
  12.668 +
  12.669 +    if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
  12.670 +        return 0;
  12.671 +
  12.672 +    put_page_from_l2e(ol2e, pfn);
  12.673 +    return 1;
  12.674 +}
  12.675 +
  12.676 +
  12.677 +static inline int update_l1e(l1_pgentry_t *pl1e, 
  12.678 +                             l1_pgentry_t  ol1e, 
  12.679 +                             l1_pgentry_t  nl1e)
  12.680 +{
  12.681 +    unsigned long o = l1_pgentry_val(ol1e);
  12.682 +    unsigned long n = l1_pgentry_val(nl1e);
  12.683 +
  12.684 +    if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ||
  12.685 +         unlikely(o != l1_pgentry_val(ol1e)) )
  12.686 +    {
  12.687 +        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
  12.688 +                l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
  12.689 +        return 0;
  12.690 +    }
  12.691 +
  12.692 +    return 1;
  12.693 +}
  12.694 +
  12.695 +
  12.696 +/* Update the L1 entry at pl1e to new value nl1e. */
  12.697 +static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
  12.698 +{
  12.699 +    l1_pgentry_t ol1e;
  12.700 +    unsigned long _ol1e;
  12.701 +    struct domain *d = current->domain;
  12.702 +
  12.703 +    if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
  12.704 +    {
  12.705 +        MEM_LOG("Bad get_user\n");
  12.706 +        return 0;
  12.707 +    }
  12.708 +    
  12.709 +    ol1e = mk_l1_pgentry(_ol1e);
  12.710 +
  12.711 +    if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
  12.712 +    {
  12.713 +        /* Same mapping (bits 12-31), r/w (bit 1), and presence (bit 0)? */
  12.714 +        if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) == 0 )
  12.715 +            return update_l1e(pl1e, ol1e, nl1e);
  12.716 +
  12.717 +        if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
  12.718 +            return 0;
  12.719 +        
  12.720 +        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
  12.721 +        {
  12.722 +            put_page_from_l1e(nl1e, d);
  12.723 +            return 0;
  12.724 +        }
  12.725 +        
  12.726 +        put_page_from_l1e(ol1e, d);
  12.727 +        return 1;
  12.728 +    }
  12.729 +
  12.730 +    if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
  12.731 +        return 0;
  12.732 +    
  12.733 +    put_page_from_l1e(ol1e, d);
  12.734 +    return 1;
  12.735 +}
  12.736 +
  12.737 +
  12.738 +int alloc_page_type(struct pfn_info *page, unsigned int type)
  12.739 +{
  12.740 +    switch ( type )
  12.741 +    {
  12.742 +    case PGT_l1_page_table:
  12.743 +        return alloc_l1_table(page);
  12.744 +    case PGT_l2_page_table:
  12.745 +        return alloc_l2_table(page);
  12.746 +    case PGT_gdt_page:
  12.747 +    case PGT_ldt_page:
  12.748 +        return alloc_segdesc_page(page);
  12.749 +    default:
  12.750 +        printk("Bad type in alloc_page_type %x t=%x c=%x\n", 
  12.751 +               type, page->u.inuse.type_info,
  12.752 +               page->count_info);
  12.753 +        BUG();
  12.754 +    }
  12.755 +
  12.756 +    return 0;
  12.757 +}
  12.758 +
  12.759 +
  12.760 +void free_page_type(struct pfn_info *page, unsigned int type)
  12.761 +{
  12.762 +    struct domain *d = page_get_owner(page);
  12.763 +
  12.764 +    switch ( type )
  12.765 +    {
  12.766 +    case PGT_l1_page_table:
  12.767 +        free_l1_table(page);
  12.768 +        break;
  12.769 +
  12.770 +    case PGT_l2_page_table:
  12.771 +        free_l2_table(page);
  12.772 +        break;
  12.773 +
  12.774 +    default:
  12.775 +        BUG();
  12.776 +    }
  12.777 +
  12.778 +    if ( unlikely(shadow_mode(d)) && 
  12.779 +         (get_shadow_status(d, page_to_pfn(page)) & PSH_shadowed) )
  12.780 +    {
  12.781 +        unshadow_table(page_to_pfn(page), type);
  12.782 +        put_shadow_status(d);
  12.783 +    }
  12.784 +}
  12.785 +
  12.786 +
  12.787 +void put_page_type(struct pfn_info *page)
  12.788 +{
  12.789 +    u32 nx, x, y = page->u.inuse.type_info;
  12.790 +
  12.791 + again:
  12.792 +    do {
  12.793 +        x  = y;
  12.794 +        nx = x - 1;
  12.795 +
  12.796 +        ASSERT((x & PGT_count_mask) != 0);
  12.797 +
  12.798 +        /*
  12.799 +         * The page should always be validated while a reference is held. The 
  12.800 +         * exception is during domain destruction, when we forcibly invalidate 
  12.801 +         * page-table pages if we detect a referential loop.
  12.802 +         * See domain.c:relinquish_list().
  12.803 +         */
  12.804 +        ASSERT((x & PGT_validated) || 
  12.805 +               test_bit(DF_DYING, &page_get_owner(page)->d_flags));
  12.806 +
  12.807 +        if ( unlikely((nx & PGT_count_mask) == 0) )
  12.808 +        {
  12.809 +            /* Record TLB information for flush later. Races are harmless. */
  12.810 +            page->tlbflush_timestamp = tlbflush_current_time();
  12.811 +            
  12.812 +            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
  12.813 +                 likely(nx & PGT_validated) )
  12.814 +            {
  12.815 +                /*
  12.816 +                 * Page-table pages must be unvalidated when count is zero. The
  12.817 +                 * 'free' is safe because the refcnt is non-zero and validated
  12.818 +                 * bit is clear => other ops will spin or fail.
  12.819 +                 */
  12.820 +                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
  12.821 +                                           x & ~PGT_validated)) != x) )
  12.822 +                    goto again;
  12.823 +                /* We cleared the 'valid bit' so we do the clear up. */
  12.824 +                free_page_type(page, x & PGT_type_mask);
  12.825 +                /* Carry on, but with the 'valid bit' now clear. */
  12.826 +                x  &= ~PGT_validated;
  12.827 +                nx &= ~PGT_validated;
  12.828 +            }
  12.829 +        }
  12.830 +        else if ( unlikely((nx & (PGT_pinned | PGT_count_mask)) == 
  12.831 +                           (PGT_pinned | 1)) )
  12.832 +        {
  12.833 +            /* Page is now only pinned. Make the back pointer mutable again. */
  12.834 +            nx |= PGT_va_mutable;
  12.835 +        }
  12.836 +    }
  12.837 +    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
  12.838 +}
  12.839 +
  12.840 +
  12.841 +int get_page_type(struct pfn_info *page, u32 type)
  12.842 +{
  12.843 +    u32 nx, x, y = page->u.inuse.type_info;
  12.844 +
  12.845 + again:
  12.846 +    do {
  12.847 +        x  = y;
  12.848 +        nx = x + 1;
  12.849 +        if ( unlikely((nx & PGT_count_mask) == 0) )
  12.850 +        {
  12.851 +            MEM_LOG("Type count overflow on pfn %08lx\n", page_to_pfn(page));
  12.852 +            return 0;
  12.853 +        }
  12.854 +        else if ( unlikely((x & PGT_count_mask) == 0) )
  12.855 +        {
  12.856 +            if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
  12.857 +            {
  12.858 +                /*
  12.859 +                 * On type change we check to flush stale TLB entries. This 
  12.860 +                 * may be unnecessary (e.g., page was GDT/LDT) but those
  12.861 +                 * circumstances should be very rare.
  12.862 +                 */
  12.863 +                struct domain *d = page_get_owner(page);
  12.864 +                if ( unlikely(NEED_FLUSH(tlbflush_time[d->exec_domain[0]->processor],
  12.865 +                                         page->tlbflush_timestamp)) )
  12.866 +                {
  12.867 +                    perfc_incr(need_flush_tlb_flush);
  12.868 +                    flush_tlb_cpu(d->exec_domain[0]->processor);
  12.869 +                }
  12.870 +
  12.871 +                /* We lose existing type, back pointer, and validity. */
  12.872 +                nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
  12.873 +                nx |= type;
  12.874 +
  12.875 +                /* No special validation needed for writable pages. */
  12.876 +                /* Page tables and GDT/LDT need to be scanned for validity. */
  12.877 +                if ( type == PGT_writable_page )
  12.878 +                    nx |= PGT_validated;
  12.879 +            }
  12.880 +        }
  12.881 +        else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
  12.882 +        {
  12.883 +            if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
  12.884 +            {
  12.885 +                if ( ((x & PGT_type_mask) != PGT_l2_page_table) ||
  12.886 +                     ((type & PGT_type_mask) != PGT_l1_page_table) )
  12.887 +                    MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %08lx\n",
  12.888 +                            x & PGT_type_mask, type, page_to_pfn(page));
  12.889 +                return 0;
  12.890 +            }
  12.891 +            else if ( (x & PGT_va_mask) == PGT_va_mutable )
  12.892 +            {
  12.893 +                /* The va backpointer is mutable, hence we update it. */
  12.894 +                nx &= ~PGT_va_mask;
  12.895 +                nx |= type; /* we know the actual type is correct */
  12.896 +            }
  12.897 +            else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask)) )
  12.898 +            {
  12.899 +                /* This table is potentially mapped at multiple locations. */
  12.900 +                nx &= ~PGT_va_mask;
  12.901 +                nx |= PGT_va_unknown;
  12.902 +            }
  12.903 +        }
  12.904 +        else if ( unlikely(!(x & PGT_validated)) )
  12.905 +        {
  12.906 +            /* Someone else is updating validation of this page. Wait... */
  12.907 +            while ( (y = page->u.inuse.type_info) == x )
  12.908 +            {
  12.909 +                rep_nop();
  12.910 +                barrier();
  12.911 +            }
  12.912 +            goto again;
  12.913 +        }
  12.914 +    }
  12.915 +    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
  12.916 +
  12.917 +    if ( unlikely(!(nx & PGT_validated)) )
  12.918 +    {
  12.919 +        /* Try to validate page type; drop the new reference on failure. */
  12.920 +        if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) )
  12.921 +        {
  12.922 +            MEM_LOG("Error while validating pfn %08lx for type %08x."
  12.923 +                    " caf=%08x taf=%08x\n",
  12.924 +                    page_to_pfn(page), type,
  12.925 +                    page->count_info,
  12.926 +                    page->u.inuse.type_info);
  12.927 +            /* Noone else can get a reference. We hold the only ref. */
  12.928 +            page->u.inuse.type_info = 0;
  12.929 +            return 0;
  12.930 +        }
  12.931 +
  12.932 +        /* Noone else is updating simultaneously. */
  12.933 +        __set_bit(_PGT_validated, &page->u.inuse.type_info);
  12.934 +    }
  12.935 +
  12.936 +    return 1;
  12.937 +}
  12.938 +
  12.939 +
  12.940 +int new_guest_cr3(unsigned long pfn)
  12.941 +{
  12.942 +    struct exec_domain *ed = current;
  12.943 +    struct domain *d = ed->domain;
  12.944 +    int okay, cpu = smp_processor_id();
  12.945 +    unsigned long old_base_pfn;
  12.946 +    
  12.947 +    okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table, d);
  12.948 +    if ( likely(okay) )
  12.949 +    {
  12.950 +        invalidate_shadow_ldt(ed);
  12.951 +
  12.952 +        percpu_info[cpu].deferred_ops &= ~DOP_FLUSH_TLB;
  12.953 +        old_base_pfn = pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT;
  12.954 +        ed->arch.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
  12.955 +
  12.956 +        shadow_mk_pagetable(ed);
  12.957 +
  12.958 +        write_ptbase(ed);
  12.959 +
  12.960 +        put_page_and_type(&frame_table[old_base_pfn]);
  12.961 +    }
  12.962 +    else
  12.963 +    {
  12.964 +        MEM_LOG("Error while installing new baseptr %08lx", pfn);
  12.965 +    }
  12.966 +
  12.967 +    return okay;
  12.968 +}
  12.969 +
  12.970 +static int do_extended_command(unsigned long ptr, unsigned long val)
  12.971 +{
  12.972 +    int okay = 1, cpu = smp_processor_id();
  12.973 +    unsigned int cmd = val & MMUEXT_CMD_MASK;
  12.974 +    unsigned long pfn = ptr >> PAGE_SHIFT;
  12.975 +    struct pfn_info *page = &frame_table[pfn];
  12.976 +    struct exec_domain *ed = current;
  12.977 +    struct domain *d = ed->domain, *nd, *e;
  12.978 +    u32 x, y;
  12.979 +    domid_t domid;
  12.980 +    grant_ref_t gntref;
  12.981 +
  12.982 +    switch ( cmd )
  12.983 +    {
  12.984 +    case MMUEXT_PIN_L1_TABLE:
  12.985 +    case MMUEXT_PIN_L2_TABLE:
  12.986 +        /*
  12.987 +         * We insist that, if you pin an L1 page, it's the first thing that
  12.988 +         * you do to it. This is because we require the backptr to still be
  12.989 +         * mutable. This assumption seems safe.
  12.990 +         */
  12.991 +        okay = get_page_and_type_from_pagenr(
  12.992 +            pfn, 
  12.993 +            ((cmd==MMUEXT_PIN_L2_TABLE) ? 
  12.994 +             PGT_l2_page_table : (PGT_l1_page_table|PGT_va_mutable)),
  12.995 +            FOREIGNDOM);
  12.996 +
  12.997 +        if ( unlikely(!okay) )
  12.998 +        {
  12.999 +            MEM_LOG("Error while pinning pfn %08lx", pfn);
 12.1000 +            break;
 12.1001 +        }
 12.1002 +
 12.1003 +        if ( unlikely(test_and_set_bit(_PGT_pinned,
 12.1004 +                                       &page->u.inuse.type_info)) )
 12.1005 +        {
 12.1006 +            MEM_LOG("Pfn %08lx already pinned", pfn);
 12.1007 +            put_page_and_type(page);
 12.1008 +            okay = 0;
 12.1009 +            break;
 12.1010 +        }
 12.1011 +
 12.1012 +        break;
 12.1013 +
 12.1014 +    case MMUEXT_UNPIN_TABLE:
 12.1015 +        if ( unlikely(!(okay = get_page_from_pagenr(pfn, FOREIGNDOM))) )
 12.1016 +        {
 12.1017 +            MEM_LOG("Page %08lx bad domain (dom=%p)",
 12.1018 +                    ptr, page_get_owner(page));
 12.1019 +        }
 12.1020 +        else if ( likely(test_and_clear_bit(_PGT_pinned, 
 12.1021 +                                            &page->u.inuse.type_info)) )
 12.1022 +        {
 12.1023 +            put_page_and_type(page);
 12.1024 +            put_page(page);
 12.1025 +        }
 12.1026 +        else
 12.1027 +        {
 12.1028 +            okay = 0;
 12.1029 +            put_page(page);
 12.1030 +            MEM_LOG("Pfn %08lx not pinned", pfn);
 12.1031 +        }
 12.1032 +        break;
 12.1033 +
 12.1034 +    case MMUEXT_NEW_BASEPTR:
 12.1035 +        okay = new_guest_cr3(pfn);
 12.1036 +        break;
 12.1037 +        
 12.1038 +    case MMUEXT_TLB_FLUSH:
 12.1039 +        percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
 12.1040 +        break;
 12.1041 +    
 12.1042 +    case MMUEXT_INVLPG:
 12.1043 +        __flush_tlb_one(ptr);
 12.1044 +        break;
 12.1045 +
 12.1046 +    case MMUEXT_FLUSH_CACHE:
 12.1047 +        if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) )
 12.1048 +        {
 12.1049 +            MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n");
 12.1050 +            okay = 0;
 12.1051 +        }
 12.1052 +        else
 12.1053 +        {
 12.1054 +            wbinvd();
 12.1055 +        }
 12.1056 +        break;
 12.1057 +
 12.1058 +    case MMUEXT_SET_LDT:
 12.1059 +    {
 12.1060 +        unsigned long ents = val >> MMUEXT_CMD_SHIFT;
 12.1061 +        if ( ((ptr & (PAGE_SIZE-1)) != 0) || 
 12.1062 +             (ents > 8192) ||
 12.1063 +             ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
 12.1064 +             ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
 12.1065 +        {
 12.1066 +            okay = 0;
 12.1067 +            MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
 12.1068 +        }
 12.1069 +        else if ( (ed->arch.ldt_ents != ents) || 
 12.1070 +                  (ed->arch.ldt_base != ptr) )
 12.1071 +        {
 12.1072 +            invalidate_shadow_ldt(ed);
 12.1073 +            ed->arch.ldt_base = ptr;
 12.1074 +            ed->arch.ldt_ents = ents;
 12.1075 +            load_LDT(ed);
 12.1076 +            percpu_info[cpu].deferred_ops &= ~DOP_RELOAD_LDT;
 12.1077 +            if ( ents != 0 )
 12.1078 +                percpu_info[cpu].deferred_ops |= DOP_RELOAD_LDT;
 12.1079 +        }
 12.1080 +        break;
 12.1081 +    }
 12.1082 +
 12.1083 +    case MMUEXT_SET_FOREIGNDOM:
 12.1084 +        domid = (domid_t)(val >> 16);
 12.1085 +
 12.1086 +        if ( (e = percpu_info[cpu].foreign) != NULL )
 12.1087 +            put_domain(e);
 12.1088 +        percpu_info[cpu].foreign = NULL;
 12.1089 +
 12.1090 +        if ( !IS_PRIV(d) )
 12.1091 +        {
 12.1092 +            switch ( domid )
 12.1093 +            {
 12.1094 +            case DOMID_IO:
 12.1095 +                get_knownalive_domain(dom_io);
 12.1096 +                percpu_info[cpu].foreign = dom_io;
 12.1097 +                break;
 12.1098 +            default:
 12.1099 +                MEM_LOG("Dom %u cannot set foreign dom\n", d->id);
 12.1100 +                okay = 0;
 12.1101 +                break;
 12.1102 +            }
 12.1103 +        }
 12.1104 +        else
 12.1105 +        {
 12.1106 +            percpu_info[cpu].foreign = e = find_domain_by_id(domid);
 12.1107 +            if ( e == NULL )
 12.1108 +            {
 12.1109 +                switch ( domid )
 12.1110 +                {
 12.1111 +                case DOMID_XEN:
 12.1112 +                    get_knownalive_domain(dom_xen);
 12.1113 +                    percpu_info[cpu].foreign = dom_xen;
 12.1114 +                    break;
 12.1115 +                case DOMID_IO:
 12.1116 +                    get_knownalive_domain(dom_io);
 12.1117 +                    percpu_info[cpu].foreign = dom_io;
 12.1118 +                    break;
 12.1119 +                default:
 12.1120 +                    MEM_LOG("Unknown domain '%u'", domid);
 12.1121 +                    okay = 0;
 12.1122 +                    break;
 12.1123 +                }
 12.1124 +            }
 12.1125 +        }
 12.1126 +        break;
 12.1127 +
 12.1128 +    case MMUEXT_TRANSFER_PAGE:
 12.1129 +        domid  = (domid_t)(val >> 16);
 12.1130 +        gntref = (grant_ref_t)((val & 0xFF00) | ((ptr >> 2) & 0x00FF));
 12.1131 +        
 12.1132 +        if ( unlikely(IS_XEN_HEAP_FRAME(page)) ||
 12.1133 +             unlikely(!pfn_is_ram(pfn)) ||
 12.1134 +             unlikely((e = find_domain_by_id(domid)) == NULL) )
 12.1135 +        {
 12.1136 +            MEM_LOG("Bad frame (%08lx) or bad domid (%d).\n", pfn, domid);
 12.1137 +            okay = 0;
 12.1138 +            break;
 12.1139 +        }
 12.1140 +
 12.1141 +        spin_lock(&d->page_alloc_lock);
 12.1142 +
 12.1143 +        /*
 12.1144 +         * The tricky bit: atomically release ownership while there is just one
 12.1145 +         * benign reference to the page (PGC_allocated). If that reference
 12.1146 +         * disappears then the deallocation routine will safely spin.
 12.1147 +         */
 12.1148 +        nd = page_get_owner(page);
 12.1149 +        y  = page->count_info;
 12.1150 +        do {
 12.1151 +            x = y;
 12.1152 +            if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 
 12.1153 +                          (1|PGC_allocated)) ||
 12.1154 +                 unlikely(nd != d) )
 12.1155 +            {
 12.1156 +                MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
 12.1157 +                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
 12.1158 +                        d, d->id, nd, x, page->u.inuse.type_info);
 12.1159 +                spin_unlock(&d->page_alloc_lock);
 12.1160 +                put_domain(e);
 12.1161 +                return 0;
 12.1162 +            }
 12.1163 +            __asm__ __volatile__(
 12.1164 +                LOCK_PREFIX "cmpxchg8b %2"
 12.1165 +                : "=d" (nd), "=a" (y),
 12.1166 +                "=m" (*(volatile u64 *)(&page->count_info))
 12.1167 +                : "0" (d), "1" (x), "c" (NULL), "b" (x) );
 12.1168 +        } 
 12.1169 +        while ( unlikely(nd != d) || unlikely(y != x) );
 12.1170 +
 12.1171 +        /*
 12.1172 +         * Unlink from 'd'. At least one reference remains (now anonymous), so
 12.1173 +         * noone else is spinning to try to delete this page from 'd'.
 12.1174 +         */
 12.1175 +        d->tot_pages--;
 12.1176 +        list_del(&page->list);
 12.1177 +        
 12.1178 +        spin_unlock(&d->page_alloc_lock);
 12.1179 +
 12.1180 +        spin_lock(&e->page_alloc_lock);
 12.1181 +
 12.1182 +        /*
 12.1183 +         * Check that 'e' will accept the page and has reservation headroom.
 12.1184 +         * Also, a domain mustn't have PGC_allocated pages when it is dying.
 12.1185 +         */
 12.1186 +        ASSERT(e->tot_pages <= e->max_pages);
 12.1187 +        if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
 12.1188 +             unlikely(e->tot_pages == e->max_pages) ||
 12.1189 +             unlikely(!gnttab_prepare_for_transfer(e, d, gntref)) )
 12.1190 +        {
 12.1191 +            MEM_LOG("Transferee has no reservation headroom (%d,%d), or "
 12.1192 +                    "provided a bad grant ref, or is dying (%08lx).\n",
 12.1193 +                    e->tot_pages, e->max_pages, e->d_flags);
 12.1194 +            spin_unlock(&e->page_alloc_lock);
 12.1195 +            put_domain(e);
 12.1196 +            okay = 0;
 12.1197 +            break;
 12.1198 +        }
 12.1199 +
 12.1200 +        /* Okay, add the page to 'e'. */
 12.1201 +        if ( unlikely(e->tot_pages++ == 0) )
 12.1202 +            get_knownalive_domain(e);
 12.1203 +        list_add_tail(&page->list, &e->page_list);
 12.1204 +        page_set_owner(page, e);
 12.1205 +
 12.1206 +        spin_unlock(&e->page_alloc_lock);
 12.1207 +
 12.1208 +        /* Transfer is all done: tell the guest about its new page frame. */
 12.1209 +        gnttab_notify_transfer(e, gntref, pfn);
 12.1210 +        
 12.1211 +        put_domain(e);
 12.1212 +        break;
 12.1213 +
 12.1214 +    case MMUEXT_REASSIGN_PAGE:
 12.1215 +        if ( unlikely(!IS_PRIV(d)) )
 12.1216 +        {
 12.1217 +            MEM_LOG("Dom %u has no reassignment priv", d->id);
 12.1218 +            okay = 0;
 12.1219 +            break;
 12.1220 +        }
 12.1221 +
 12.1222 +        e = percpu_info[cpu].foreign;
 12.1223 +        if ( unlikely(e == NULL) )
 12.1224 +        {
 12.1225 +            MEM_LOG("No FOREIGNDOM to reassign pfn %08lx to", pfn);
 12.1226 +            okay = 0;
 12.1227 +            break;
 12.1228 +        }
 12.1229 +
 12.1230 +        /*
 12.1231 +         * Grab both page_list locks, in order. This prevents the page from
 12.1232 +         * disappearing elsewhere while we modify the owner, and we'll need
 12.1233 +         * both locks if we're successful so that we can change lists.
 12.1234 +         */
 12.1235 +        if ( d < e )
 12.1236 +        {
 12.1237 +            spin_lock(&d->page_alloc_lock);
 12.1238 +            spin_lock(&e->page_alloc_lock);
 12.1239 +        }
 12.1240 +        else
 12.1241 +        {
 12.1242 +            spin_lock(&e->page_alloc_lock);
 12.1243 +            spin_lock(&d->page_alloc_lock);
 12.1244 +        }
 12.1245 +
 12.1246 +        /* A domain shouldn't have PGC_allocated pages when it is dying. */
 12.1247 +        if ( unlikely(test_bit(DF_DYING, &e->d_flags)) ||
 12.1248 +             unlikely(IS_XEN_HEAP_FRAME(page)) )
 12.1249 +        {
 12.1250 +            MEM_LOG("Reassignment page is Xen heap, or dest dom is dying.");
 12.1251 +            okay = 0;
 12.1252 +            goto reassign_fail;
 12.1253 +        }
 12.1254 +
 12.1255 +        /*
 12.1256 +         * The tricky bit: atomically change owner while there is just one
 12.1257 +         * benign reference to the page (PGC_allocated). If that reference
 12.1258 +         * disappears then the deallocation routine will safely spin.
 12.1259 +         */
 12.1260 +        nd = page_get_owner(page);
 12.1261 +        y  = page->count_info;
 12.1262 +        do {
 12.1263 +            x = y;
 12.1264 +            if ( unlikely((x & (PGC_count_mask|PGC_allocated)) != 
 12.1265 +                          (1|PGC_allocated)) ||
 12.1266 +                 unlikely(nd != d) )
 12.1267 +            {
 12.1268 +                MEM_LOG("Bad page values %08lx: ed=%p(%u), sd=%p,"
 12.1269 +                        " caf=%08x, taf=%08x\n", page_to_pfn(page),
 12.1270 +                        d, d->id, nd, x, page->u.inuse.type_info);
 12.1271 +                okay = 0;
 12.1272 +                goto reassign_fail;
 12.1273 +            }
 12.1274 +            __asm__ __volatile__(
 12.1275 +                LOCK_PREFIX "cmpxchg8b %3"
 12.1276 +                : "=d" (nd), "=a" (y), "=c" (e),
 12.1277 +                "=m" (*(volatile u64 *)(&page->count_info))
 12.1278 +                : "0" (d), "1" (x), "c" (e), "b" (x) );
 12.1279 +        } 
 12.1280 +        while ( unlikely(nd != d) || unlikely(y != x) );
 12.1281 +        
 12.1282 +        /*
 12.1283 +         * Unlink from 'd'. We transferred at least one reference to 'e', so
 12.1284 +         * noone else is spinning to try to delete this page from 'd'.
 12.1285 +         */
 12.1286 +        d->tot_pages--;
 12.1287 +        list_del(&page->list);
 12.1288 +        
 12.1289 +        /*
 12.1290 +         * Add the page to 'e'. Someone may already have removed the last
 12.1291 +         * reference and want to remove the page from 'e'. However, we have
 12.1292 +         * the lock so they'll spin waiting for us.
 12.1293 +         */
 12.1294 +        if ( unlikely(e->tot_pages++ == 0) )
 12.1295 +            get_knownalive_domain(e);
 12.1296 +        list_add_tail(&page->list, &e->page_list);
 12.1297 +
 12.1298 +    reassign_fail:        
 12.1299 +        spin_unlock(&d->page_alloc_lock);
 12.1300 +        spin_unlock(&e->page_alloc_lock);
 12.1301 +        break;
 12.1302 +
 12.1303 +    case MMUEXT_CLEAR_FOREIGNDOM:
 12.1304 +        if ( (e = percpu_info[cpu].foreign) != NULL )
 12.1305 +            put_domain(e);
 12.1306 +        percpu_info[cpu].foreign = NULL;
 12.1307 +        break;
 12.1308 +
 12.1309 +    default:
 12.1310 +        MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
 12.1311 +        okay = 0;
 12.1312 +        break;
 12.1313 +    }
 12.1314 +
 12.1315 +    return okay;
 12.1316 +}
 12.1317 +
 12.1318 +int do_mmu_update(
 12.1319 +    mmu_update_t *ureqs, unsigned int count, unsigned int *pdone)
 12.1320 +{
 12.1321 +/*
 12.1322 + * We steal the m.s.b. of the @count parameter to indicate whether this
 12.1323 + * invocation of do_mmu_update() is resuming a previously preempted call.
 12.1324 + * We steal the next 15 bits to remember the current FOREIGNDOM.
 12.1325 + */
 12.1326 +#define MMU_UPDATE_PREEMPTED          (~(~0U>>1))
 12.1327 +#define MMU_UPDATE_PREEMPT_FDOM_SHIFT ((sizeof(int)*8)-16)
 12.1328 +#define MMU_UPDATE_PREEMPT_FDOM_MASK  (0x7FFFU<<MMU_UPDATE_PREEMPT_FDOM_SHIFT)
 12.1329 +
 12.1330 +    mmu_update_t req;
 12.1331 +    unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
 12.1332 +    struct pfn_info *page;
 12.1333 +    int rc = 0, okay = 1, i = 0, cpu = smp_processor_id();
 12.1334 +    unsigned int cmd, done = 0;
 12.1335 +    unsigned long prev_smfn = 0;
 12.1336 +    l1_pgentry_t *prev_spl1e = 0;
 12.1337 +    struct exec_domain *ed = current;
 12.1338 +    struct domain *d = ed->domain;
 12.1339 +    u32 type_info;
 12.1340 +    domid_t domid;
 12.1341 +
 12.1342 +    LOCK_BIGLOCK(d);
 12.1343 +
 12.1344 +    cleanup_writable_pagetable(d);
 12.1345 +
 12.1346 +    if ( unlikely(shadow_mode(d)) )
 12.1347 +        check_pagetable(d, ed->arch.pagetable, "pre-mmu"); /* debug */
 12.1348 +
 12.1349 +    /*
 12.1350 +     * If we are resuming after preemption, read how much work we have already
 12.1351 +     * done. This allows us to set the @done output parameter correctly.
 12.1352 +     * We also reset FOREIGNDOM here.
 12.1353 +     */
 12.1354 +    if ( unlikely(count&(MMU_UPDATE_PREEMPTED|MMU_UPDATE_PREEMPT_FDOM_MASK)) )
 12.1355 +    {
 12.1356 +        if ( !(count & MMU_UPDATE_PREEMPTED) )
 12.1357 +        {
 12.1358 +            /* Count overflow into private FOREIGNDOM field. */
 12.1359 +            MEM_LOG("do_mmu_update count is too large");
 12.1360 +            rc = -EINVAL;
 12.1361 +            goto out;
 12.1362 +        }
 12.1363 +        count &= ~MMU_UPDATE_PREEMPTED;
 12.1364 +        domid = count >> MMU_UPDATE_PREEMPT_FDOM_SHIFT;
 12.1365 +        count &= ~MMU_UPDATE_PREEMPT_FDOM_MASK;
 12.1366 +        if ( unlikely(pdone != NULL) )
 12.1367 +            (void)get_user(done, pdone);
 12.1368 +        if ( (domid != current->domain->id) &&
 12.1369 +             !do_extended_command(0, MMUEXT_SET_FOREIGNDOM | (domid << 16)) )
 12.1370 +        {
 12.1371 +            rc = -EINVAL;
 12.1372 +            goto out;
 12.1373 +        }
 12.1374 +    }
 12.1375 +
 12.1376 +    perfc_incrc(calls_to_mmu_update); 
 12.1377 +    perfc_addc(num_page_updates, count);
 12.1378 +
 12.1379 +    if ( unlikely(!array_access_ok(VERIFY_READ, ureqs, count, sizeof(req))) )
 12.1380 +    {
 12.1381 +        rc = -EFAULT;
 12.1382 +        goto out;
 12.1383 +    }
 12.1384 +
 12.1385 +    for ( i = 0; i < count; i++ )
 12.1386 +    {
 12.1387 +        if ( hypercall_preempt_check() )
 12.1388 +        {
 12.1389 +            rc = hypercall3_create_continuation(
 12.1390 +                __HYPERVISOR_mmu_update, ureqs, 
 12.1391 +                (count - i) |
 12.1392 +                (FOREIGNDOM->id << MMU_UPDATE_PREEMPT_FDOM_SHIFT) | 
 12.1393 +                MMU_UPDATE_PREEMPTED, pdone);
 12.1394 +            break;
 12.1395 +        }
 12.1396 +
 12.1397 +        if ( unlikely(__copy_from_user(&req, ureqs, sizeof(req)) != 0) )
 12.1398 +        {
 12.1399 +            MEM_LOG("Bad __copy_from_user");
 12.1400 +            rc = -EFAULT;
 12.1401 +            break;
 12.1402 +        }
 12.1403 +
 12.1404 +        cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
 12.1405 +        pfn = req.ptr >> PAGE_SHIFT;
 12.1406 +
 12.1407 +        okay = 0;
 12.1408 +
 12.1409 +        switch ( cmd )
 12.1410 +        {
 12.1411 +            /*
 12.1412 +             * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
 12.1413 +             */
 12.1414 +        case MMU_NORMAL_PT_UPDATE:
 12.1415 +            if ( unlikely(!get_page_from_pagenr(pfn, current->domain)) )
 12.1416 +            {
 12.1417 +                MEM_LOG("Could not get page for normal update");
 12.1418 +                break;
 12.1419 +            }
 12.1420 +
 12.1421 +            if ( likely(prev_pfn == pfn) )
 12.1422 +            {
 12.1423 +                va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
 12.1424 +            }
 12.1425 +            else
 12.1426 +            {
 12.1427 +                if ( prev_pfn != 0 )
 12.1428 +                    unmap_domain_mem((void *)va);
 12.1429 +                va = (unsigned long)map_domain_mem(req.ptr);
 12.1430 +                prev_pfn = pfn;
 12.1431 +            }
 12.1432 +
 12.1433 +            page = &frame_table[pfn];
 12.1434 +            switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
 12.1435 +            {
 12.1436 +            case PGT_l1_page_table: 
 12.1437 +                if ( likely(get_page_type(
 12.1438 +                    page, type_info & (PGT_type_mask|PGT_va_mask))) )
 12.1439 +                {
 12.1440 +                    okay = mod_l1_entry((l1_pgentry_t *)va, 
 12.1441 +                                        mk_l1_pgentry(req.val)); 
 12.1442 +
 12.1443 +                    if ( unlikely(shadow_mode(d)) && okay &&
 12.1444 +                         (get_shadow_status(d, page-frame_table) &
 12.1445 +                          PSH_shadowed) )
 12.1446 +                    {
 12.1447 +                        shadow_l1_normal_pt_update(
 12.1448 +                            req.ptr, req.val, &prev_smfn, &prev_spl1e);
 12.1449 +                        put_shadow_status(d);
 12.1450 +                    }
 12.1451 +
 12.1452 +                    put_page_type(page);
 12.1453 +                }
 12.1454 +                break;
 12.1455 +            case PGT_l2_page_table:
 12.1456 +                if ( likely(get_page_type(page, PGT_l2_page_table)) )
 12.1457 +                {
 12.1458 +                    okay = mod_l2_entry((l2_pgentry_t *)va, 
 12.1459 +                                        mk_l2_pgentry(req.val),
 12.1460 +                                        pfn); 
 12.1461 +
 12.1462 +                    if ( unlikely(shadow_mode(d)) && okay &&
 12.1463 +                         (get_shadow_status(d, page-frame_table) & 
 12.1464 +                          PSH_shadowed) )
 12.1465 +                    {
 12.1466 +                        shadow_l2_normal_pt_update(req.ptr, req.val);
 12.1467 +                        put_shadow_status(d);
 12.1468 +                    }
 12.1469 +
 12.1470 +                    put_page_type(page);
 12.1471 +                }
 12.1472 +                break;
 12.1473 +            default:
 12.1474 +                if ( likely(get_page_type(page, PGT_writable_page)) )
 12.1475 +                {
 12.1476 +                    *(unsigned long *)va = req.val;
 12.1477 +                    okay = 1;
 12.1478 +                    put_page_type(page);
 12.1479 +                }
 12.1480 +                break;
 12.1481 +            }
 12.1482 +
 12.1483 +            put_page(page);
 12.1484 +            break;
 12.1485 +
 12.1486 +        case MMU_MACHPHYS_UPDATE:
 12.1487 +            if ( unlikely(!get_page_from_pagenr(pfn, FOREIGNDOM)) )
 12.1488 +            {
 12.1489 +                MEM_LOG("Could not get page for mach->phys update");
 12.1490 +                break;
 12.1491 +            }
 12.1492 +
 12.1493 +            machine_to_phys_mapping[pfn] = req.val;
 12.1494 +            okay = 1;
 12.1495 +
 12.1496 +            /*
 12.1497 +             * If in log-dirty mode, mark the corresponding pseudo-physical
 12.1498 +             * page as dirty.
 12.1499 +             */
 12.1500 +            if ( unlikely(shadow_mode(d) == SHM_logdirty) && 
 12.1501 +                 mark_dirty(d, pfn) )
 12.1502 +                d->arch.shadow_dirty_block_count++;
 12.1503 +
 12.1504 +            put_page(&frame_table[pfn]);
 12.1505 +            break;
 12.1506 +
 12.1507 +            /*
 12.1508 +             * MMU_EXTENDED_COMMAND: Extended command is specified
 12.1509 +             * in the least-siginificant bits of the 'value' field.
 12.1510 +             */
 12.1511 +        case MMU_EXTENDED_COMMAND:
 12.1512 +            req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
 12.1513 +            okay = do_extended_command(req.ptr, req.val);
 12.1514 +            break;
 12.1515 +
 12.1516 +        default:
 12.1517 +            MEM_LOG("Invalid page update command %08lx", req.ptr);
 12.1518 +            break;
 12.1519 +        }
 12.1520 +
 12.1521 +        if ( unlikely(!okay) )
 12.1522 +        {
 12.1523 +            rc = -EINVAL;
 12.1524 +            break;
 12.1525 +        }
 12.1526 +
 12.1527 +        ureqs++;
 12.1528 +    }
 12.1529 +
 12.1530 + out:
 12.1531 +    if ( prev_pfn != 0 )
 12.1532 +        unmap_domain_mem((void *)va);
 12.1533 +
 12.1534 +    if ( unlikely(prev_spl1e != 0) ) 
 12.1535 +        unmap_domain_mem((void *)prev_spl1e);
 12.1536 +
 12.1537 +    deferred_ops = percpu_info[cpu].deferred_ops;
 12.1538 +    percpu_info[cpu].deferred_ops = 0;
 12.1539 +
 12.1540 +    if ( deferred_ops & DOP_FLUSH_TLB )
 12.1541 +        local_flush_tlb();
 12.1542 +        
 12.1543 +    if ( deferred_ops & DOP_RELOAD_LDT )
 12.1544 +        (void)map_ldt_shadow_page(0);
 12.1545 +
 12.1546 +    if ( unlikely(percpu_info[cpu].foreign != NULL) )
 12.1547 +    {
 12.1548 +        put_domain(percpu_info[cpu].foreign);
 12.1549 +        percpu_info[cpu].foreign = NULL;
 12.1550 +    }
 12.1551 +
 12.1552 +    /* Add incremental work we have done to the @done output parameter. */
 12.1553 +    if ( unlikely(pdone != NULL) )
 12.1554 +        __put_user(done + i, pdone);
 12.1555 +
 12.1556 +    if ( unlikely(shadow_mode(d)) )
 12.1557 +        check_pagetable(d, ed->arch.pagetable, "post-mmu"); /* debug */
 12.1558 +
 12.1559 +    UNLOCK_BIGLOCK(d);
 12.1560 +    return rc;
 12.1561 +}
 12.1562 +
 12.1563 +
 12.1564 +int do_update_va_mapping(unsigned long va,
 12.1565 +                         unsigned long val, 
 12.1566 +                         unsigned long flags)
 12.1567 +{
 12.1568 +    struct exec_domain *ed = current;
 12.1569 +    struct domain *d = ed->domain;
 12.1570 +    int err = 0;
 12.1571 +    unsigned int cpu = ed->processor;
 12.1572 +    unsigned long deferred_ops;
 12.1573 +
 12.1574 +    perfc_incrc(calls_to_update_va);
 12.1575 +
 12.1576 +    if ( unlikely(!__addr_ok(va)) )
 12.1577 +        return -EINVAL;
 12.1578 +
 12.1579 +    LOCK_BIGLOCK(d);
 12.1580 +
 12.1581 +    cleanup_writable_pagetable(d);
 12.1582 +
 12.1583 +    /*
 12.1584 +     * XXX When we make this support 4MB superpages we should also deal with 
 12.1585 +     * the case of updating L2 entries.
 12.1586 +     */
 12.1587 +
 12.1588 +    if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
 12.1589 +                                mk_l1_pgentry(val))) )
 12.1590 +        err = -EINVAL;
 12.1591 +
 12.1592 +    if ( unlikely(shadow_mode(d)) )
 12.1593 +    {
 12.1594 +        unsigned long sval = 0;
 12.1595 +
 12.1596 +        l1pte_propagate_from_guest(d, &val, &sval);
 12.1597 +
 12.1598 +        if ( unlikely(__put_user(sval, ((unsigned long *)(
 12.1599 +            &shadow_linear_pg_table[l1_linear_offset(va)])))) )
 12.1600 +        {
 12.1601 +            /*
 12.1602 +             * Since L2's are guranteed RW, failure indicates the page was not 
 12.1603 +             * shadowed, so ignore.
 12.1604 +             */
 12.1605 +            perfc_incrc(shadow_update_va_fail);
 12.1606 +        }
 12.1607 +
 12.1608 +        /*
 12.1609 +         * If we're in log-dirty mode then we need to note that we've updated
 12.1610 +         * the PTE in the PT-holding page. We need the machine frame number
 12.1611 +         * for this.
 12.1612 +         */
 12.1613 +        if ( shadow_mode(d) == SHM_logdirty )
 12.1614 +            mark_dirty(d, va_to_l1mfn(va));
 12.1615 +  
 12.1616 +        check_pagetable(d, ed->arch.pagetable, "va"); /* debug */
 12.1617 +    }
 12.1618 +
 12.1619 +    deferred_ops = percpu_info[cpu].deferred_ops;
 12.1620 +    percpu_info[cpu].deferred_ops = 0;
 12.1621 +
 12.1622 +    if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || 
 12.1623 +         unlikely(flags & UVMF_FLUSH_TLB) )
 12.1624 +        local_flush_tlb();
 12.1625 +    else if ( unlikely(flags & UVMF_INVLPG) )
 12.1626 +        __flush_tlb_one(va);
 12.1627 +
 12.1628 +    if ( unlikely(deferred_ops & DOP_RELOAD_LDT) )
 12.1629 +        (void)map_ldt_shadow_page(0);
 12.1630 +    
 12.1631 +    UNLOCK_BIGLOCK(d);
 12.1632 +
 12.1633 +    return err;
 12.1634 +}
 12.1635 +
 12.1636 +int do_update_va_mapping_otherdomain(unsigned long va,
 12.1637 +                                     unsigned long val, 
 12.1638 +                                     unsigned long flags,
 12.1639 +                                     domid_t domid)
 12.1640 +{
 12.1641 +    unsigned int cpu = smp_processor_id();
 12.1642 +    struct domain *d;
 12.1643 +    int rc;
 12.1644 +
 12.1645 +    if ( unlikely(!IS_PRIV(current->domain)) )
 12.1646 +        return -EPERM;
 12.1647 +
 12.1648 +    percpu_info[cpu].foreign = d = find_domain_by_id(domid);
 12.1649 +    if ( unlikely(d == NULL) )
 12.1650 +    {
 12.1651 +        MEM_LOG("Unknown domain '%u'", domid);
 12.1652 +        return -ESRCH;
 12.1653 +    }
 12.1654 +
 12.1655 +    rc = do_update_va_mapping(va, val, flags);
 12.1656 +
 12.1657 +    put_domain(d);
 12.1658 +    percpu_info[cpu].foreign = NULL;
 12.1659 +
 12.1660 +    return rc;
 12.1661 +}
 12.1662 +
 12.1663 +
 12.1664 +
 12.1665 +/*************************
 12.1666 + * Descriptor Tables
 12.1667 + */
 12.1668 +
 12.1669 +void destroy_gdt(struct exec_domain *ed)
 12.1670 +{
 12.1671 +    int i;
 12.1672 +    unsigned long pfn;
 12.1673 +
 12.1674 +    for ( i = 0; i < 16; i++ )
 12.1675 +    {
 12.1676 +        if ( (pfn = l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[i])) != 0 )
 12.1677 +            put_page_and_type(&frame_table[pfn]);
 12.1678 +        ed->arch.perdomain_ptes[i] = mk_l1_pgentry(0);
 12.1679 +    }
 12.1680 +}
 12.1681 +
 12.1682 +
 12.1683 +long set_gdt(struct exec_domain *ed, 
 12.1684 +             unsigned long *frames,
 12.1685 +             unsigned int entries)
 12.1686 +{
 12.1687 +    struct domain *d = ed->domain;
 12.1688 +    /* NB. There are 512 8-byte entries per GDT page. */
 12.1689 +    int i = 0, nr_pages = (entries + 511) / 512;
 12.1690 +    struct desc_struct *vgdt;
 12.1691 +    unsigned long pfn;
 12.1692 +
 12.1693 +    /* Check the first page in the new GDT. */
 12.1694 +    if ( (pfn = frames[0]) >= max_page )
 12.1695 +        goto fail;
 12.1696 +
 12.1697 +    /* The first page is special because Xen owns a range of entries in it. */
 12.1698 +    if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
 12.1699 +    {
 12.1700 +        /* GDT checks failed: try zapping the Xen reserved entries. */
 12.1701 +        if ( !get_page_and_type(&frame_table[pfn], d, PGT_writable_page) )
 12.1702 +            goto fail;
 12.1703 +        vgdt = map_domain_mem(pfn << PAGE_SHIFT);
 12.1704 +        memset(vgdt + FIRST_RESERVED_GDT_ENTRY, 0,
 12.1705 +               NR_RESERVED_GDT_ENTRIES*8);
 12.1706 +        unmap_domain_mem(vgdt);
 12.1707 +        put_page_and_type(&frame_table[pfn]);
 12.1708 +
 12.1709 +        /* Okay, we zapped the entries. Now try the GDT checks again. */
 12.1710 +        if ( !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
 12.1711 +            goto fail;
 12.1712 +    }
 12.1713 +
 12.1714 +    /* Check the remaining pages in the new GDT. */
 12.1715 +    for ( i = 1; i < nr_pages; i++ )
 12.1716 +        if ( ((pfn = frames[i]) >= max_page) ||
 12.1717 +             !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) )
 12.1718 +            goto fail;
 12.1719 +
 12.1720 +    /* Copy reserved GDT entries to the new GDT. */
 12.1721 +    vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
 12.1722 +    memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 
 12.1723 +           gdt_table + FIRST_RESERVED_GDT_ENTRY, 
 12.1724 +           NR_RESERVED_GDT_ENTRIES*8);
 12.1725 +    unmap_domain_mem(vgdt);
 12.1726 +
 12.1727 +    /* Tear down the old GDT. */
 12.1728 +    destroy_gdt(ed);
 12.1729 +
 12.1730 +    /* Install the new GDT. */
 12.1731 +    for ( i = 0; i < nr_pages; i++ )
 12.1732 +        ed->arch.perdomain_ptes[i] =
 12.1733 +            mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
 12.1734 +
 12.1735 +    SET_GDT_ADDRESS(ed, GDT_VIRT_START(ed));
 12.1736 +    SET_GDT_ENTRIES(ed, entries);
 12.1737 +
 12.1738 +    return 0;
 12.1739 +
 12.1740 + fail:
 12.1741 +    while ( i-- > 0 )
 12.1742 +        put_page_and_type(&frame_table[frames[i]]);
 12.1743 +    return -EINVAL;
 12.1744 +}
 12.1745 +
 12.1746 +
 12.1747 +long do_set_gdt(unsigned long *frame_list, unsigned int entries)
 12.1748 +{
 12.1749 +    int nr_pages = (entries + 511) / 512;
 12.1750 +    unsigned long frames[16];
 12.1751 +    long ret;
 12.1752 +
 12.1753 +    if ( (entries <= LAST_RESERVED_GDT_ENTRY) || (entries > 8192) ) 
 12.1754 +        return -EINVAL;
 12.1755 +    
 12.1756 +    if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
 12.1757 +        return -EFAULT;
 12.1758 +
 12.1759 +    LOCK_BIGLOCK(current->domain);
 12.1760 +
 12.1761 +    if ( (ret = set_gdt(current, frames, entries)) == 0 )
 12.1762 +    {
 12.1763 +        local_flush_tlb();
 12.1764 +        __asm__ __volatile__ ("lgdt %0" : "=m" (*current->arch.gdt));
 12.1765 +    }
 12.1766 +
 12.1767 +    UNLOCK_BIGLOCK(current->domain);
 12.1768 +
 12.1769 +    return ret;
 12.1770 +}
 12.1771 +
 12.1772 +
 12.1773 +long do_update_descriptor(
 12.1774 +    unsigned long pa, unsigned long word1, unsigned long word2)
 12.1775 +{
 12.1776 +    unsigned long pfn = pa >> PAGE_SHIFT;
 12.1777 +    struct desc_struct *gdt_pent, d;
 12.1778 +    struct pfn_info *page;
 12.1779 +    struct exec_domain *ed;
 12.1780 +    long ret = -EINVAL;
 12.1781 +
 12.1782 +    d.a = (u32)word1;
 12.1783 +    d.b = (u32)word2;
 12.1784 +
 12.1785 +    LOCK_BIGLOCK(current->domain);
 12.1786 +
 12.1787 +    if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(&d) ) {
 12.1788 +        UNLOCK_BIGLOCK(current->domain);
 12.1789 +        return -EINVAL;
 12.1790 +    }
 12.1791 +
 12.1792 +    page = &frame_table[pfn];
 12.1793 +    if ( unlikely(!get_page(page, current->domain)) ) {
 12.1794 +        UNLOCK_BIGLOCK(current->domain);
 12.1795 +        return -EINVAL;
 12.1796 +    }
 12.1797 +
 12.1798 +    /* Check if the given frame is in use in an unsafe context. */
 12.1799 +    switch ( page->u.inuse.type_info & PGT_type_mask )
 12.1800 +    {
 12.1801 +    case PGT_gdt_page:
 12.1802 +        /* Disallow updates of Xen-reserved descriptors in the current GDT. */
 12.1803 +        for_each_exec_domain(current->domain, ed) {
 12.1804 +            if ( (l1_pgentry_to_pagenr(ed->arch.perdomain_ptes[0]) == pfn) &&
 12.1805 +                 (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
 12.1806 +                 (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
 12.1807 +                goto out;
 12.1808 +        }
 12.1809 +        if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
 12.1810 +            goto out;
 12.1811 +        break;
 12.1812 +    case PGT_ldt_page:
 12.1813 +        if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
 12.1814 +            goto out;
 12.1815 +        break;
 12.1816 +    default:
 12.1817 +        if ( unlikely(!get_page_type(page, PGT_writable_page)) )
 12.1818 +            goto out;
 12.1819 +        break;
 12.1820 +    }
 12.1821 +
 12.1822 +    /* All is good so make the update. */
 12.1823 +    gdt_pent = map_domain_mem(pa);
 12.1824 +    memcpy(gdt_pent, &d, 8);
 12.1825 +    unmap_domain_mem(gdt_pent);
 12.1826 +
 12.1827 +    put_page_type(page);
 12.1828 +
 12.1829 +    ret = 0; /* success */
 12.1830 +
 12.1831 + out:
 12.1832 +    put_page(page);
 12.1833 +
 12.1834 +    UNLOCK_BIGLOCK(current->domain);
 12.1835 +
 12.1836 +    return ret;
 12.1837 +}
 12.1838 +
 12.1839 +
 12.1840 +
 12.1841 +/*************************
 12.1842 + * Writable Pagetables
 12.1843 + */
 12.1844 +
 12.1845 +ptwr_info_t ptwr_info[NR_CPUS];
 12.1846 +
 12.1847 +#ifdef VERBOSE
 12.1848 +int ptwr_debug = 0x0;
 12.1849 +#define PTWR_PRINTK(_f, _a...) \
 12.1850 + do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 )
 12.1851 +#define PTWR_PRINT_WHICH (which ? 'I' : 'A')
 12.1852 +#else
 12.1853 +#define PTWR_PRINTK(_f, _a...) ((void)0)
 12.1854 +#endif
 12.1855 +
 12.1856 +/* Flush the given writable p.t. page and write-protect it again. */
 12.1857 +void ptwr_flush(const int which)
 12.1858 +{
 12.1859 +    unsigned long  sstat, spte, pte, *ptep, l1va;
 12.1860 +    l1_pgentry_t  *sl1e = NULL, *pl1e, ol1e, nl1e;
 12.1861 +    l2_pgentry_t  *pl2e;
 12.1862 +    int            i, cpu = smp_processor_id();
 12.1863 +    struct exec_domain *ed = current;
 12.1864 +    struct domain *d = ed->domain;
 12.1865 +
 12.1866 +    l1va = ptwr_info[cpu].ptinfo[which].l1va;
 12.1867 +    ptep = (unsigned long *)&linear_pg_table[l1va>>PAGE_SHIFT];
 12.1868 +
 12.1869 +    /*
 12.1870 +     * STEP 1. Write-protect the p.t. page so no more updates can occur.
 12.1871 +     */
 12.1872 +
 12.1873 +    if ( unlikely(__get_user(pte, ptep)) )
 12.1874 +    {
 12.1875 +        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
 12.1876 +        /*
 12.1877 +         * Really a bug. We could read this PTE during the initial fault,
 12.1878 +         * and pagetables can't have changed meantime. XXX Multi-CPU guests?
 12.1879 +         */
 12.1880 +        BUG();
 12.1881 +    }
 12.1882 +    PTWR_PRINTK("[%c] disconnected_l1va at %p is %08lx\n",
 12.1883 +                PTWR_PRINT_WHICH, ptep, pte);
 12.1884 +    pte &= ~_PAGE_RW;
 12.1885 +
 12.1886 +    if ( unlikely(shadow_mode(d)) )
 12.1887 +    {
 12.1888 +        /* Write-protect the p.t. page in the shadow page table. */
 12.1889 +        l1pte_propagate_from_guest(d, &pte, &spte);
 12.1890 +        __put_user(
 12.1891 +            spte, (unsigned long *)&shadow_linear_pg_table[l1va>>PAGE_SHIFT]);
 12.1892 +
 12.1893 +        /* Is the p.t. page itself shadowed? Map it into Xen space if so. */
 12.1894 +        sstat = get_shadow_status(d, pte >> PAGE_SHIFT);
 12.1895 +        if ( sstat & PSH_shadowed )
 12.1896 +            sl1e = map_domain_mem((sstat & PSH_pfn_mask) << PAGE_SHIFT);
 12.1897 +    }
 12.1898 +
 12.1899 +    /* Write-protect the p.t. page in the guest page table. */
 12.1900 +    if ( unlikely(__put_user(pte, ptep)) )
 12.1901 +    {
 12.1902 +        MEM_LOG("ptwr: Could not update pte at %p\n", ptep);
 12.1903 +        /*
 12.1904 +         * Really a bug. We could write this PTE during the initial fault,
 12.1905 +         * and pagetables can't have changed meantime. XXX Multi-CPU guests?
 12.1906 +         */
 12.1907 +        BUG();
 12.1908 +    }
 12.1909 +
 12.1910 +    /* Ensure that there are no stale writable mappings in any TLB. */
 12.1911 +    /* NB. INVLPG is a serialising instruction: flushes pending updates. */
 12.1912 +#if 1
 12.1913 +    __flush_tlb_one(l1va); /* XXX Multi-CPU guests? */
 12.1914 +#else
 12.1915 +    flush_tlb_all();
 12.1916 +#endif
 12.1917 +    PTWR_PRINTK("[%c] disconnected_l1va at %p now %08lx\n",
 12.1918 +                PTWR_PRINT_WHICH, ptep, pte);
 12.1919 +
 12.1920 +    /*
 12.1921 +     * STEP 2. Validate any modified PTEs.
 12.1922 +     */
 12.1923 +
 12.1924 +    pl1e = ptwr_info[cpu].ptinfo[which].pl1e;
 12.1925 +    for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
 12.1926 +    {
 12.1927 +        ol1e = ptwr_info[cpu].ptinfo[which].page[i];
 12.1928 +        nl1e = pl1e[i];
 12.1929 +
 12.1930 +        if ( likely(l1_pgentry_val(ol1e) == l1_pgentry_val(nl1e)) )
 12.1931 +            continue;
 12.1932 +
 12.1933 +        /*
 12.1934 +         * Fast path for PTEs that have merely been write-protected
 12.1935 +         * (e.g., during a Unix fork()). A strict reduction in privilege.
 12.1936 +         */
 12.1937 +        if ( likely(l1_pgentry_val(ol1e) == (l1_pgentry_val(nl1e)|_PAGE_RW)) )
 12.1938 +        {
 12.1939 +            if ( likely(l1_pgentry_val(nl1e) & _PAGE_PRESENT) )
 12.1940 +            {
 12.1941 +                if ( unlikely(sl1e != NULL) )
 12.1942 +                    l1pte_propagate_from_guest(
 12.1943 +                        d, &l1_pgentry_val(nl1e), 
 12.1944 +                        &l1_pgentry_val(sl1e[i]));
 12.1945 +                put_page_type(&frame_table[l1_pgentry_to_pagenr(nl1e)]);
 12.1946 +            }
 12.1947 +            continue;
 12.1948 +        }
 12.1949 +
 12.1950 +        if ( unlikely(!get_page_from_l1e(nl1e, d)) )
 12.1951 +        {
 12.1952 +            MEM_LOG("ptwr: Could not re-validate l1 page\n");
 12.1953 +            /*
 12.1954 +             * Make the remaining p.t's consistent before crashing, so the
 12.1955 +             * reference counts are correct.
 12.1956 +             */
 12.1957 +            memcpy(&pl1e[i], &ptwr_info[cpu].ptinfo[which].page[i],
 12.1958 +                   (ENTRIES_PER_L1_PAGETABLE - i) * sizeof(l1_pgentry_t));
 12.1959 +            unmap_domain_mem(pl1e);
 12.1960 +            ptwr_info[cpu].ptinfo[which].l1va = 0;
 12.1961 +            UNLOCK_BIGLOCK(d);
 12.1962 +            domain_crash();
 12.1963 +        }
 12.1964 +        
 12.1965 +        if ( unlikely(sl1e != NULL) )
 12.1966 +            l1pte_propagate_from_guest(
 12.1967 +                d, &l1_pgentry_val(nl1e), &l1_pgentry_val(sl1e[i]));
 12.1968 +
 12.1969 +        if ( unlikely(l1_pgentry_val(ol1e) & _PAGE_PRESENT) )
 12.1970 +            put_page_from_l1e(ol1e, d);
 12.1971 +    }
 12.1972 +    unmap_domain_mem(pl1e);
 12.1973 +
 12.1974 +    /*
 12.1975 +     * STEP 3. Reattach the L1 p.t. page into the current address space.
 12.1976 +     */
 12.1977 +
 12.1978 +    if ( (which == PTWR_PT_ACTIVE) && likely(!shadow_mode(d)) )
 12.1979 +    {
 12.1980 +        pl2e = &linear_l2_table[ptwr_info[cpu].ptinfo[which].l2_idx];
 12.1981 +        *pl2e = mk_l2_pgentry(l2_pgentry_val(*pl2e) | _PAGE_PRESENT); 
 12.1982 +    }
 12.1983 +
 12.1984 +    /*
 12.1985 +     * STEP 4. Final tidy-up.
 12.1986 +     */
 12.1987 +
 12.1988 +    ptwr_info[cpu].ptinfo[which].l1va = 0;
 12.1989 +
 12.1990 +    if ( unlikely(sl1e != NULL) )
 12.1991 +    {
 12.1992 +        unmap_domain_mem(sl1e);
 12.1993 +        put_shadow_status(d);
 12.1994 +    }
 12.1995 +}
 12.1996 +
 12.1997 +/* Write page fault handler: check if guest is trying to modify a PTE. */
 12.1998 +int ptwr_do_page_fault(unsigned long addr)
 12.1999 +{
 12.2000 +    unsigned long    pte, pfn, l2e;
 12.2001 +    struct pfn_info *page;
 12.2002 +    l2_pgentry_t    *pl2e;
 12.2003 +    int              which, cpu = smp_processor_id();
 12.2004 +    u32              l2_idx;
 12.2005 +
 12.2006 +#ifdef __x86_64__
 12.2007 +    return 0; /* Writable pagetables need fixing for x86_64. */
 12.2008 +#endif
 12.2009 +
 12.2010 +    /*
 12.2011 +     * Attempt to read the PTE that maps the VA being accessed. By checking for
 12.2012 +     * PDE validity in the L2 we avoid many expensive fixups in __get_user().
 12.2013 +     */
 12.2014 +    if ( !(l2_pgentry_val(linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) &
 12.2015 +           _PAGE_PRESENT) ||
 12.2016 +         __get_user(pte, (unsigned long *)&linear_pg_table[addr>>PAGE_SHIFT]) )
 12.2017 +    {
 12.2018 +        return 0;
 12.2019 +    }
 12.2020 +
 12.2021 +    pfn  = pte >> PAGE_SHIFT;
 12.2022 +    page = &frame_table[pfn];
 12.2023 +
 12.2024 +    /* We are looking only for read-only mappings of p.t. pages. */
 12.2025 +    if ( ((pte & (_PAGE_RW | _PAGE_PRESENT)) != _PAGE_PRESENT) ||
 12.2026 +         ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) )
 12.2027 +    {
 12.2028 +        return 0;
 12.2029 +    }
 12.2030 +    
 12.2031 +    /* Get the L2 index at which this L1 p.t. is always mapped. */
 12.2032 +    l2_idx = page->u.inuse.type_info & PGT_va_mask;
 12.2033 +    if ( unlikely(l2_idx >= PGT_va_unknown) )
 12.2034 +    {
 12.2035 +        domain_crash(); /* Urk! This L1 is mapped in multiple L2 slots! */
 12.2036 +    }
 12.2037 +    l2_idx >>= PGT_va_shift;
 12.2038 +
 12.2039 +    if ( l2_idx == (addr >> L2_PAGETABLE_SHIFT) )
 12.2040 +    {
 12.2041 +        MEM_LOG("PTWR failure! Pagetable maps itself at %08lx\n", addr);
 12.2042 +        domain_crash();
 12.2043 +    }
 12.2044 +
 12.2045 +    /*
 12.2046 +     * Is the L1 p.t. mapped into the current address space? If so we call it
 12.2047 +     * an ACTIVE p.t., otherwise it is INACTIVE.
 12.2048 +     */
 12.2049 +    pl2e = &linear_l2_table[l2_idx];
 12.2050 +    l2e  = l2_pgentry_val(*pl2e);
 12.2051 +    which = PTWR_PT_INACTIVE;
 12.2052 +    if ( (l2e >> PAGE_SHIFT) == pfn )
 12.2053 +    {
 12.2054 +        /* Check the PRESENT bit to set ACTIVE. */
 12.2055 +        if ( likely(l2e & _PAGE_PRESENT) )
 12.2056 +            which = PTWR_PT_ACTIVE;
 12.2057 +        else {
 12.2058 +            /*
 12.2059 +             * If the PRESENT bit is clear, we may be conflicting with
 12.2060 +             * the current ACTIVE p.t. (it may be the same p.t. mapped
 12.2061 +             * at another virt addr).
 12.2062 +             * The ptwr_flush call below will restore the PRESENT bit.
 12.2063 +             */
 12.2064 +            if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va &&
 12.2065 +                 l2_idx == ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l2_idx )
 12.2066 +                which = PTWR_PT_ACTIVE;
 12.2067 +        }
 12.2068 +    }
 12.2069 +    
 12.2070 +    PTWR_PRINTK("[%c] page_fault on l1 pt at va %08lx, pt for %08x, "
 12.2071 +                "pfn %08lx\n", PTWR_PRINT_WHICH,
 12.2072 +                addr, l2_idx << L2_PAGETABLE_SHIFT, pfn);
 12.2073 +    
 12.2074 +    /*
 12.2075 +     * We only allow one ACTIVE and one INACTIVE p.t. to be updated at at 
 12.2076 +     * time. If there is already one, we must flush it out.
 12.2077 +     */
 12.2078 +    if ( ptwr_info[cpu].ptinfo[which].l1va )
 12.2079 +        ptwr_flush(which);
 12.2080 +
 12.2081 +    ptwr_info[cpu].ptinfo[which].l1va   = addr | 1;
 12.2082 +    ptwr_info[cpu].ptinfo[which].l2_idx = l2_idx;
 12.2083 +    
 12.2084 +    /* For safety, disconnect the L1 p.t. page from current space. */
 12.2085 +    if ( (which == PTWR_PT_ACTIVE) && 
 12.2086 +         likely(!shadow_mode(current->domain)) )
 12.2087 +    {
 12.2088 +        *pl2e = mk_l2_pgentry(l2e & ~_PAGE_PRESENT);
 12.2089 +#if 1
 12.2090 +        flush_tlb(); /* XXX Multi-CPU guests? */
 12.2091 +#else
 12.2092 +        flush_tlb_all();
 12.2093 +#endif
 12.2094 +    }
 12.2095 +    
 12.2096 +    /* Temporarily map the L1 page, and make a copy of it. */
 12.2097 +    ptwr_info[cpu].ptinfo[which].pl1e = map_domain_mem(pfn << PAGE_SHIFT);
 12.2098 +    memcpy(ptwr_info[cpu].ptinfo[which].page,
 12.2099 +           ptwr_info[cpu].ptinfo[which].pl1e,
 12.2100 +           ENTRIES_PER_L1_PAGETABLE * sizeof(l1_pgentry_t));
 12.2101 +    
 12.2102 +    /* Finally, make the p.t. page writable by the guest OS. */
 12.2103 +    pte |= _PAGE_RW;
 12.2104 +    PTWR_PRINTK("[%c] update %p pte to %08lx\n", PTWR_PRINT_WHICH,
 12.2105 +                &linear_pg_table[addr>>PAGE_SHIFT], pte);
 12.2106 +    if ( unlikely(__put_user(pte, (unsigned long *)
 12.2107 +                             &linear_pg_table[addr>>PAGE_SHIFT])) )
 12.2108 +    {
 12.2109 +        MEM_LOG("ptwr: Could not update pte at %p\n", (unsigned long *)
 12.2110 +                &linear_pg_table[addr>>PAGE_SHIFT]);
 12.2111 +        /* Toss the writable pagetable state and crash. */
 12.2112 +        unmap_domain_mem(ptwr_info[cpu].ptinfo[which].pl1e);
 12.2113 +        ptwr_info[cpu].ptinfo[which].l1va = 0;
 12.2114 +        domain_crash();
 12.2115 +    }
 12.2116 +    
 12.2117 +    return EXCRET_fault_fixed;
 12.2118 +}
 12.2119 +
 12.2120 +static __init int ptwr_init(void)
 12.2121 +{
 12.2122 +    int i;
 12.2123 +
 12.2124 +    for ( i = 0; i < smp_num_cpus; i++ )
 12.2125 +    {
 12.2126 +        ptwr_info[i].ptinfo[PTWR_PT_ACTIVE].page =
 12.2127 +            (void *)alloc_xenheap_page();
 12.2128 +        ptwr_info[i].ptinfo[PTWR_PT_INACTIVE].page =
 12.2129 +            (void *)alloc_xenheap_page();
 12.2130 +    }
 12.2131 +
 12.2132 +    return 0;
 12.2133 +}
 12.2134 +__initcall(ptwr_init);
 12.2135 +
 12.2136 +
 12.2137 +
 12.2138 +
 12.2139 +/************************************************************************/
 12.2140 +/************************************************************************/
 12.2141 +/************************************************************************/
 12.2142 +
 12.2143 +#ifndef NDEBUG
 12.2144 +
 12.2145 +void ptwr_status(void)
 12.2146 +{
 12.2147 +    unsigned long pte, *ptep, pfn;
 12.2148 +    struct pfn_info *page;
 12.2149 +    int cpu = smp_processor_id();
 12.2150 +
 12.2151 +    ptep = (unsigned long *)&linear_pg_table
 12.2152 +        [ptwr_info[cpu].ptinfo[PTWR_PT_INACTIVE].l1va>>PAGE_SHIFT];
 12.2153 +
 12.2154 +    if ( __get_user(pte, ptep) ) {
 12.2155 +        MEM_LOG("ptwr: Could not read pte at %p\n", ptep);
 12.2156 +        domain_crash();
 12.2157 +    }
 12.2158 +
 12.2159 +    pfn = pte >> PAGE_SHIFT;
 12.2160 +    page = &frame_table[pfn];
 12.2161 +    printk("need to alloc l1 page %p\n", page);
 12.2162 +    /* make pt page writable */
 12.2163 +    printk("need to make read-only l1-page at %p is %08lx\n",
 12.2164 +           ptep, pte);
 12.2165 +
 12.2166 +    if ( ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va == 0 )
 12.2167 +        return;
 12.2168 +
 12.2169 +    if ( __get_user(pte, (unsigned long *)
 12.2170 +                    ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va) ) {
 12.2171 +        MEM_LOG("ptwr: Could not read pte at %p\n", (unsigned long *)
 12.2172 +                ptwr_info[cpu].ptinfo[PTWR_PT_ACTIVE].l1va);
 12.2173 +        domain_crash();
 12.2174 +    }
 12.2175 +    pfn = pte >> PAGE_SHIFT;
 12.2176 +    page = &frame_table[pfn];
 12.2177 +}
 12.2178 +
 12.2179 +void audit_domain(struct domain *d)
 12.2180 +{
 12.2181 +    int ttot=0, ctot=0, io_mappings=0, lowmem_mappings=0;
 12.2182 +
 12.2183 +    void adjust (struct pfn_info *page, int dir, int adjtype)
 12.2184 +    {
 12.2185 +        int count = page->count_info & PGC_count_mask;
 12.2186 +
 12.2187 +        if ( adjtype )
 12.2188 +        {
 12.2189 +            int tcount = page->u.inuse.type_info & PGT_count_mask;
 12.2190 +            
 12.2191 +            ttot++;
 12.2192 +
 12.2193 +            tcount += dir;
 12.2194 +
 12.2195 +            if ( tcount < 0 )
 12.2196 +            {
 12.2197 +                /* This will only come out once. */
 12.2198 +                printk("Audit %d: type count whent below zero pfn=%x "
 12.2199 +                       "taf=%x otaf=%x\n",
 12.2200 +                       d->id, page-frame_table,
 12.2201 +                       page->u.inuse.type_info,
 12.2202 +                       page->tlbflush_timestamp);
 12.2203 +            }
 12.2204 +            
 12.2205 +            page->u.inuse.type_info =
 12.2206 +                (page->u.inuse.type_info & ~PGT_count_mask) | 
 12.2207 +                (tcount & PGT_count_mask);
 12.2208 +        }
 12.2209 +
 12.2210 +        ctot++;
 12.2211 +        count += dir;
 12.2212 +        if ( count < 0 )
 12.2213 +        {
 12.2214 +            /* This will only come out once. */
 12.2215 +            printk("Audit %d: general count whent below zero pfn=%x "
 12.2216 +                   "taf=%x otaf=%x\n",
 12.2217 +                   d->id, page-frame_table,
 12.2218 +                   page->u.inuse.type_info,
 12.2219 +                   page->tlbflush_timestamp);
 12.2220 +        }
 12.2221 +            
 12.2222 +        page->count_info =
 12.2223 +            (page->count_info & ~PGC_count_mask) | 
 12.2224 +            (count & PGC_count_mask);            
 12.2225 +
 12.2226 +    }
 12.2227 +
 12.2228 +    void scan_for_pfn(struct domain *d, unsigned long xpfn)
 12.2229 +    {
 12.2230 +        unsigned long pfn, *pt;
 12.2231 +        struct list_head *list_ent;
 12.2232 +        struct pfn_info *page;
 12.2233 +        int i;
 12.2234 +
 12.2235 +        list_ent = d->page_list.next;
 12.2236 +        for ( i = 0; (list_ent != &d->page_list); i++ )
 12.2237 +        {
 12.2238 +            pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
 12.2239 +            page = &frame_table[pfn];
 12.2240 +            
 12.2241 +            switch ( page->u.inuse.type_info & PGT_type_mask )
 12.2242 +            {
 12.2243 +            case PGT_l1_page_table:
 12.2244 +            case PGT_l2_page_table:
 12.2245 +                pt = map_domain_mem(pfn<<PAGE_SHIFT);
 12.2246 +                for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
 12.2247 +                    if ( (pt[i] & _PAGE_PRESENT) &&
 12.2248 +                         ((pt[i] >> PAGE_SHIFT) == xpfn) )
 12.2249 +                        printk("     found dom=%d i=%x pfn=%lx t=%x c=%x\n",
 12.2250 +                               d->id, i, pfn, page->u.inuse.type_info,
 12.2251 +                               page->count_info);
 12.2252 +                unmap_domain_mem(pt);           
 12.2253 +            }
 12.2254 +
 12.2255 +            list_ent = frame_table[pfn].list.next;
 12.2256 +        }
 12.2257 +
 12.2258 +    }
 12.2259 +
 12.2260 +    void scan_for_pfn_remote(unsigned long xpfn)
 12.2261 +    {
 12.2262 +        struct domain *e;
 12.2263 +        for_each_domain ( e )
 12.2264 +            scan_for_pfn( e, xpfn );            
 12.2265 +    }   
 12.2266 +
 12.2267 +    int i;
 12.2268 +    unsigned long pfn;
 12.2269 +    struct list_head *list_ent;
 12.2270 +    struct pfn_info *page;
 12.2271 +
 12.2272 +    if ( d != current->domain )
 12.2273 +        domain_pause(d);
 12.2274 +    synchronise_pagetables(~0UL);
 12.2275 +
 12.2276 +    printk("pt base=%lx sh_info=%x\n",
 12.2277 +           pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT,
 12.2278 +           virt_to_page(d->shared_info)-frame_table);
 12.2279 +           
 12.2280 +    spin_lock(&d->page_alloc_lock);
 12.2281 +
 12.2282 +    /* PHASE 0 */
 12.2283 +
 12.2284 +    list_ent = d->page_list.next;
 12.2285 +    for ( i = 0; (list_ent != &d->page_list); i++ )
 12.2286 +    {
 12.2287 +        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
 12.2288 +        page = &frame_table[pfn];
 12.2289 +
 12.2290 +        if ( page_get_owner(page) != d )
 12.2291 +            BUG();
 12.2292 +
 12.2293 +        if ( (page->u.inuse.type_info & PGT_count_mask) >
 12.2294 +             (page->count_info & PGC_count_mask) )
 12.2295 +            printk("taf > caf %x %x pfn=%lx\n",
 12.2296 +                   page->u.inuse.type_info, page->count_info, pfn );
 12.2297 + 
 12.2298 +#if 0   /* SYSV shared memory pages plus writeable files. */
 12.2299 +        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && 
 12.2300 +             (page->u.inuse.type_info & PGT_count_mask) > 1 )
 12.2301 +        {
 12.2302 +            printk("writeable page with type count >1: pfn=%lx t=%x c=%x\n",
 12.2303 +                  pfn,
 12.2304 +                  page->u.inuse.type_info,
 12.2305 +                  page->count_info );
 12.2306 +            scan_for_pfn_remote(pfn);
 12.2307 +        }
 12.2308 +#endif
 12.2309 +        if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_none && 
 12.2310 +             (page->u.inuse.type_info & PGT_count_mask) > 1 )
 12.2311 +        {
 12.2312 +            printk("normal page with type count >1: pfn=%lx t=%x c=%x\n",
 12.2313 +                  pfn,
 12.2314 +                  page->u.inuse.type_info,
 12.2315 +                  page->count_info );
 12.2316 +        }
 12.2317 +
 12.2318 +        /* Use tlbflush_timestamp to store original type_info. */
 12.2319 +        page->tlbflush_timestamp = page->u.inuse.type_info;
 12.2320 +
 12.2321 +        list_ent = frame_table[pfn].list.next;
 12.2322 +    }
 12.2323 +
 12.2324 +
 12.2325 +    /* PHASE 1 */
 12.2326 +
 12.2327 +    adjust(&frame_table[pagetable_val(d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], -1, 1);
 12.2328 +
 12.2329 +    list_ent = d->page_list.next;
 12.2330 +    for ( i = 0; (list_ent != &d->page_list); i++ )
 12.2331 +    {
 12.2332 +        unsigned long *pt;
 12.2333 +        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;       
 12.2334 +        page = &frame_table[pfn];
 12.2335 +
 12.2336 +        if ( page_get_owner(page) != d )
 12.2337 +            BUG();
 12.2338 +
 12.2339 +        switch ( page->u.inuse.type_info & PGT_type_mask )
 12.2340 +        {
 12.2341 +        case PGT_l2_page_table:
 12.2342 +
 12.2343 +            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
 12.2344 +                printk("Audit %d: L2 not validated %x\n",
 12.2345 +                       d->id, page->u.inuse.type_info);
 12.2346 +
 12.2347 +            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
 12.2348 +                printk("Audit %d: L2 not pinned %x\n",
 12.2349 +                       d->id, page->u.inuse.type_info);
 12.2350 +            else
 12.2351 +                adjust( page, -1, 1 );
 12.2352 +           
 12.2353 +            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 12.2354 +
 12.2355 +            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
 12.2356 +            {
 12.2357 +                if ( pt[i] & _PAGE_PRESENT )
 12.2358 +                {
 12.2359 +                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 12.2360 +                    struct pfn_info *l1page = &frame_table[l1pfn];
 12.2361 +
 12.2362 +                    if ( page_get_owner(l1page) != d )
 12.2363 +                    {
 12.2364 +                        printk("L2: Skip bizarre page belonging to other "
 12.2365 +                               "dom %p\n", page_get_owner(l1page));
 12.2366 +                        continue;
 12.2367 +                    }
 12.2368 +                    
 12.2369 +                    if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
 12.2370 +                         PGT_l2_page_table )
 12.2371 +                        printk("Audit %d: [%x] Found %s Linear PT "
 12.2372 +                               "t=%x pfn=%lx\n", d->id, i, 
 12.2373 +                               (l1pfn==pfn) ? "Self" : "Other",
 12.2374 +                               l1page->u.inuse.type_info,
 12.2375 +                               l1pfn);
 12.2376 +                    else if ( (l1page->u.inuse.type_info & PGT_type_mask) !=
 12.2377 +                              PGT_l1_page_table )
 12.2378 +                        printk("Audit %d: [%x] Expected L1 t=%x pfn=%lx\n",
 12.2379 +                               d->id, i,
 12.2380 +                               l1page->u.inuse.type_info,
 12.2381 +                               l1pfn);
 12.2382 +
 12.2383 +                    adjust(l1page, -1, 1);
 12.2384 +                }
 12.2385 +            }
 12.2386 +
 12.2387 +            unmap_domain_mem(pt);
 12.2388 +
 12.2389 +            break;
 12.2390 +
 12.2391 +
 12.2392 +        case PGT_l1_page_table:
 12.2393 +            
 12.2394 +            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
 12.2395 +                adjust( page, -1, 1 );
 12.2396 +
 12.2397 +            if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated )
 12.2398 +                printk("Audit %d: L1 not validated %x\n",
 12.2399 +                       d->id, page->u.inuse.type_info);
 12.2400 +#if 0
 12.2401 +            if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned )
 12.2402 +                printk("Audit %d: L1 not pinned %x\n",
 12.2403 +                       d->id, page->u.inuse.type_info);
 12.2404 +#endif
 12.2405 +            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 12.2406 +
 12.2407 +            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
 12.2408 +            {
 12.2409 +                if ( pt[i] & _PAGE_PRESENT )
 12.2410 +                {
 12.2411 +                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 12.2412 +                    struct pfn_info *l1page = &frame_table[l1pfn];
 12.2413 +
 12.2414 +                    if ( l1pfn < 0x100 )
 12.2415 +                    {
 12.2416 +                        lowmem_mappings++;
 12.2417 +                        continue;
 12.2418 +                    }
 12.2419 +
 12.2420 +                    if ( l1pfn > max_page )
 12.2421 +                    {
 12.2422 +                        io_mappings++;
 12.2423 +                        continue;
 12.2424 +                    }
 12.2425 +
 12.2426 +                    if ( pt[i] & _PAGE_RW )
 12.2427 +                    {
 12.2428 +
 12.2429 +                        if ( (l1page->u.inuse.type_info & PGT_type_mask) ==
 12.2430 +                             PGT_l1_page_table ||
 12.2431 +                             (l1page->u.inuse.type_info & PGT_type_mask) ==
 12.2432 +                             PGT_l2_page_table )
 12.2433 +                            printk("Audit %d: [%x] Ilegal RW t=%x pfn=%lx\n",
 12.2434 +                                   d->id, i,
 12.2435 +                                   l1page->u.inuse.type_info,
 12.2436 +                                   l1pfn);
 12.2437 +
 12.2438 +                    }
 12.2439 +
 12.2440 +                    if ( page_get_owner(l1page) != d )
 12.2441 +                    {
 12.2442 +                        printk("Audit %d: [%lx,%x] Skip foreign page dom=%p "
 12.2443 +                               "pfn=%lx c=%08x t=%08x m2p=%lx\n",
 12.2444 +                               d->id, pfn, i,
 12.2445 +                               page_get_owner(l1page),
 12.2446 +                               l1pfn,
 12.2447 +                               l1page->count_info,
 12.2448 +                               l1page->u.inuse.type_info,
 12.2449 +                               machine_to_phys_mapping[l1pfn]);    
 12.2450 +                        continue;
 12.2451 +                    }
 12.2452 +
 12.2453 +                    adjust(l1page, -1, 0);
 12.2454 +                }
 12.2455 +            }
 12.2456 +
 12.2457 +            unmap_domain_mem(pt);
 12.2458 +
 12.2459 +            break;
 12.2460 +        }       
 12.2461 +
 12.2462 +        list_ent = frame_table[pfn].list.next;
 12.2463 +    }
 12.2464 +
 12.2465 +    if ( (io_mappings > 0) || (lowmem_mappings > 0) )
 12.2466 +        printk("Audit %d: Found %d lowmem mappings and %d io mappings\n",
 12.2467 +               d->id, lowmem_mappings, io_mappings);
 12.2468 +
 12.2469 +    /* PHASE 2 */
 12.2470 +
 12.2471 +    ctot = ttot = 0;
 12.2472 +    list_ent = d->page_list.next;
 12.2473 +    for ( i = 0; (list_ent != &d->page_list); i++ )
 12.2474 +    {
 12.2475 +        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
 12.2476 +        page = &frame_table[pfn];
 12.2477 +
 12.2478 +        switch ( page->u.inuse.type_info & PGT_type_mask)
 12.2479 +        {
 12.2480 +        case PGT_l1_page_table:
 12.2481 +        case PGT_l2_page_table:
 12.2482 +            if ( (page->u.inuse.type_info & PGT_count_mask) != 0 )
 12.2483 +            {
 12.2484 +                printk("Audit %d: type count!=0 t=%x ot=%x c=%x pfn=%lx\n",
 12.2485 +                       d->id, page->u.inuse.type_info, 
 12.2486 +                       page->tlbflush_timestamp,
 12.2487 +                       page->count_info, pfn );
 12.2488 +                scan_for_pfn_remote(pfn);
 12.2489 +            }
 12.2490 +        default:
 12.2491 +            if ( (page->count_info & PGC_count_mask) != 1 )
 12.2492 +            {
 12.2493 +                printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x pfn=%lx\n",
 12.2494 +                       d->id, 
 12.2495 +                       page->count_info,
 12.2496 +                       page->u.inuse.type_info, 
 12.2497 +                       page->tlbflush_timestamp, pfn );
 12.2498 +                scan_for_pfn_remote(pfn);
 12.2499 +            }
 12.2500 +            break;
 12.2501 +        }
 12.2502 +
 12.2503 +        list_ent = frame_table[pfn].list.next;
 12.2504 +    }
 12.2505 +
 12.2506 +    /* PHASE 3 */
 12.2507 +    list_ent = d->page_list.next;
 12.2508 +    for ( i = 0; (list_ent != &d->page_list); i++ )
 12.2509 +    {
 12.2510 +        unsigned long *pt;
 12.2511 +        pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
 12.2512 +        page = &frame_table[pfn];
 12.2513 +
 12.2514 +        switch ( page->u.inuse.type_info & PGT_type_mask )
 12.2515 +        {
 12.2516 +        case PGT_l2_page_table:
 12.2517 +            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
 12.2518 +                adjust( page, 1, 1 );          
 12.2519 +
 12.2520 +            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 12.2521 +
 12.2522 +            for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
 12.2523 +            {
 12.2524 +                if ( pt[i] & _PAGE_PRESENT )
 12.2525 +                {
 12.2526 +                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 12.2527 +                    struct pfn_info *l1page;
 12.2528 +
 12.2529 +                    if (l1pfn>max_page)
 12.2530 +                        continue;
 12.2531 +
 12.2532 +                    l1page = &frame_table[l1pfn];
 12.2533 +
 12.2534 +                    if ( page_get_owner(l1page) == d )
 12.2535 +                        adjust(l1page, 1, 1);
 12.2536 +                }
 12.2537 +            }
 12.2538 +
 12.2539 +            unmap_domain_mem(pt);
 12.2540 +            break;
 12.2541 +
 12.2542 +        case PGT_l1_page_table:
 12.2543 +            if ( (page->u.inuse.type_info & PGT_pinned) == PGT_pinned )
 12.2544 +                adjust( page, 1, 1 );
 12.2545 +
 12.2546 +            pt = map_domain_mem( pfn<<PAGE_SHIFT );
 12.2547 +
 12.2548 +            for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
 12.2549 +            {
 12.2550 +                if ( pt[i] & _PAGE_PRESENT )
 12.2551 +                {
 12.2552 +                    unsigned long l1pfn = pt[i]>>PAGE_SHIFT;
 12.2553 +                    struct pfn_info *l1page;
 12.2554 +
 12.2555 +                    if (l1pfn>max_page)
 12.2556 +                        continue;
 12.2557 +
 12.2558 +                    l1page = &frame_table[l1pfn];
 12.2559 +
 12.2560 +                    if ( (page_get_owner(l1page) != d) ||
 12.2561 +                         (l1pfn < 0x100) || (l1pfn > max_page) )
 12.2562 +                        continue;
 12.2563 +
 12.2564 +                    adjust(l1page, 1, 0);
 12.2565 +                }
 12.2566 +            }
 12.2567 +
 12.2568 +            unmap_domain_mem(pt);
 12.2569 +            break;
 12.2570 +        }
 12.2571 +
 12.2572 +
 12.2573 +        page->tlbflush_timestamp = 0;
 12.2574 +
 12.2575 +        list_ent = frame_table[pfn].list.next;
 12.2576 +    }
 12.2577 +
 12.2578 +    spin_unlock(&d->page_alloc_lock);
 12.2579 +
 12.2580 +    adjust(&frame_table[pagetable_val(
 12.2581 +        d->exec_domain[0]->arch.pagetable)>>PAGE_SHIFT], 1, 1);
 12.2582 +
 12.2583 +    printk("Audit %d: Done. ctot=%d ttot=%d\n", d->id, ctot, ttot );
 12.2584 +
 12.2585 +    if ( d != current->domain )
 12.2586 +        domain_unpause(d);
 12.2587 +}
 12.2588 +
 12.2589 +void audit_domains(void)
 12.2590 +{
 12.2591 +    struct domain *d;
 12.2592 +    for_each_domain ( d )
 12.2593 +        audit_domain(d);
 12.2594 +}
 12.2595 +
 12.2596 +void audit_domains_key(unsigned char key)
 12.2597 +{
 12.2598 +    audit_domains();
 12.2599 +}
 12.2600 +
 12.2601 +#endif
    13.1 --- a/xen/arch/x86/traps.c	Tue Feb 08 12:27:23 2005 +0000
    13.2 +++ b/xen/arch/x86/traps.c	Tue Feb 08 15:13:51 2005 +0000
    13.3 @@ -528,7 +528,7 @@ asmlinkage int do_general_protection(str
    13.4  
    13.5      /* Emulate some simple privileged instructions when exec'ed in ring 1. */
    13.6      if ( (regs->error_code == 0) &&
    13.7 -         RING_1(regs) &&
    13.8 +         GUESTOS_FAULT(regs) &&
    13.9           emulate_privileged_op(regs) )
   13.10          return 0;
   13.11  
    14.1 --- a/xen/common/dom_mem_ops.c	Tue Feb 08 12:27:23 2005 +0000
    14.2 +++ b/xen/common/dom_mem_ops.c	Tue Feb 08 15:13:51 2005 +0000
    14.3 @@ -122,7 +122,7 @@ free_dom_mem(struct domain *d,
    14.4  long
    14.5  do_dom_mem_op(unsigned long  op, 
    14.6                unsigned long *extent_list, 
    14.7 -              unsigned long  nr_extents,
    14.8 +              unsigned int   nr_extents,
    14.9                unsigned int   extent_order,
   14.10                domid_t        domid)
   14.11  {
   14.12 @@ -133,8 +133,7 @@ do_dom_mem_op(unsigned long  op,
   14.13      start_extent  = op >> START_EXTENT_SHIFT;
   14.14      op           &= (1 << START_EXTENT_SHIFT) - 1;
   14.15  
   14.16 -    if ( unlikely(start_extent > nr_extents) || 
   14.17 -         unlikely(nr_extents > ~0U) ) /* can pack into a uint? */
   14.18 +    if ( unlikely(start_extent > nr_extents) )
   14.19          return -EINVAL;
   14.20  
   14.21      if ( likely(domid == DOMID_SELF) )
   14.22 @@ -150,13 +149,11 @@ do_dom_mem_op(unsigned long  op,
   14.23      {
   14.24      case MEMOP_increase_reservation:
   14.25          rc = alloc_dom_mem(
   14.26 -            d, extent_list, start_extent, 
   14.27 -            (unsigned int)nr_extents, extent_order);
   14.28 +            d, extent_list, start_extent, nr_extents, extent_order);
   14.29          break;
   14.30      case MEMOP_decrease_reservation:
   14.31          rc = free_dom_mem(
   14.32 -            d, extent_list, start_extent, 
   14.33 -            (unsigned int)nr_extents, extent_order);
   14.34 +            d, extent_list, start_extent, nr_extents, extent_order);
   14.35          break;
   14.36      default:
   14.37          rc = -ENOSYS;
    15.1 --- a/xen/include/asm-x86/page.h	Tue Feb 08 12:27:23 2005 +0000
    15.2 +++ b/xen/include/asm-x86/page.h	Tue Feb 08 15:13:51 2005 +0000
    15.3 @@ -99,6 +99,13 @@ typedef struct { unsigned long l4_lo; } 
    15.4    (((_a) >> L4_PAGETABLE_SHIFT) & (ENTRIES_PER_L4_PAGETABLE - 1))
    15.5  #endif
    15.6  
    15.7 +/* Given a virtual address, get an entry offset into a linear page table. */
    15.8 +#if defined(__i386__)
    15.9 +#define l1_linear_offset(_a) ((_a) >> PAGE_SHIFT)
   15.10 +#elif defined(__x86_64__)
   15.11 +#define l1_linear_offset(_a) (((_a) & ((1UL << 48) - 1)) >> PAGE_SHIFT)
   15.12 +#endif
   15.13 +
   15.14  #if defined(__i386__)
   15.15  #define pagetable_t l2_pgentry_t
   15.16  #define pagetable_val(_x)  ((_x).l2_lo)
    16.1 --- a/xen/include/asm-x86/x86_32/regs.h	Tue Feb 08 12:27:23 2005 +0000
    16.2 +++ b/xen/include/asm-x86/x86_32/regs.h	Tue Feb 08 15:13:51 2005 +0000
    16.3 @@ -39,4 +39,6 @@ struct xen_regs
    16.4  #define RING_2(_r)    (((_r)->cs & 3) == 2)
    16.5  #define RING_3(_r)    (((_r)->cs & 3) == 3)
    16.6  
    16.7 +#define GUESTOS_FAULT(_r) (!VM86_MODE(_r) && RING_1(_r))
    16.8 +
    16.9  #endif
    17.1 --- a/xen/include/asm-x86/x86_64/regs.h	Tue Feb 08 12:27:23 2005 +0000
    17.2 +++ b/xen/include/asm-x86/x86_64/regs.h	Tue Feb 08 15:13:51 2005 +0000
    17.3 @@ -36,4 +36,6 @@ struct xen_regs
    17.4  #define RING_2(_r)    (((_r)->cs & 3) == 2)
    17.5  #define RING_3(_r)    (((_r)->cs & 3) == 3)
    17.6  
    17.7 +#define GUESTOS_FAULT(_r) (!VM86_MODE(_r) && RING_3(_r))
    17.8 +
    17.9  #endif
    18.1 --- a/xen/include/asm-x86/x86_64/uaccess.h	Tue Feb 08 12:27:23 2005 +0000
    18.2 +++ b/xen/include/asm-x86/x86_64/uaccess.h	Tue Feb 08 15:13:51 2005 +0000
    18.3 @@ -15,34 +15,19 @@
    18.4  #define VERIFY_READ 0
    18.5  #define VERIFY_WRITE 1
    18.6  
    18.7 -#define __addr_ok(addr) ((unsigned long)(addr) < HYPERVISOR_VIRT_START)
    18.8 -
    18.9  /*
   18.10 - * Test whether a block of memory is a valid user space address.
   18.11 - * Returns 0 if the range is valid, nonzero otherwise.
   18.12 - *
   18.13 - * This is equivalent to the following test:
   18.14 - * ((u65)addr >= (u65)HYPERVISOR_VIRT_END) ?
   18.15 - * (((u65)addr + (u65)size) >= ((u65)1 << 64)) :
   18.16 - * (((u65)addr + (u65)size) >= ((u65)HYPERVISOR_VIRT_START))
   18.17 + * Valid if in +ve half of 48-bit address space, or above Xen-reserved area.
   18.18 + * This is also valid for range checks (addr, addr+size). As long as the
   18.19 + * start address is outside the Xen-reserved area then we will access a
   18.20 + * non-canonical address (and thus fault) before ever reaching VIRT_START.
   18.21   */
   18.22 -#define __range_not_ok(addr,size) ({ \
   18.23 -    unsigned long flag,sum; \
   18.24 -    if ((unsigned long)addr >= HYPERVISOR_VIRT_END) \
   18.25 -        asm("addq %3,%1 ; sbbq %0,%0" \
   18.26 -            :"=&r" (flag), "=r" (sum) \
   18.27 -            :"1" (addr),"g" ((long)(size))); \
   18.28 -    else \
   18.29 -        asm("addq %3,%1 ; sbbq %0,%0 ; cmpq %1,%4 ; sbbq $0,%0"  \
   18.30 -            :"=&r" (flag), "=r" (sum) \
   18.31 -            :"1" (addr),"g" ((long)(size)),"r" (HYPERVISOR_VIRT_START)); \
   18.32 -    flag; })
   18.33 +#define __addr_ok(addr) \
   18.34 +    (((unsigned long)(addr) < (1UL<<48)) || \
   18.35 +     ((unsigned long)(addr) >= HYPERVISOR_VIRT_END))
   18.36  
   18.37 -#define access_ok(type, addr, size) (__range_not_ok(addr,size) == 0)
   18.38 +#define access_ok(type, addr, size) (__addr_ok(addr))
   18.39  
   18.40 -#define array_access_ok(type,addr,count,size)                    \
   18.41 -    (likely(sizeof(count) <= 4) /* disallow 64-bit counts */ &&  \
   18.42 -     access_ok(type,addr,(unsigned long)count*(unsigned long)size))
   18.43 +#define array_access_ok(type,addr,count,size) (__addr_ok(addr))
   18.44  
   18.45  extern long __get_user_bad(void);
   18.46  extern void __put_user_bad(void);