ia64/xen-unstable

changeset 7871:3f39f030fa89

Merged.
author emellor@leeni.uk.xensource.com
date Wed Nov 16 20:33:23 2005 +0100 (2005-11-16)
parents cbf6f95e9c62 e4e1674a747d
children 0adacfa2e33f 2ce5edc0ccbd 418954da5c0f
files
line diff
     1.1 --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/ioremap.c	Wed Nov 16 20:33:12 2005 +0100
     1.2 +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/ioremap.c	Wed Nov 16 20:33:23 2005 +0100
     1.3 @@ -136,21 +136,19 @@ int direct_kernel_remap_pfn_range(unsign
     1.4  }
     1.5  EXPORT_SYMBOL(direct_kernel_remap_pfn_range);
     1.6  
     1.7 -/* FIXME: This is horribly broken on PAE */ 
     1.8  static int lookup_pte_fn(
     1.9  	pte_t *pte, struct page *pte_page, unsigned long addr, void *data)
    1.10  {
    1.11 -	unsigned long *ptep = (unsigned long *)data;
    1.12 +	uint64_t *ptep = (uint64_t *)data;
    1.13  	if (ptep)
    1.14 -		*ptep = (pfn_to_mfn(page_to_pfn(pte_page)) <<
    1.15 -			 PAGE_SHIFT) |
    1.16 -			((unsigned long)pte & ~PAGE_MASK);
    1.17 +		*ptep = ((uint64_t)pfn_to_mfn(page_to_pfn(pte_page)) <<
    1.18 +			 PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK);
    1.19  	return 0;
    1.20  }
    1.21  
    1.22  int create_lookup_pte_addr(struct mm_struct *mm, 
    1.23  			   unsigned long address,
    1.24 -			   unsigned long *ptep)
    1.25 +			   uint64_t *ptep)
    1.26  {
    1.27  	return generic_page_range(mm, address, PAGE_SIZE, lookup_pte_fn, ptep);
    1.28  }
     2.1 --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c	Wed Nov 16 20:33:12 2005 +0100
     2.2 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c	Wed Nov 16 20:33:23 2005 +0100
     2.3 @@ -770,9 +770,9 @@ void __init setup_arch(char **cmdline_p)
     2.4  		pfn_to_mfn_frame_list_list = alloc_bootmem(PAGE_SIZE);
     2.5  		HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
     2.6  		  virt_to_mfn(pfn_to_mfn_frame_list_list);
     2.7 -	       
     2.8 +
     2.9  		fpp = PAGE_SIZE/sizeof(unsigned long);
    2.10 -		for ( i=0, j=0, k=-1; i< max_pfn; i+=fpp, j++ )
    2.11 +		for ( i=0, j=0, k=-1; i< end_pfn; i+=fpp, j++ )
    2.12  		{
    2.13  			if ( (j % fpp) == 0 )
    2.14  			{
    2.15 @@ -786,9 +786,12 @@ void __init setup_arch(char **cmdline_p)
    2.16  			pfn_to_mfn_frame_list[k][j] = 
    2.17  				virt_to_mfn(&phys_to_machine_mapping[i]);
    2.18  		}
    2.19 -		HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
    2.20 +		HYPERVISOR_shared_info->arch.max_pfn = end_pfn;
    2.21 +
    2.22  	}
    2.23  
    2.24 +
    2.25 +
    2.26  	if ( ! (xen_start_info->flags & SIF_INITDOMAIN))
    2.27  	{
    2.28  		acpi_disabled = 1;
     3.1 --- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c	Wed Nov 16 20:33:12 2005 +0100
     3.2 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c	Wed Nov 16 20:33:23 2005 +0100
     3.3 @@ -412,7 +412,7 @@ static void fast_flush_area(int idx, int
     3.4  	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2];
     3.5  	unsigned int i, op = 0;
     3.6  	struct grant_handle_pair *handle;
     3.7 -	unsigned long ptep;
     3.8 +	uint64_t ptep;
     3.9  	int ret;
    3.10  
    3.11  	for ( i = 0; i < nr_pages; i++)
    3.12 @@ -427,9 +427,9 @@ static void fast_flush_area(int idx, int
    3.13  		op++;
    3.14  
    3.15  		if (create_lookup_pte_addr(
    3.16 -			blktap_vma->vm_mm,
    3.17 -			MMAP_VADDR(user_vstart, idx, i), 
    3.18 -			&ptep) !=0) {
    3.19 +			    blktap_vma->vm_mm,
    3.20 +			    MMAP_VADDR(user_vstart, idx, i), 
    3.21 +			    &ptep) !=0) {
    3.22  			DPRINTK("Couldn't get a pte addr!\n");
    3.23  			return;
    3.24  		}
    3.25 @@ -705,7 +705,7 @@ static void dispatch_rw_block_io(blkif_t
    3.26  
    3.27  		unsigned long uvaddr;
    3.28  		unsigned long kvaddr;
    3.29 -		unsigned long ptep;
    3.30 +		uint64_t ptep;
    3.31  
    3.32  		uvaddr = MMAP_VADDR(user_vstart, pending_idx, i);
    3.33  		kvaddr = MMAP_VADDR(mmap_vstart, pending_idx, i);
     4.1 --- a/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c	Wed Nov 16 20:33:12 2005 +0100
     4.2 +++ b/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c	Wed Nov 16 20:33:23 2005 +0100
     4.3 @@ -152,7 +152,8 @@ static int privcmd_ioctl(struct inode *i
     4.4  		privcmd_mmapbatch_t m;
     4.5  		struct vm_area_struct *vma = NULL;
     4.6  		unsigned long *p, addr;
     4.7 -		unsigned long mfn, ptep;
     4.8 +		unsigned long mfn; 
     4.9 +		uint64_t ptep;
    4.10  		int i;
    4.11  
    4.12  		if (copy_from_user(&m, (void *)data, sizeof(m))) {
    4.13 @@ -217,15 +218,39 @@ static int privcmd_ioctl(struct inode *i
    4.14  #endif
    4.15  
    4.16  #ifndef __ia64__
    4.17 -	case IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN: {
    4.18 -		unsigned long m2pv = (unsigned long)machine_to_phys_mapping;
    4.19 -		pgd_t *pgd = pgd_offset_k(m2pv);
    4.20 -		pud_t *pud = pud_offset(pgd, m2pv);
    4.21 -		pmd_t *pmd = pmd_offset(pud, m2pv);
    4.22 -		unsigned long m2p_start_mfn =
    4.23 -			(*(unsigned long *)pmd) >> PAGE_SHIFT; 
    4.24 -		ret = put_user(m2p_start_mfn, (unsigned long *)data) ?
    4.25 -			-EFAULT: 0;
    4.26 +	case IOCTL_PRIVCMD_GET_MACH2PHYS_MFNS: {
    4.27 +
    4.28 +		pgd_t *pgd; 
    4.29 +		pud_t *pud; 
    4.30 +		pmd_t *pmd; 
    4.31 +		unsigned long m2pv, m2p_mfn; 	
    4.32 +		privcmd_m2pmfns_t m; 
    4.33 +		unsigned long *p; 
    4.34 +		int i; 
    4.35 +
    4.36 +		if (copy_from_user(&m, (void *)data, sizeof(m)))
    4.37 +			return -EFAULT;
    4.38 +
    4.39 +		m2pv = (unsigned long)machine_to_phys_mapping;
    4.40 +
    4.41 +		p = m.arr; 
    4.42 +
    4.43 +		for(i=0; i < m.num; i++) { 
    4.44 +
    4.45 +			pgd = pgd_offset_k(m2pv);
    4.46 +			pud = pud_offset(pgd, m2pv);
    4.47 +			pmd = pmd_offset(pud, m2pv);
    4.48 +			m2p_mfn = (*(uint64_t *)pmd >> PAGE_SHIFT)&0xFFFFFFFF;
    4.49 +			
    4.50 +			if (put_user(m2p_mfn, p + i))
    4.51 +				return -EFAULT;
    4.52 +
    4.53 +			m2pv += (1 << 21); 
    4.54 +		}
    4.55 +
    4.56 +		ret = 0; 
    4.57 +		break; 
    4.58 +
    4.59  	}
    4.60  	break;
    4.61  #endif
     5.1 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Wed Nov 16 20:33:12 2005 +0100
     5.2 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h	Wed Nov 16 20:33:23 2005 +0100
     5.3 @@ -450,11 +450,11 @@ void make_pages_writable(void *va, unsig
     5.4  #endif /* !CONFIG_DISCONTIGMEM */
     5.5  
     5.6  int direct_remap_pfn_range(struct vm_area_struct *vma,
     5.7 -                            unsigned long address, 
     5.8 -                            unsigned long mfn,
     5.9 -                            unsigned long size, 
    5.10 -                            pgprot_t prot,
    5.11 -                            domid_t  domid);
    5.12 +                           unsigned long address, 
    5.13 +                           unsigned long mfn,
    5.14 +                           unsigned long size, 
    5.15 +                           pgprot_t prot,
    5.16 +                           domid_t  domid);
    5.17  int direct_kernel_remap_pfn_range(unsigned long address, 
    5.18  				  unsigned long mfn,
    5.19  				  unsigned long size, 
    5.20 @@ -462,7 +462,7 @@ int direct_kernel_remap_pfn_range(unsign
    5.21  				  domid_t  domid);
    5.22  int create_lookup_pte_addr(struct mm_struct *mm,
    5.23                             unsigned long address,
    5.24 -                           unsigned long *ptep);
    5.25 +                           uint64_t *ptep);
    5.26  int touch_pte_range(struct mm_struct *mm,
    5.27                      unsigned long address,
    5.28                      unsigned long size);
     6.1 --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h	Wed Nov 16 20:33:12 2005 +0100
     6.2 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h	Wed Nov 16 20:33:23 2005 +0100
     6.3 @@ -541,7 +541,7 @@ int direct_kernel_remap_pfn_range(unsign
     6.4  
     6.5  int create_lookup_pte_addr(struct mm_struct *mm,
     6.6                             unsigned long address,
     6.7 -                           unsigned long *ptep);
     6.8 +                           uint64_t *ptep);
     6.9  
    6.10  int touch_pte_range(struct mm_struct *mm,
    6.11                      unsigned long address,
     7.1 --- a/linux-2.6-xen-sparse/include/asm-xen/linux-public/privcmd.h	Wed Nov 16 20:33:12 2005 +0100
     7.2 +++ b/linux-2.6-xen-sparse/include/asm-xen/linux-public/privcmd.h	Wed Nov 16 20:33:23 2005 +0100
     7.3 @@ -55,6 +55,11 @@ typedef struct privcmd_mmapbatch {
     7.4  	unsigned long *arr; /* array of mfns - top nibble set on err */
     7.5  } privcmd_mmapbatch_t; 
     7.6  
     7.7 +typedef struct privcmd_m2pmfns { 
     7.8 +	int num;    /* max number of mfns to return */
     7.9 +	unsigned long *arr; /* array of mfns */
    7.10 +} privcmd_m2pmfns_t; 
    7.11 +
    7.12  typedef struct privcmd_blkmsg
    7.13  {
    7.14  	unsigned long op;
    7.15 @@ -69,12 +74,11 @@ typedef struct privcmd_blkmsg
    7.16   */
    7.17  #define IOCTL_PRIVCMD_HYPERCALL					\
    7.18  	_IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
    7.19 -
    7.20  #define IOCTL_PRIVCMD_MMAP					\
    7.21  	_IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
    7.22  #define IOCTL_PRIVCMD_MMAPBATCH					\
    7.23  	_IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
    7.24 -#define IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN			\
    7.25 +#define IOCTL_PRIVCMD_GET_MACH2PHYS_MFNS			\
    7.26  	_IOC(_IOC_READ, 'P', 4, sizeof(unsigned long))
    7.27  
    7.28  #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
     8.1 --- a/tools/examples/vif-common.sh	Wed Nov 16 20:33:12 2005 +0100
     8.2 +++ b/tools/examples/vif-common.sh	Wed Nov 16 20:33:23 2005 +0100
     8.3 @@ -63,7 +63,9 @@ function frob_iptable()
     8.4    fi
     8.5  
     8.6    iptables "$c" FORWARD -m physdev --physdev-in "$vif" "$@" -j ACCEPT ||
     8.7 -    fatal "iptables $c FORWARD -m physdev --physdev-in $vif $@ -j ACCEPT failed"
     8.8 +    log err \
     8.9 +     "iptables $c FORWARD -m physdev --physdev-in $vif $@ -j ACCEPT failed.
    8.10 +If you are using iptables, this may affect networking for guest domains."
    8.11  }
    8.12  
    8.13  
     9.1 --- a/tools/libxc/xc_linux_build.c	Wed Nov 16 20:33:12 2005 +0100
     9.2 +++ b/tools/libxc/xc_linux_build.c	Wed Nov 16 20:33:23 2005 +0100
     9.3 @@ -629,7 +629,7 @@ static int setup_guest(int xc_handle,
     9.4      memset(start_info, 0, sizeof(*start_info));
     9.5      rc = xc_version(xc_handle, XENVER_version, NULL);
     9.6      sprintf(start_info->magic, "xen-%i.%i-x86_%d%s",
     9.7 -            rc >> 16, rc & (0xFFFF), sizeof(long)*8,
     9.8 +            rc >> 16, rc & (0xFFFF), (unsigned int)sizeof(long)*8,
     9.9              dsi.pae_kernel ? "p" : "");
    9.10      start_info->nr_pages     = nr_pages;
    9.11      start_info->shared_info  = shared_info_frame << PAGE_SHIFT;
    10.1 --- a/tools/libxc/xc_linux_restore.c	Wed Nov 16 20:33:12 2005 +0100
    10.2 +++ b/tools/libxc/xc_linux_restore.c	Wed Nov 16 20:33:23 2005 +0100
    10.3 @@ -13,13 +13,13 @@
    10.4  #include "xg_save_restore.h"
    10.5  
    10.6  /* max mfn of the whole machine */
    10.7 -static uint32_t max_mfn; 
    10.8 +static unsigned long max_mfn; 
    10.9  
   10.10  /* virtual starting address of the hypervisor */
   10.11 -static uint32_t hvirt_start; 
   10.12 +static unsigned long hvirt_start; 
   10.13  
   10.14  /* #levels of page tables used by the currrent guest */
   10.15 -static uint32_t pt_levels; 
   10.16 +static unsigned int pt_levels; 
   10.17  
   10.18  /* total number of pages used by the current guest */
   10.19  static unsigned long max_pfn;
   10.20 @@ -50,7 +50,6 @@ read_exact(int fd, void *buf, size_t cou
   10.21      return (r == count) ? 1 : 0; 
   10.22  }
   10.23  
   10.24 -
   10.25  /*
   10.26  ** In the state file (or during transfer), all page-table pages are 
   10.27  ** converted into a 'canonical' form where references to actual mfns 
   10.28 @@ -60,23 +59,11 @@ read_exact(int fd, void *buf, size_t cou
   10.29  */
   10.30  int uncanonicalize_pagetable(unsigned long type, void *page) 
   10.31  { 
   10.32 -    int i, pte_last, xen_start, xen_end; 
   10.33 +    int i, pte_last; 
   10.34      unsigned long pfn; 
   10.35      uint64_t pte; 
   10.36  
   10.37 -    /* 
   10.38 -    ** We need to determine which entries in this page table hold
   10.39 -    ** reserved hypervisor mappings. This depends on the current
   10.40 -    ** page table type as well as the number of paging levels. 
   10.41 -    */
   10.42 -    xen_start = xen_end = pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8); 
   10.43 -    
   10.44 -    if (pt_levels == 2 && type == L2TAB)
   10.45 -        xen_start = (hvirt_start >> L2_PAGETABLE_SHIFT); 
   10.46 -
   10.47 -    if (pt_levels == 3 && type == L3TAB) 
   10.48 -        xen_start = L3_PAGETABLE_ENTRIES_PAE; 
   10.49 -
   10.50 +    pte_last = PAGE_SIZE / ((pt_levels == 2)? 4 : 8); 
   10.51  
   10.52      /* Now iterate through the page table, uncanonicalizing each PTE */
   10.53      for(i = 0; i < pte_last; i++) { 
   10.54 @@ -85,13 +72,10 @@ int uncanonicalize_pagetable(unsigned lo
   10.55              pte = ((uint32_t *)page)[i]; 
   10.56          else 
   10.57              pte = ((uint64_t *)page)[i]; 
   10.58 -        
   10.59 -        if(i >= xen_start && i < xen_end) 
   10.60 -            pte = 0; 
   10.61 -        
   10.62 +
   10.63          if(pte & _PAGE_PRESENT) { 
   10.64 -            
   10.65 -            pfn = pte >> PAGE_SHIFT; 
   10.66 +
   10.67 +            pfn = (pte >> PAGE_SHIFT) & 0xffffffff;
   10.68              
   10.69              if(pfn >= max_pfn) { 
   10.70                  ERR("Frame number in type %lu page table is out of range: "
   10.71 @@ -101,17 +85,16 @@ int uncanonicalize_pagetable(unsigned lo
   10.72              } 
   10.73              
   10.74              
   10.75 -            if(type == L1TAB) 
   10.76 -                pte &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PAT);
   10.77 -            else 
   10.78 -                pte &= (PAGE_SIZE - 1) & ~(_PAGE_GLOBAL | _PAGE_PSE);
   10.79 -            
   10.80 -            pte |= p2m[pfn] << PAGE_SHIFT;
   10.81 -            
   10.82 +            pte &= 0xffffff0000000fffULL;
   10.83 +            pte |= (uint64_t)p2m[pfn] << PAGE_SHIFT;
   10.84 +
   10.85              if(pt_levels == 2) 
   10.86                  ((uint32_t *)page)[i] = (uint32_t)pte; 
   10.87              else 
   10.88                  ((uint64_t *)page)[i] = (uint64_t)pte; 
   10.89 +
   10.90 +        
   10.91 +
   10.92          }
   10.93      }
   10.94      
   10.95 @@ -143,6 +126,9 @@ int xc_linux_restore(int xc_handle, int 
   10.96      /* A table of MFNs to map in the current region */
   10.97      unsigned long *region_mfn = NULL;
   10.98  
   10.99 +    /* Types of the pfns in the current region */
  10.100 +    unsigned long region_pfn_type[MAX_BATCH_SIZE];
  10.101 +
  10.102      /* A temporary mapping, and a copy, of one frame of guest memory. */
  10.103      unsigned long *page = NULL;
  10.104  
  10.105 @@ -233,11 +219,13 @@ int xc_linux_restore(int xc_handle, int 
  10.106      
  10.107      if(xc_domain_memory_increase_reservation(
  10.108             xc_handle, dom, max_pfn, 0, 0, NULL) != 0) { 
  10.109 -        ERR("Failed to increase reservation by %lx KB\n", max_pfn); 
  10.110 +        ERR("Failed to increase reservation by %lx KB\n", PFN_TO_KB(max_pfn));
  10.111          errno = ENOMEM;
  10.112          goto out;
  10.113      }
  10.114  
  10.115 +    DPRINTF("Increased domain reservation by %lx KB\n", PFN_TO_KB(max_pfn)); 
  10.116 +
  10.117      /* Build the pfn-to-mfn table. We choose MFN ordering returned by Xen. */
  10.118      if (xc_get_pfn_list(xc_handle, dom, p2m, max_pfn) != max_pfn) {
  10.119          ERR("Did not read correct number of frame numbers for new dom");
  10.120 @@ -249,6 +237,7 @@ int xc_linux_restore(int xc_handle, int 
  10.121          goto out;
  10.122      }
  10.123  
  10.124 +
  10.125      DPRINTF("Reloading memory pages:   0%%\n");
  10.126  
  10.127      /*
  10.128 @@ -261,7 +250,6 @@ int xc_linux_restore(int xc_handle, int 
  10.129      while (1) { 
  10.130  
  10.131          int j;
  10.132 -        unsigned long region_pfn_type[MAX_BATCH_SIZE];
  10.133  
  10.134          this_pc = (n * 100) / max_pfn;
  10.135          if ( (this_pc - prev_pc) >= 5 )
  10.136 @@ -322,7 +310,7 @@ int xc_linux_restore(int xc_handle, int 
  10.137              if (pagetype == XTAB) 
  10.138                  /* a bogus/unmapped page: skip it */
  10.139                  continue;
  10.140 -            
  10.141 +
  10.142              if (pfn > max_pfn) {
  10.143                  ERR("pfn out of range");
  10.144                  goto out;
  10.145 @@ -348,10 +336,20 @@ int xc_linux_restore(int xc_handle, int 
  10.146                  ** A page table page - need to 'uncanonicalize' it, i.e. 
  10.147                  ** replace all the references to pfns with the corresponding 
  10.148                  ** mfns for the new domain. 
  10.149 -                */ 
  10.150 -                if(!uncanonicalize_pagetable(pagetype, page))
  10.151 -                    goto out; 
  10.152 +                ** 
  10.153 +                ** On PAE we need to ensure that PGDs are in MFNs < 4G, and 
  10.154 +                ** so we may need to update the p2m after the main loop. 
  10.155 +                ** Hence we defer canonicalization of L1s until then. 
  10.156 +                */
  10.157 +                if(pt_levels != 3 || pagetype != L1TAB) { 
  10.158  
  10.159 +                    if(!uncanonicalize_pagetable(pagetype, page)) {
  10.160 +                        ERR("failed uncanonicalize pt!\n"); 
  10.161 +                        goto out; 
  10.162 +                    }
  10.163 +
  10.164 +                } 
  10.165 +                    
  10.166              } else if(pagetype != NOTAB) { 
  10.167  
  10.168                  ERR("Bogus page type %lx page table is out of range: "
  10.169 @@ -361,7 +359,6 @@ int xc_linux_restore(int xc_handle, int 
  10.170              } 
  10.171  
  10.172  
  10.173 -
  10.174              if (verify) {
  10.175  
  10.176                  int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
  10.177 @@ -386,9 +383,9 @@ int xc_linux_restore(int xc_handle, int 
  10.178              }
  10.179  
  10.180              if (xc_add_mmu_update(xc_handle, mmu, 
  10.181 -                                  (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
  10.182 -                                  pfn)) {
  10.183 -                ERR("machpys mfn=%ld pfn=%ld", mfn, pfn);
  10.184 +                                  (((unsigned long long)mfn) << PAGE_SHIFT) 
  10.185 +                                  | MMU_MACHPHYS_UPDATE, pfn)) {
  10.186 +                ERR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
  10.187                  goto out;
  10.188              }
  10.189          } /* end of 'batch' for loop */
  10.190 @@ -399,14 +396,39 @@ int xc_linux_restore(int xc_handle, int 
  10.191  
  10.192      DPRINTF("Received all pages\n");
  10.193  
  10.194 -    if (pt_levels == 3) {
  10.195 +    if(pt_levels == 3) { 
  10.196  
  10.197 -        /* Get all PGDs below 4GB. */
  10.198 +        /* 
  10.199 +        ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This 
  10.200 +        ** is a little awkward and involves (a) finding all such PGDs and
  10.201 +        ** replacing them with 'lowmem' versions; (b) upating the p2m[] 
  10.202 +        ** with the new info; and (c) canonicalizing all the L1s using the
  10.203 +        ** (potentially updated) p2m[]. 
  10.204 +        ** 
  10.205 +        ** This is relatively slow (and currently involves two passes through
  10.206 +        ** the pfn_type[] array), but at least seems to be correct. May wish
  10.207 +        ** to consider more complex approaches to optimize this later. 
  10.208 +        */
  10.209 +
  10.210 +        int j, k; 
  10.211 +
  10.212 +        /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
  10.213          for (i = 0; i < max_pfn; i++) {
  10.214              
  10.215              if (((pfn_type[i] & LTABTYPE_MASK)==L3TAB) && (p2m[i]>0xfffffUL)) {
  10.216  
  10.217                  unsigned long new_mfn; 
  10.218 +                uint64_t l3ptes[4]; 
  10.219 +                uint64_t *l3tab; 
  10.220 +
  10.221 +                l3tab = (uint64_t *)
  10.222 +                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  10.223 +                                         PROT_READ, p2m[i]); 
  10.224 +
  10.225 +                for(j = 0; j < 4; j++) 
  10.226 +                    l3ptes[j] = l3tab[j]; 
  10.227 +                
  10.228 +                munmap(l3tab, PAGE_SIZE); 
  10.229  
  10.230                  if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) {
  10.231                      ERR("Couldn't get a page below 4GB :-(");
  10.232 @@ -414,15 +436,58 @@ int xc_linux_restore(int xc_handle, int 
  10.233                  }
  10.234                  
  10.235                  p2m[i] = new_mfn;
  10.236 -                if (xc_add_mmu_update(
  10.237 -                        xc_handle, mmu, 
  10.238 -                        (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, i)) {
  10.239 +                if (xc_add_mmu_update(xc_handle, mmu, 
  10.240 +                                      (((unsigned long long)new_mfn) 
  10.241 +                                       << PAGE_SHIFT) | 
  10.242 +                                      MMU_MACHPHYS_UPDATE, i)) {
  10.243                      ERR("Couldn't m2p on PAE root pgdir");
  10.244                      goto out;
  10.245                  }
  10.246 +                
  10.247 +                l3tab = (uint64_t *)
  10.248 +                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
  10.249 +                                         PROT_READ | PROT_WRITE, p2m[i]); 
  10.250 +                
  10.251 +                for(j = 0; j < 4; j++) 
  10.252 +                    l3tab[j] = l3ptes[j]; 
  10.253 +                
  10.254 +                munmap(l3tab, PAGE_SIZE); 
  10.255 +                
  10.256              }
  10.257          }
  10.258 -        
  10.259 +
  10.260 +        /* Second pass: find all L1TABs and uncanonicalize them */
  10.261 +        j = 0; 
  10.262 +
  10.263 +        for(i = 0; i < max_pfn; i++) { 
  10.264 +            
  10.265 +            if (((pfn_type[i] & LTABTYPE_MASK)==L1TAB)) { 
  10.266 +                region_mfn[j] = p2m[i]; 
  10.267 +                j++; 
  10.268 +            }
  10.269 +
  10.270 +            if(i == (max_pfn-1) || j == MAX_BATCH_SIZE) { 
  10.271 +
  10.272 +                if (!(region_base = xc_map_foreign_batch(
  10.273 +                          xc_handle, dom, PROT_READ | PROT_WRITE, 
  10.274 +                          region_mfn, j))) {  
  10.275 +                    ERR("map batch failed");
  10.276 +                    goto out;
  10.277 +                }
  10.278 +
  10.279 +                for(k = 0; k < j; k++) {
  10.280 +                    if(!uncanonicalize_pagetable(L1TAB, 
  10.281 +                                                 region_base + k*PAGE_SIZE)) {
  10.282 +                        ERR("failed uncanonicalize pt!\n"); 
  10.283 +                        goto out; 
  10.284 +                    } 
  10.285 +                }
  10.286 +                
  10.287 +                munmap(region_base, j*PAGE_SIZE); 
  10.288 +                j = 0; 
  10.289 +            }
  10.290 +        }
  10.291 +
  10.292      }
  10.293  
  10.294  
  10.295 @@ -431,6 +496,7 @@ int xc_linux_restore(int xc_handle, int 
  10.296          goto out;
  10.297      } 
  10.298  
  10.299 +
  10.300      /*
  10.301       * Pin page tables. Do this after writing to them as otherwise Xen
  10.302       * will barf when doing the type-checking.
  10.303 @@ -439,7 +505,7 @@ int xc_linux_restore(int xc_handle, int 
  10.304  
  10.305          if ( (pfn_type[i] & LPINTAB) == 0 )
  10.306              continue;
  10.307 -        
  10.308 +
  10.309          switch(pfn_type[i]) { 
  10.310  
  10.311          case (L1TAB|LPINTAB): 
  10.312 @@ -463,24 +529,17 @@ int xc_linux_restore(int xc_handle, int 
  10.313          }
  10.314  
  10.315          pin[nr_pins].arg1.mfn = p2m[i];
  10.316 +
  10.317 +        nr_pins ++; 
  10.318          
  10.319 -        if (++nr_pins == MAX_PIN_BATCH) {
  10.320 +        if (i == (max_pfn-1) || nr_pins == MAX_PIN_BATCH) {
  10.321              if (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) { 
  10.322                  ERR("Failed to pin batch of %d page tables", nr_pins); 
  10.323                  goto out;
  10.324              } 
  10.325 -            DPRINTF("successfully pinned batch of %d page tables", nr_pins); 
  10.326              nr_pins = 0;
  10.327          }
  10.328      }
  10.329 -    
  10.330 -    if (nr_pins != 0) { 
  10.331 -        if((rc = xc_mmuext_op(xc_handle, pin, nr_pins, dom)) < 0) { 
  10.332 -            ERR("Failed (2) to pin batch of %d page tables", nr_pins); 
  10.333 -            DPRINTF("rc is %d\n", rc); 
  10.334 -            goto out;
  10.335 -        }
  10.336 -    }
  10.337  
  10.338      DPRINTF("\b\b\b\b100%%\n");
  10.339      DPRINTF("Memory reloaded.\n");
  10.340 @@ -579,23 +638,20 @@ int xc_linux_restore(int xc_handle, int 
  10.341      pfn = ctxt.ctrlreg[3] >> PAGE_SHIFT;
  10.342  
  10.343      if (pfn >= max_pfn) {
  10.344 -        DPRINTF("PT base is bad: pfn=%lu max_pfn=%lu type=%08lx\n",
  10.345 -                pfn, max_pfn, pfn_type[pfn]); 
  10.346 -        ERR("PT base is bad.");
  10.347 +        ERR("PT base is bad: pfn=%lu max_pfn=%lu type=%08lx",
  10.348 +            pfn, max_pfn, pfn_type[pfn]); 
  10.349          goto out;
  10.350      }
  10.351  
  10.352      if ((pt_levels == 2) && ((pfn_type[pfn]&LTABTYPE_MASK) != L2TAB)) { 
  10.353 -        DPRINTF("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
  10.354 -                pfn, max_pfn, pfn_type[pfn], (unsigned long)L2TAB);
  10.355 -        ERR("PT base is bad.");
  10.356 +        ERR("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
  10.357 +            pfn, max_pfn, pfn_type[pfn], (unsigned long)L2TAB);
  10.358          goto out;
  10.359      }
  10.360  
  10.361      if ((pt_levels == 3) && ((pfn_type[pfn]&LTABTYPE_MASK) != L3TAB)) { 
  10.362 -        DPRINTF("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx\n",
  10.363 -                pfn, max_pfn, pfn_type[pfn], (unsigned long)L3TAB);
  10.364 -        ERR("PT base is bad.");
  10.365 +        ERR("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx",
  10.366 +            pfn, max_pfn, pfn_type[pfn], (unsigned long)L3TAB);
  10.367          goto out;
  10.368      }
  10.369      
    11.1 --- a/tools/libxc/xc_linux_save.c	Wed Nov 16 20:33:12 2005 +0100
    11.2 +++ b/tools/libxc/xc_linux_save.c	Wed Nov 16 20:33:23 2005 +0100
    11.3 @@ -27,13 +27,13 @@
    11.4  
    11.5  
    11.6  /* max mfn of the whole machine */
    11.7 -static uint32_t max_mfn; 
    11.8 +static unsigned long max_mfn; 
    11.9  
   11.10  /* virtual starting address of the hypervisor */
   11.11 -static uint32_t hvirt_start; 
   11.12 +static unsigned long hvirt_start; 
   11.13  
   11.14  /* #levels of page tables used by the currrent guest */
   11.15 -static uint32_t pt_levels; 
   11.16 +static unsigned int pt_levels; 
   11.17  
   11.18  /* total number of pages used by the current guest */
   11.19  static unsigned long max_pfn;
   11.20 @@ -73,7 +73,7 @@ static unsigned long *live_m2p = NULL;
   11.21  */
   11.22  
   11.23  #define BITS_PER_LONG (sizeof(unsigned long) * 8) 
   11.24 -#define BITMAP_SIZE   ((max_pfn + BITS_PER_LONG - 1) / BITS_PER_LONG)
   11.25 +#define BITMAP_SIZE   ((max_pfn + BITS_PER_LONG - 1) / 8)
   11.26  
   11.27  #define BITMAP_ENTRY(_nr,_bmap) \
   11.28     ((unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
   11.29 @@ -500,6 +500,70 @@ void canonicalize_pagetable(unsigned lon
   11.30  
   11.31  
   11.32  
   11.33 +static unsigned long *xc_map_m2p(int xc_handle, 
   11.34 +                                 unsigned long max_mfn, 
   11.35 +                                 int prot) 
   11.36 +{ 
   11.37 +    privcmd_m2pmfns_t m2p_mfns; 
   11.38 +    privcmd_mmap_t ioctlx; 
   11.39 +    privcmd_mmap_entry_t *entries; 
   11.40 +    unsigned long m2p_chunks, m2p_size; 
   11.41 +    unsigned long *m2p; 
   11.42 +    int i, rc; 
   11.43 +
   11.44 +    m2p_size   = M2P_SIZE(max_mfn); 
   11.45 +    m2p_chunks = M2P_CHUNKS(max_mfn); 
   11.46 +
   11.47 +
   11.48 +    m2p_mfns.num = m2p_chunks; 
   11.49 +
   11.50 +    if(!(m2p_mfns.arr = malloc(m2p_chunks * sizeof(unsigned long)))) { 
   11.51 +        ERR("failed to allocate space for m2p mfns!\n"); 
   11.52 +        return NULL; 
   11.53 +    } 
   11.54 +
   11.55 +    if (ioctl(xc_handle, IOCTL_PRIVCMD_GET_MACH2PHYS_MFNS, &m2p_mfns) < 0) {
   11.56 +        ERR("xc_get_m2p_mfns:"); 
   11.57 +        return NULL;
   11.58 +    }
   11.59 +
   11.60 +    if((m2p = mmap(NULL, m2p_size, prot, 
   11.61 +                   MAP_SHARED, xc_handle, 0)) == MAP_FAILED) {
   11.62 +        ERR("failed to mmap m2p"); 
   11.63 +        return NULL; 
   11.64 +    } 
   11.65 +    
   11.66 +
   11.67 +    if(!(entries = malloc(m2p_chunks * sizeof(privcmd_mmap_entry_t)))) { 
   11.68 +        ERR("failed to allocate space for mmap entries!\n"); 
   11.69 +        return NULL; 
   11.70 +    } 
   11.71 +
   11.72 +
   11.73 +    ioctlx.num   = m2p_chunks;
   11.74 +    ioctlx.dom   = DOMID_XEN; 
   11.75 +    ioctlx.entry = entries; 
   11.76 +    
   11.77 +    for(i=0; i < m2p_chunks; i++) { 
   11.78 +        
   11.79 +        entries[i].va = (unsigned long)(((void *)m2p) + (i * M2P_CHUNK_SIZE)); 
   11.80 +        entries[i].mfn = m2p_mfns.arr[i]; 
   11.81 +        entries[i].npages = M2P_CHUNK_SIZE >> PAGE_SHIFT;
   11.82 +
   11.83 +    }
   11.84 +
   11.85 +    if((rc = ioctl(xc_handle, IOCTL_PRIVCMD_MMAP, &ioctlx)) < 0) {
   11.86 +        ERR("ioctl_mmap failed (rc = %d)", rc); 
   11.87 +        return NULL; 
   11.88 +    }
   11.89 +        
   11.90 +    free(m2p_mfns.arr); 
   11.91 +    free(entries); 
   11.92 +
   11.93 +    return m2p; 
   11.94 +}
   11.95 +
   11.96 +
   11.97  
   11.98  int xc_linux_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, 
   11.99                    uint32_t max_factor, uint32_t flags)
  11.100 @@ -531,16 +595,12 @@ int xc_linux_save(int xc_handle, int io_
  11.101      /* A copy of the pfn-to-mfn table frame list. */
  11.102      unsigned long *p2m_frame_list = NULL;
  11.103  
  11.104 -    unsigned long m2p_start_mfn;
  11.105 -    
  11.106      /* Live mapping of shared info structure */
  11.107      shared_info_t *live_shinfo = NULL;
  11.108  
  11.109      /* base of the region in which domain memory is mapped */
  11.110      unsigned char *region_base = NULL;
  11.111  
  11.112 -
  11.113 -    
  11.114      /* power of 2 order of max_pfn */
  11.115      int order_nr; 
  11.116  
  11.117 @@ -564,9 +624,6 @@ int xc_linux_save(int xc_handle, int io_
  11.118      
  11.119      initialize_mbit_rate(); 
  11.120  
  11.121 -    DPRINTF("xc_linux_save start DOM%u live=%s\n", dom, live ? 
  11.122 -            "true" : "false"); 
  11.123 -
  11.124      if(!get_platform_info(xc_handle, dom, 
  11.125                            &max_mfn, &hvirt_start, &pt_levels)) {
  11.126          ERR("Unable to get platform info."); 
  11.127 @@ -647,11 +704,13 @@ int xc_linux_save(int xc_handle, int io_
  11.128      }
  11.129  
  11.130      /* Setup the mfn_to_pfn table mapping */
  11.131 -    m2p_start_mfn = xc_get_m2p_start_mfn(xc_handle);
  11.132 -    live_m2p      = xc_map_foreign_range(xc_handle, DOMID_XEN, M2P_SIZE, 
  11.133 -                                         PROT_READ, m2p_start_mfn);
  11.134 +    if(!(live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ))) { 
  11.135 +        ERR("Failed to map live M2P table"); 
  11.136 +        goto out; 
  11.137 +    } 
  11.138 +
  11.139      
  11.140 -    /* Get a local copy fo the live_P2M_frame_list */
  11.141 +    /* Get a local copy of the live_P2M_frame_list */
  11.142      if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) { 
  11.143          ERR("Couldn't allocate p2m_frame_list array");
  11.144          goto out;
  11.145 @@ -662,6 +721,8 @@ int xc_linux_save(int xc_handle, int io_
  11.146      for (i = 0; i < max_pfn; i += ulpp) {
  11.147          if (!translate_mfn_to_pfn(&p2m_frame_list[i/ulpp])) { 
  11.148              ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
  11.149 +            ERR("entry %d: p2m_frame_list[%ld] is 0x%lx", i, i/ulpp, 
  11.150 +                p2m_frame_list[i/ulpp]); 
  11.151              goto out;
  11.152          }
  11.153      }
  11.154 @@ -693,11 +754,8 @@ int xc_linux_save(int xc_handle, int io_
  11.155          
  11.156      }
  11.157  
  11.158 -#if 0
  11.159 -    sent_last_iter = 0xFFFFFFFF; /* Pretend we sent a /lot/ last time */
  11.160 -#else
  11.161 -    sent_last_iter = 1 << 20; 
  11.162 -#endif
  11.163 +    /* pretend we sent all the pages last iteration */
  11.164 +    sent_last_iter = max_pfn; 
  11.165  
  11.166  
  11.167      /* calculate the power of 2 order of max_pfn, e.g.
  11.168 @@ -705,9 +763,6 @@ int xc_linux_save(int xc_handle, int io_
  11.169      for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++)
  11.170          continue;
  11.171  
  11.172 -#undef BITMAP_SIZE
  11.173 -#define BITMAP_SIZE ((1<<20)/8) 
  11.174 -
  11.175      /* Setup to_send / to_fix and to_skip bitmaps */
  11.176      to_send = malloc(BITMAP_SIZE); 
  11.177      to_fix  = calloc(1, BITMAP_SIZE); 
  11.178 @@ -922,10 +977,8 @@ int xc_linux_save(int xc_handle, int io_
  11.179  
  11.180  
  11.181                  /* write out pages in batch */
  11.182 -                if (pagetype == XTAB) {
  11.183 -                    DPRINTF("SKIP BOGUS page %i mfn %08lx\n", j, pfn_type[j]);
  11.184 +                if (pagetype == XTAB)
  11.185                      continue;
  11.186 -                }
  11.187  
  11.188                  pagetype &= LTABTYPE_MASK; 
  11.189                  
  11.190 @@ -950,11 +1003,11 @@ int xc_linux_save(int xc_handle, int io_
  11.191              } /* end of the write out for this batch */
  11.192              
  11.193              sent_this_iter += batch;
  11.194 -            
  11.195 +
  11.196 +            munmap(region_base, batch*PAGE_SIZE);
  11.197 +        
  11.198          } /* end of this while loop for this iteration */
  11.199          
  11.200 -        munmap(region_base, batch*PAGE_SIZE);
  11.201 -        
  11.202        skip: 
  11.203          
  11.204          total_sent += sent_this_iter;
  11.205 @@ -1027,13 +1080,9 @@ int xc_linux_save(int xc_handle, int io_
  11.206  
  11.207      DPRINTF("All memory is saved\n");
  11.208  
  11.209 -    /* Success! */
  11.210 -    rc = 0;
  11.211 -    
  11.212 -    /* ^^^^^^ XXX SMH: hmm.. not sure that's really success! */
  11.213 -    
  11.214      /* Zero terminate */
  11.215 -    if (!write_exact(io_fd, &rc, sizeof(int))) { 
  11.216 +    i = 0; 
  11.217 +    if (!write_exact(io_fd, &i, sizeof(int))) { 
  11.218          ERR("Error when writing to state file (6)");
  11.219          goto out;
  11.220      }
  11.221 @@ -1043,17 +1092,17 @@ int xc_linux_save(int xc_handle, int io_
  11.222          unsigned int i,j;
  11.223          unsigned long pfntab[1024]; 
  11.224  
  11.225 -        for ( i = 0, j = 0; i < max_pfn; i++ ) {
  11.226 -            if ( ! is_mapped(live_p2m[i]) )
  11.227 +        for (i = 0, j = 0; i < max_pfn; i++) {
  11.228 +            if (!is_mapped(live_p2m[i]))
  11.229                  j++;
  11.230          }
  11.231 -
  11.232 +        
  11.233          if(!write_exact(io_fd, &j, sizeof(unsigned int))) { 
  11.234              ERR("Error when writing to state file (6a)");
  11.235              goto out;
  11.236          }	
  11.237          
  11.238 -        for ( i = 0, j = 0; i < max_pfn; ) {
  11.239 +        for (i = 0, j = 0; i < max_pfn; ) {
  11.240  
  11.241              if (!is_mapped(live_p2m[i]))
  11.242                  pfntab[j++] = i;
  11.243 @@ -1097,7 +1146,10 @@ int xc_linux_save(int xc_handle, int io_
  11.244          ERR("Error when writing to state file (1)");
  11.245          goto out;
  11.246      }
  11.247 -    
  11.248 +
  11.249 +    /* Success! */
  11.250 +    rc = 0;
  11.251 +
  11.252   out:
  11.253  
  11.254      if (live_shinfo)
  11.255 @@ -1110,7 +1162,7 @@ int xc_linux_save(int xc_handle, int io_
  11.256          munmap(live_p2m, P2M_SIZE); 
  11.257  
  11.258      if(live_m2p) 
  11.259 -        munmap(live_m2p, M2P_SIZE); 
  11.260 +        munmap(live_m2p, M2P_SIZE(max_mfn)); 
  11.261  
  11.262      free(pfn_type);
  11.263      free(pfn_batch);
    12.1 --- a/tools/libxc/xc_private.c	Wed Nov 16 20:33:12 2005 +0100
    12.2 +++ b/tools/libxc/xc_private.c	Wed Nov 16 20:33:23 2005 +0100
    12.3 @@ -260,18 +260,6 @@ long long xc_domain_get_cpu_usage( int x
    12.4  }
    12.5  
    12.6  
    12.7 -unsigned long xc_get_m2p_start_mfn ( int xc_handle )
    12.8 -{
    12.9 -    unsigned long mfn;
   12.10 -
   12.11 -    if ( ioctl( xc_handle, IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN, &mfn ) < 0 )
   12.12 -    {
   12.13 -        perror("xc_get_m2p_start_mfn:");
   12.14 -        return 0;
   12.15 -    }
   12.16 -    return mfn;
   12.17 -}
   12.18 -
   12.19  int xc_get_pfn_list(int xc_handle,
   12.20                      uint32_t domid, 
   12.21                      unsigned long *pfn_buf, 
    13.1 --- a/tools/libxc/xg_private.h	Wed Nov 16 20:33:12 2005 +0100
    13.2 +++ b/tools/libxc/xg_private.h	Wed Nov 16 20:33:23 2005 +0100
    13.3 @@ -153,8 +153,6 @@ typedef struct mfn_mapper {
    13.4      
    13.5  } mfn_mapper_t;
    13.6  
    13.7 -unsigned long xc_get_m2p_start_mfn (int xc_handle);
    13.8 -
    13.9  int xc_copy_to_domain_page(int xc_handle, uint32_t domid,
   13.10                              unsigned long dst_pfn, void *src_page);
   13.11  
    14.1 --- a/tools/libxc/xg_save_restore.h	Wed Nov 16 20:33:12 2005 +0100
    14.2 +++ b/tools/libxc/xg_save_restore.h	Wed Nov 16 20:33:23 2005 +0100
    14.3 @@ -4,6 +4,8 @@
    14.4  ** Defintions and utilities for save / restore. 
    14.5  */
    14.6  
    14.7 +#include "xc_private.h"
    14.8 +
    14.9  #define DEBUG    1
   14.10  #define PROGRESS 0
   14.11  
   14.12 @@ -55,25 +57,24 @@ while (0)
   14.13  ** Returns 1 on success, 0 on failure. 
   14.14  */
   14.15  static int get_platform_info(int xc_handle, uint32_t dom, 
   14.16 -                             /* OUT */ uint32_t *max_mfn,  
   14.17 -                             /* OUT */ uint32_t *hvirt_start, 
   14.18 -                             /* OUT */ uint32_t *pt_levels)
   14.19 +                             /* OUT */ unsigned long *max_mfn,  
   14.20 +                             /* OUT */ unsigned long *hvirt_start, 
   14.21 +                             /* OUT */ unsigned int *pt_levels)
   14.22      
   14.23  { 
   14.24      xen_capabilities_info_t xen_caps = "";
   14.25      xen_platform_parameters_t xen_params;
   14.26 -    xc_physinfo_t physinfo;
   14.27      
   14.28 -    if (xc_physinfo(xc_handle, &physinfo) != 0) 
   14.29 -        return 0;
   14.30 -    
   14.31 +
   14.32      if (xc_version(xc_handle, XENVER_platform_parameters, &xen_params) != 0)
   14.33          return 0;
   14.34      
   14.35      if (xc_version(xc_handle, XENVER_capabilities, &xen_caps) != 0)
   14.36          return 0;
   14.37  
   14.38 -    *max_mfn =     physinfo.total_pages;
   14.39 +    if (xc_memory_op(xc_handle, XENMEM_maximum_ram_page, max_mfn) != 0)
   14.40 +        return 0; 
   14.41 +    
   14.42      *hvirt_start = xen_params.virt_start;
   14.43  
   14.44      if (strstr(xen_caps, "xen-3.0-x86_64"))
   14.45 @@ -95,13 +96,22 @@ static int get_platform_info(int xc_hand
   14.46  ** entry tell us whether or not the the PFN is currently mapped.
   14.47  */
   14.48  
   14.49 -#define PFN_TO_KB(_pfn) ((_pfn) * PAGE_SIZE / 1024)
   14.50 +#define PFN_TO_KB(_pfn) ((_pfn) << (PAGE_SHIFT - 10))
   14.51  #define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1))
   14.52  
   14.53 -/* Size in bytes of the M2P and P2M (both rounded up to nearest PAGE_SIZE) */
   14.54 -#define M2P_SIZE ROUNDUP((max_mfn * sizeof(unsigned long)), PAGE_SHIFT) 
   14.55 -#define P2M_SIZE ROUNDUP((max_pfn * sizeof(unsigned long)), PAGE_SHIFT) 
   14.56  
   14.57 +/* 
   14.58 +** The M2P is made up of some number of 'chunks' of at least 2MB in size. 
   14.59 +** The below definitions and utility function(s) deal with mapping the M2P 
   14.60 +** regarldess of the underlying machine memory size or architecture. 
   14.61 +*/
   14.62 +#define M2P_SHIFT       L2_PAGETABLE_SHIFT_PAE 
   14.63 +#define M2P_CHUNK_SIZE  (1 << M2P_SHIFT) 
   14.64 +#define M2P_SIZE(_m)    ROUNDUP(((_m) * sizeof(unsigned long)), M2P_SHIFT) 
   14.65 +#define M2P_CHUNKS(_m)  (M2P_SIZE((_m)) >> M2P_SHIFT)
   14.66 +
   14.67 +/* Size in bytes of the P2M (rounded up to the nearest PAGE_SIZE bytes) */
   14.68 +#define P2M_SIZE        ROUNDUP((max_pfn * sizeof(unsigned long)), PAGE_SHIFT) 
   14.69  
   14.70  /* Number of unsigned longs in a page */
   14.71  #define ulpp            (PAGE_SIZE/sizeof(unsigned long))
    15.1 --- a/tools/python/xen/xend/XendCheckpoint.py	Wed Nov 16 20:33:12 2005 +0100
    15.2 +++ b/tools/python/xen/xend/XendCheckpoint.py	Wed Nov 16 20:33:23 2005 +0100
    15.3 @@ -129,7 +129,7 @@ def restore(xd, fd):
    15.4          l = read_exact(fd, sizeof_unsigned_long,
    15.5                         "not a valid guest state file: pfn count read")
    15.6          nr_pfns = unpack("=L", l)[0]   # XXX endianess
    15.7 -        if nr_pfns > 1024*1024:     # XXX
    15.8 +        if nr_pfns > 16*1024*1024:     # XXX 
    15.9              raise XendError(
   15.10                  "not a valid guest state file: pfn count out of range")
   15.11  
    16.1 --- a/tools/python/xen/xend/XendDomain.py	Wed Nov 16 20:33:12 2005 +0100
    16.2 +++ b/tools/python/xen/xend/XendDomain.py	Wed Nov 16 20:33:23 2005 +0100
    16.3 @@ -63,14 +63,19 @@ class XendDomain:
    16.4          self.domains = {}
    16.5          self.domains_lock = threading.RLock()
    16.6  
    16.7 -        xswatch("@releaseDomain", self.onReleaseDomain)
    16.8 -
    16.9          self.domains_lock.acquire()
   16.10          try:
   16.11              self._add_domain(
   16.12                  XendDomainInfo.recreate(self.xen_domains()[PRIV_DOMAIN],
   16.13                                          True))
   16.14              self.dom0_setup()
   16.15 +
   16.16 +            # This watch registration needs to be before the refresh call, so
   16.17 +            # that we're sure that we haven't missed any releases, but inside
   16.18 +            # the domains_lock, as we don't want the watch to fire until after
   16.19 +            # the refresh call has completed.
   16.20 +            xswatch("@releaseDomain", self.onReleaseDomain)
   16.21 +            
   16.22              self.refresh(True)
   16.23          finally:
   16.24              self.domains_lock.release()
    17.1 --- a/tools/python/xen/xend/XendDomainInfo.py	Wed Nov 16 20:33:12 2005 +0100
    17.2 +++ b/tools/python/xen/xend/XendDomainInfo.py	Wed Nov 16 20:33:23 2005 +0100
    17.3 @@ -45,6 +45,8 @@ import uuid
    17.4  
    17.5  from xen.xend.xenstore.xstransact import xstransact
    17.6  from xen.xend.xenstore.xsutil import GetDomainPath, IntroduceDomain
    17.7 +from xen.xend.xenstore.xswatch import xswatch
    17.8 +
    17.9  
   17.10  """Shutdown code for poweroff."""
   17.11  DOMAIN_POWEROFF = 0
   17.12 @@ -82,7 +84,6 @@ STATE_DOM_SHUTDOWN = 2
   17.13  
   17.14  SHUTDOWN_TIMEOUT = 30
   17.15  
   17.16 -DOMROOT = '/local/domain/'
   17.17  VMROOT  = '/vm/'
   17.18  
   17.19  ZOMBIE_PREFIX = 'Zombie-'
   17.20 @@ -100,27 +101,53 @@ log = logging.getLogger("xend.XendDomain
   17.21  #log.setLevel(logging.TRACE)
   17.22  
   17.23  
   17.24 -## Configuration entries that we expect to round-trip -- be read from the
   17.25 +##
   17.26 +# All parameters of VMs that may be configured on-the-fly, or at start-up.
   17.27 +# 
   17.28 +VM_CONFIG_PARAMS = [
   17.29 +    ('name',        str),
   17.30 +    ('on_poweroff', str),
   17.31 +    ('on_reboot',   str),
   17.32 +    ('on_crash',    str),
   17.33 +    ]
   17.34 +
   17.35 +
   17.36 +##
   17.37 +# Configuration entries that we expect to round-trip -- be read from the
   17.38  # config file or xc, written to save-files (i.e. through sxpr), and reused as
   17.39  # config on restart or restore, all without munging.  Some configuration
   17.40  # entries are munged for backwards compatibility reasons, or because they
   17.41  # don't come out of xc in the same form as they are specified in the config
   17.42  # file, so those are handled separately.
   17.43  ROUNDTRIPPING_CONFIG_ENTRIES = [
   17.44 -        ('name',         str),
   17.45 -        ('uuid',         str),
   17.46 -        ('ssidref',      int),
   17.47 -        ('vcpus',        int),
   17.48 -        ('vcpu_avail',   int),
   17.49 -        ('cpu_weight',   float),
   17.50 -        ('memory',       int),
   17.51 -        ('maxmem',       int),
   17.52 -        ('bootloader',   str),
   17.53 -        ('on_poweroff',  str),
   17.54 -        ('on_reboot',    str),
   17.55 -        ('on_crash',     str)
   17.56 +    ('uuid',       str),
   17.57 +    ('ssidref',    int),
   17.58 +    ('vcpus',      int),
   17.59 +    ('vcpu_avail', int),
   17.60 +    ('cpu_weight', float),
   17.61 +    ('memory',     int),
   17.62 +    ('maxmem',     int),
   17.63 +    ('bootloader', str),
   17.64      ]
   17.65  
   17.66 +ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFIG_PARAMS
   17.67 +
   17.68 +
   17.69 +##
   17.70 +# All entries written to the store.  This is VM_CONFIGURATION_PARAMS, plus
   17.71 +# those entries written to the store that cannot be reconfigured on-the-fly.
   17.72 +#
   17.73 +VM_STORE_ENTRIES = [
   17.74 +    ('uuid',       str),
   17.75 +    ('ssidref',    int),
   17.76 +    ('vcpus',      int),
   17.77 +    ('vcpu_avail', int),
   17.78 +    ('memory',     int),
   17.79 +    ('maxmem',     int),
   17.80 +    ]
   17.81 +
   17.82 +VM_STORE_ENTRIES += VM_CONFIG_PARAMS
   17.83 +
   17.84  
   17.85  #
   17.86  # There are a number of CPU-related fields:
   17.87 @@ -156,6 +183,7 @@ def create(config):
   17.88          vm.initDomain()
   17.89          vm.storeVmDetails()
   17.90          vm.storeDomDetails()
   17.91 +        vm.registerWatch()
   17.92          vm.refreshShutdown()
   17.93          return vm
   17.94      except:
   17.95 @@ -211,6 +239,7 @@ def recreate(xeninfo, priv):
   17.96          vm.storeVmDetails()
   17.97          vm.storeDomDetails()
   17.98  
   17.99 +    vm.registerWatch()
  17.100      vm.refreshShutdown(xeninfo)
  17.101      return vm
  17.102  
  17.103 @@ -371,6 +400,8 @@ class XendDomainInfo:
  17.104          self.console_port = None
  17.105          self.console_mfn = None
  17.106  
  17.107 +        self.vmWatch = None
  17.108 +
  17.109          self.state = STATE_DOM_OK
  17.110          self.state_updated = threading.Condition()
  17.111          self.refresh_shutdown_lock = threading.Condition()
  17.112 @@ -378,6 +409,42 @@ class XendDomainInfo:
  17.113  
  17.114      ## private:
  17.115  
  17.116 +    def readVMDetails(self, params):
  17.117 +        """Read from the store all of those entries that we consider 
  17.118 +        """
  17.119 +        try:
  17.120 +            return self.gatherVm(*params)
  17.121 +        except ValueError:
  17.122 +            # One of the int/float entries in params has a corresponding store
  17.123 +            # entry that is invalid.  We recover, because older versions of
  17.124 +            # Xend may have put the entry there (memory/target, for example),
  17.125 +            # but this is in general a bad situation to have reached.
  17.126 +            log.exception(
  17.127 +                "Store corrupted at %s!  Domain %d's configuration may be "
  17.128 +                "affected.", self.vmpath, self.domid)
  17.129 +            return []
  17.130 +
  17.131 +
  17.132 +    def storeChanged(self):
  17.133 +        log.debug("XendDomainInfo.storeChanged");
  17.134 +
  17.135 +        changed = False
  17.136 +        
  17.137 +        def f(x, y):
  17.138 +            if y is not None and self.info[x[0]] != y:
  17.139 +                self.info[x[0]] = y
  17.140 +                changed = True
  17.141 +
  17.142 +        map(f, VM_CONFIG_PARAMS, self.readVMDetails(VM_CONFIG_PARAMS))
  17.143 +
  17.144 +        if changed:
  17.145 +            # Update the domain section of the store, as this contains some
  17.146 +            # parameters derived from the VM configuration.
  17.147 +            self.storeDomDetails()
  17.148 +
  17.149 +        return 1
  17.150 +
  17.151 +
  17.152      def augmentInfo(self):
  17.153          """Augment self.info, as given to us through {@link #recreate}, with
  17.154          values taken from the store.  This recovers those values known to xend
  17.155 @@ -387,30 +454,8 @@ class XendDomainInfo:
  17.156              if not self.infoIsSet(name) and val is not None:
  17.157                  self.info[name] = val
  17.158  
  17.159 -        params = (("name", str),
  17.160 -                  ("on_poweroff",  str),
  17.161 -                  ("on_reboot",    str),
  17.162 -                  ("on_crash",     str),
  17.163 -                  ("image",        str),
  17.164 -                  ("memory",       int),
  17.165 -                  ("maxmem",       int),
  17.166 -                  ("vcpus",        int),
  17.167 -                  ("vcpu_avail",   int),
  17.168 -                  ("start_time", float))
  17.169 -
  17.170 -        try:
  17.171 -            from_store = self.gatherVm(*params)
  17.172 -        except ValueError, exn:
  17.173 -            # One of the int/float entries in params has a corresponding store
  17.174 -            # entry that is invalid.  We recover, because older versions of
  17.175 -            # Xend may have put the entry there (memory/target, for example),
  17.176 -            # but this is in general a bad situation to have reached.
  17.177 -            log.exception(
  17.178 -                "Store corrupted at %s!  Domain %d's configuration may be "
  17.179 -                "affected.", self.vmpath, self.domid)
  17.180 -            return
  17.181 -
  17.182 -        map(lambda x, y: useIfNeeded(x[0], y), params, from_store)
  17.183 +        map(lambda x, y: useIfNeeded(x[0], y), VM_STORE_ENTRIES,
  17.184 +            self.readVMDetails(VM_STORE_ENTRIES))
  17.185  
  17.186          device = []
  17.187          for c in controllerClasses:
  17.188 @@ -536,23 +581,24 @@ class XendDomainInfo:
  17.189  
  17.190          self.introduceDomain()
  17.191          self.storeDomDetails()
  17.192 +        self.registerWatch()
  17.193          self.refreshShutdown()
  17.194  
  17.195          log.debug("XendDomainInfo.completeRestore done")
  17.196  
  17.197  
  17.198      def storeVmDetails(self):
  17.199 -        to_store = {
  17.200 -            'uuid':               self.info['uuid']
  17.201 -            }
  17.202 +        to_store = {}
  17.203 +
  17.204 +        for k in VM_STORE_ENTRIES:
  17.205 +            if self.infoIsSet(k[0]):
  17.206 +                to_store[k[0]] = str(self.info[k[0]])
  17.207  
  17.208          if self.infoIsSet('image'):
  17.209              to_store['image'] = sxp.to_string(self.info['image'])
  17.210  
  17.211 -        for k in ['name', 'ssidref', 'memory', 'maxmem', 'on_poweroff',
  17.212 -                  'on_reboot', 'on_crash', 'vcpus', 'vcpu_avail']:
  17.213 -            if self.infoIsSet(k):
  17.214 -                to_store[k] = str(self.info[k])
  17.215 +        if self.infoIsSet('start_time'):
  17.216 +            to_store['start_time'] = str(self.info['start_time'])
  17.217  
  17.218          log.debug("Storing VM details: %s", to_store)
  17.219  
  17.220 @@ -599,13 +645,16 @@ class XendDomainInfo:
  17.221          return result
  17.222  
  17.223  
  17.224 -    def setDomid(self, domid):
  17.225 -        """Set the domain id.
  17.226 +    ## public:
  17.227  
  17.228 -        @param dom: domain id
  17.229 -        """
  17.230 -        self.domid = domid
  17.231 -        self.storeDom("domid", self.domid)
  17.232 +    def registerWatch(self):
  17.233 +        """Register a watch on this VM's entries in the store, so that
  17.234 +        when they are changed externally, we keep up to date.  This should
  17.235 +        only be called by {@link #create}, {@link #recreate}, or {@link
  17.236 +        #restore}, once the domain's details have been written, but before the
  17.237 +        new instance is returned."""
  17.238 +        self.vmWatch = xswatch(self.vmpath, self.storeChanged)
  17.239 +
  17.240  
  17.241      def getDomid(self):
  17.242          return self.domid
  17.243 @@ -1116,6 +1165,13 @@ class XendDomainInfo:
  17.244          """Cleanup VM resources.  Idempotent.  Nothrow guarantee."""
  17.245  
  17.246          try:
  17.247 +            try:
  17.248 +                if self.vmWatch:
  17.249 +                    self.vmWatch.unwatch()
  17.250 +                self.vmWatch = None
  17.251 +            except:
  17.252 +                log.exception("Unwatching VM path failed.")
  17.253 +
  17.254              self.removeVm()
  17.255          except:
  17.256              log.exception("Removing VM path failed.")
    18.1 --- a/tools/python/xen/xend/xenstore/xswatch.py	Wed Nov 16 20:33:12 2005 +0100
    18.2 +++ b/tools/python/xen/xend/xenstore/xswatch.py	Wed Nov 16 20:33:23 2005 +0100
    18.3 @@ -22,6 +22,10 @@ class xswatch:
    18.4          xs.watch(path, self)
    18.5  
    18.6  
    18.7 +    def unwatch(self):
    18.8 +        xs.unwatch(self.path, self)
    18.9 +
   18.10 +
   18.11  watchThread = None
   18.12  xs = None
   18.13  xslock = threading.Lock()
   18.14 @@ -49,7 +53,7 @@ def watchMain():
   18.15              watch = we[1]
   18.16              res = watch.fn(*watch.args, **watch.kwargs)
   18.17              if not res:
   18.18 -                xs.unwatch(watch.path, watch)
   18.19 +                watch.unwatch()
   18.20          except:
   18.21              log.exception("read_watch failed")
   18.22              # Ignore this exception -- there's no point throwing it
    19.1 --- a/xen/arch/x86/mm.c	Wed Nov 16 20:33:12 2005 +0100
    19.2 +++ b/xen/arch/x86/mm.c	Wed Nov 16 20:33:23 2005 +0100
    19.3 @@ -898,6 +898,7 @@ static int alloc_l3_table(struct pfn_inf
    19.4      return 1;
    19.5  
    19.6   fail:
    19.7 +    MEM_LOG("Failure in alloc_l3_table: entry %d", i);
    19.8      while ( i-- > 0 )
    19.9          if ( is_guest_l3_slot(i) )
   19.10              put_page_from_l3e(pl3e[i], pfn);
   19.11 @@ -948,6 +949,7 @@ static int alloc_l4_table(struct pfn_inf
   19.12      return 1;
   19.13  
   19.14   fail:
   19.15 +    MEM_LOG("Failure in alloc_l4_table: entry %d", i);
   19.16      while ( i-- > 0 )
   19.17          if ( is_guest_l4_slot(i) )
   19.18              put_page_from_l4e(pl4e[i], pfn);