ia64/xen-unstable

changeset 5727:ba925b4aef28

The 1:1 page table should be a 3 level PAE page table on x86-64

This is needed to support > 4GB machine physical addresses.

Signed-off-by: Chengyuan Li <chengyuan.li@intel.com>
Signed-off-by: Arun Sharma <arun.sharma@intel.com>
author kaf24@firebug.cl.cam.ac.uk
date Mon Jul 11 14:39:10 2005 +0000 (2005-07-11)
parents a29b4174d39c
children 56a63f9f378f
files tools/libxc/xc_vmx_build.c tools/python/xen/xend/image.py xen/arch/x86/shadow32.c xen/arch/x86/shadow_public.c xen/arch/x86/vmx.c xen/arch/x86/vmx_vmcs.c
line diff
     1.1 --- a/tools/libxc/xc_vmx_build.c	Mon Jul 11 10:23:19 2005 +0000
     1.2 +++ b/tools/libxc/xc_vmx_build.c	Mon Jul 11 14:39:10 2005 +0000
     1.3 @@ -13,6 +13,9 @@
     1.4  
     1.5  #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
     1.6  #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
     1.7 +#ifdef __x86_64__
     1.8 +#define L3_PROT (_PAGE_PRESENT)
     1.9 +#endif
    1.10  
    1.11  #define round_pgup(_p)    (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
    1.12  #define round_pgdown(_p)  ((_p)&PAGE_MASK)
    1.13 @@ -91,6 +94,7 @@ static void build_e820map(struct mem_map
    1.14      mem_mapp->nr_map = nr_map;
    1.15  }
    1.16  
    1.17 +#ifdef __i386__
    1.18  static int zap_mmio_range(int xc_handle, u32 dom,
    1.19                              l2_pgentry_32_t *vl2tab,
    1.20                              unsigned long mmio_range_start,
    1.21 @@ -138,6 +142,65 @@ static int zap_mmio_ranges(int xc_handle
    1.22      munmap(vl2tab, PAGE_SIZE);
    1.23      return 0;
    1.24  }
    1.25 +#else
    1.26 +static int zap_mmio_range(int xc_handle, u32 dom,
    1.27 +                           l3_pgentry_t *vl3tab,
    1.28 +                           unsigned long mmio_range_start,
    1.29 +                           unsigned long mmio_range_size)
    1.30 +{
    1.31 +   unsigned long mmio_addr;
    1.32 +   unsigned long mmio_range_end = mmio_range_start + mmio_range_size;
    1.33 +   unsigned long vl2e = 0;
    1.34 +   unsigned long vl3e;
    1.35 +   l1_pgentry_t *vl1tab;
    1.36 +   l2_pgentry_t *vl2tab;
    1.37 + 
    1.38 +   mmio_addr = mmio_range_start & PAGE_MASK;
    1.39 +   for (; mmio_addr < mmio_range_end; mmio_addr += PAGE_SIZE) {
    1.40 +       vl3e = vl3tab[l3_table_offset(mmio_addr)];
    1.41 +       if (vl3e == 0)
    1.42 +           continue;
    1.43 +       vl2tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
    1.44 +               PROT_READ|PROT_WRITE, vl3e >> PAGE_SHIFT);
    1.45 +       if (vl2tab == 0) {
    1.46 +           PERROR("Failed zap MMIO range");
    1.47 +           return -1;
    1.48 +       }
    1.49 +       vl2e = vl2tab[l2_table_offset(mmio_addr)];
    1.50 +       if (vl2e == 0)
    1.51 +           continue;
    1.52 +       vl1tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
    1.53 +               PROT_READ|PROT_WRITE, vl2e >> PAGE_SHIFT);
    1.54 +
    1.55 +       vl1tab[l1_table_offset(mmio_addr)] = 0;
    1.56 +       munmap(vl2tab, PAGE_SIZE);
    1.57 +       munmap(vl1tab, PAGE_SIZE);
    1.58 +   }
    1.59 +   return 0;
    1.60 +}
    1.61 +
    1.62 +static int zap_mmio_ranges(int xc_handle, u32 dom,
    1.63 +                           unsigned long l3tab,
    1.64 +                           struct mem_map *mem_mapp)
    1.65 +{
    1.66 +   int i;
    1.67 +   l3_pgentry_t *vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
    1.68 +                                               PROT_READ|PROT_WRITE,
    1.69 +                                               l3tab >> PAGE_SHIFT);
    1.70 +   if (vl3tab == 0)
    1.71 +   	return -1;
    1.72 +   for (i = 0; i < mem_mapp->nr_map; i++) {
    1.73 +       if ((mem_mapp->map[i].type == E820_IO)
    1.74 +         && (mem_mapp->map[i].caching_attr == MEMMAP_UC))
    1.75 +           if (zap_mmio_range(xc_handle, dom, vl3tab,
    1.76 +	    		mem_mapp->map[i].addr, mem_mapp->map[i].size) == -1)
    1.77 +		return -1;
    1.78 +   }
    1.79 +   munmap(vl3tab, PAGE_SIZE);
    1.80 +   return 0;
    1.81 +}
    1.82 +
    1.83 +#endif
    1.84  
    1.85  static int setup_guest(int xc_handle,
    1.86                           u32 dom, int memsize,
    1.87 @@ -151,9 +214,13 @@ static int setup_guest(int xc_handle,
    1.88                           unsigned long flags,
    1.89                           struct mem_map * mem_mapp)
    1.90  {
    1.91 -    l1_pgentry_32_t *vl1tab=NULL, *vl1e=NULL;
    1.92 -    l2_pgentry_32_t *vl2tab=NULL, *vl2e=NULL;
    1.93 +    l1_pgentry_t *vl1tab=NULL, *vl1e=NULL;
    1.94 +    l2_pgentry_t *vl2tab=NULL, *vl2e=NULL;
    1.95      unsigned long *page_array = NULL;
    1.96 +#ifdef __x86_64__
    1.97 +    l3_pgentry_t *vl3tab=NULL, *vl3e=NULL;
    1.98 +    unsigned long l3tab;
    1.99 +#endif
   1.100      unsigned long l2tab;
   1.101      unsigned long l1tab;
   1.102      unsigned long count, i;
   1.103 @@ -212,7 +279,11 @@ static int setup_guest(int xc_handle,
   1.104      if(initrd_len == 0)
   1.105          vinitrd_start = vinitrd_end = 0;
   1.106  
   1.107 +#ifdef __i386__
   1.108      nr_pt_pages = 1 + ((memsize + 3) >> 2);
   1.109 +#else
   1.110 +    nr_pt_pages = 5 + ((memsize + 1) >> 1);
   1.111 +#endif
   1.112      vpt_start   = v_end;
   1.113      vpt_end     = vpt_start + (nr_pt_pages * PAGE_SIZE);
   1.114  
   1.115 @@ -274,6 +345,7 @@ static int setup_guest(int xc_handle,
   1.116      if ( (mmu = init_mmu_updates(xc_handle, dom)) == NULL )
   1.117          goto error_out;
   1.118  
   1.119 +#ifdef __i386__
   1.120      /* First allocate page for page dir. */
   1.121      ppt_alloc = (vpt_start - dsi.v_start) >> PAGE_SHIFT;
   1.122      l2tab = page_array[ppt_alloc++] << PAGE_SHIFT;
   1.123 @@ -310,7 +382,64 @@ static int setup_guest(int xc_handle,
   1.124      }
   1.125      munmap(vl1tab, PAGE_SIZE);
   1.126      munmap(vl2tab, PAGE_SIZE);
   1.127 +#else
   1.128 +    /* First allocate pdpt */
   1.129 +    ppt_alloc = (vpt_start - dsi.v_start) >> PAGE_SHIFT;
   1.130 +    /* here l3tab means pdpt, only 4 entry is used */
   1.131 +    l3tab = page_array[ppt_alloc++] << PAGE_SHIFT;
   1.132 +    ctxt->ctrlreg[3] = l3tab;
   1.133  
   1.134 +    /* Initialise the page tables. */
   1.135 +    if ( (vl3tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, 
   1.136 +                                        PROT_READ|PROT_WRITE, 
   1.137 +                                        l3tab >> PAGE_SHIFT)) == NULL )
   1.138 +        goto error_out;
   1.139 +    memset(vl3tab, 0, PAGE_SIZE);
   1.140 +
   1.141 +    vl3e = &vl3tab[l3_table_offset(dsi.v_start)];
   1.142 +
   1.143 +    for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
   1.144 +    {
   1.145 +        if (!(count % (1 << (L3_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)))){
   1.146 +            l2tab = page_array[ppt_alloc++] << PAGE_SHIFT;
   1.147 +
   1.148 +            if (vl2tab != NULL)
   1.149 +                munmap(vl2tab, PAGE_SIZE);
   1.150 +
   1.151 +            if ( (vl2tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
   1.152 +                      PROT_READ|PROT_WRITE,
   1.153 +                      l2tab >> PAGE_SHIFT)) == NULL )
   1.154 +                goto error_out;
   1.155 +
   1.156 +            memset(vl2tab, 0, PAGE_SIZE);
   1.157 +            *vl3e++ = l2tab | L3_PROT;
   1.158 +            vl2e = &vl2tab[l2_table_offset(dsi.v_start + (count << PAGE_SHIFT))];
   1.159 +        }
   1.160 +        if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 )
   1.161 +        {
   1.162 +            l1tab = page_array[ppt_alloc++] << PAGE_SHIFT;
   1.163 +            if ( vl1tab != NULL )
   1.164 +                munmap(vl1tab, PAGE_SIZE);
   1.165 +            if ( (vl1tab = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
   1.166 +                      PROT_READ|PROT_WRITE,
   1.167 +                      l1tab >> PAGE_SHIFT)) == NULL )
   1.168 +            {
   1.169 +                munmap(vl2tab, PAGE_SIZE);
   1.170 +                goto error_out;
   1.171 +            }
   1.172 +            memset(vl1tab, 0, PAGE_SIZE);
   1.173 +            vl1e = &vl1tab[l1_table_offset(dsi.v_start + (count<<PAGE_SHIFT))];
   1.174 +            *vl2e++ = l1tab | L2_PROT;
   1.175 +        }
   1.176 +
   1.177 +        *vl1e = (page_array[count] << PAGE_SHIFT) | L1_PROT;
   1.178 +        vl1e++;
   1.179 +    }
   1.180 +
   1.181 +    munmap(vl1tab, PAGE_SIZE);
   1.182 +    munmap(vl2tab, PAGE_SIZE);
   1.183 +    munmap(vl3tab, PAGE_SIZE);
   1.184 +#endif
   1.185      /* Write the machine->phys table entries. */
   1.186      for ( count = 0; count < nr_pages; count++ )
   1.187      {
   1.188 @@ -325,6 +454,7 @@ static int setup_guest(int xc_handle,
   1.189  		xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE,
   1.190  		page_array[(vboot_params_start-dsi.v_start)>>PAGE_SHIFT])) == 0)
   1.191          goto error_out;
   1.192 +
   1.193      memset(boot_paramsp, 0, sizeof(*boot_paramsp));
   1.194  
   1.195      strncpy((char *)boot_paramsp->cmd_line, cmdline, 0x800);
   1.196 @@ -381,7 +511,11 @@ static int setup_guest(int xc_handle,
   1.197  
   1.198      /* memsize is in megabytes */
   1.199      build_e820map(mem_mapp, memsize << 20);
   1.200 +#if defined (__i386__)
   1.201      if (zap_mmio_ranges(xc_handle, dom, l2tab, mem_mapp) == -1)
   1.202 +#else
   1.203 +    if (zap_mmio_ranges(xc_handle, dom, l3tab, mem_mapp) == -1)
   1.204 +#endif
   1.205      	goto error_out;
   1.206      boot_paramsp->e820_map_nr = mem_mapp->nr_map;
   1.207      for (i=0; i<mem_mapp->nr_map; i++) {
     2.1 --- a/tools/python/xen/xend/image.py	Mon Jul 11 10:23:19 2005 +0000
     2.2 +++ b/tools/python/xen/xend/image.py	Mon Jul 11 14:39:10 2005 +0000
     2.3 @@ -351,6 +351,8 @@ class VmxImageHandler(ImageHandler):
     2.4          @param mem_mb: size in MB
     2.5          @return size in KB
     2.6          """
     2.7 -        # Logic x86-32 specific. 
     2.8          # 1 page for the PGD + 1 pte page for 4MB of memory (rounded)
     2.9 -        return (1 + ((mem_mb + 3) >> 2)) * 4
    2.10 +        if os.uname()[4] == 'x86_64':
    2.11 +            return (5 + ((mem_mb + 1) >> 1)) * 4
    2.12 +        else:
    2.13 +            return (1 + ((mem_mb + 3) >> 2)) * 4
     3.1 --- a/xen/arch/x86/shadow32.c	Mon Jul 11 10:23:19 2005 +0000
     3.2 +++ b/xen/arch/x86/shadow32.c	Mon Jul 11 14:39:10 2005 +0000
     3.3 @@ -677,7 +677,7 @@ int _shadow_mode_refcounts(struct domain
     3.4      return shadow_mode_refcounts(d);
     3.5  }
     3.6  
     3.7 -void alloc_monitor_pagetable(struct vcpu *v)
     3.8 +static void alloc_monitor_pagetable(struct vcpu *v)
     3.9  {
    3.10      unsigned long mmfn;
    3.11      l2_pgentry_t *mpl2e;
     4.1 --- a/xen/arch/x86/shadow_public.c	Mon Jul 11 10:23:19 2005 +0000
     4.2 +++ b/xen/arch/x86/shadow_public.c	Mon Jul 11 14:39:10 2005 +0000
     4.3 @@ -162,7 +162,7 @@ static pagetable_t page_table_convert(st
     4.4      return mk_pagetable(page_to_phys(l4page));
     4.5  }
     4.6  
     4.7 -void alloc_monitor_pagetable(struct vcpu *v)
     4.8 +static void alloc_monitor_pagetable(struct vcpu *v)
     4.9  {
    4.10      unsigned long mmfn;
    4.11      l4_pgentry_t *mpl4e;
     5.1 --- a/xen/arch/x86/vmx.c	Mon Jul 11 10:23:19 2005 +0000
     5.2 +++ b/xen/arch/x86/vmx.c	Mon Jul 11 14:39:10 2005 +0000
     5.3 @@ -801,7 +801,11 @@ vmx_world_restore(struct vcpu *d, struct
     5.4  skip_cr3:
     5.5  
     5.6      error |= __vmread(CR4_READ_SHADOW, &old_cr4);
     5.7 +#if defined (__i386__)
     5.8      error |= __vmwrite(GUEST_CR4, (c->cr4 | X86_CR4_VMXE));
     5.9 +#else
    5.10 +    error |= __vmwrite(GUEST_CR4, (c->cr4 | X86_CR4_VMXE | X86_CR4_PAE));
    5.11 +#endif
    5.12      error |= __vmwrite(CR4_READ_SHADOW, c->cr4);
    5.13  
    5.14      error |= __vmwrite(GUEST_IDTR_LIMIT, c->idtr_limit);
    5.15 @@ -860,7 +864,7 @@ vmx_assist(struct vcpu *d, int mode)
    5.16  {
    5.17      struct vmx_assist_context c;
    5.18      u32 magic;
    5.19 -    unsigned long cp;
    5.20 +    u32 cp;
    5.21  
    5.22      /* make sure vmxassist exists (this is not an error) */
    5.23      if (!vmx_copy(&magic, VMXASSIST_MAGIC_OFFSET, sizeof(magic), COPY_IN))
    5.24 @@ -1191,7 +1195,7 @@ static int mov_to_cr(int gp, int cr, str
    5.25  
    5.26          __vmread(CR4_READ_SHADOW, &old_cr);
    5.27          if (pae_disabled)
    5.28 -            __vmwrite(GUEST_CR4, ((value & ~X86_CR4_PAE) | X86_CR4_VMXE));
    5.29 +            __vmwrite(GUEST_CR4, value| X86_CR4_VMXE);
    5.30          else
    5.31              __vmwrite(GUEST_CR4, value| X86_CR4_VMXE);
    5.32  
     6.1 --- a/xen/arch/x86/vmx_vmcs.c	Mon Jul 11 10:23:19 2005 +0000
     6.2 +++ b/xen/arch/x86/vmx_vmcs.c	Mon Jul 11 14:39:10 2005 +0000
     6.3 @@ -122,6 +122,7 @@ int vmx_setup_platform(struct vcpu *d, s
     6.4      struct e820entry *e820p;
     6.5      unsigned long gpfn = 0;
     6.6  
     6.7 +    local_flush_tlb_pge();
     6.8      regs->ebx = 0;   /* Linux expects ebx to be 0 for boot proc */
     6.9  
    6.10      n = regs->ecx;
    6.11 @@ -311,8 +312,7 @@ construct_init_vmcs_guest(struct cpu_use
    6.12      error |= __vmwrite(CR0_READ_SHADOW, shadow_cr);
    6.13      /* CR3 is set in vmx_final_setup_guest */
    6.14  #ifdef __x86_64__
    6.15 -    error |= __vmwrite(GUEST_CR4, host_env->cr4 & ~X86_CR4_PAE);
    6.16 -    printk("construct_init_vmcs_guest: guest CR4 is %lx\n", host_env->cr4 );
    6.17 +    error |= __vmwrite(GUEST_CR4, host_env->cr4 & ~X86_CR4_PSE);
    6.18  #else
    6.19      error |= __vmwrite(GUEST_CR4, host_env->cr4);
    6.20  #endif