ia64/xen-unstable

changeset 11903:a855c7d3a536

[HVM][VMX] Use CPUID instruction virtualization to workaround VMXAssist 4G limit.

Address space access limit in VMXAssist is 4G, because IA-32 only has
4GB virtual address space which VMXassist can use to map physical
memory. The issue is, win2k3 server with more than 4G memory will put
AP GDT above 4G, so when AP changes its mode from real mode to PAE
paging mode, the long jump instrction it uses need access AP GDT
entries which resides above 4G, but because of this constraint, it can
not access GDT and so fails boot.

Signed-off-by: Xin Li <xin.b.li@intel.com>
author kfraser@localhost.localdomain
date Thu Oct 19 15:49:16 2006 +0100 (2006-10-19)
parents d27d1f8ca25c
children f5321161c649
files tools/firmware/vmxassist/util.c tools/firmware/vmxassist/util.h tools/firmware/vmxassist/vm86.c xen/arch/x86/hvm/vmx/vmx.c
line diff
     1.1 --- a/tools/firmware/vmxassist/util.c	Thu Oct 19 15:15:36 2006 +0100
     1.2 +++ b/tools/firmware/vmxassist/util.c	Thu Oct 19 15:49:16 2006 +0100
     1.3 @@ -29,6 +29,31 @@ static void putchar(int);
     1.4  static char *printnum(char *, unsigned long, int);
     1.5  static void _doprint(void (*)(int), char const *, va_list);
     1.6  
     1.7 +void
     1.8 +cpuid_addr_value(uint64_t addr, uint64_t *value)
     1.9 +{
    1.10 +	uint32_t addr_low   = (uint32_t)addr;
    1.11 +	uint32_t addr_high  = (uint32_t)(addr >> 32);
    1.12 +	uint32_t value_low, value_high;
    1.13 +	static unsigned int addr_leaf;
    1.14 +
    1.15 +	if (!addr_leaf) {
    1.16 +		unsigned int eax, ebx, ecx, edx;
    1.17 +		__asm__ __volatile__(
    1.18 +			"cpuid"
    1.19 +			: "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
    1.20 +			: "0" (0x40000000));
    1.21 +		addr_leaf = eax + 1;
    1.22 +	}
    1.23 +
    1.24 +	__asm__ __volatile__(
    1.25 +		"cpuid"
    1.26 +		: "=c" (value_low), "=d" (value_high)
    1.27 +		: "a" (addr_leaf), "0" (addr_low), "1" (addr_high)
    1.28 +		: "ebx");
    1.29 +
    1.30 +	*value = (uint64_t)value_high << 32 | value_low;
    1.31 +}
    1.32  
    1.33  void
    1.34  dump_regs(struct regs *regs)
     2.1 --- a/tools/firmware/vmxassist/util.h	Thu Oct 19 15:15:36 2006 +0100
     2.2 +++ b/tools/firmware/vmxassist/util.h	Thu Oct 19 15:49:16 2006 +0100
     2.3 @@ -31,6 +31,7 @@
     2.4  
     2.5  struct vmx_assist_context;
     2.6  
     2.7 +extern void cpuid_addr_value(uint64_t addr, uint64_t *value);
     2.8  extern void hexdump(unsigned char *, int);
     2.9  extern void dump_regs(struct regs *);
    2.10  extern void dump_vmx_context(struct vmx_assist_context *);
     3.1 --- a/tools/firmware/vmxassist/vm86.c	Thu Oct 19 15:15:36 2006 +0100
     3.2 +++ b/tools/firmware/vmxassist/vm86.c	Thu Oct 19 15:49:16 2006 +0100
     3.3 @@ -56,8 +56,8 @@ static char *rnames[] = { "ax", "cx", "d
     3.4  #define PT_ENTRY_PRESENT 0x1
     3.5  
     3.6  /* We only support access to <=4G physical memory due to 1:1 mapping */
     3.7 -static unsigned
     3.8 -guest_linear_to_real(uint32_t base)
     3.9 +static uint64_t
    3.10 +guest_linear_to_phys(uint32_t base)
    3.11  {
    3.12  	uint32_t gcr3 = oldctx.cr3;
    3.13  	uint64_t l2_mfn;
    3.14 @@ -89,23 +89,32 @@ guest_linear_to_real(uint32_t base)
    3.15  		l2_mfn = ((uint64_t *)(long)gcr3)[(base >> 30) & 0x3];
    3.16  		if (!(l2_mfn & PT_ENTRY_PRESENT))
    3.17  			panic("l3 entry not present\n");
    3.18 -		l2_mfn &= 0x3fffff000ULL;
    3.19 +		l2_mfn &= 0xffffff000ULL;
    3.20  
    3.21 -		l1_mfn = ((uint64_t *)(long)l2_mfn)[(base >> 21) & 0x1ff];
    3.22 +		if (l2_mfn & 0xf00000000ULL) {
    3.23 +			printf("l2 page above 4G\n");
    3.24 +			cpuid_addr_value(l2_mfn + 8 * ((base >> 21) & 0x1ff), &l1_mfn);
    3.25 +		} else
    3.26 +			l1_mfn = ((uint64_t *)(long)l2_mfn)[(base >> 21) & 0x1ff];
    3.27  		if (!(l1_mfn & PT_ENTRY_PRESENT))
    3.28  			panic("l2 entry not present\n");
    3.29  
    3.30  		if (l1_mfn & PDE_PS) { /* CR4.PSE is ignored in PAE mode */
    3.31 -			l0_mfn = l1_mfn & 0x3ffe00000ULL;
    3.32 +			l0_mfn = l1_mfn & 0xfffe00000ULL;
    3.33  			return l0_mfn + (base & 0x1fffff);
    3.34  		}
    3.35  
    3.36 -		l1_mfn &= 0x3fffff000ULL;
    3.37 +		l1_mfn &= 0xffffff000ULL;
    3.38  
    3.39 -		l0_mfn = ((uint64_t *)(long)l1_mfn)[(base >> 12) & 0x1ff];
    3.40 +		if (l1_mfn & 0xf00000000ULL) {
    3.41 +			printf("l1 page above 4G\n");
    3.42 +			cpuid_addr_value(l1_mfn + 8 * ((base >> 12) & 0x1ff), &l0_mfn);
    3.43 +		} else
    3.44 +			l0_mfn = ((uint64_t *)(long)l1_mfn)[(base >> 12) & 0x1ff];
    3.45  		if (!(l0_mfn & PT_ENTRY_PRESENT))
    3.46  			panic("l1 entry not present\n");
    3.47 -		l0_mfn &= 0x3fffff000ULL;
    3.48 +
    3.49 +		l0_mfn &= 0xffffff000ULL;
    3.50  
    3.51  		return l0_mfn + (base & 0xfff);
    3.52  	}
    3.53 @@ -114,6 +123,7 @@ guest_linear_to_real(uint32_t base)
    3.54  static unsigned
    3.55  address(struct regs *regs, unsigned seg, unsigned off)
    3.56  {
    3.57 +	uint64_t gdt_phys_base;
    3.58  	unsigned long long entry;
    3.59  	unsigned seg_base, seg_limit;
    3.60  	unsigned entry_low, entry_high;
    3.61 @@ -129,8 +139,13 @@ address(struct regs *regs, unsigned seg,
    3.62  	    (mode == VM86_REAL_TO_PROTECTED && regs->cs == seg))
    3.63  		return ((seg & 0xFFFF) << 4) + off;
    3.64  
    3.65 -	entry = ((unsigned long long *)
    3.66 -                 guest_linear_to_real(oldctx.gdtr_base))[seg >> 3];
    3.67 +	gdt_phys_base = guest_linear_to_phys(oldctx.gdtr_base);
    3.68 +	if (gdt_phys_base != (uint32_t)gdt_phys_base) {
    3.69 +		printf("gdt base address above 4G\n");
    3.70 +		cpuid_addr_value(gdt_phys_base + 8 * (seg >> 3), &entry);
    3.71 +	} else
    3.72 +		entry = ((unsigned long long *)(long)gdt_phys_base)[seg >> 3];
    3.73 +
    3.74  	entry_high = entry >> 32;
    3.75  	entry_low = entry & 0xFFFFFFFF;
    3.76  
    3.77 @@ -804,6 +819,7 @@ pop(struct regs *regs, unsigned prefix, 
    3.78  static int
    3.79  load_seg(unsigned long sel, uint32_t *base, uint32_t *limit, union vmcs_arbytes *arbytes)
    3.80  {
    3.81 +	uint64_t gdt_phys_base;
    3.82  	unsigned long long entry;
    3.83  
    3.84  	/* protected mode: use seg as index into gdt */
    3.85 @@ -815,8 +831,12 @@ load_seg(unsigned long sel, uint32_t *ba
    3.86  		return 1;
    3.87  	}
    3.88  
    3.89 -	entry = ((unsigned long long *)
    3.90 -                 guest_linear_to_real(oldctx.gdtr_base))[sel >> 3];
    3.91 +	gdt_phys_base = guest_linear_to_phys(oldctx.gdtr_base);
    3.92 +	if (gdt_phys_base != (uint32_t)gdt_phys_base) {
    3.93 +		printf("gdt base address above 4G\n");
    3.94 +		cpuid_addr_value(gdt_phys_base + 8 * (sel >> 3), &entry);
    3.95 +	} else
    3.96 +		entry = ((unsigned long long *)(long)gdt_phys_base)[sel >> 3];
    3.97  
    3.98  	/* Check the P bit first */
    3.99  	if (!((entry >> (15+32)) & 0x1) && sel != 0)
     4.1 --- a/xen/arch/x86/hvm/vmx/vmx.c	Thu Oct 19 15:15:36 2006 +0100
     4.2 +++ b/xen/arch/x86/hvm/vmx/vmx.c	Thu Oct 19 15:49:16 2006 +0100
     4.3 @@ -921,7 +921,32 @@ static void vmx_do_cpuid(struct cpu_user
     4.4      if ( input == CPUID_LEAF_0x4 )
     4.5      {
     4.6          cpuid_count(input, count, &eax, &ebx, &ecx, &edx);
     4.7 -        eax &= NUM_CORES_RESET_MASK;  
     4.8 +        eax &= NUM_CORES_RESET_MASK;
     4.9 +    }
    4.10 +    else if ( input == 0x40000003 )
    4.11 +    {
    4.12 +        /*
    4.13 +         * NB. Unsupported interface for private use of VMXASSIST only.
    4.14 +         * Note that this leaf lives at <max-hypervisor-leaf> + 1.
    4.15 +         */
    4.16 +        u64 value = ((u64)regs->edx << 32) | (u32)regs->ecx;
    4.17 +        unsigned long mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
    4.18 +        char *p;
    4.19 +
    4.20 +        DPRINTK("Input address is 0x%"PRIx64".\n", value);
    4.21 +
    4.22 +        /* 8-byte aligned valid pseudophys address from vmxassist, please. */
    4.23 +        if ( (value & 7) || (mfn == INVALID_MFN) ||
    4.24 +             !v->arch.hvm_vmx.vmxassist_enabled )
    4.25 +            domain_crash_synchronous();
    4.26 +
    4.27 +        p = map_domain_page(mfn);
    4.28 +        value = *((uint64_t *)(p + (value & (PAGE_SIZE - 1))));
    4.29 +        unmap_domain_page(p);
    4.30 +
    4.31 +        DPRINTK("Output value is 0x%"PRIx64".\n", value);
    4.32 +        ecx = (u32)(value >>  0);
    4.33 +        edx = (u32)(value >> 32);
    4.34      }
    4.35      else if ( !cpuid_hypervisor_leaves(input, &eax, &ebx, &ecx, &edx) )
    4.36      {