ia64/xen-unstable

changeset 18901:6595393a3d28

Use virtual 8086 mode for VMX guests with CR0.PE == 0

When a VMX guest tries to enter real mode, put it in virtual 8086 mode
instead, if that's possible. Handle all errors and corner cases by
falling back to the real-mode emulator.

This is similar to the old VMXASSIST system except it uses Xen's
x86_emulate emulator instead of having a partial emulator in the guest
firmware. It more than doubles the speed of real-mode operation on
VMX.

Signed-off-by: Tim Deegan <Tim.Deegan@citrix.com>
author Keir Fraser <keir.fraser@citrix.com>
date Tue Dec 09 16:28:02 2008 +0000 (2008-12-09)
parents 5535efd8e011
children b73f3646a17f
files tools/firmware/hvmloader/hvmloader.c tools/libxc/xc_domain_restore.c tools/libxc/xc_domain_save.c xen/arch/x86/hvm/vmx/entry.S xen/arch/x86/hvm/vmx/realmode.c xen/arch/x86/hvm/vmx/vmcs.c xen/arch/x86/hvm/vmx/vmx.c xen/arch/x86/x86_32/asm-offsets.c xen/arch/x86/x86_64/asm-offsets.c xen/arch/x86/x86_emulate/x86_emulate.h xen/include/asm-x86/hvm/vmx/vmcs.h xen/include/asm-x86/perfc_defn.h xen/include/public/hvm/params.h
line diff
     1.1 --- a/tools/firmware/hvmloader/hvmloader.c	Tue Dec 09 13:23:15 2008 +0000
     1.2 +++ b/tools/firmware/hvmloader/hvmloader.c	Tue Dec 09 16:28:02 2008 +0000
     1.3 @@ -536,6 +536,23 @@ static uint16_t init_xen_platform_io_bas
     1.4      return bios_info->xen_pfiob;
     1.5  }
     1.6  
     1.7 +/* Set up an empty TSS area for virtual 8086 mode to use. 
     1.8 + * The only important thing is that it musn't have any bits set 
     1.9 + * in the interrupt redirection bitmap, so all zeros will do.  */
    1.10 +static void init_vm86_tss(void)
    1.11 +{
    1.12 +    uint32_t tss;
    1.13 +    struct xen_hvm_param p;
    1.14 +
    1.15 +    tss = e820_malloc(128, 128);
    1.16 +    memset((char *)tss, 0, 128);
    1.17 +    p.domid = DOMID_SELF;
    1.18 +    p.index = HVM_PARAM_VM86_TSS;
    1.19 +    p.value = tss;
    1.20 +    hypercall_hvm_op(HVMOP_set_param, &p);
    1.21 +    printf("vm86 TSS at %08x\n", tss);
    1.22 +}
    1.23 +
    1.24  int main(void)
    1.25  {
    1.26      int option_rom_sz = 0, vgabios_sz = 0, etherboot_sz = 0;
    1.27 @@ -606,6 +623,8 @@ int main(void)
    1.28          acpi_build_tables();
    1.29      }
    1.30  
    1.31 +    init_vm86_tss();
    1.32 +
    1.33      cmos_write_memory_size();
    1.34  
    1.35      printf("BIOS map:\n");
     2.1 --- a/tools/libxc/xc_domain_restore.c	Tue Dec 09 13:23:15 2008 +0000
     2.2 +++ b/tools/libxc/xc_domain_restore.c	Tue Dec 09 16:28:02 2008 +0000
     2.3 @@ -490,6 +490,22 @@ int xc_domain_restore(int xc_handle, int
     2.4              continue;
     2.5          }
     2.6  
     2.7 +        if ( j == -4 )
     2.8 +        {
     2.9 +            uint64_t vm86_tss;
    2.10 +
    2.11 +            /* Skip padding 4 bytes then read the vm86 TSS location. */
    2.12 +            if ( read_exact(io_fd, &vm86_tss, sizeof(uint32_t)) ||
    2.13 +                 read_exact(io_fd, &vm86_tss, sizeof(uint64_t)) )
    2.14 +            {
    2.15 +                ERROR("error read the address of the vm86 TSS");
    2.16 +                goto out;
    2.17 +            }
    2.18 +
    2.19 +            xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, vm86_tss);
    2.20 +            continue;
    2.21 +        }
    2.22 +
    2.23          if ( j == 0 )
    2.24              break;  /* our work here is done */
    2.25  
     3.1 --- a/tools/libxc/xc_domain_save.c	Tue Dec 09 13:23:15 2008 +0000
     3.2 +++ b/tools/libxc/xc_domain_save.c	Tue Dec 09 16:28:02 2008 +0000
     3.3 @@ -1388,20 +1388,32 @@ int xc_domain_save(int xc_handle, int io
     3.4      if ( hvm )
     3.5      {
     3.6          struct {
     3.7 -            int minusthree;
     3.8 +            int id;
     3.9              uint32_t pad;
    3.10 -            uint64_t ident_pt;
    3.11 -        } chunk = { -3, 0 };
    3.12 +            uint64_t data;
    3.13 +        } chunk = { 0, };
    3.14  
    3.15 +        chunk.id = -3;
    3.16          xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
    3.17 -                         (unsigned long *)&chunk.ident_pt);
    3.18 +                         (unsigned long *)&chunk.data);
    3.19  
    3.20 -        if ( (chunk.ident_pt != 0) &&
    3.21 +        if ( (chunk.data != 0) &&
    3.22               write_exact(io_fd, &chunk, sizeof(chunk)) )
    3.23          {
    3.24              PERROR("Error when writing the ident_pt for EPT guest");
    3.25              goto out;
    3.26          }
    3.27 +
    3.28 +        chunk.id = -4;
    3.29 +        xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS,
    3.30 +                         (unsigned long *)&chunk.data);
    3.31 +
    3.32 +        if ( (chunk.data != 0) &&
    3.33 +             write_exact(io_fd, &chunk, sizeof(chunk)) )
    3.34 +        {
    3.35 +            PERROR("Error when writing the vm86 TSS for guest");
    3.36 +            goto out;
    3.37 +        }
    3.38      }
    3.39  
    3.40      /* Zero terminate */
     4.1 --- a/xen/arch/x86/hvm/vmx/entry.S	Tue Dec 09 13:23:15 2008 +0000
     4.2 +++ b/xen/arch/x86/hvm/vmx/entry.S	Tue Dec 09 16:28:02 2008 +0000
     4.3 @@ -133,9 +133,15 @@ vmx_asm_do_vmentry:
     4.4          cmpl $0,(r(dx),r(ax),1)
     4.5          jnz  .Lvmx_process_softirqs
     4.6  
     4.7 -        testb $0xff,VCPU_vmx_emul(r(bx))
     4.8 -        jnz  .Lvmx_goto_realmode
     4.9 +        testb $0xff,VCPU_vmx_emulate(r(bx))
    4.10 +        jnz .Lvmx_goto_emulator
    4.11 +        testb $0xff,VCPU_vmx_realmode(r(bx))
    4.12 +        jz .Lvmx_not_realmode
    4.13 +        cmpw $0,VCPU_vm86_seg_mask(r(bx))
    4.14 +        jnz .Lvmx_goto_emulator
    4.15 +        call_with_regs(vmx_enter_realmode) 
    4.16  
    4.17 +.Lvmx_not_realmode:
    4.18          mov  VCPU_hvm_guest_cr2(r(bx)),r(ax)
    4.19          mov  r(ax),%cr2
    4.20          call vmx_trace_vmentry
    4.21 @@ -189,7 +195,7 @@ vmx_asm_do_vmentry:
    4.22          call vm_launch_fail
    4.23          ud2
    4.24  
    4.25 -.Lvmx_goto_realmode:
    4.26 +.Lvmx_goto_emulator:
    4.27          sti
    4.28          call_with_regs(vmx_realmode)
    4.29          jmp  vmx_asm_do_vmentry
     5.1 --- a/xen/arch/x86/hvm/vmx/realmode.c	Tue Dec 09 13:23:15 2008 +0000
     5.2 +++ b/xen/arch/x86/hvm/vmx/realmode.c	Tue Dec 09 16:28:02 2008 +0000
     5.3 @@ -103,31 +103,13 @@ static void realmode_deliver_exception(
     5.4  static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt)
     5.5  {
     5.6      struct vcpu *curr = current;
     5.7 -    unsigned long seg_reg_dirty;
     5.8      uint32_t intr_info;
     5.9      int rc;
    5.10  
    5.11 -    seg_reg_dirty = hvmemul_ctxt->seg_reg_dirty;
    5.12 -    hvmemul_ctxt->seg_reg_dirty = 0;
    5.13 +    perfc_incr(realmode_emulations);
    5.14  
    5.15      rc = hvm_emulate_one(hvmemul_ctxt);
    5.16  
    5.17 -    if ( test_bit(x86_seg_cs, &hvmemul_ctxt->seg_reg_dirty) )
    5.18 -    {
    5.19 -        curr->arch.hvm_vmx.vmxemul &= ~VMXEMUL_BAD_CS;
    5.20 -        if ( hvmemul_get_seg_reg(x86_seg_cs, hvmemul_ctxt)->sel & 3 )
    5.21 -            curr->arch.hvm_vmx.vmxemul |= VMXEMUL_BAD_CS;
    5.22 -    }
    5.23 -
    5.24 -    if ( test_bit(x86_seg_ss, &hvmemul_ctxt->seg_reg_dirty) )
    5.25 -    {
    5.26 -        curr->arch.hvm_vmx.vmxemul &= ~VMXEMUL_BAD_SS;
    5.27 -        if ( hvmemul_get_seg_reg(x86_seg_ss, hvmemul_ctxt)->sel & 3 )
    5.28 -            curr->arch.hvm_vmx.vmxemul |= VMXEMUL_BAD_SS;
    5.29 -    }
    5.30 -
    5.31 -    hvmemul_ctxt->seg_reg_dirty |= seg_reg_dirty;
    5.32 -
    5.33      if ( rc == X86EMUL_UNHANDLEABLE )
    5.34      {
    5.35          gdprintk(XENLOG_ERR, "Failed to emulate insn.\n");
    5.36 @@ -210,7 +192,8 @@ void vmx_realmode(struct cpu_user_regs *
    5.37          intr_info = 0;
    5.38      }
    5.39  
    5.40 -    while ( curr->arch.hvm_vmx.vmxemul &&
    5.41 +    curr->arch.hvm_vmx.vmx_emulate = 1;
    5.42 +    while ( curr->arch.hvm_vmx.vmx_emulate &&
    5.43              !softirq_pending(smp_processor_id()) &&
    5.44              (curr->arch.hvm_vcpu.io_state == HVMIO_none) )
    5.45      {
    5.46 @@ -220,13 +203,27 @@ void vmx_realmode(struct cpu_user_regs *
    5.47           * in real mode, because we don't emulate protected-mode IDT vectoring.
    5.48           */
    5.49          if ( unlikely(!(++emulations & 15)) &&
    5.50 -             !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) &&
    5.51 +             curr->arch.hvm_vmx.vmx_realmode && 
    5.52               hvm_local_events_need_delivery(curr) )
    5.53              break;
    5.54 +
    5.55          realmode_emulate_one(&hvmemul_ctxt);
    5.56 +
    5.57 +        /* Stop emulating unless our segment state is not safe */
    5.58 +        if ( curr->arch.hvm_vmx.vmx_realmode )
    5.59 +            curr->arch.hvm_vmx.vmx_emulate = 
    5.60 +                (curr->arch.hvm_vmx.vm86_segment_mask != 0);
    5.61 +        else
    5.62 +            curr->arch.hvm_vmx.vmx_emulate = 
    5.63 +                 ((hvmemul_ctxt.seg_reg[x86_seg_cs].sel & 3)
    5.64 +                  || (hvmemul_ctxt.seg_reg[x86_seg_ss].sel & 3));
    5.65      }
    5.66  
    5.67 -    if ( !curr->arch.hvm_vmx.vmxemul )
    5.68 +    /* Need to emulate next time if we've started an IO operation */
    5.69 +    if ( curr->arch.hvm_vcpu.io_state != HVMIO_none )
    5.70 +        curr->arch.hvm_vmx.vmx_emulate = 1;
    5.71 +
    5.72 +    if ( !curr->arch.hvm_vmx.vmx_emulate && !curr->arch.hvm_vmx.vmx_realmode )
    5.73      {
    5.74          /*
    5.75           * Cannot enter protected mode with bogus selector RPLs and DPLs.
     6.1 --- a/xen/arch/x86/hvm/vmx/vmcs.c	Tue Dec 09 13:23:15 2008 +0000
     6.2 +++ b/xen/arch/x86/hvm/vmx/vmcs.c	Tue Dec 09 16:28:02 2008 +0000
     6.3 @@ -880,15 +880,6 @@ void vmx_do_resume(struct vcpu *v)
     6.4      reset_stack_and_jump(vmx_asm_do_vmentry);
     6.5  }
     6.6  
     6.7 -static void vmx_dump_sel(char *name, enum x86_segment seg)
     6.8 -{
     6.9 -    struct segment_register sreg;
    6.10 -    hvm_get_segment_register(current, seg, &sreg);
    6.11 -    printk("%s: sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016llx\n", 
    6.12 -           name, sreg.sel, sreg.attr.bytes, sreg.limit,
    6.13 -           (unsigned long long)sreg.base);
    6.14 -}
    6.15 -
    6.16  static unsigned long vmr(unsigned long field)
    6.17  {
    6.18      int rc;
    6.19 @@ -897,6 +888,28 @@ static unsigned long vmr(unsigned long f
    6.20      return rc ? 0 : val;
    6.21  }
    6.22  
    6.23 +static void vmx_dump_sel(char *name, uint32_t selector)
    6.24 +{
    6.25 +    uint32_t sel, attr, limit;
    6.26 +    uint64_t base;
    6.27 +    sel = vmr(selector);
    6.28 +    attr = vmr(selector + (GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR));
    6.29 +    limit = vmr(selector + (GUEST_ES_LIMIT - GUEST_ES_SELECTOR));
    6.30 +    base = vmr(selector + (GUEST_ES_BASE - GUEST_ES_SELECTOR));
    6.31 +    printk("%s: sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016"PRIx64"\n",
    6.32 +           name, sel, attr, limit, base);
    6.33 +}
    6.34 +
    6.35 +static void vmx_dump_sel2(char *name, uint32_t lim)
    6.36 +{
    6.37 +    uint32_t limit;
    6.38 +    uint64_t base;
    6.39 +    limit = vmr(lim);
    6.40 +    base = vmr(lim + (GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
    6.41 +    printk("%s:                           limit=0x%08x, base=0x%016"PRIx64"\n",
    6.42 +           name, limit, base);
    6.43 +}
    6.44 +
    6.45  void vmcs_dump_vcpu(struct vcpu *v)
    6.46  {
    6.47      struct cpu_user_regs *regs = &v->arch.guest_context.user_regs;
    6.48 @@ -938,16 +951,16 @@ void vmcs_dump_vcpu(struct vcpu *v)
    6.49             (unsigned long long)vmr(GUEST_SYSENTER_ESP),
    6.50             (int)vmr(GUEST_SYSENTER_CS),
    6.51             (unsigned long long)vmr(GUEST_SYSENTER_EIP));
    6.52 -    vmx_dump_sel("CS", x86_seg_cs);
    6.53 -    vmx_dump_sel("DS", x86_seg_ds);
    6.54 -    vmx_dump_sel("SS", x86_seg_ss);
    6.55 -    vmx_dump_sel("ES", x86_seg_es);
    6.56 -    vmx_dump_sel("FS", x86_seg_fs);
    6.57 -    vmx_dump_sel("GS", x86_seg_gs);
    6.58 -    vmx_dump_sel("GDTR", x86_seg_gdtr);
    6.59 -    vmx_dump_sel("LDTR", x86_seg_ldtr);
    6.60 -    vmx_dump_sel("IDTR", x86_seg_idtr);
    6.61 -    vmx_dump_sel("TR", x86_seg_tr);
    6.62 +    vmx_dump_sel("CS", GUEST_CS_SELECTOR);
    6.63 +    vmx_dump_sel("DS", GUEST_DS_SELECTOR);
    6.64 +    vmx_dump_sel("SS", GUEST_SS_SELECTOR);
    6.65 +    vmx_dump_sel("ES", GUEST_ES_SELECTOR);
    6.66 +    vmx_dump_sel("FS", GUEST_FS_SELECTOR);
    6.67 +    vmx_dump_sel("GS", GUEST_GS_SELECTOR);
    6.68 +    vmx_dump_sel2("GDTR", GUEST_GDTR_LIMIT);
    6.69 +    vmx_dump_sel("LDTR", GUEST_LDTR_SELECTOR);
    6.70 +    vmx_dump_sel2("IDTR", GUEST_IDTR_LIMIT);
    6.71 +    vmx_dump_sel("TR", GUEST_TR_SELECTOR);
    6.72      x  = (unsigned long long)vmr(TSC_OFFSET_HIGH) << 32;
    6.73      x |= (uint32_t)vmr(TSC_OFFSET);
    6.74      printk("TSC Offset = %016llx\n", x);
     7.1 --- a/xen/arch/x86/hvm/vmx/vmx.c	Tue Dec 09 13:23:15 2008 +0000
     7.2 +++ b/xen/arch/x86/hvm/vmx/vmx.c	Tue Dec 09 16:28:02 2008 +0000
     7.3 @@ -704,6 +704,26 @@ static void vmx_ctxt_switch_to(struct vc
     7.4      vpmu_load(v);
     7.5  }
     7.6  
     7.7 +
     7.8 +/* SDM volume 3b section 22.3.1.2: we can only enter virtual 8086 mode
     7.9 + * if all of CS, SS, DS, ES, FS and GS are 16bit ring-3 data segments.
    7.10 + * The guest thinks it's got ring-0 segments, so we need to fudge
    7.11 + * things.  We store the ring-3 version in the VMCS to avoid lots of
    7.12 + * shuffling on vmenter and vmexit, and translate in these accessors. */
    7.13 +
    7.14 +#define rm_cs_attr (((union segment_attributes) {                       \
    7.15 +        .fields = { .type = 0xb, .s = 1, .dpl = 0, .p = 1, .avl = 0,    \
    7.16 +                    .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
    7.17 +#define rm_ds_attr (((union segment_attributes) {                       \
    7.18 +        .fields = { .type = 0x3, .s = 1, .dpl = 0, .p = 1, .avl = 0,    \
    7.19 +                    .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
    7.20 +#define vm86_ds_attr (((union segment_attributes) {                     \
    7.21 +        .fields = { .type = 0x3, .s = 1, .dpl = 3, .p = 1, .avl = 0,    \
    7.22 +                    .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
    7.23 +#define vm86_tr_attr (((union segment_attributes) {                     \
    7.24 +        .fields = { .type = 0xb, .s = 0, .dpl = 0, .p = 1, .avl = 0,    \
    7.25 +                    .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
    7.26 +
    7.27  static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
    7.28                                       struct segment_register *reg)
    7.29  {
    7.30 @@ -779,14 +799,85 @@ static void vmx_get_segment_register(str
    7.31      /* Unusable flag is folded into Present flag. */
    7.32      if ( attr & (1u<<16) )
    7.33          reg->attr.fields.p = 0;
    7.34 +
    7.35 +    /* Adjust for virtual 8086 mode */
    7.36 +    if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr 
    7.37 +         && !(v->arch.hvm_vmx.vm86_segment_mask & (1u << seg)) )
    7.38 +    {
    7.39 +        struct segment_register *sreg = &v->arch.hvm_vmx.vm86_saved_seg[seg];
    7.40 +        if ( seg == x86_seg_tr ) 
    7.41 +            *reg = *sreg;
    7.42 +        else if ( reg->base != sreg->base || seg == x86_seg_ss )
    7.43 +        {
    7.44 +            /* If the guest's reloaded the segment, remember the new version.
    7.45 +             * We can't tell if the guest reloaded the segment with another 
    7.46 +             * one that has the same base.  By default we assume it hasn't,
    7.47 +             * since we don't want to lose big-real-mode segment attributes,
    7.48 +             * but for SS we assume it has: the Ubuntu graphical bootloader
    7.49 +             * does this and gets badly confused if we leave the old SS in 
    7.50 +             * place. */
    7.51 +            reg->attr.bytes = (seg == x86_seg_cs ? rm_cs_attr : rm_ds_attr);
    7.52 +            *sreg = *reg;
    7.53 +        }
    7.54 +        else 
    7.55 +        {
    7.56 +            /* Always give realmode guests a selector that matches the base
    7.57 +             * but keep the attr and limit from before */
    7.58 +            *reg = *sreg;
    7.59 +            reg->sel = reg->base >> 4;
    7.60 +        }
    7.61 +    }
    7.62  }
    7.63  
    7.64  static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
    7.65                                       struct segment_register *reg)
    7.66  {
    7.67 -    uint32_t attr;
    7.68 +    uint32_t attr, sel, limit;
    7.69 +    uint64_t base;
    7.70 +
    7.71 +    sel = reg->sel;
    7.72 +    attr = reg->attr.bytes;
    7.73 +    limit = reg->limit;
    7.74 +    base = reg->base;
    7.75  
    7.76 -    attr = reg->attr.bytes;
    7.77 +    /* Adjust CS/SS/DS/ES/FS/GS/TR for virtual 8086 mode */
    7.78 +    if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr )
    7.79 +    {
    7.80 +        /* Remember the proper contents */
    7.81 +        v->arch.hvm_vmx.vm86_saved_seg[seg] = *reg;
    7.82 +        
    7.83 +        if ( seg == x86_seg_tr ) 
    7.84 +        {
    7.85 +            if ( v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS] )
    7.86 +            {
    7.87 +                sel = 0;
    7.88 +                attr = vm86_tr_attr;
    7.89 +                limit = 0xff;
    7.90 +                base = v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS];
    7.91 +                v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
    7.92 +            }
    7.93 +            else
    7.94 +                v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
    7.95 +        }
    7.96 +        else
    7.97 +        {
    7.98 +            /* Try to fake it out as a 16bit data segment.  This could
    7.99 +             * cause confusion for the guest if it reads the selector,
   7.100 +             * but otherwise we have to emulate if *any* segment hasn't
   7.101 +             * been reloaded. */
   7.102 +            if ( base < 0x100000 && !(base & 0xf) && limit >= 0xffff
   7.103 +                 && reg->attr.fields.p )
   7.104 +            {
   7.105 +                sel = base >> 4;
   7.106 +                attr = vm86_ds_attr;
   7.107 +                limit = 0xffff;
   7.108 +                v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
   7.109 +            }
   7.110 +            else 
   7.111 +                v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
   7.112 +        }
   7.113 +    }
   7.114 +
   7.115      attr = ((attr & 0xf00) << 4) | (attr & 0xff);
   7.116  
   7.117      /* Not-present must mean unusable. */
   7.118 @@ -794,67 +885,67 @@ static void vmx_set_segment_register(str
   7.119          attr |= (1u << 16);
   7.120  
   7.121      /* VMX has strict consistency requirement for flag G. */
   7.122 -    attr |= !!(reg->limit >> 20) << 15;
   7.123 +    attr |= !!(limit >> 20) << 15;
   7.124  
   7.125      vmx_vmcs_enter(v);
   7.126  
   7.127      switch ( seg )
   7.128      {
   7.129      case x86_seg_cs:
   7.130 -        __vmwrite(GUEST_CS_SELECTOR, reg->sel);
   7.131 -        __vmwrite(GUEST_CS_LIMIT, reg->limit);
   7.132 -        __vmwrite(GUEST_CS_BASE, reg->base);
   7.133 +        __vmwrite(GUEST_CS_SELECTOR, sel);
   7.134 +        __vmwrite(GUEST_CS_LIMIT, limit);
   7.135 +        __vmwrite(GUEST_CS_BASE, base);
   7.136          __vmwrite(GUEST_CS_AR_BYTES, attr);
   7.137          break;
   7.138      case x86_seg_ds:
   7.139 -        __vmwrite(GUEST_DS_SELECTOR, reg->sel);
   7.140 -        __vmwrite(GUEST_DS_LIMIT, reg->limit);
   7.141 -        __vmwrite(GUEST_DS_BASE, reg->base);
   7.142 +        __vmwrite(GUEST_DS_SELECTOR, sel);
   7.143 +        __vmwrite(GUEST_DS_LIMIT, limit);
   7.144 +        __vmwrite(GUEST_DS_BASE, base);
   7.145          __vmwrite(GUEST_DS_AR_BYTES, attr);
   7.146          break;
   7.147      case x86_seg_es:
   7.148 -        __vmwrite(GUEST_ES_SELECTOR, reg->sel);
   7.149 -        __vmwrite(GUEST_ES_LIMIT, reg->limit);
   7.150 -        __vmwrite(GUEST_ES_BASE, reg->base);
   7.151 +        __vmwrite(GUEST_ES_SELECTOR, sel);
   7.152 +        __vmwrite(GUEST_ES_LIMIT, limit);
   7.153 +        __vmwrite(GUEST_ES_BASE, base);
   7.154          __vmwrite(GUEST_ES_AR_BYTES, attr);
   7.155          break;
   7.156      case x86_seg_fs:
   7.157 -        __vmwrite(GUEST_FS_SELECTOR, reg->sel);
   7.158 -        __vmwrite(GUEST_FS_LIMIT, reg->limit);
   7.159 -        __vmwrite(GUEST_FS_BASE, reg->base);
   7.160 +        __vmwrite(GUEST_FS_SELECTOR, sel);
   7.161 +        __vmwrite(GUEST_FS_LIMIT, limit);
   7.162 +        __vmwrite(GUEST_FS_BASE, base);
   7.163          __vmwrite(GUEST_FS_AR_BYTES, attr);
   7.164          break;
   7.165      case x86_seg_gs:
   7.166 -        __vmwrite(GUEST_GS_SELECTOR, reg->sel);
   7.167 -        __vmwrite(GUEST_GS_LIMIT, reg->limit);
   7.168 -        __vmwrite(GUEST_GS_BASE, reg->base);
   7.169 +        __vmwrite(GUEST_GS_SELECTOR, sel);
   7.170 +        __vmwrite(GUEST_GS_LIMIT, limit);
   7.171 +        __vmwrite(GUEST_GS_BASE, base);
   7.172          __vmwrite(GUEST_GS_AR_BYTES, attr);
   7.173          break;
   7.174      case x86_seg_ss:
   7.175 -        __vmwrite(GUEST_SS_SELECTOR, reg->sel);
   7.176 -        __vmwrite(GUEST_SS_LIMIT, reg->limit);
   7.177 -        __vmwrite(GUEST_SS_BASE, reg->base);
   7.178 +        __vmwrite(GUEST_SS_SELECTOR, sel);
   7.179 +        __vmwrite(GUEST_SS_LIMIT, limit);
   7.180 +        __vmwrite(GUEST_SS_BASE, base);
   7.181          __vmwrite(GUEST_SS_AR_BYTES, attr);
   7.182          break;
   7.183      case x86_seg_tr:
   7.184 -        __vmwrite(GUEST_TR_SELECTOR, reg->sel);
   7.185 -        __vmwrite(GUEST_TR_LIMIT, reg->limit);
   7.186 -        __vmwrite(GUEST_TR_BASE, reg->base);
   7.187 +        __vmwrite(GUEST_TR_SELECTOR, sel);
   7.188 +        __vmwrite(GUEST_TR_LIMIT, limit);
   7.189 +        __vmwrite(GUEST_TR_BASE, base);
   7.190          /* VMX checks that the the busy flag (bit 1) is set. */
   7.191          __vmwrite(GUEST_TR_AR_BYTES, attr | 2);
   7.192          break;
   7.193      case x86_seg_gdtr:
   7.194 -        __vmwrite(GUEST_GDTR_LIMIT, reg->limit);
   7.195 -        __vmwrite(GUEST_GDTR_BASE, reg->base);
   7.196 +        __vmwrite(GUEST_GDTR_LIMIT, limit);
   7.197 +        __vmwrite(GUEST_GDTR_BASE, base);
   7.198          break;
   7.199      case x86_seg_idtr:
   7.200 -        __vmwrite(GUEST_IDTR_LIMIT, reg->limit);
   7.201 -        __vmwrite(GUEST_IDTR_BASE, reg->base);
   7.202 +        __vmwrite(GUEST_IDTR_LIMIT, limit);
   7.203 +        __vmwrite(GUEST_IDTR_BASE, base);
   7.204          break;
   7.205      case x86_seg_ldtr:
   7.206 -        __vmwrite(GUEST_LDTR_SELECTOR, reg->sel);
   7.207 -        __vmwrite(GUEST_LDTR_LIMIT, reg->limit);
   7.208 -        __vmwrite(GUEST_LDTR_BASE, reg->base);
   7.209 +        __vmwrite(GUEST_LDTR_SELECTOR, sel);
   7.210 +        __vmwrite(GUEST_LDTR_LIMIT, limit);
   7.211 +        __vmwrite(GUEST_LDTR_BASE, base);
   7.212          __vmwrite(GUEST_LDTR_AR_BYTES, attr);
   7.213          break;
   7.214      default:
   7.215 @@ -970,6 +1061,7 @@ static void vmx_update_guest_cr(struct v
   7.216      switch ( cr )
   7.217      {
   7.218      case 0: {
   7.219 +        int realmode;
   7.220          unsigned long hw_cr0_mask =
   7.221              X86_CR0_NE | X86_CR0_PG | X86_CR0_PE;
   7.222  
   7.223 @@ -998,9 +1090,44 @@ static void vmx_update_guest_cr(struct v
   7.224                  vmx_fpu_enter(v);
   7.225          }
   7.226  
   7.227 -        v->arch.hvm_vmx.vmxemul &= ~VMXEMUL_REALMODE;
   7.228 -        if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
   7.229 -            v->arch.hvm_vmx.vmxemul |= VMXEMUL_REALMODE;
   7.230 +        realmode = !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE); 
   7.231 +        if ( realmode != v->arch.hvm_vmx.vmx_realmode )
   7.232 +        {
   7.233 +            enum x86_segment s; 
   7.234 +            struct segment_register reg[x86_seg_tr + 1];
   7.235 +
   7.236 +            /* Entering or leaving real mode: adjust the segment registers.
   7.237 +             * Need to read them all either way, as realmode reads can update
   7.238 +             * the saved values we'll use when returning to prot mode. */
   7.239 +            for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
   7.240 +                vmx_get_segment_register(v, s, &reg[s]);
   7.241 +            v->arch.hvm_vmx.vmx_realmode = realmode;
   7.242 +            
   7.243 +            if ( realmode )
   7.244 +            {
   7.245 +                for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
   7.246 +                    vmx_set_segment_register(v, s, &reg[s]);
   7.247 +                v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
   7.248 +                __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
   7.249 +                __vmwrite(EXCEPTION_BITMAP, 0xffffffff);
   7.250 +            }
   7.251 +            else 
   7.252 +            {
   7.253 +                for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ ) 
   7.254 +                    if ( !(v->arch.hvm_vmx.vm86_segment_mask & (1<<s)) )
   7.255 +                        vmx_set_segment_register(
   7.256 +                            v, s, &v->arch.hvm_vmx.vm86_saved_seg[s]);
   7.257 +                v->arch.hvm_vcpu.hw_cr[4] =
   7.258 +                    ((v->arch.hvm_vcpu.hw_cr[4] & ~X86_CR4_VME)
   7.259 +                     |(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VME));
   7.260 +                __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
   7.261 +                __vmwrite(EXCEPTION_BITMAP, 
   7.262 +                          HVM_TRAP_MASK
   7.263 +                          | (paging_mode_hap(v->domain) ?
   7.264 +                             0 : (1U << TRAP_page_fault))
   7.265 +                          | (1U << TRAP_no_device));
   7.266 +            }
   7.267 +        }
   7.268  
   7.269          v->arch.hvm_vcpu.hw_cr[0] =
   7.270              v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
   7.271 @@ -1028,6 +1155,8 @@ static void vmx_update_guest_cr(struct v
   7.272          if ( paging_mode_hap(v->domain) )
   7.273              v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
   7.274          v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4];
   7.275 +        if ( v->arch.hvm_vmx.vmx_realmode ) 
   7.276 +            v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
   7.277          if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) )
   7.278          {
   7.279              v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
   7.280 @@ -1097,6 +1226,7 @@ void ept_sync_domain(struct domain *d)
   7.281  static void __vmx_inject_exception(int trap, int type, int error_code)
   7.282  {
   7.283      unsigned long intr_fields;
   7.284 +    struct vcpu *curr = current;
   7.285  
   7.286      /*
   7.287       * NB. Callers do not need to worry about clearing STI/MOV-SS blocking:
   7.288 @@ -1113,6 +1243,11 @@ static void __vmx_inject_exception(int t
   7.289      }
   7.290  
   7.291      __vmwrite(VM_ENTRY_INTR_INFO, intr_fields);
   7.292 +
   7.293 +    /* Can't inject exceptions in virtual 8086 mode because they would 
   7.294 +     * use the protected-mode IDT.  Emulate at the next vmenter instead. */
   7.295 +    if ( curr->arch.hvm_vmx.vmx_realmode ) 
   7.296 +        curr->arch.hvm_vmx.vmx_emulate = 1;
   7.297  }
   7.298  
   7.299  void vmx_inject_hw_exception(int trap, int error_code)
   7.300 @@ -2072,6 +2207,17 @@ static void vmx_failed_vmentry(unsigned 
   7.301      domain_crash(curr->domain);
   7.302  }
   7.303  
   7.304 +asmlinkage void vmx_enter_realmode(struct cpu_user_regs *regs)
   7.305 +{
   7.306 +    struct vcpu *v = current;
   7.307 +
   7.308 +    /* Adjust RFLAGS to enter virtual 8086 mode with IOPL == 3.  Since
   7.309 +     * we have CR4.VME == 1 and our own TSS with an empty interrupt
   7.310 +     * redirection bitmap, all software INTs will be handled by vm86 */
   7.311 +    v->arch.hvm_vmx.vm86_saved_eflags = regs->eflags;
   7.312 +    regs->eflags |= (X86_EFLAGS_VM | X86_EFLAGS_IOPL);
   7.313 +}
   7.314 +
   7.315  asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
   7.316  {
   7.317      unsigned int exit_reason, idtv_info;
   7.318 @@ -2100,6 +2246,42 @@ asmlinkage void vmx_vmexit_handler(struc
   7.319      if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
   7.320          return vmx_failed_vmentry(exit_reason, regs);
   7.321  
   7.322 +    if ( v->arch.hvm_vmx.vmx_realmode )
   7.323 +    {
   7.324 +        unsigned int vector;
   7.325 +
   7.326 +        /* Put RFLAGS back the way the guest wants it */
   7.327 +        regs->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IOPL);
   7.328 +        regs->eflags |= (v->arch.hvm_vmx.vm86_saved_eflags & X86_EFLAGS_IOPL);
   7.329 +
   7.330 +        /* Unless this exit was for an interrupt, we've hit something
   7.331 +         * vm86 can't handle.  Try again, using the emulator. */
   7.332 +        switch ( exit_reason )
   7.333 +        {
   7.334 +        case EXIT_REASON_EXCEPTION_NMI:
   7.335 +            vector = __vmread(VM_EXIT_INTR_INFO) & INTR_INFO_VECTOR_MASK;;
   7.336 +            if ( vector != TRAP_page_fault
   7.337 +                 && vector != TRAP_nmi 
   7.338 +                 && vector != TRAP_machine_check ) 
   7.339 +            {
   7.340 +                perfc_incr(realmode_exits);
   7.341 +                v->arch.hvm_vmx.vmx_emulate = 1;
   7.342 +                return;
   7.343 +            }
   7.344 +        case EXIT_REASON_EXTERNAL_INTERRUPT:
   7.345 +        case EXIT_REASON_INIT:
   7.346 +        case EXIT_REASON_SIPI:
   7.347 +        case EXIT_REASON_PENDING_VIRT_INTR:
   7.348 +        case EXIT_REASON_PENDING_VIRT_NMI:
   7.349 +        case EXIT_REASON_MACHINE_CHECK:
   7.350 +            break;
   7.351 +        default:
   7.352 +            v->arch.hvm_vmx.vmx_emulate = 1;
   7.353 +            perfc_incr(realmode_exits);
   7.354 +            return;
   7.355 +        }
   7.356 +    }
   7.357 +
   7.358      hvm_maybe_deassert_evtchn_irq();
   7.359  
   7.360      /* Event delivery caused this intercept? Queue for redelivery. */
     8.1 --- a/xen/arch/x86/x86_32/asm-offsets.c	Tue Dec 09 13:23:15 2008 +0000
     8.2 +++ b/xen/arch/x86/x86_32/asm-offsets.c	Tue Dec 09 16:28:02 2008 +0000
     8.3 @@ -88,7 +88,9 @@ void __dummy__(void)
     8.4      BLANK();
     8.5  
     8.6      OFFSET(VCPU_vmx_launched, struct vcpu, arch.hvm_vmx.launched);
     8.7 -    OFFSET(VCPU_vmx_emul, struct vcpu, arch.hvm_vmx.vmxemul);
     8.8 +    OFFSET(VCPU_vmx_realmode, struct vcpu, arch.hvm_vmx.vmx_realmode);
     8.9 +    OFFSET(VCPU_vmx_emulate, struct vcpu, arch.hvm_vmx.vmx_emulate);
    8.10 +    OFFSET(VCPU_vm86_seg_mask, struct vcpu, arch.hvm_vmx.vm86_segment_mask);
    8.11      OFFSET(VCPU_hvm_guest_cr2, struct vcpu, arch.hvm_vcpu.guest_cr[2]);
    8.12      BLANK();
    8.13  
     9.1 --- a/xen/arch/x86/x86_64/asm-offsets.c	Tue Dec 09 13:23:15 2008 +0000
     9.2 +++ b/xen/arch/x86/x86_64/asm-offsets.c	Tue Dec 09 16:28:02 2008 +0000
     9.3 @@ -107,7 +107,9 @@ void __dummy__(void)
     9.4      BLANK();
     9.5  
     9.6      OFFSET(VCPU_vmx_launched, struct vcpu, arch.hvm_vmx.launched);
     9.7 -    OFFSET(VCPU_vmx_emul, struct vcpu, arch.hvm_vmx.vmxemul);
     9.8 +    OFFSET(VCPU_vmx_realmode, struct vcpu, arch.hvm_vmx.vmx_realmode);
     9.9 +    OFFSET(VCPU_vmx_emulate, struct vcpu, arch.hvm_vmx.vmx_emulate);
    9.10 +    OFFSET(VCPU_vm86_seg_mask, struct vcpu, arch.hvm_vmx.vm86_segment_mask);
    9.11      OFFSET(VCPU_hvm_guest_cr2, struct vcpu, arch.hvm_vcpu.guest_cr[2]);
    9.12      BLANK();
    9.13  
    10.1 --- a/xen/arch/x86/x86_emulate/x86_emulate.h	Tue Dec 09 13:23:15 2008 +0000
    10.2 +++ b/xen/arch/x86/x86_emulate/x86_emulate.h	Tue Dec 09 16:28:02 2008 +0000
    10.3 @@ -67,6 +67,7 @@ typedef union segment_attributes {
    10.4          uint16_t l:   1;    /* 9;  Bit 53 */
    10.5          uint16_t db:  1;    /* 10; Bit 54 */
    10.6          uint16_t g:   1;    /* 11; Bit 55 */
    10.7 +        uint16_t pad: 4;
    10.8      } fields;
    10.9  } __attribute__ ((packed)) segment_attributes_t;
   10.10  
    11.1 --- a/xen/include/asm-x86/hvm/vmx/vmcs.h	Tue Dec 09 13:23:15 2008 +0000
    11.2 +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h	Tue Dec 09 16:28:02 2008 +0000
    11.3 @@ -109,11 +109,16 @@ struct arch_vmx_struct {
    11.4  
    11.5      unsigned long        host_cr0;
    11.6  
    11.7 +    /* Is the guest in real mode? */
    11.8 +    uint8_t              vmx_realmode;
    11.9      /* Are we emulating rather than VMENTERing? */
   11.10 -#define VMXEMUL_REALMODE 1  /* Yes, because CR0.PE == 0   */
   11.11 -#define VMXEMUL_BAD_CS   2  /* Yes, because CS.RPL != CPL */
   11.12 -#define VMXEMUL_BAD_SS   4  /* Yes, because SS.RPL != CPL */
   11.13 -    uint8_t              vmxemul;
   11.14 +    uint8_t              vmx_emulate;
   11.15 +    /* Bitmask of segments that we can't safely use in virtual 8086 mode */
   11.16 +    uint16_t             vm86_segment_mask;
   11.17 +    /* Shadow CS, SS, DS, ES, FS, GS, TR while in virtual 8086 mode */
   11.18 +    struct segment_register vm86_saved_seg[x86_seg_tr + 1];
   11.19 +    /* Remember EFLAGS while in virtual 8086 mode */
   11.20 +    uint32_t             vm86_saved_eflags;
   11.21  };
   11.22  
   11.23  int vmx_create_vmcs(struct vcpu *v);
    12.1 --- a/xen/include/asm-x86/perfc_defn.h	Tue Dec 09 13:23:15 2008 +0000
    12.2 +++ b/xen/include/asm-x86/perfc_defn.h	Tue Dec 09 16:28:02 2008 +0000
    12.3 @@ -127,4 +127,7 @@ PERFCOUNTER(mshv_wrmsr_icr,             
    12.4  PERFCOUNTER(mshv_wrmsr_tpr,             "MS Hv wrmsr tpr")
    12.5  PERFCOUNTER(mshv_wrmsr_eoi,             "MS Hv wrmsr eoi")
    12.6  
    12.7 +PERFCOUNTER(realmode_emulations, "realmode instructions emulated")
    12.8 +PERFCOUNTER(realmode_exits,      "vmexits from realmode")
    12.9 +
   12.10  /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
    13.1 --- a/xen/include/public/hvm/params.h	Tue Dec 09 13:23:15 2008 +0000
    13.2 +++ b/xen/include/public/hvm/params.h	Tue Dec 09 16:28:02 2008 +0000
    13.3 @@ -100,6 +100,9 @@
    13.4  /* ACPI S state: currently support S0 and S3 on x86. */
    13.5  #define HVM_PARAM_ACPI_S_STATE 14
    13.6  
    13.7 -#define HVM_NR_PARAMS          15
    13.8 +/* TSS used on Intel when CR0.PE=0. */
    13.9 +#define HVM_PARAM_VM86_TSS     15
   13.10 +
   13.11 +#define HVM_NR_PARAMS          16
   13.12  
   13.13  #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */