ia64/xen-unstable

changeset 6214:027812e4a63c

Fix context switching race which could cause vcpu_pause()
to not actually do its job properly. This requires us to
move clearing of VCPUF_running to be protected by the
scheduler lock.

Signed-off-by: Keir Fraser <keir@xensource.com>
author kaf24@firebug.cl.cam.ac.uk
date Tue Aug 16 18:02:24 2005 +0000 (2005-08-16)
parents 26c03c17c418
children c7689e1e0768
files xen/arch/ia64/xenmisc.c xen/arch/x86/domain.c xen/arch/x86/vmx.c xen/common/schedule.c xen/include/asm-x86/vmx_vmcs.h xen/include/xen/sched.h
line diff
     1.1 --- a/xen/arch/ia64/xenmisc.c	Tue Aug 16 17:02:49 2005 +0000
     1.2 +++ b/xen/arch/ia64/xenmisc.c	Tue Aug 16 18:02:24 2005 +0000
     1.3 @@ -280,7 +280,6 @@ void cs01foo(void) {}
     1.4  
     1.5  unsigned long context_switch_count = 0;
     1.6  
     1.7 -// context_switch
     1.8  void context_switch(struct vcpu *prev, struct vcpu *next)
     1.9  {
    1.10  //printk("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n");
    1.11 @@ -290,22 +289,14 @@ void context_switch(struct vcpu *prev, s
    1.12  //if (prev->domain->domain_id == 0 && next->domain->domain_id == 1) cs01foo();
    1.13  //printk("@@sw %d->%d\n",prev->domain->domain_id,next->domain->domain_id);
    1.14  #ifdef CONFIG_VTI
    1.15 -	unsigned long psr;
    1.16 -	/* Interrupt is enabled after next task is chosen.
    1.17 -	 * So we have to disable it for stack switch.
    1.18 -	 */
    1.19 -	local_irq_save(psr);
    1.20  	vtm_domain_out(prev);
    1.21 -	/* Housekeeping for prev domain */
    1.22 -#endif // CONFIG_VTI
    1.23 -
    1.24 +#endif
    1.25  	context_switch_count++;
    1.26  	switch_to(prev,next,prev);
    1.27  #ifdef CONFIG_VTI
    1.28 -	/* Post-setup for new domain */
    1.29  	 vtm_domain_in(current);
    1.30 -	local_irq_restore(psr);
    1.31 -#endif // CONFIG_VTI
    1.32 +#endif
    1.33 +
    1.34  // leave this debug for now: it acts as a heartbeat when more than
    1.35  // one domain is active
    1.36  {
    1.37 @@ -315,25 +306,27 @@ int id = ((struct vcpu *)current)->domai
    1.38  if (!cnt[id]--) { printk("%x",id); cnt[id] = 500000; }
    1.39  if (!i--) { printk("+",id); i = 1000000; }
    1.40  }
    1.41 -	clear_bit(_VCPUF_running, &prev->vcpu_flags);
    1.42 -	//if (!is_idle_task(next->domain) )
    1.43 -		//send_guest_virq(next, VIRQ_TIMER);
    1.44 +
    1.45  #ifdef CONFIG_VTI
    1.46  	if (VMX_DOMAIN(current))
    1.47  		vmx_load_all_rr(current);
    1.48 -	return;
    1.49 -#else // CONFIG_VTI
    1.50 +#else
    1.51  	if (!is_idle_task(current->domain)) {
    1.52  		load_region_regs(current);
    1.53  		if (vcpu_timer_expired(current)) vcpu_pend_timer(current);
    1.54  	}
    1.55  	if (vcpu_timer_expired(current)) vcpu_pend_timer(current);
    1.56 -#endif // CONFIG_VTI
    1.57 +#endif
    1.58 +}
    1.59 +
    1.60 +void context_switch_finalise(struct vcpu *next)
    1.61 +{
    1.62 +	/* nothing to do */
    1.63  }
    1.64  
    1.65  void continue_running(struct vcpu *same)
    1.66  {
    1.67 -    /* nothing to do */
    1.68 +	/* nothing to do */
    1.69  }
    1.70  
    1.71  void panic_domain(struct pt_regs *regs, const char *fmt, ...)
     2.1 --- a/xen/arch/x86/domain.c	Tue Aug 16 17:02:49 2005 +0000
     2.2 +++ b/xen/arch/x86/domain.c	Tue Aug 16 18:02:24 2005 +0000
     2.3 @@ -48,6 +48,8 @@ boolean_param("noreboot", opt_noreboot);
     2.4  
     2.5  struct percpu_ctxt {
     2.6      struct vcpu *curr_vcpu;
     2.7 +    unsigned int context_not_finalised;
     2.8 +    unsigned int dirty_segment_mask;
     2.9  } __cacheline_aligned;
    2.10  static struct percpu_ctxt percpu_ctxt[NR_CPUS];
    2.11  
    2.12 @@ -541,51 +543,59 @@ void toggle_guest_mode(struct vcpu *v)
    2.13      __r; })
    2.14  
    2.15  #if CONFIG_VMX
    2.16 -#define load_msrs(_p, _n)     if (vmx_switch_on) vmx_load_msrs((_p), (_n))
    2.17 +#define load_msrs(n)     if (vmx_switch_on) vmx_load_msrs(n)
    2.18  #else
    2.19 -#define load_msrs(_p, _n)     ((void)0)
    2.20 +#define load_msrs(n)     ((void)0)
    2.21  #endif 
    2.22  
    2.23 -static void load_segments(struct vcpu *p, struct vcpu *n)
    2.24 +/*
    2.25 + * save_segments() writes a mask of segments which are dirty (non-zero),
    2.26 + * allowing load_segments() to avoid some expensive segment loads and
    2.27 + * MSR writes.
    2.28 + */
    2.29 +#define DIRTY_DS           0x01
    2.30 +#define DIRTY_ES           0x02
    2.31 +#define DIRTY_FS           0x04
    2.32 +#define DIRTY_GS           0x08
    2.33 +#define DIRTY_FS_BASE      0x10
    2.34 +#define DIRTY_GS_BASE_USER 0x20
    2.35 +
    2.36 +static void load_segments(struct vcpu *n)
    2.37  {
    2.38 -    struct vcpu_guest_context *pctxt = &p->arch.guest_context;
    2.39      struct vcpu_guest_context *nctxt = &n->arch.guest_context;
    2.40      int all_segs_okay = 1;
    2.41 +    unsigned int dirty_segment_mask, cpu = smp_processor_id();
    2.42 +
    2.43 +    /* Load and clear the dirty segment mask. */
    2.44 +    dirty_segment_mask = percpu_ctxt[cpu].dirty_segment_mask;
    2.45 +    percpu_ctxt[cpu].dirty_segment_mask = 0;
    2.46  
    2.47      /* Either selector != 0 ==> reload. */
    2.48 -    if ( unlikely(pctxt->user_regs.ds | nctxt->user_regs.ds) )
    2.49 +    if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) )
    2.50          all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds);
    2.51  
    2.52      /* Either selector != 0 ==> reload. */
    2.53 -    if ( unlikely(pctxt->user_regs.es | nctxt->user_regs.es) )
    2.54 +    if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) )
    2.55          all_segs_okay &= loadsegment(es, nctxt->user_regs.es);
    2.56  
    2.57      /*
    2.58       * Either selector != 0 ==> reload.
    2.59       * Also reload to reset FS_BASE if it was non-zero.
    2.60       */
    2.61 -    if ( unlikely(pctxt->user_regs.fs |
    2.62 -                  pctxt->fs_base |
    2.63 +    if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) |
    2.64                    nctxt->user_regs.fs) )
    2.65 -    {
    2.66          all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs);
    2.67 -        if ( pctxt->user_regs.fs ) /* != 0 selector kills fs_base */
    2.68 -            pctxt->fs_base = 0;
    2.69 -    }
    2.70  
    2.71      /*
    2.72       * Either selector != 0 ==> reload.
    2.73       * Also reload to reset GS_BASE if it was non-zero.
    2.74       */
    2.75 -    if ( unlikely(pctxt->user_regs.gs |
    2.76 -                  pctxt->gs_base_user |
    2.77 +    if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) |
    2.78                    nctxt->user_regs.gs) )
    2.79      {
    2.80          /* Reset GS_BASE with user %gs? */
    2.81 -        if ( pctxt->user_regs.gs || !nctxt->gs_base_user )
    2.82 +        if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user )
    2.83              all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs);
    2.84 -        if ( pctxt->user_regs.gs ) /* != 0 selector kills gs_base_user */
    2.85 -            pctxt->gs_base_user = 0;
    2.86      }
    2.87  
    2.88      /* This can only be non-zero if selector is NULL. */
    2.89 @@ -650,7 +660,9 @@ static void load_segments(struct vcpu *p
    2.90  
    2.91  static void save_segments(struct vcpu *v)
    2.92  {
    2.93 -    struct cpu_user_regs *regs = &v->arch.guest_context.user_regs;
    2.94 +    struct vcpu_guest_context *ctxt = &v->arch.guest_context;
    2.95 +    struct cpu_user_regs      *regs = &ctxt->user_regs;
    2.96 +    unsigned int dirty_segment_mask = 0;
    2.97  
    2.98      if ( VMX_DOMAIN(v) )
    2.99          rdmsrl(MSR_SHADOW_GS_BASE, v->arch.arch_vmx.msr_content.shadow_gs);
   2.100 @@ -659,18 +671,34 @@ static void save_segments(struct vcpu *v
   2.101      __asm__ __volatile__ ( "movl %%es,%0" : "=m" (regs->es) );
   2.102      __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (regs->fs) );
   2.103      __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (regs->gs) );
   2.104 -}
   2.105  
   2.106 -static void clear_segments(void)
   2.107 -{
   2.108 -    __asm__ __volatile__ (
   2.109 -        " movl %0,%%ds; "
   2.110 -        " movl %0,%%es; "
   2.111 -        " movl %0,%%fs; "
   2.112 -        " movl %0,%%gs; "
   2.113 -        ""safe_swapgs"  "
   2.114 -        " movl %0,%%gs"
   2.115 -        : : "r" (0) );
   2.116 +    if ( regs->ds )
   2.117 +        dirty_segment_mask |= DIRTY_DS;
   2.118 +
   2.119 +    if ( regs->es )
   2.120 +        dirty_segment_mask |= DIRTY_ES;
   2.121 +
   2.122 +    if ( regs->fs )
   2.123 +    {
   2.124 +        dirty_segment_mask |= DIRTY_FS;
   2.125 +        ctxt->fs_base = 0; /* != 0 selector kills fs_base */
   2.126 +    }
   2.127 +    else if ( ctxt->fs_base )
   2.128 +    {
   2.129 +        dirty_segment_mask |= DIRTY_FS_BASE;
   2.130 +    }
   2.131 +
   2.132 +    if ( regs->gs )
   2.133 +    {
   2.134 +        dirty_segment_mask |= DIRTY_GS;
   2.135 +        ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */
   2.136 +    }
   2.137 +    else if ( ctxt->gs_base_user )
   2.138 +    {
   2.139 +        dirty_segment_mask |= DIRTY_GS_BASE_USER;
   2.140 +    }
   2.141 +
   2.142 +    percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask;
   2.143  }
   2.144  
   2.145  long do_switch_to_user(void)
   2.146 @@ -706,10 +734,9 @@ long do_switch_to_user(void)
   2.147  
   2.148  #elif defined(__i386__)
   2.149  
   2.150 -#define load_segments(_p, _n) ((void)0)
   2.151 -#define load_msrs(_p, _n)     ((void)0)
   2.152 -#define save_segments(_p)     ((void)0)
   2.153 -#define clear_segments()      ((void)0)
   2.154 +#define load_segments(n) ((void)0)
   2.155 +#define load_msrs(n)     ((void)0)
   2.156 +#define save_segments(p) ((void)0)
   2.157  
   2.158  static inline void switch_kernel_stack(struct vcpu *n, unsigned int cpu)
   2.159  {
   2.160 @@ -726,9 +753,9 @@ static inline void switch_kernel_stack(s
   2.161  static void __context_switch(void)
   2.162  {
   2.163      struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
   2.164 -    unsigned int         cpu = smp_processor_id();
   2.165 -    struct vcpu  *p = percpu_ctxt[cpu].curr_vcpu;
   2.166 -    struct vcpu  *n = current;
   2.167 +    unsigned int          cpu = smp_processor_id();
   2.168 +    struct vcpu          *p = percpu_ctxt[cpu].curr_vcpu;
   2.169 +    struct vcpu          *n = current;
   2.170  
   2.171      if ( !is_idle_task(p->domain) )
   2.172      {
   2.173 @@ -786,23 +813,27 @@ static void __context_switch(void)
   2.174  
   2.175  void context_switch(struct vcpu *prev, struct vcpu *next)
   2.176  {
   2.177 -    struct vcpu *realprev;
   2.178 -
   2.179 -    local_irq_disable();
   2.180 +    unsigned int cpu = smp_processor_id();
   2.181  
   2.182      set_current(next);
   2.183  
   2.184 -    if ( ((realprev = percpu_ctxt[smp_processor_id()].curr_vcpu) == next) || 
   2.185 -         is_idle_task(next->domain) )
   2.186 -    {
   2.187 -        local_irq_enable();
   2.188 -    }
   2.189 -    else
   2.190 +    if ( (percpu_ctxt[cpu].curr_vcpu != next) && !is_idle_task(next->domain) )
   2.191      {
   2.192          __context_switch();
   2.193 +        percpu_ctxt[cpu].context_not_finalised = 1;
   2.194 +    }
   2.195 +}
   2.196  
   2.197 -        local_irq_enable();
   2.198 -        
   2.199 +void context_switch_finalise(struct vcpu *next)
   2.200 +{
   2.201 +    unsigned int cpu = smp_processor_id();
   2.202 +
   2.203 +    if ( percpu_ctxt[cpu].context_not_finalised )
   2.204 +    {
   2.205 +        percpu_ctxt[cpu].context_not_finalised = 0;
   2.206 +
   2.207 +        BUG_ON(percpu_ctxt[cpu].curr_vcpu != next);
   2.208 +
   2.209          if ( VMX_DOMAIN(next) )
   2.210          {
   2.211              vmx_restore_msrs(next);
   2.212 @@ -810,19 +841,11 @@ void context_switch(struct vcpu *prev, s
   2.213          else
   2.214          {
   2.215              load_LDT(next);
   2.216 -            load_segments(realprev, next);
   2.217 -            load_msrs(realprev, next);
   2.218 +            load_segments(next);
   2.219 +            load_msrs(next);
   2.220          }
   2.221      }
   2.222  
   2.223 -    /*
   2.224 -     * We do this late on because it doesn't need to be protected by the
   2.225 -     * schedule_lock, and because we want this to be the very last use of
   2.226 -     * 'prev' (after this point, a dying domain's info structure may be freed
   2.227 -     * without warning). 
   2.228 -     */
   2.229 -    clear_bit(_VCPUF_running, &prev->vcpu_flags);
   2.230 -
   2.231      schedule_tail(next);
   2.232      BUG();
   2.233  }
   2.234 @@ -835,12 +858,19 @@ void continue_running(struct vcpu *same)
   2.235  
   2.236  int __sync_lazy_execstate(void)
   2.237  {
   2.238 -    if ( percpu_ctxt[smp_processor_id()].curr_vcpu == current )
   2.239 -        return 0;
   2.240 -    __context_switch();
   2.241 -    load_LDT(current);
   2.242 -    clear_segments();
   2.243 -    return 1;
   2.244 +    unsigned long flags;
   2.245 +    int switch_required;
   2.246 +
   2.247 +    local_irq_save(flags);
   2.248 +
   2.249 +    switch_required = (percpu_ctxt[smp_processor_id()].curr_vcpu != current);
   2.250 +
   2.251 +    if ( switch_required )
   2.252 +        __context_switch();
   2.253 +
   2.254 +    local_irq_restore(flags);
   2.255 +
   2.256 +    return switch_required;
   2.257  }
   2.258  
   2.259  void sync_lazy_execstate_cpu(unsigned int cpu)
     3.1 --- a/xen/arch/x86/vmx.c	Tue Aug 16 17:02:49 2005 +0000
     3.2 +++ b/xen/arch/x86/vmx.c	Tue Aug 16 18:02:24 2005 +0000
     3.3 @@ -65,7 +65,7 @@ static u32 msr_data_index[VMX_MSR_COUNT]
     3.4   * are not modified once set for generic domains, we don't save them, 
     3.5   * but simply reset them to the values set at percpu_traps_init().
     3.6   */
     3.7 -void vmx_load_msrs(struct vcpu *p, struct vcpu *n)
     3.8 +void vmx_load_msrs(struct vcpu *n)
     3.9  {
    3.10      struct msr_state *host_state;
    3.11      host_state = &percpu_msr[smp_processor_id()];
     4.1 --- a/xen/common/schedule.c	Tue Aug 16 17:02:49 2005 +0000
     4.2 +++ b/xen/common/schedule.c	Tue Aug 16 18:02:24 2005 +0000
     4.3 @@ -474,13 +474,14 @@ static void __enter_scheduler(void)
     4.4  
     4.5      set_ac_timer(&schedule_data[cpu].s_timer, now + r_time);
     4.6  
     4.7 -    /* Must be protected by the schedule_lock! */
     4.8 -    set_bit(_VCPUF_running, &next->vcpu_flags);
     4.9 +    if ( unlikely(prev == next) )
    4.10 +    {
    4.11 +        spin_unlock_irq(&schedule_data[cpu].schedule_lock);
    4.12 +        return continue_running(prev);
    4.13 +    }
    4.14  
    4.15 -    spin_unlock_irq(&schedule_data[cpu].schedule_lock);
    4.16 -
    4.17 -    if ( unlikely(prev == next) )
    4.18 -        return continue_running(prev);
    4.19 +    clear_bit(_VCPUF_running, &prev->vcpu_flags);
    4.20 +    set_bit(_VCPUF_running, &next->vcpu_flags);
    4.21  
    4.22      perfc_incrc(sched_ctx);
    4.23  
    4.24 @@ -517,6 +518,10 @@ static void __enter_scheduler(void)
    4.25               next->domain->domain_id, next->vcpu_id);
    4.26  
    4.27      context_switch(prev, next);
    4.28 +
    4.29 +    spin_unlock_irq(&schedule_data[cpu].schedule_lock);
    4.30 +
    4.31 +    context_switch_finalise(next);
    4.32  }
    4.33  
    4.34  /* No locking needed -- pointer comparison is safe :-) */
     5.1 --- a/xen/include/asm-x86/vmx_vmcs.h	Tue Aug 16 17:02:49 2005 +0000
     5.2 +++ b/xen/include/asm-x86/vmx_vmcs.h	Tue Aug 16 18:02:24 2005 +0000
     5.3 @@ -28,10 +28,10 @@ extern int start_vmx(void);
     5.4  extern void stop_vmx(void);
     5.5  
     5.6  #if defined (__x86_64__)
     5.7 -extern void vmx_load_msrs(struct vcpu *p, struct vcpu *n);
     5.8 +extern void vmx_load_msrs(struct vcpu *n);
     5.9  void vmx_restore_msrs(struct vcpu *d);
    5.10  #else
    5.11 -#define vmx_load_msrs(_p, _n)      ((void)0)
    5.12 +#define vmx_load_msrs(_n)          ((void)0)
    5.13  #define vmx_restore_msrs(_v)       ((void)0)
    5.14  #endif
    5.15  
     6.1 --- a/xen/include/xen/sched.h	Tue Aug 16 17:02:49 2005 +0000
     6.2 +++ b/xen/include/xen/sched.h	Tue Aug 16 18:02:24 2005 +0000
     6.3 @@ -258,12 +258,32 @@ extern void sync_lazy_execstate_mask(cpu
     6.4  extern void sync_lazy_execstate_all(void);
     6.5  extern int __sync_lazy_execstate(void);
     6.6  
     6.7 -/* Called by the scheduler to switch to another vcpu. */
     6.8 +/*
     6.9 + * Called by the scheduler to switch to another VCPU. On entry, although
    6.10 + * VCPUF_running is no longer asserted for @prev, its context is still running
    6.11 + * on the local CPU and is not committed to memory. The local scheduler lock
    6.12 + * is therefore still held, and interrupts are disabled, because the local CPU
    6.13 + * is in an inconsistent state.
    6.14 + * 
    6.15 + * The callee must ensure that the local CPU is no longer running in @prev's
    6.16 + * context, and that the context is saved to memory, before returning.
    6.17 + * Alternatively, if implementing lazy context switching, it suffices to ensure
    6.18 + * that invoking __sync_lazy_execstate() will switch and commit @prev's state.
    6.19 + */
    6.20  extern void context_switch(
    6.21      struct vcpu *prev, 
    6.22      struct vcpu *next);
    6.23  
    6.24 -/* Called by the scheduler to continue running the current vcpu. */
    6.25 +/*
    6.26 + * On some architectures (notably x86) it is not possible to entirely load
    6.27 + * @next's context with interrupts disabled. These may implement a function to
    6.28 + * finalise loading the new context after interrupts are re-enabled. This
    6.29 + * function is not given @prev and is not permitted to access it.
    6.30 + */
    6.31 +extern void context_switch_finalise(
    6.32 +    struct vcpu *next);
    6.33 +
    6.34 +/* Called by the scheduler to continue running the current VCPU. */
    6.35  extern void continue_running(
    6.36      struct vcpu *same);
    6.37